Pythonic Prose: urllib

Showing posts with label urllib. Show all posts

Tuesday, October 6, 2009

Python - read robots.txt files with ease

# robot parser allows access to a websites 
# robots.txt file (more on robots.txt)  
 
import robotparser
# more on robotparser doc   
# Note: in python 3 robotparser will be found in 
# the urllib module at urllib.robotparser 
 
# examples using urllib 
#   - copy image (or file) off web  
#   - alter user agent string   
#   - browse the web with python   
 
 
# the site I want to read 
url = "http://pythonicprose.blogspot.com/robots.txt" 
 
rob = robotparser.RobotFileParser()
rob.set_url(url)
 
# read and parse through the file 
rob.read()
 
# if you are creating a web crawler or spider you may need to keep 
# track of how long it has been since you last read the robots.txt file 
# use modified to mark the time and mtime to read it 
rob.modified()
 
# to get the time: 
rob.mtime()
 
# check and see if any user agent can read the home page 
print rob.can_fetch("*", "/")
# output: 
#   True 
 
# check and see if any user agent can read the search page 
print rob.can_fetch("*", "/search")
# output: 
#   False 
 
# now that we are so many lines down from set_url we can check 
# the host we are processing 
print rob.host
# output: 
# 'pythonicprose.blogspot.com'

Wednesday, September 23, 2009

python - copy images (or any file) from the web to local machine

# copy images off of the web to your local machine
#  (this should work for any files off the web...not just images)

import urllib
import os

url_of_file = "http://www.example.com/images/example_image.jpg"
local_file = "local_copy.jpg"

# retrieve from web and put in local_file
urllib.urlretrieve(url_of_file, local_file)

# now proof that the file was copied
# just listing all files in working directory
print os.listdir('.')

Monday, September 21, 2009

Python - alter user agent string in web request

# Sometimes websites deny access to bots or unsupported browsers
# python allows you to change the user agent string so you can 
# impersonate any browser you desire

import urllib
import urllib2

# unaltered header.

print "sending a request using the default python user agent string..."
url = 'http://www.example.com'
req = urllib2.Request(url)

response = urllib2.urlopen(req)
the_page = response.read()


# alter the header to look like a real browser
print "sending a request with an altered user agent string..."
url = 'http://www.example.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, data=None, headers=headers) #, data) #, headers)

response = urllib2.urlopen(req)
the_page = response.read()



#Apache logs on the server indicate:
#   make a request without altering the user_agent
#       "POST / HTTP/1.1" 200 2022 "-" "Python-urllib/2.6"
#
#   a request made with the altered user_agent
#       "POST / HTTP/1.1" 200 2022 "-" "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"

Python - browse the web with python

# retrieve the html from a web site 
import urllib2 
import urllib 
 
# query string 
# in this example these GET parameters don't 
#   do anything.  They are just here to show 
#   off the urlencode() function 
qs = {} 
qs['q'] = "items to search for" 
qs['i'] = 22 
qs_values = urllib.urlencode(qs) 
 
# append everything together 
url = "http://www.example.com" 
full_url = url + '?' + qs_values 
print "my full url: %s" %(full_url) 
 
# get the data from the web 
data = urllib2.urlopen(full_url) 
 
# data now has all the html/css/javascsript in it 
for item in data: 
    print item 
 
 
## output:     
##my full url: http://www.example.com?q=items+to+search+for&i=22 
##<HTML> 
## 
##<HEAD> 
## 
##  <TITLE>Example Web Page</TITLE> 
## 
##</HEAD>  
## 
##<body>   
## 
##<p>You have reached this web page by typing &quot;example.com&quot;, 
## 
##&quot;example.net&quot;, 
## 
##  or &quot;example.org&quot; into your web browser.</p> 
## 
##<p>These domain names are reserved for use in documentation and are not available  
## 
##  for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC  
## 
##  2606</a>, Section 3.</p> 
## 
##</BODY> 
## 
##</HTML>