# Sometimes websites deny access to bots or unsupported browsers
# python allows you to change the user agent string so you can
# impersonate any browser you desire
import urllib
import urllib2
# unaltered header.
print "sending a request using the default python user agent string..."
url = 'http://www.example.com'
req = urllib2.Request(url)
response = urllib2.urlopen(req)
the_page = response.read()
# alter the header to look like a real browser
print "sending a request with an altered user agent string..."
url = 'http://www.example.com'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
req = urllib2.Request(url, data=None, headers=headers) #, data) #, headers)
response = urllib2.urlopen(req)
the_page = response.read()
#Apache logs on the server indicate:
# make a request without altering the user_agent
# "POST / HTTP/1.1" 200 2022 "-" "Python-urllib/2.6"
#
# a request made with the altered user_agent
# "POST / HTTP/1.1" 200 2022 "-" "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
A python example based blog that shows how to accomplish python goals and how to correct python errors.
Showing posts with label urllib2. Show all posts
Showing posts with label urllib2. Show all posts
Monday, September 21, 2009
Python - alter user agent string in web request
Python - browse the web with python
# retrieve the html from a web site
import urllib2
import urllib
# query string
# in this example these GET parameters don't
# do anything. They are just here to show
# off the urlencode() function
qs = {}
qs['q'] = "items to search for"
qs['i'] = 22
qs_values = urllib.urlencode(qs)
# append everything together
url = "http://www.example.com"
full_url = url + '?' + qs_values
print "my full url: %s" %(full_url)
# get the data from the web
data = urllib2.urlopen(full_url)
# data now has all the html/css/javascsript in it
for item in data:
print item
## output:
##my full url: http://www.example.com?q=items+to+search+for&i=22
##<HTML>
##
##<HEAD>
##
## <TITLE>Example Web Page</TITLE>
##
##</HEAD>
##
##<body>
##
##<p>You have reached this web page by typing "example.com",
##
##"example.net",
##
## or "example.org" into your web browser.</p>
##
##<p>These domain names are reserved for use in documentation and are not available
##
## for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC
##
## 2606</a>, Section 3.</p>
##
##</BODY>
##
##</HTML>
Subscribe to:
Posts (Atom)