Python/urllib
< Python
urllib
Basic Get
import urllib2 response = urllib2.urlopen('http://python.org/') html = response.read()
import urllib2 response = urllib2.urlopen('http://python.org/') if response.code != 200: print "failure"
POST data:
import urllib import urllib2 url = 'http://www.someserver.com/cgi-bin/register.cgi' values = {'name' : 'Michael Foord', 'location' : 'Northampton', 'language' : 'Python' } data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req) the_page = response.read() print the_page
Request Headers
... user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0" headers = { 'User-Agent' : user_agent } data = urllib.urlencode(values) req = urllib2.Request(url, data, headers)
# or request = urllib2.Request('http://your.tld/...') request.add_header('User-Agent', 'some fake agent string') request.add_header('Referer', 'fake referrer') ... response = urllib2.urlopen(request)
Response Header
print response.info().getheader('Content-Type')
print response.info().headers # list
response.url # response url
Cookies
import cookielib cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) # user opener for all future requests req = urllib2.Request(url, postdata, headers) response = opener.open(req)
References:
- HOWTO Fetch Internet Resources Using urllib2 — Python v2.7.6 documentation - https://docs.python.org/2/howto/urllib2.html
Progress Bar
Progress bar: [1]
global rem_file # global variable to be used in dlProgress urllib.urlretrieve(rem_file, loc_file, reporthook=dlProgress) def dlProgress(count, blockSize, totalSize): percent = int(count*blockSize*100/totalSize) sys.stdout.write("\r" + rem_file + "...%d%%" % percent) sys.stdout.flush()
- http - Python urllib2 Progress Hook - Stack Overflow - http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
- How to write a download progress indicator in Python? - Python - K210.ORG - http://k210.org/python/how_to_write_a_download_progress_indicator_in_python/
Read web page 2.x
import urllib resp = urllib.urlopen('http://www.python.org') html = resp.read() head = resp.headers headers = head.keys() # or 'for header in head' or 'head.items()' head['set-cookie']
URL Encode
#!/usr/bin/env python # urlencode a message from stdin or command parameters # author: Kenneth Burgener <kenneth@k.ttak.org> (c) 2013 import urllib import sys import select msg = "" if select.select([sys.stdin,],[],[],0.0)[0]: msg = sys.stdin.readline().strip() + " " if len(sys.argv) > 1: msg += " ".join(sys.argv[1:]) msg = msg.strip() print urllib.quote(msg)
cookies
cookies: [2]
import urllib2 req1 = urllib2.Request(url1) response = urllib2.urlopen(req1) cookie = response.headers.get('Set-Cookie') # Use the cookie is subsequent requests req2 = urllib2.Request(url2) req2.add_header('cookie', cookie) response = urllib2.urlopen(req2)
Read web page 3.x
import urllib.request resp = urllib.request.urlopen('http://www.python.org') html = resp.read()
Convert byte characters to string
print(f.read(100).decode('utf-8'))
Download File
Download file: (2.x)
import urllib urllib.urlretrieve(url, filename)
Download file: (3.x)
import urllib.request urllib.request.urlretrieve(url, filename)
Open web browser
import webbrowser webbrowser.open('http://www.google.com')
HTTP Basic Authentication
HTTP Basic Authentication [3]
import urllib2, base64 request = urllib2.Request("http://api.foursquare.com/v1/user") base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', ) request.add_header("Authorization", "Basic %s" % base64string) result = urllib2.urlopen(request)
HTTP Basic Authentication Handler
HTTP Basic Authentication Handler: [4] [5] [6]
import urllib2 auth = urllib2.HTTPPasswordMgrWithDefaultRealm() auth.add_password(None, 'http://twitter.com/account/', username, password) auth_handler = urllib2.HTTPBasicAuthHandler(auth) url_opener = urllib2.build_opener(auth_handler) # Note: this will change the default opener. # if you do not wish to do this, simply use "url_opener.open(url_request)" everywhere. urllib2.install_opener(url_opener) url_request = urllib2.Request('https://api.twitter.com/1/statuses/user_timeline.json?%s' % twitter_args) url_output = urllib2.urlopen(url_request).read()
theurl = 'http://10.10.10.135/image.jpg' req = urllib2.Request(theurl) base64string = base64.encodestring('%s:%s' % (username, password))[:-1] authheader = "Basic %s" % base64string req.add_header("Authorization", authheader) resp = urllib2.urlopen(req) img = resp.read()
# the following has the problem of being required to already know the realm import urllib2 TRIM_API_URL = 'http://api.tr.im/api' auth_handler = urllib2.HTTPBasicAuthHandler() auth_handler.add_password(realm='tr.im', uri=TRIM_API_URL, user=USERNAME, passwd=PASSWORD) url_opener = urllib2.build_opener(auth_handler) urllib2.install_opener(url_opener) response = urllib2.urlopen('%s/trim_simple?url=%s' % (TRIM_API_URL, url_to_trim)) url = response.read().strip()
urllib Session Tracking
Sessions Tracking:
import urllib import urllib2 import random print "-" * 10, "REQ1", "-" * 10 data = {'username': 'test', 'password': 'password1', 'id': str(random.randint(1, 100)), } data = urllib.urlencode(data) req1 = urllib2.Request('http://demo.oeey.com/session_a.php', data) req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0') print "REQ1 HEADERS:", req1.headers.items() print "REQ1 DATA:", req1.data print "-" * 10, "RESP1", "-" * 10 resp1 = urllib2.urlopen(req1) cookie = resp1.headers.get('Set-Cookie') print 'RESP1 URL:', resp1.geturl() # new url, if redirected print 'RESP1 CODE:', resp1.getcode() # 200 print 'RESP1 COOKIE:', cookie print 'RESP1 HEADERS:', resp1.headers.items() print "RESP1 HTML:\n", resp1.read() print "-" * 10, "REQ2", "-" * 10 data = { 'product': '3', } data = urllib.urlencode(data) req2 = urllib2.Request('http://demo.oeey.com/session_b.php', data) req2.add_header('Cookie', cookie) req2.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0') print "REQ2 HEADERS:", req2.headers.items() print "REQ2 DATA:", req2.data print "-" * 10, "RESP2", "-" * 10 resp2 = urllib2.urlopen(req2) print 'RESP2 URL:', resp2.geturl() # new url, if redirected print 'RESP2 CODE:', resp2.getcode() # 200 print 'RESP2 HEADERS:', resp2.headers.items() print "RESP2 HTML:\n", resp2.read()
session_a.php:
<?php // Start Session session_start(); // Test redirect - resp1.geturl() //$_SESSION['id'] = 'redirect'; //header('Location: http://demo.oeey.com/session_b.php'); // Show REQUEST data\ echo "Request Data: \n"; print_r($_REQUEST); // $_REQUEST is combination of $_GET, $_POST and $_COOKIE. if(isset($_REQUEST['id'])) { $_SESSION['id'] = $_REQUEST['id']; } else { $_SESSION['id'] = rand(); } echo "Request ID: " . $_SESSION['id'];
session_b.php:
<?php // Start Session session_start(); // Show REQUEST data echo "Request Data: \n"; print_r($_REQUEST); // Show session data echo "Session Data: \n"; print_r($_SESSION); // Show ID echo "ID: \n"; if(isset($_SESSION['id'])) { echo $_SESSION['id']; } else { echo "ID not set"; }
simpler example
### LOGON url = 'http://apc.oeey.com/login.tgi' values = {'Username' : 'admin', 'Password' : 'admin', } data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=3) cookie = response.headers.get('Set-Cookie') print cookie # 'DLILPC="W5J/nTupJF0hyrv"; Version=1; Path=/' ### POWER OFF url = 'http://apc.oeey.com/outlet?8=OFF' req = urllib2.Request(url) req.add_header('Cookie', cookie) response = urllib2.urlopen(req, timeout=3) ### LOGOUT url = 'http://apc.oeey.com/logout' req = urllib2.Request(url) req.add_header('Cookie', cookie) response = urllib2.urlopen(req, timeout=3) cookie = response.headers.get('Set-Cookie') print cookie # 'DLILPC=""; Version=1; Max-Age=0; Path=/'
URL Cookie Session Tracking
import urllib import urllib2 import re url = 'http://apc.oeey.com/Forms/login1' values = {'login_username' : 'apc', 'login_password' : 'apc', } data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req, timeout=3) # exceptions: # socket.timeout: timed out # urllib2.HTTPError: HTTP Error 403: Forbidden print response.url # http://apc.oeey.com/NMC/GGcOPeRq8+FWctMifeoezA/home.htm # get cookie match = re.findall('http://apc.oeey.com/NMC/(.*)/home.htm', response.url) print match cookie = match[0] # use cookie: url2 = 'http://apc.oeey.com/NMC/{}/Forms/outlctrl1'.format(cookie) values2 = { 'rPDUOutletCtrl': '4', 'OL_Cntrl_Col1_Btn': '?8,2', 'submit': 'Next >>'} data2 = urllib.urlencode(values2) req2 = urllib2.Request(url2, data2) response2 = urllib2.urlopen(req2) print response2.url # logoff: url6 = 'http://apc.oeey.com/NMC/{}/logout.htm'.format(cookie) req6 = urllib2.Request(url6) response6 = urllib2.urlopen(req6) print response6.url # http://apc.oeey.com/NMC/X7tmWWC4oYI0Z4hQbnlLaQ/logout.htm #html = response6.read()