Python/urllib: Difference between revisions
< Python
(No difference)
|
Latest revision as of 23:24, 8 January 2018
urllib
Basic Get
import urllib2
response = urllib2.urlopen('http://python.org/')
html = response.read()
import urllib2
response = urllib2.urlopen('http://python.org/')
if response.code != 200:
print "failure"
POST data:
import urllib
import urllib2
url = 'http://www.someserver.com/cgi-bin/register.cgi'
values = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()
print the_page
Request Headers
...
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0"
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
# or
request = urllib2.Request('http://your.tld/...')
request.add_header('User-Agent', 'some fake agent string')
request.add_header('Referer', 'fake referrer')
...
response = urllib2.urlopen(request)
Response Header
print response.info().getheader('Content-Type')
print response.info().headers # list
response.url # response url
Cookies
import cookielib cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) # user opener for all future requests req = urllib2.Request(url, postdata, headers) response = opener.open(req)
References:
- HOWTO Fetch Internet Resources Using urllib2 — Python v2.7.6 documentation - https://docs.python.org/2/howto/urllib2.html
Progress Bar
Progress bar: [1]
global rem_file # global variable to be used in dlProgress
urllib.urlretrieve(rem_file, loc_file, reporthook=dlProgress)
def dlProgress(count, blockSize, totalSize):
percent = int(count*blockSize*100/totalSize)
sys.stdout.write("\r" + rem_file + "...%d%%" % percent)
sys.stdout.flush()
- http - Python urllib2 Progress Hook - Stack Overflow - http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
- How to write a download progress indicator in Python? - Python - K210.ORG - http://k210.org/python/how_to_write_a_download_progress_indicator_in_python/
Read web page 2.x
import urllib
resp = urllib.urlopen('http://www.python.org')
html = resp.read()
head = resp.headers
headers = head.keys() # or 'for header in head' or 'head.items()'
head['set-cookie']
URL Encode
#!/usr/bin/env python
# urlencode a message from stdin or command parameters
# author: Kenneth Burgener <kenneth@k.ttak.org> (c) 2013
import urllib
import sys
import select
msg = ""
if select.select([sys.stdin,],[],[],0.0)[0]:
msg = sys.stdin.readline().strip() + " "
if len(sys.argv) > 1:
msg += " ".join(sys.argv[1:])
msg = msg.strip()
print urllib.quote(msg)
cookies
cookies: [2]
import urllib2
req1 = urllib2.Request(url1)
response = urllib2.urlopen(req1)
cookie = response.headers.get('Set-Cookie')
# Use the cookie is subsequent requests
req2 = urllib2.Request(url2)
req2.add_header('cookie', cookie)
response = urllib2.urlopen(req2)
Read web page 3.x
import urllib.request
resp = urllib.request.urlopen('http://www.python.org')
html = resp.read()
Convert byte characters to string
print(f.read(100).decode('utf-8'))
Download File
Download file: (2.x)
import urllib urllib.urlretrieve(url, filename)
Download file: (3.x)
import urllib.request urllib.request.urlretrieve(url, filename)
Open web browser
import webbrowser
webbrowser.open('http://www.google.com')
HTTP Basic Authentication
HTTP Basic Authentication [3]
import urllib2, base64
request = urllib2.Request("http://api.foursquare.com/v1/user")
base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', )
request.add_header("Authorization", "Basic %s" % base64string)
result = urllib2.urlopen(request)
HTTP Basic Authentication Handler
HTTP Basic Authentication Handler: [4] [5] [6]
import urllib2
auth = urllib2.HTTPPasswordMgrWithDefaultRealm()
auth.add_password(None, 'http://twitter.com/account/', username, password)
auth_handler = urllib2.HTTPBasicAuthHandler(auth)
url_opener = urllib2.build_opener(auth_handler)
# Note: this will change the default opener.
# if you do not wish to do this, simply use "url_opener.open(url_request)" everywhere.
urllib2.install_opener(url_opener)
url_request = urllib2.Request('https://api.twitter.com/1/statuses/user_timeline.json?%s' % twitter_args)
url_output = urllib2.urlopen(url_request).read()
theurl = 'http://10.10.10.135/image.jpg'
req = urllib2.Request(theurl)
base64string = base64.encodestring('%s:%s' % (username, password))[:-1]
authheader = "Basic %s" % base64string
req.add_header("Authorization", authheader)
resp = urllib2.urlopen(req)
img = resp.read()
# the following has the problem of being required to already know the realm
import urllib2
TRIM_API_URL = 'http://api.tr.im/api'
auth_handler = urllib2.HTTPBasicAuthHandler()
auth_handler.add_password(realm='tr.im',
uri=TRIM_API_URL,
user=USERNAME,
passwd=PASSWORD)
url_opener = urllib2.build_opener(auth_handler)
urllib2.install_opener(url_opener)
response = urllib2.urlopen('%s/trim_simple?url=%s'
% (TRIM_API_URL, url_to_trim))
url = response.read().strip()
urllib Session Tracking
Sessions Tracking:
import urllib
import urllib2
import random
print "-" * 10, "REQ1", "-" * 10
data = {'username': 'test',
'password': 'password1',
'id': str(random.randint(1, 100)),
}
data = urllib.urlencode(data)
req1 = urllib2.Request('http://demo.oeey.com/session_a.php', data)
req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ1 HEADERS:", req1.headers.items()
print "REQ1 DATA:", req1.data
print "-" * 10, "RESP1", "-" * 10
resp1 = urllib2.urlopen(req1)
cookie = resp1.headers.get('Set-Cookie')
print 'RESP1 URL:', resp1.geturl() # new url, if redirected
print 'RESP1 CODE:', resp1.getcode() # 200
print 'RESP1 COOKIE:', cookie
print 'RESP1 HEADERS:', resp1.headers.items()
print "RESP1 HTML:\n", resp1.read()
print "-" * 10, "REQ2", "-" * 10
data = {
'product': '3',
}
data = urllib.urlencode(data)
req2 = urllib2.Request('http://demo.oeey.com/session_b.php', data)
req2.add_header('Cookie', cookie)
req2.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
print "REQ2 HEADERS:", req2.headers.items()
print "REQ2 DATA:", req2.data
print "-" * 10, "RESP2", "-" * 10
resp2 = urllib2.urlopen(req2)
print 'RESP2 URL:', resp2.geturl() # new url, if redirected
print 'RESP2 CODE:', resp2.getcode() # 200
print 'RESP2 HEADERS:', resp2.headers.items()
print "RESP2 HTML:\n", resp2.read()
session_a.php:
<?php
// Start Session
session_start();
// Test redirect - resp1.geturl()
//$_SESSION['id'] = 'redirect';
//header('Location: http://demo.oeey.com/session_b.php');
// Show REQUEST data\
echo "Request Data: \n";
print_r($_REQUEST);
// $_REQUEST is combination of $_GET, $_POST and $_COOKIE.
if(isset($_REQUEST['id'])) {
$_SESSION['id'] = $_REQUEST['id'];
} else {
$_SESSION['id'] = rand();
}
echo "Request ID: " . $_SESSION['id'];
session_b.php:
<?php
// Start Session
session_start();
// Show REQUEST data
echo "Request Data: \n";
print_r($_REQUEST);
// Show session data
echo "Session Data: \n";
print_r($_SESSION);
// Show ID
echo "ID: \n";
if(isset($_SESSION['id'])) {
echo $_SESSION['id'];
} else {
echo "ID not set";
}
simpler example
### LOGON
url = 'http://apc.oeey.com/login.tgi'
values = {'Username' : 'admin',
'Password' : 'admin', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie # 'DLILPC="W5J/nTupJF0hyrv"; Version=1; Path=/'
### POWER OFF
url = 'http://apc.oeey.com/outlet?8=OFF'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)
### LOGOUT
url = 'http://apc.oeey.com/logout'
req = urllib2.Request(url)
req.add_header('Cookie', cookie)
response = urllib2.urlopen(req, timeout=3)
cookie = response.headers.get('Set-Cookie')
print cookie # 'DLILPC=""; Version=1; Max-Age=0; Path=/'
URL Cookie Session Tracking
import urllib
import urllib2
import re
url = 'http://apc.oeey.com/Forms/login1'
values = {'login_username' : 'apc',
'login_password' : 'apc', }
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req, timeout=3)
# exceptions:
# socket.timeout: timed out
# urllib2.HTTPError: HTTP Error 403: Forbidden
print response.url # http://apc.oeey.com/NMC/GGcOPeRq8+FWctMifeoezA/home.htm
# get cookie
match = re.findall('http://apc.oeey.com/NMC/(.*)/home.htm', response.url)
print match
cookie = match[0]
# use cookie:
url2 = 'http://apc.oeey.com/NMC/{}/Forms/outlctrl1'.format(cookie)
values2 = { 'rPDUOutletCtrl': '4',
'OL_Cntrl_Col1_Btn': '?8,2',
'submit': 'Next >>'}
data2 = urllib.urlencode(values2)
req2 = urllib2.Request(url2, data2)
response2 = urllib2.urlopen(req2)
print response2.url
# logoff:
url6 = 'http://apc.oeey.com/NMC/{}/logout.htm'.format(cookie)
req6 = urllib2.Request(url6)
response6 = urllib2.urlopen(req6)
print response6.url # http://apc.oeey.com/NMC/X7tmWWC4oYI0Z4hQbnlLaQ/logout.htm
#html = response6.read()