Pythonic Prose: October 2009

Tuesday, October 27, 2009

Python - break a large mysql dump into small dumps

# created a dumpfile from my mysql db
# and found that it was too large to
# upload to my new db host.
 
# this python script breaks up the database
# into smaller pieces that you can more
# easily import through phpmyadmin
# (if I'd only had shell access I wouldn't
# have this problem at all!)
 
# indicates a new table is about to be
# created
dlmtr = "-- Table structure for table"
 
wholeFile = open("myDBDump.sql")
fileN = 0
oFile = open(str(fileN) + ".sql", 'w')
reducing = True
 
for line in wholeFile:
    if line.find(dlmtr) > -1:
        # this is the seam for the next file
        print "starting new file"
        oFile.close()
        fileN += 1
        oFile = open(str(fileN) + ".sql", 'w')
    oFile.write(line)
 
oFile.close()
print "Done"

Python - extract a tar.gz archive

# extract a tar.gz file with the tarfile module
import tarfile
 
# open the tarfile and use the 'r:gz' parameter
# the 'r:gz' mode enables gzip compression reading
tfile = tarfile.open("archive.tar.gz", 'r:gz')
 
# 99.9% of the time you just want to extract all
# the contents of the archive.
tfile.extractall('.')
 
# Maybe this isn't so amazing for you types out
# there using *nix, os x, or (anything other than
# windows that comes with tar and gunzip scripts).
# However, if you find yourself on windows and
# need to extract a tar.gz you're in for quite the
# freeware/spyware/spamware gauntlet.
 
# Python has everything you need built in!
# Hooray for python!
print "Done!"

Monday, October 26, 2009

Python - extract or unzip a tar file

# untar a tar file with python
# python can open, inspect contents, and extract
#   tar files with the built-in
#   tarfile module.
import tarfile
 
# tar file to extract
theTarFile = 'example.tar'
 
# tar file path to extract
extractTarPath = '.'
 
# open the tar file
tfile = tarfile.open(theTarFile)
 
if tarfile.is_tarfile(theTarFile):
    # list all contents
    print "tar file contents:"
    print tfile.list(verbose=False)
    # extract all contents
    tfile.extractall(extractTarPath)
else:
    print theTarFile + " is not a tarfile."

Tuesday, October 20, 2009

Python - reduce a web sites size

# I've recently started using iWeb (which for the non
# Mac OS X inclined is the application used to make web

# sites and what not).
#
# After creating a 80 page site I was horrified at the
# total size site.  Nearly 100 MB!! The site really wasn't
# very graphics intensive.  Each page had only one image on it
# at most!  I started examining the file structure and was
# horrified to realize that iWeb produces web sites use
# huge image files.  Not even remotely compressed.

#
# I needed to rescale the jpg and png files down to
# reasonable sizes.


from PIL import Image
import glob

import os

# this is the default size for images:
size = 256, 256

# I provide 2 different sizes for other files
# that need to be larger.
# Identify the names of the files that need to
# be higher quality.
csize0files = 'PhotoGray_nav_bg.png', 'bg_round.jpg'

csize0 = 768, 768

# different custom sizes for other 'important files
#
csize1files = 'nonefiles', 'none'
csize1 = 512, 512

# create a list for all the files and then add
# them all in type by type.
# For my page I just had jpg and png images

all_matching_files = []
for i in glob.glob("*/*.jpg"):
    all_matching_files.append(i)

for i in glob.glob("*/*.png"):
    all_matching_files.append(i)


# if you are using this for iWeb checkout the file count!
# my 80 page site had 3000+ images!!!
print "total images to be resized: " + str(len(all_matching_files))
count = len(all_matching_files)

# loop through all the images and make changes
for infile in all_matching_files:
    scalesize = size
    im = Image.open(infile)
    # split out all the useful parts of the file's path

    thePath, theFile = os.path.split(infile)
    fileName, extension = os.path.splitext(theFile)
    # custom resize if necessary
    if theFile in csize1files:
        scalesize = csize1
    elif theFile in csize0files:
        scalesize = csize0

    # resize with PIL's awesome thumbnail method

    im.thumbnail(scalesize, Image.ANTIALIAS)

    # save back as appropriate type
    if extension == ".png":
        im.save(infile, "PNG")
    else:
        im.save(infile, "JPEG")

    count -= 1
    if count % 10 == 0:
        # output some useful stats

        print str(count) + " images remaining."

print "....done"

## output:
##    total images to be resized: 3907

##    3900 images remaining.
##    3890 images remaining.
##    ... [snip].....(there were a lot!)
##    30 images remaining.
##    20 images remaining.
##    10 images remaining.
##    0 images remaining.
##    ....done

# Running this script reduced my website size
# from 96MB to 39MB!!
#
# Certainly there is still room for improvement
# future posts will ideally be aimed at further
# efficiency gains.

Monday, October 19, 2009

Python - quickly update urls in a web page

# update a webpages url references. 
# Your web page has a collection of images and you 
# want to update all the "folder1" references 
# to the new folder you've populated called "newfolder" 
 
# normally you would open the actual html file 
# and iterate through the file one line at a time 
# like in this post.  
 
# but for simplicity sake lets just use a multi line string 
# for the example 
theString = """ <img src="folder1/pic2324.jpg" /> 
<img src="folder1/pic2255.png" /> 
<img src="folder2/pic552.jpg" /> 
<img src="folder1/pica2f.jpg" /> 
 
""" 
 
# all you need is to iterate through the 
# file and replace 'folder1' with 'newfolder' 
for line in theString.split('\n'):
    line = line.replace("folder1/", "newfolder/")
    print line
 
#output: 
#     <img src="newfolder/pic2324.jpg" /> 
#    <img src="newfolder/pic2255.png" /> 
#    <img src="folder2/pic552.jpg" /> 
#    <img src="newfolder/pica2f.jpg" />

Tuesday, October 6, 2009

Python - read robots.txt files with ease

# robot parser allows access to a websites 
# robots.txt file (more on robots.txt)  
 
import robotparser
# more on robotparser doc   
# Note: in python 3 robotparser will be found in 
# the urllib module at urllib.robotparser 
 
# examples using urllib 
#   - copy image (or file) off web  
#   - alter user agent string   
#   - browse the web with python   
 
 
# the site I want to read 
url = "http://pythonicprose.blogspot.com/robots.txt" 
 
rob = robotparser.RobotFileParser()
rob.set_url(url)
 
# read and parse through the file 
rob.read()
 
# if you are creating a web crawler or spider you may need to keep 
# track of how long it has been since you last read the robots.txt file 
# use modified to mark the time and mtime to read it 
rob.modified()
 
# to get the time: 
rob.mtime()
 
# check and see if any user agent can read the home page 
print rob.can_fetch("*", "/")
# output: 
#   True 
 
# check and see if any user agent can read the search page 
print rob.can_fetch("*", "/search")
# output: 
#   False 
 
# now that we are so many lines down from set_url we can check 
# the host we are processing 
print rob.host
# output: 
# 'pythonicprose.blogspot.com'

Sunday, October 4, 2009

Python - make your own class attributes iterable

# It can be useful to iterate through data contained 
# in your own custom objects. 
 
# Lets say you have your own class 
class ExampleClass(object):
    def __init__(self):
        self.objectList = []
        self.objectDict = {}
        self.maxItem = 100
        self.objectItem = "" 
    def iterateList(self):
        return self.objectList
    def addListItem(self, item):
        self.objectList.append(item)
    def addDictItem(self, item, value):
        self.objectDict[item] = value
 
 
# create an instance of the class 
# and lets use it's iterating methods 
ec = ExampleClass()
 
# add some example data 
for i in xrange(10):
    ec.addListItem(i)
    ec.addDictItem(i, str(i)+"'s value")
 
# now that we have data lets iterate 
# through the data 
for item in ec.iterateList():
    print item
 
#output: 
#    0 
#    1 
#    2 
#    3 
#    4 
#    5 
#    6 
#    7 
#    8 
#    9

Saturday, October 3, 2009

Python - using sqlite3 module for persistant data

# The sqlite3 lets you create and use
# a database with just a file

import sqlite3
# more detailed python doc sqlite3 

import os
# in this example we get the current working dir path 

# Choose the file to use for the
# db and connect (create it)

conn = sqlite3.connect(os.path.abspath('.') + "tempdb")

# grab a cursor and we can create the db schema
c = conn.cursor()

# if you happen to run through this example a few times
# you may notice that the data is persistant.  For this example
# we'll ensure that we're starting from ground zero
# drop the database (if it exists)

c.execute('drop table if exists users')

# create a table
c.execute('create table users (name text, age text, email text)')

# insert data
c.execute("""insert into users values ('steve', '30', 'blah@blah.com')""")
c.execute("""insert into users values ('steve2', '32', 'blah@blah2.com')""")
c.execute("""insert into users values ('steve3', '33', 'blah@blah3.com')""")

#,
#                    ('steve II', '20', 'blah2@blah.com'),
#                    ('steve III', '10', 'blah3@blah.com')""")

# now lets select our data
c.execute('select * from users')

# iterate through the results with for each
for row in c:
    print row

# output:
#    (u'steve', u'30', u'blah@blah.com')
#    (u'steve2', u'32', u'blah@blah2.com')

#    (u'steve3', u'33', u'blah@blah3.com')

Python - create unit tests and ensure accurate documentation with doctest

""" 
The doctest module uses class and 
method documentation to run unit 
tests on your code. 
 
The doctest module reads the coding documentation 
you've created and uses that same documentation 
to conduct unit tests.  This helps ensure 
the documentation is accurate and creates a 
one stop destination for documentation and unit 
tests. 
""" 
 
class ExampleClass(object):
    """ 
    Example class that 
    has one working method. 
 
    >>> ec = ExampleClass()
    >>> ec.example(10)
    19 
 
    >>> ec = ExampleClass()
    >>> ec.example(0)
    -1 
 
    # non int parameters should 
    # return nothing 
    >>> ec = ExampleClass()
    >>> ec.example("apple")
 
    >>> ec = ExampleClass()
    >>> ec.a = 3
    >>> ec.example(10)
    30 
    """ 
 
    def __init__(self):
        self.a = 2
        self.b = -1
    def example(self, n):
        try:
            n = int(n)
        except ValueError:
            return None
        return n * self.a + self.b
 
 
if __name__ == '__main__':
    # more about  doctest features  
    import doctest
    doctest.testmod()
 
 
# this outputs: 
# nothing at all 
# if no errors are found then doctest doesn't complain 
 
# If you were to change some of the expected values in the 
# documention....for instance... the last example: 
#    >>> ec = ExampleClass() 
#    >>> ec.a = 3 
#    >>> ec.example(10) 
#    29 
# and change the expected response to 30 
# 
# 
# the output would be: 
#    ********************************************************************** 
#    File "C:\python\docstringexample.py", line 27, in __main__.ExampleClass 
#    Failed example: 
#        ec.example(10) 
#    Expected: 
#        30 
#    Got: 
#        29 
#    ********************************************************************** 
#    1 items had failures: 
#       1 of   9 in __main__.ExampleClass 
#    ***Test Failed*** 1 failures.

Python - storing persistance objects in file with shelve

# The shelve module is used to store objects in a file. 
# You use the file like a glorified dict with key, value 
# pairs. 
import shelve
# shelve python doc
objList = []
filename = 'shelveFile.shelve' 
 
# open and or create the file 
file = shelve.open(filename)
 
# Here is an example class we'll create 
# instances of and then store in the file 
class ExampleClass(object):
    def __init__(self):
        self.a = 0
        self.b = 1
        self.c = 2
        self.k = 0
    def getTotal(self):
        return self.a + self.b + self.c
 
# create several instances 
for i in xrange(3):
    obj = ExampleClass()
    obj.k = i
    obj.a = i+1
    obj.b = i+2
    obj.c = i+3
    objList.append(obj)
 
# now add the objects to file object 
for i in objList:
    # keys are strings 
    file[str(i.k)] = i
 
# The sync command will explicitly 
# write changes to file 
file.sync()
 
# Closing the object will also execute 
# the sync command 
file.close()
 
# The file (and the 3 objects in it 
# are now saved. 
# Now we'll reopen and verify the data is there 
file2 = shelve.open(filename)
 
# Iterate through and print out 
# the object attributes (to verify 
# they are the values we assigned previously) 
for i in file2.keys():
    j = file2[str(i)]
    print "a,b,c,k = ", j.a, j.b, j.c, j.k
#output: 
#a,b,c,k =  1 2 3 0 
#a,b,c,k =  3 4 5 2 
#a,b,c,k =  2 3 4 1 
 
# You can edit these values. 
# Here will change all 'a' attributes to 7 
for i in file2.keys():
    # Take note of how these changes were made. 
    # You cannot merely alter an attribute 
    # like file2[str(i)].a = 7 (this will 
    # not work). 
    j = file2[str(i)]
    j.a = 7
    file2[str(j.k)] = j
 
# And verify that changes are made: 
for i in file2.keys():
    j = file2[str(i)]
    print "a,b,c,k = ", j.a, j.b, j.c, j.k
#output: 
#a,b,c,k =  7 2 3 0 
#a,b,c,k =  7 4 5 2 
#a,b,c,k =  7 3 4 1 
 
 
 
# now close the shelve file so you can 
# use the data objects another day. 
file2.close()

Thursday, October 1, 2009

Python - using filecmp to compare two or more files

# The filecmp module is a portable way to check
# whether two (or more) files are the same.

# The module only has two methods:
#   cmp(file1, file2)
#   cmpfiles(directory1, directory2, common)
import filecmp
# find more verbose filecmp python docs here  


# check whether two files are the same
# of course you need to have a /etc/hosts and .bak

#  for this example to work (feel free to change to filenames
#  that do exist on your setup)
if filecmp.cmp('/etc/hosts', '/etc/hosts.bak'):
    print "the files are the same"
else:
    print "the files are not the same"



# filecmp also allows you to compare directories
# you can use cmpfiles which returns 3 tuples:
#                           matches, mismatch, error
match, mismatch, error = filecmp.cmpfiles('folder1',
                                          'folder2',
                                            ['LICENSE.TXT',
                                             'README.TXT',
                                             'VERSION'])


print "Matching: ", match
print "Mismatched: ", mismatch
print "Errors: ", error

# output (for me):
# Matching: ['LICENSE.TXT', 'README.TXT']

# Mismatched: ['LICENSE']
# Errors: []

Python - copy or move files and directories

# shutil is used for high level copy and move needs. 
# shutil can operate on individual files or recursively 
# on a directory structure. 
import shutil
# shutil python doc
 
# of course these examples the 
# file is in the current working directory  
 
# it supports simply copying files 
print "just copy file contents..." 
shutil.copy("hero.bmp", "hero2.bmp")
# The second hero2.bmp is now created. 
# However, all attributes of the file have been reset 
#   like creation dates and what not (depending 
#   on the file type) 
 
# to copy the stats from the first to the new copied  
# file you can use the copystat method. 
# To fix the first examples lack of stat copying 
print "copy stats..." 
shutil.copystat("hero.bmp", "hero2.bmp")
 
# shutil can also copy an entire directory tree with copytree 
# stats are copied for the files. 
print "recursively copy directory tree..." 
shutil.copytree('C:/tmp','C:/newtmp')
 
# shutil can also remove a directory tree 
print "remove the copied directory tree..." 
shutil.rmtree('C:/newtmp')
 
#

Python - using glob to get lists of files and directories

import os.path
# glob is a simple and useful python module. 
# It uses simple regular expressions to match 
# directories and files for a given path.  If 
# you've ever used the command line to 'ls' or 
# 'dir' the currently directory you may be aware 
# that the directory accepts * or ? or [] to 
# match patterns.  glob is a python implementation 
# of this functionality. 
 
import glob
import os
 
# find all the .txt files in the current working directory  
print glob.glob('*.TXT')
# output: 
# ['LICENSE.txt', 'NEWS.txt', 'README.txt'] 
 
# you can also specify a full path 
# Here I'm searching for dll files in python 2.6 
print glob.glob('C:\Python26\DLLs\*.dll')
# output: 
#    ['C:\\Python26\\DLLs\\sqlite3.dll', 
#    'C:\\Python26\\DLLs\\tcl85.dll', 
#    'C:\\Python26\\DLLs\\tclpip85.dll', 
#    'C:\\Python26\\DLLs\\tk85.dll'] 
 
# If you are expecting a great deal of results 
# you should use the glob.iglob method that returns 
# matches as it goes and does not load everything 
# into memory first. 
# glob.iglob() example 
f = glob.iglob('C:\Python26\Lib\*')
 
spitItOut = True
while spitItOut:
    try:
        fileNameAndPath = f.next()
        # since glob gives you the full path you can 
        # use the output with some of the os module's methods 
        if os.path.isfile(fileNameAndPath):
            fileNameAndPath += " is a file." 
        else:
            fileNameAndPath += " is not a file." 
        print fileNameAndPath
    except StopIteration:
        spitItOut = False
 
#output (snipped a bit...since there a lot): 
#    C:\Python26\Lib\abc.py is a file. 
#    ....[snip] 
#    C:\Python26\Lib\compiler is not a file. 
#    ...[another snip] 
#    C:\Python26\Lib\getopt.py is a file. 
#    C:\Python26\Lib\getopt.pyc is a file. 
#    C:\Python26\Lib\getpass.py is a file. 
#    C:\Python26\Lib\gettext.py is a file. 
#    C:\Python26\Lib\glob.py is a file. 
#    C:\Python26\Lib\glob.pyc is a file.

Python - printing complex objects with pretty printing

# Pretty printing (using the pprint module) transforms 
# python objects into human readable output. 
# 
# Use pprint when you need to display a complex 
# data structure to users. 
 
 
import string
import pprint
# pprint python doc
 
d = {}
 
for i in string.ascii_lowercase:
    d[i] = string.ascii_lowercase.replace(i, ' ')
 
print "not useful output:" 
print d
# output: 
#   not useful output: 
#   {'a': ' bcdefghijklmnopqrstuvwxyz', 'c': 'ab defghijklmnopqrstuvwxyz', 'b': 
#   'a cdefghijklmnopqrstuvwxyz', 'e': 'abcd fghijklmnopqrstuvwxyz', 'd': 'abc 
#   efghijklmnopqrstuvwxyz', 'g': 'abcdef hijklmnopqrstuvwxyz', 'f': 'abcde ghij 
#   klmnopqrstuvwxyz', 'i': 'abcdefgh jklmnopqrstuvwxyz', 'h': 'abcdefg ijklmnop 
#   qrstuvwxyz', 'k': 'abcdefghij lmnopqrstuvwxyz', 'j': 'abcdefghi klmnopqrstuv 
#   wxyz', 'm': 'abcdefghijkl nopqrstuvwxyz', 'l': 'abcdefghijk mnopqrstuvwxyz', 
#   'o': 'abcdefghijklmn pqrstuvwxyz', 'n': 'abcdefghijklm opqrstuvwxyz', 'q': ' 
#   abcdefghijklmnop rstuvwxyz', 'p': 'abcdefghijklmno qrstuvwxyz', 's': 'abcdef 
#   ghijklmnopqr tuvwxyz', 'r': 'abcdefghijklmnopq stuvwxyz', 'u': 'abcdefghijkl 
#   mnopqrst vwxyz', 't': 'abcdefghijklmnopqrs uvwxyz', 'w': 'abcdefghijklmnopqr 
#   stuv xyz', 'v': 'abcdefghijklmnopqrstu wxyz', 'y': 'abcdefghijklmnopqrstuvwx 
#   z', 'x': 'abcdefghijklmnopqrstuvw yz', 'z': 'abcdefghijklmnopqrstuvwxy '} 
# 
# All the data is there but it is difficult to read. 
# You can use pprint (pretty print) to make things easy to read.  pprint 
#   formats python datastructures to be human readable. 
 
print "human readable output:" 
pprint.pprint(d, indent=4)
# output: 
#human readable output: 
#{   'a': ' bcdefghijklmnopqrstuvwxyz', 
#    'b': 'a cdefghijklmnopqrstuvwxyz', 
#    'c': 'ab defghijklmnopqrstuvwxyz', 
#    'd': 'abc efghijklmnopqrstuvwxyz', 
#    'e': 'abcd fghijklmnopqrstuvwxyz', 
#    'f': 'abcde ghijklmnopqrstuvwxyz', 
#    'g': 'abcdef hijklmnopqrstuvwxyz', 
#    'h': 'abcdefg ijklmnopqrstuvwxyz', 
#    'i': 'abcdefgh jklmnopqrstuvwxyz', 
#    'j': 'abcdefghi klmnopqrstuvwxyz', 
#    'k': 'abcdefghij lmnopqrstuvwxyz', 
#    'l': 'abcdefghijk mnopqrstuvwxyz', 
#    'm': 'abcdefghijkl nopqrstuvwxyz', 
#    'n': 'abcdefghijklm opqrstuvwxyz', 
#    'o': 'abcdefghijklmn pqrstuvwxyz', 
#    'p': 'abcdefghijklmno qrstuvwxyz', 
#    'q': 'abcdefghijklmnop rstuvwxyz', 
#    'r': 'abcdefghijklmnopq stuvwxyz', 
#    's': 'abcdefghijklmnopqr tuvwxyz', 
#    't': 'abcdefghijklmnopqrs uvwxyz', 
#    'u': 'abcdefghijklmnopqrst vwxyz', 
#    'v': 'abcdefghijklmnopqrstu wxyz', 
#    'w': 'abcdefghijklmnopqrstuv xyz', 
#    'x': 'abcdefghijklmnopqrstuvw yz', 
#    'y': 'abcdefghijklmnopqrstuvwx z', 
#    'z': 'abcdefghijklmnopqrstuvwxy '} 
# 
# Formatted in this fashion its easy to see what 
# data is being stored in the dict.

Python - hash with md5 and sha1 (and others!)

# There a many reasons to hash data. 
# For this example we'll say that we 
# want to has passwords so we can store 
# them in a database (or file)...or for 
# this example a variable 
 
# hashlib encapsulates the following functionality: 
#   md5 
#   sha1 
#   sha224 
#   sha256 
#   sha384 
#   sha512 
import hashlib
 
# When a user creates their account they'll input 
# a password.  For security purposes you hash 
# the password and store it (so they can log into 
# their account later). 
password = "$uperP@a$$w0rd" 
 
#pass the password to the sha1 constructor 
createSha1 = hashlib.sha1(password)
 
#dump the password out in text 
sha1_password = createSha1.hexdigest()
 
print sha1_password
#output: 
# 2d0b537e6673e1f6baf1c462cd4922dab32ee243 
 
# You'll notice that sha1 creates a 40 character hash. 
# All hashed strings (regardless of original size) will 
# be represented by sha1 as 40 characters. 
print len(sha1_password)
#output: 
# 40 
 
# You can store that hashed password and then later the 
# user will attempt to login.  Take their password and hash 
# it with the same algorithm (sha1 in our example). 
password_attempt_1 = "superP@a$$w0rd" 
password_attempt_2 = "$up3rP@a$$w0rd" 
password_attempt_3 = "$uperP@a$$w0rd" 
 
#take the attempts and hash them so you can compare passwords 
attempt1 = hashlib.sha1(password_attempt_1)
if sha1_password == attempt1.hexdigest():
    print "password attempt 1 is a success" 
else:
    print "password attempt 1 is a failure" 
 
attempt2 = hashlib.sha1(password_attempt_2)
if sha1_password == attempt2.hexdigest():
    print "password attempt 2 is a success" 
else:
    print "password attempt 2 is a failure" 
 
attempt3 = hashlib.sha1(password_attempt_3)
if sha1_password == attempt3.hexdigest():
    print "password attempt 3 is a success" 
else:
    print "password attempt 3 is a failure" 
 
#output: 
#    password attempt 1 is a failure 
#    password attempt 2 is a failure 
#    password attempt 3 is a success 
 
 
# Now that you understand how to use sha1 you 
# understand how to use all of the supported 
# algorithms in hashlib.  They all use the same 
# methods so you can easily adapt your code to 
# any of the hash types. 
# Check out the python docs for hashlib