Pythonic Prose: 2009

Thursday, November 19, 2009

Python - processing command line arguments

# python scripts are often run from the
# command line.

# python can retrieve and use command line
# arguments with the sys module.

import sys

# all arguments are stored in the sys.argv list
print "number of arguments passed: ", len(sys.argv)

# process through the argument list

for argument in sys.argv:
    print argument

# my input/output:
#    python commandlinearguments.py one two three four five six seven
#    number of arguments passed:  8
#    commandlinearguments.py

#    one
#    two
#    three
#    four
#    five
#    six
#    seven

Wednesday, November 18, 2009

Python - boolean and, or, and not

# python includes the basic and, 
# or, and not boolean operations 
 
a = 10
b = 20
c = 30
 
if a>9 and b+1==21 and c==a+b:
    print "boolean 'and' example equates true" 
 
if a==9 or b<10 or c<100:
    print "only one of these expressions needs to be true" 
 
if not a+b==31:
    print "if this is not true"

Tuesday, October 27, 2009

Python - break a large mysql dump into small dumps

# created a dumpfile from my mysql db
# and found that it was too large to
# upload to my new db host.
 
# this python script breaks up the database
# into smaller pieces that you can more
# easily import through phpmyadmin
# (if I'd only had shell access I wouldn't
# have this problem at all!)
 
# indicates a new table is about to be
# created
dlmtr = "-- Table structure for table"
 
wholeFile = open("myDBDump.sql")
fileN = 0
oFile = open(str(fileN) + ".sql", 'w')
reducing = True
 
for line in wholeFile:
    if line.find(dlmtr) > -1:
        # this is the seam for the next file
        print "starting new file"
        oFile.close()
        fileN += 1
        oFile = open(str(fileN) + ".sql", 'w')
    oFile.write(line)
 
oFile.close()
print "Done"

Python - extract a tar.gz archive

# extract a tar.gz file with the tarfile module
import tarfile
 
# open the tarfile and use the 'r:gz' parameter
# the 'r:gz' mode enables gzip compression reading
tfile = tarfile.open("archive.tar.gz", 'r:gz')
 
# 99.9% of the time you just want to extract all
# the contents of the archive.
tfile.extractall('.')
 
# Maybe this isn't so amazing for you types out
# there using *nix, os x, or (anything other than
# windows that comes with tar and gunzip scripts).
# However, if you find yourself on windows and
# need to extract a tar.gz you're in for quite the
# freeware/spyware/spamware gauntlet.
 
# Python has everything you need built in!
# Hooray for python!
print "Done!"

Monday, October 26, 2009

Python - extract or unzip a tar file

# untar a tar file with python
# python can open, inspect contents, and extract
#   tar files with the built-in
#   tarfile module.
import tarfile
 
# tar file to extract
theTarFile = 'example.tar'
 
# tar file path to extract
extractTarPath = '.'
 
# open the tar file
tfile = tarfile.open(theTarFile)
 
if tarfile.is_tarfile(theTarFile):
    # list all contents
    print "tar file contents:"
    print tfile.list(verbose=False)
    # extract all contents
    tfile.extractall(extractTarPath)
else:
    print theTarFile + " is not a tarfile."

Tuesday, October 20, 2009

Python - reduce a web sites size

# I've recently started using iWeb (which for the non
# Mac OS X inclined is the application used to make web

# sites and what not).
#
# After creating a 80 page site I was horrified at the
# total size site.  Nearly 100 MB!! The site really wasn't
# very graphics intensive.  Each page had only one image on it
# at most!  I started examining the file structure and was
# horrified to realize that iWeb produces web sites use
# huge image files.  Not even remotely compressed.

#
# I needed to rescale the jpg and png files down to
# reasonable sizes.


from PIL import Image
import glob

import os

# this is the default size for images:
size = 256, 256

# I provide 2 different sizes for other files
# that need to be larger.
# Identify the names of the files that need to
# be higher quality.
csize0files = 'PhotoGray_nav_bg.png', 'bg_round.jpg'

csize0 = 768, 768

# different custom sizes for other 'important files
#
csize1files = 'nonefiles', 'none'
csize1 = 512, 512

# create a list for all the files and then add
# them all in type by type.
# For my page I just had jpg and png images

all_matching_files = []
for i in glob.glob("*/*.jpg"):
    all_matching_files.append(i)

for i in glob.glob("*/*.png"):
    all_matching_files.append(i)


# if you are using this for iWeb checkout the file count!
# my 80 page site had 3000+ images!!!
print "total images to be resized: " + str(len(all_matching_files))
count = len(all_matching_files)

# loop through all the images and make changes
for infile in all_matching_files:
    scalesize = size
    im = Image.open(infile)
    # split out all the useful parts of the file's path

    thePath, theFile = os.path.split(infile)
    fileName, extension = os.path.splitext(theFile)
    # custom resize if necessary
    if theFile in csize1files:
        scalesize = csize1
    elif theFile in csize0files:
        scalesize = csize0

    # resize with PIL's awesome thumbnail method

    im.thumbnail(scalesize, Image.ANTIALIAS)

    # save back as appropriate type
    if extension == ".png":
        im.save(infile, "PNG")
    else:
        im.save(infile, "JPEG")

    count -= 1
    if count % 10 == 0:
        # output some useful stats

        print str(count) + " images remaining."

print "....done"

## output:
##    total images to be resized: 3907

##    3900 images remaining.
##    3890 images remaining.
##    ... [snip].....(there were a lot!)
##    30 images remaining.
##    20 images remaining.
##    10 images remaining.
##    0 images remaining.
##    ....done

# Running this script reduced my website size
# from 96MB to 39MB!!
#
# Certainly there is still room for improvement
# future posts will ideally be aimed at further
# efficiency gains.

Monday, October 19, 2009

Python - quickly update urls in a web page

# update a webpages url references. 
# Your web page has a collection of images and you 
# want to update all the "folder1" references 
# to the new folder you've populated called "newfolder" 
 
# normally you would open the actual html file 
# and iterate through the file one line at a time 
# like in this post.  
 
# but for simplicity sake lets just use a multi line string 
# for the example 
theString = """ <img src="folder1/pic2324.jpg" /> 
<img src="folder1/pic2255.png" /> 
<img src="folder2/pic552.jpg" /> 
<img src="folder1/pica2f.jpg" /> 
 
""" 
 
# all you need is to iterate through the 
# file and replace 'folder1' with 'newfolder' 
for line in theString.split('\n'):
    line = line.replace("folder1/", "newfolder/")
    print line
 
#output: 
#     <img src="newfolder/pic2324.jpg" /> 
#    <img src="newfolder/pic2255.png" /> 
#    <img src="folder2/pic552.jpg" /> 
#    <img src="newfolder/pica2f.jpg" />

Tuesday, October 6, 2009

Python - read robots.txt files with ease

# robot parser allows access to a websites 
# robots.txt file (more on robots.txt)  
 
import robotparser
# more on robotparser doc   
# Note: in python 3 robotparser will be found in 
# the urllib module at urllib.robotparser 
 
# examples using urllib 
#   - copy image (or file) off web  
#   - alter user agent string   
#   - browse the web with python   
 
 
# the site I want to read 
url = "http://pythonicprose.blogspot.com/robots.txt" 
 
rob = robotparser.RobotFileParser()
rob.set_url(url)
 
# read and parse through the file 
rob.read()
 
# if you are creating a web crawler or spider you may need to keep 
# track of how long it has been since you last read the robots.txt file 
# use modified to mark the time and mtime to read it 
rob.modified()
 
# to get the time: 
rob.mtime()
 
# check and see if any user agent can read the home page 
print rob.can_fetch("*", "/")
# output: 
#   True 
 
# check and see if any user agent can read the search page 
print rob.can_fetch("*", "/search")
# output: 
#   False 
 
# now that we are so many lines down from set_url we can check 
# the host we are processing 
print rob.host
# output: 
# 'pythonicprose.blogspot.com'

Sunday, October 4, 2009

Python - make your own class attributes iterable

# It can be useful to iterate through data contained 
# in your own custom objects. 
 
# Lets say you have your own class 
class ExampleClass(object):
    def __init__(self):
        self.objectList = []
        self.objectDict = {}
        self.maxItem = 100
        self.objectItem = "" 
    def iterateList(self):
        return self.objectList
    def addListItem(self, item):
        self.objectList.append(item)
    def addDictItem(self, item, value):
        self.objectDict[item] = value
 
 
# create an instance of the class 
# and lets use it's iterating methods 
ec = ExampleClass()
 
# add some example data 
for i in xrange(10):
    ec.addListItem(i)
    ec.addDictItem(i, str(i)+"'s value")
 
# now that we have data lets iterate 
# through the data 
for item in ec.iterateList():
    print item
 
#output: 
#    0 
#    1 
#    2 
#    3 
#    4 
#    5 
#    6 
#    7 
#    8 
#    9

Saturday, October 3, 2009

Python - using sqlite3 module for persistant data

# The sqlite3 lets you create and use
# a database with just a file

import sqlite3
# more detailed python doc sqlite3 

import os
# in this example we get the current working dir path 

# Choose the file to use for the
# db and connect (create it)

conn = sqlite3.connect(os.path.abspath('.') + "tempdb")

# grab a cursor and we can create the db schema
c = conn.cursor()

# if you happen to run through this example a few times
# you may notice that the data is persistant.  For this example
# we'll ensure that we're starting from ground zero
# drop the database (if it exists)

c.execute('drop table if exists users')

# create a table
c.execute('create table users (name text, age text, email text)')

# insert data
c.execute("""insert into users values ('steve', '30', 'blah@blah.com')""")
c.execute("""insert into users values ('steve2', '32', 'blah@blah2.com')""")
c.execute("""insert into users values ('steve3', '33', 'blah@blah3.com')""")

#,
#                    ('steve II', '20', 'blah2@blah.com'),
#                    ('steve III', '10', 'blah3@blah.com')""")

# now lets select our data
c.execute('select * from users')

# iterate through the results with for each
for row in c:
    print row

# output:
#    (u'steve', u'30', u'blah@blah.com')
#    (u'steve2', u'32', u'blah@blah2.com')

#    (u'steve3', u'33', u'blah@blah3.com')

Python - create unit tests and ensure accurate documentation with doctest

""" 
The doctest module uses class and 
method documentation to run unit 
tests on your code. 
 
The doctest module reads the coding documentation 
you've created and uses that same documentation 
to conduct unit tests.  This helps ensure 
the documentation is accurate and creates a 
one stop destination for documentation and unit 
tests. 
""" 
 
class ExampleClass(object):
    """ 
    Example class that 
    has one working method. 
 
    >>> ec = ExampleClass()
    >>> ec.example(10)
    19 
 
    >>> ec = ExampleClass()
    >>> ec.example(0)
    -1 
 
    # non int parameters should 
    # return nothing 
    >>> ec = ExampleClass()
    >>> ec.example("apple")
 
    >>> ec = ExampleClass()
    >>> ec.a = 3
    >>> ec.example(10)
    30 
    """ 
 
    def __init__(self):
        self.a = 2
        self.b = -1
    def example(self, n):
        try:
            n = int(n)
        except ValueError:
            return None
        return n * self.a + self.b
 
 
if __name__ == '__main__':
    # more about  doctest features  
    import doctest
    doctest.testmod()
 
 
# this outputs: 
# nothing at all 
# if no errors are found then doctest doesn't complain 
 
# If you were to change some of the expected values in the 
# documention....for instance... the last example: 
#    >>> ec = ExampleClass() 
#    >>> ec.a = 3 
#    >>> ec.example(10) 
#    29 
# and change the expected response to 30 
# 
# 
# the output would be: 
#    ********************************************************************** 
#    File "C:\python\docstringexample.py", line 27, in __main__.ExampleClass 
#    Failed example: 
#        ec.example(10) 
#    Expected: 
#        30 
#    Got: 
#        29 
#    ********************************************************************** 
#    1 items had failures: 
#       1 of   9 in __main__.ExampleClass 
#    ***Test Failed*** 1 failures.

Python - storing persistance objects in file with shelve

# The shelve module is used to store objects in a file. 
# You use the file like a glorified dict with key, value 
# pairs. 
import shelve
# shelve python doc
objList = []
filename = 'shelveFile.shelve' 
 
# open and or create the file 
file = shelve.open(filename)
 
# Here is an example class we'll create 
# instances of and then store in the file 
class ExampleClass(object):
    def __init__(self):
        self.a = 0
        self.b = 1
        self.c = 2
        self.k = 0
    def getTotal(self):
        return self.a + self.b + self.c
 
# create several instances 
for i in xrange(3):
    obj = ExampleClass()
    obj.k = i
    obj.a = i+1
    obj.b = i+2
    obj.c = i+3
    objList.append(obj)
 
# now add the objects to file object 
for i in objList:
    # keys are strings 
    file[str(i.k)] = i
 
# The sync command will explicitly 
# write changes to file 
file.sync()
 
# Closing the object will also execute 
# the sync command 
file.close()
 
# The file (and the 3 objects in it 
# are now saved. 
# Now we'll reopen and verify the data is there 
file2 = shelve.open(filename)
 
# Iterate through and print out 
# the object attributes (to verify 
# they are the values we assigned previously) 
for i in file2.keys():
    j = file2[str(i)]
    print "a,b,c,k = ", j.a, j.b, j.c, j.k
#output: 
#a,b,c,k =  1 2 3 0 
#a,b,c,k =  3 4 5 2 
#a,b,c,k =  2 3 4 1 
 
# You can edit these values. 
# Here will change all 'a' attributes to 7 
for i in file2.keys():
    # Take note of how these changes were made. 
    # You cannot merely alter an attribute 
    # like file2[str(i)].a = 7 (this will 
    # not work). 
    j = file2[str(i)]
    j.a = 7
    file2[str(j.k)] = j
 
# And verify that changes are made: 
for i in file2.keys():
    j = file2[str(i)]
    print "a,b,c,k = ", j.a, j.b, j.c, j.k
#output: 
#a,b,c,k =  7 2 3 0 
#a,b,c,k =  7 4 5 2 
#a,b,c,k =  7 3 4 1 
 
 
 
# now close the shelve file so you can 
# use the data objects another day. 
file2.close()

Thursday, October 1, 2009

Python - using filecmp to compare two or more files

# The filecmp module is a portable way to check
# whether two (or more) files are the same.

# The module only has two methods:
#   cmp(file1, file2)
#   cmpfiles(directory1, directory2, common)
import filecmp
# find more verbose filecmp python docs here  


# check whether two files are the same
# of course you need to have a /etc/hosts and .bak

#  for this example to work (feel free to change to filenames
#  that do exist on your setup)
if filecmp.cmp('/etc/hosts', '/etc/hosts.bak'):
    print "the files are the same"
else:
    print "the files are not the same"



# filecmp also allows you to compare directories
# you can use cmpfiles which returns 3 tuples:
#                           matches, mismatch, error
match, mismatch, error = filecmp.cmpfiles('folder1',
                                          'folder2',
                                            ['LICENSE.TXT',
                                             'README.TXT',
                                             'VERSION'])


print "Matching: ", match
print "Mismatched: ", mismatch
print "Errors: ", error

# output (for me):
# Matching: ['LICENSE.TXT', 'README.TXT']

# Mismatched: ['LICENSE']
# Errors: []

Python - copy or move files and directories

# shutil is used for high level copy and move needs. 
# shutil can operate on individual files or recursively 
# on a directory structure. 
import shutil
# shutil python doc
 
# of course these examples the 
# file is in the current working directory  
 
# it supports simply copying files 
print "just copy file contents..." 
shutil.copy("hero.bmp", "hero2.bmp")
# The second hero2.bmp is now created. 
# However, all attributes of the file have been reset 
#   like creation dates and what not (depending 
#   on the file type) 
 
# to copy the stats from the first to the new copied  
# file you can use the copystat method. 
# To fix the first examples lack of stat copying 
print "copy stats..." 
shutil.copystat("hero.bmp", "hero2.bmp")
 
# shutil can also copy an entire directory tree with copytree 
# stats are copied for the files. 
print "recursively copy directory tree..." 
shutil.copytree('C:/tmp','C:/newtmp')
 
# shutil can also remove a directory tree 
print "remove the copied directory tree..." 
shutil.rmtree('C:/newtmp')
 
#

Python - using glob to get lists of files and directories

import os.path
# glob is a simple and useful python module. 
# It uses simple regular expressions to match 
# directories and files for a given path.  If 
# you've ever used the command line to 'ls' or 
# 'dir' the currently directory you may be aware 
# that the directory accepts * or ? or [] to 
# match patterns.  glob is a python implementation 
# of this functionality. 
 
import glob
import os
 
# find all the .txt files in the current working directory  
print glob.glob('*.TXT')
# output: 
# ['LICENSE.txt', 'NEWS.txt', 'README.txt'] 
 
# you can also specify a full path 
# Here I'm searching for dll files in python 2.6 
print glob.glob('C:\Python26\DLLs\*.dll')
# output: 
#    ['C:\\Python26\\DLLs\\sqlite3.dll', 
#    'C:\\Python26\\DLLs\\tcl85.dll', 
#    'C:\\Python26\\DLLs\\tclpip85.dll', 
#    'C:\\Python26\\DLLs\\tk85.dll'] 
 
# If you are expecting a great deal of results 
# you should use the glob.iglob method that returns 
# matches as it goes and does not load everything 
# into memory first. 
# glob.iglob() example 
f = glob.iglob('C:\Python26\Lib\*')
 
spitItOut = True
while spitItOut:
    try:
        fileNameAndPath = f.next()
        # since glob gives you the full path you can 
        # use the output with some of the os module's methods 
        if os.path.isfile(fileNameAndPath):
            fileNameAndPath += " is a file." 
        else:
            fileNameAndPath += " is not a file." 
        print fileNameAndPath
    except StopIteration:
        spitItOut = False
 
#output (snipped a bit...since there a lot): 
#    C:\Python26\Lib\abc.py is a file. 
#    ....[snip] 
#    C:\Python26\Lib\compiler is not a file. 
#    ...[another snip] 
#    C:\Python26\Lib\getopt.py is a file. 
#    C:\Python26\Lib\getopt.pyc is a file. 
#    C:\Python26\Lib\getpass.py is a file. 
#    C:\Python26\Lib\gettext.py is a file. 
#    C:\Python26\Lib\glob.py is a file. 
#    C:\Python26\Lib\glob.pyc is a file.

Python - printing complex objects with pretty printing

# Pretty printing (using the pprint module) transforms 
# python objects into human readable output. 
# 
# Use pprint when you need to display a complex 
# data structure to users. 
 
 
import string
import pprint
# pprint python doc
 
d = {}
 
for i in string.ascii_lowercase:
    d[i] = string.ascii_lowercase.replace(i, ' ')
 
print "not useful output:" 
print d
# output: 
#   not useful output: 
#   {'a': ' bcdefghijklmnopqrstuvwxyz', 'c': 'ab defghijklmnopqrstuvwxyz', 'b': 
#   'a cdefghijklmnopqrstuvwxyz', 'e': 'abcd fghijklmnopqrstuvwxyz', 'd': 'abc 
#   efghijklmnopqrstuvwxyz', 'g': 'abcdef hijklmnopqrstuvwxyz', 'f': 'abcde ghij 
#   klmnopqrstuvwxyz', 'i': 'abcdefgh jklmnopqrstuvwxyz', 'h': 'abcdefg ijklmnop 
#   qrstuvwxyz', 'k': 'abcdefghij lmnopqrstuvwxyz', 'j': 'abcdefghi klmnopqrstuv 
#   wxyz', 'm': 'abcdefghijkl nopqrstuvwxyz', 'l': 'abcdefghijk mnopqrstuvwxyz', 
#   'o': 'abcdefghijklmn pqrstuvwxyz', 'n': 'abcdefghijklm opqrstuvwxyz', 'q': ' 
#   abcdefghijklmnop rstuvwxyz', 'p': 'abcdefghijklmno qrstuvwxyz', 's': 'abcdef 
#   ghijklmnopqr tuvwxyz', 'r': 'abcdefghijklmnopq stuvwxyz', 'u': 'abcdefghijkl 
#   mnopqrst vwxyz', 't': 'abcdefghijklmnopqrs uvwxyz', 'w': 'abcdefghijklmnopqr 
#   stuv xyz', 'v': 'abcdefghijklmnopqrstu wxyz', 'y': 'abcdefghijklmnopqrstuvwx 
#   z', 'x': 'abcdefghijklmnopqrstuvw yz', 'z': 'abcdefghijklmnopqrstuvwxy '} 
# 
# All the data is there but it is difficult to read. 
# You can use pprint (pretty print) to make things easy to read.  pprint 
#   formats python datastructures to be human readable. 
 
print "human readable output:" 
pprint.pprint(d, indent=4)
# output: 
#human readable output: 
#{   'a': ' bcdefghijklmnopqrstuvwxyz', 
#    'b': 'a cdefghijklmnopqrstuvwxyz', 
#    'c': 'ab defghijklmnopqrstuvwxyz', 
#    'd': 'abc efghijklmnopqrstuvwxyz', 
#    'e': 'abcd fghijklmnopqrstuvwxyz', 
#    'f': 'abcde ghijklmnopqrstuvwxyz', 
#    'g': 'abcdef hijklmnopqrstuvwxyz', 
#    'h': 'abcdefg ijklmnopqrstuvwxyz', 
#    'i': 'abcdefgh jklmnopqrstuvwxyz', 
#    'j': 'abcdefghi klmnopqrstuvwxyz', 
#    'k': 'abcdefghij lmnopqrstuvwxyz', 
#    'l': 'abcdefghijk mnopqrstuvwxyz', 
#    'm': 'abcdefghijkl nopqrstuvwxyz', 
#    'n': 'abcdefghijklm opqrstuvwxyz', 
#    'o': 'abcdefghijklmn pqrstuvwxyz', 
#    'p': 'abcdefghijklmno qrstuvwxyz', 
#    'q': 'abcdefghijklmnop rstuvwxyz', 
#    'r': 'abcdefghijklmnopq stuvwxyz', 
#    's': 'abcdefghijklmnopqr tuvwxyz', 
#    't': 'abcdefghijklmnopqrs uvwxyz', 
#    'u': 'abcdefghijklmnopqrst vwxyz', 
#    'v': 'abcdefghijklmnopqrstu wxyz', 
#    'w': 'abcdefghijklmnopqrstuv xyz', 
#    'x': 'abcdefghijklmnopqrstuvw yz', 
#    'y': 'abcdefghijklmnopqrstuvwx z', 
#    'z': 'abcdefghijklmnopqrstuvwxy '} 
# 
# Formatted in this fashion its easy to see what 
# data is being stored in the dict.

Python - hash with md5 and sha1 (and others!)

# There a many reasons to hash data. 
# For this example we'll say that we 
# want to has passwords so we can store 
# them in a database (or file)...or for 
# this example a variable 
 
# hashlib encapsulates the following functionality: 
#   md5 
#   sha1 
#   sha224 
#   sha256 
#   sha384 
#   sha512 
import hashlib
 
# When a user creates their account they'll input 
# a password.  For security purposes you hash 
# the password and store it (so they can log into 
# their account later). 
password = "$uperP@a$$w0rd" 
 
#pass the password to the sha1 constructor 
createSha1 = hashlib.sha1(password)
 
#dump the password out in text 
sha1_password = createSha1.hexdigest()
 
print sha1_password
#output: 
# 2d0b537e6673e1f6baf1c462cd4922dab32ee243 
 
# You'll notice that sha1 creates a 40 character hash. 
# All hashed strings (regardless of original size) will 
# be represented by sha1 as 40 characters. 
print len(sha1_password)
#output: 
# 40 
 
# You can store that hashed password and then later the 
# user will attempt to login.  Take their password and hash 
# it with the same algorithm (sha1 in our example). 
password_attempt_1 = "superP@a$$w0rd" 
password_attempt_2 = "$up3rP@a$$w0rd" 
password_attempt_3 = "$uperP@a$$w0rd" 
 
#take the attempts and hash them so you can compare passwords 
attempt1 = hashlib.sha1(password_attempt_1)
if sha1_password == attempt1.hexdigest():
    print "password attempt 1 is a success" 
else:
    print "password attempt 1 is a failure" 
 
attempt2 = hashlib.sha1(password_attempt_2)
if sha1_password == attempt2.hexdigest():
    print "password attempt 2 is a success" 
else:
    print "password attempt 2 is a failure" 
 
attempt3 = hashlib.sha1(password_attempt_3)
if sha1_password == attempt3.hexdigest():
    print "password attempt 3 is a success" 
else:
    print "password attempt 3 is a failure" 
 
#output: 
#    password attempt 1 is a failure 
#    password attempt 2 is a failure 
#    password attempt 3 is a success 
 
 
# Now that you understand how to use sha1 you 
# understand how to use all of the supported 
# algorithms in hashlib.  They all use the same 
# methods so you can easily adapt your code to 
# any of the hash types. 
# Check out the python docs for hashlib

Wednesday, September 30, 2009

Python - regular expression backreference example

# use the power of regular expressions
# bite the bullet and review the regular expression syntax 

import re

# lets say you have created the next search engine
# your search engine extracts the contents of
# the <title></title> tags
theString = """ <lots of garbage and # what not and this

title is going to be cool> <myTitle> will be awesome.
And once you get <title>the title is here</title> and then
there is the end """

# you compile a regular expression to search

# for the contents of the title tag
# (this is where the regular expression syntax http://docs.python.org/library/re.html#regular-expression-syntax
# comes in handy)
# the one thing to certainly notice is that there are
# parenthesis surrounding the contents of the title tag.
# These are called backreferences.  Once we've run the search
# we'll be able to reference these.
p = re.compile('<title>(.+)<\/title>')


# now search theString
m = re.search(p, theString)

# you can test whether or not your
# regular expression was successfull
if m:
    print "regular expression search successfull!"
    # referencing group #1 references the first backreference

    print "the title contents are:", m.group(1)
    # group # 0 is the entire regular expression result
    print "the entire regular expression returned:", m.group(0)

else:
    print "regular expression search returns no results"

#output:
#   regular expression search successfull!
#   the title contents are: the title is here
#   the entire regular expression returned: <title>the title is here</title>

Python - using yaml for configuration files

import yaml
# checkout and download yaml for python 

# you should probably put this config in a seperate file
# but for this example it is just a multi-line string
yamlConfigFile = """
cars:
    car0:
        type: toyota
        hp: 129
        mpg:
            city: 30
            highway: 35
        cost: 15,000
    car1:
        type: gm
        hp: 225
        mpg:
            city: 20
            highway: 25
        cost: 20,000
    car2:
        type: chevy
        hp: 220
        mpg:
            city: 22
            highway: 24
        cost: 21,000
"""

# the yaml file will be converted to a dict
# for sub sections the dict will nest dicts
theDict = yaml.load(yamlConfigFile)
print theDict
# output (I added some tabs and what not so you
#           could see the nested dict structure):
# {'cars':
#    {'car2':
#        {'mpg': {'city': 22, 'highway': 24},
#        'hp': 220,
#        'cost': '21,000',
#        'type': 'chevy'},
#    'car0':
#        {'mpg': {'city': 30, 'highway': 35},
#        'hp': 129,
#        'cost': '15,000',
#        'type': 'toyota'},
#    'car1':
#        {'mpg': {'city': 20, 'highway': 25},
#        'hp': 225,
#        'cost': '20,000',
#        'type': 'gm'}
#    }
#}

# to list the car types (like car1, car2, etc
print theDict['cars'].keys()
# output:
# ['car2', 'car0', 'car1']

# to display the type and cost of the vehicles
for c in theDict['cars'].keys():
    print theDict['cars'][c]['type'], "cost:", theDict['cars'][c]['cost']

# output:
#    chevy cost: 21,000
#    toyota cost: 15,000
#    gm cost: 20,000

# update the cost of toyota
theDict['cars']['car0']['cost'] = '25,000'
# the update is now in the dict representation of the yaml file

# to dump the yaml dict back to a file
# or in our case a multi-line string use the dump command
# which you could write to a file
print yaml.dump(theDict)
# output:
#    cars:
#      car0:
#        cost: 25,000
#        hp: 129
#        mpg: {city: 30, highway: 35}
#        type: toyota
#      car1:
#        cost: 20,000
#        hp: 225
#        mpg: {city: 20, highway: 25}
#        type: gm
#      car2:
#        cost: 21,000
#        hp: 220
#        mpg: {city: 22, highway: 24}
#        type: chevy

Tuesday, September 29, 2009

Python - simple regular expression examples

# regular expressions are extremely powerful
# here are some simple examples to get you started

import re

text = "Some example text to manipulate with regular expressions."

# find the location of all the vowels
# iterate through all vowels
# here I've used the finditer method to return and
#   and iterator through the results
for i in re.finditer('[aeiouy]', text):
    print "location:", i.start(), " to ", i.end()
    print "  found text was: ", text[i.start():i.end()]
# output:
#location: 1  to  2
#  found text was:  o
#location: 3  to  4
#  found text was:  e
#
# SNIP -- there are lots of vowels
#
#location: 49  to  50
#  found text was:  e
#location: 52  to  53
#  found text was:  i
#location: 53  to  54
#  found text was:  o

# use regular expressions to split your sentence into words
sentence = "This is my example sentence"
for word in re.split(' ', sentence):
    print word
#Output:
# This
# is
# my
# example
# sentence

# search and replace regular expression functionality
# replace 'regular expression' with 're'
text = "regular expression text goes here"
newText = re.sub('regular expression', 're', text)
print newText
#Outputs:
# re text goes here

Python - generate double dutch

# this example is similar 
# to the double dutch generator 
 
def createDoubleDutch(word):
    ''' 
        create and return a double 
        dutch version of word 
    ''' 
    for v in ("a", "e", "i", "o", "u", "y"):
        # double dutch-ize each vowel 
        word = word.replace(v, v+"b"+v)
    return word
 
if __name__ == '__main__':
    ddSentence = "" 
    for w in "My sample sentence for double dutch".split(' '):
           ddSentence += createDoubleDutch(w) + " " 
    print ddSentence.strip()
 
#output: 
# Myby sabamplebe sebentebencebe fobor doboubublebe dubutch

Monday, September 28, 2009

Python - pig latin generator

 
 
 
def makePigLatin(word):
    """ convert one word into pig latin """ 
    m  = len(word)
    vowels = "a", "e", "i", "o", "u", "y" 
    # short words are not converted 
    if m<3 or word=="the":
        return word
    else:
        for i in vowels:
            if word.find(i) < m and word.find(i) != -1:
                m = word.find(i)
        if m==0:
            return word+"way" 
        else:
            return word[m:]+word[:m]+"ay" 
 
 
sentence = "Hooray for pig latin" 
pigLatinSentence = "" 
# iterate through words in sentence 
for w in sentence.split(' '):
    pigLatinSentence += makePigLatin(w) + " " 
 
print pigLatinSentence.strip()
 
# output: 
# oorayHay orfay igpay atinlay

python - split paragraph into sentences with regular expressions

# split up a paragraph into sentences
# using regular expressions


def splitParagraphIntoSentences(paragraph):
    ''' break a paragraph into sentences
        and return a list '''
    import re
    # to split by multile characters

    #   regular expressions are easiest (and fastest)
    sentenceEnders = re.compile('[.!?]')
    sentenceList = sentenceEnders.split(paragraph)
    return sentenceList


if __name__ == '__main__':
    p = """This is a sentence.  This is an excited sentence! And do you think this is a question?"""

    sentences = splitParagraphIntoSentences(p)
    for s in sentences:
        print s.strip()

#output:
#   This is a sentence
#   This is an excited sentence

#   And do you think this is a question

Python - detect and label objects in images

Image to be analyzed

Detected Objects have now been outlined

from PIL import Image

# you'll need to get PIL 
# some other (shorter) scripts
# that use PIL:
#   create a thumbnail with PIL  
#   find the average image RGB  
#   replace image colors with PIL  
#
# this script is based on the

#   find the sun script   

class TheOutliner(object):
    ''' takes a dict of xy points and
        draws a rectangle around them '''
    def __init__(self):
        self.outlineColor = 0, 255, 255
        self.pic = None
        self.picn = None
        self.minX = 0
        self.minY = 0
        self.maxX = 0
        self.maxY = 0
    def doEverything(self, imgPath, dictPoints, theoutfile):
        self.loadImage(imgPath)
        self.loadBrightPoints(dictPoints)
        self.drawBox()
        self.saveImg(theoutfile)
    def loadImage(self, imgPath):
        self.pic = Image.open(imgPath)
        self.picn = self.pic.load()
    def loadBrightPoints(self, dictPoints):
        '''iterate through all points and

           gather max/min x/y '''


        # an x from the pool (the max/min
        #   must be from dictPoints)
        self.minX = dictPoints.keys()[0][0]
        self.maxX = self.minX
        self.minY = dictPoints.keys()[0][1]
        self.maxY = self.minY


        for point in dictPoints.keys():
            if point[0] < self.minX:
                self.minX = point[0]
            elif point[0] > self.maxX:
                self.maxX = point[0]

            if point[1]< self.minY:
                self.minY = point[1]
            elif point[1] > self.maxY:
                self.maxY = point[1]
    def drawBox(self):
        # drop box around bright points

        for x in xrange(self.minX, self.maxX):
            # top bar
            self.picn[x, self.minY] = self.outlineColor
            # bottom bar
            self.picn[x, self.maxY] = self.outlineColor
        for y in xrange(self.minY, self.maxY):
            # left bar

            self.picn[self.minX, y] = self.outlineColor
            # right bar
            self.picn[self.maxX, y] = self.outlineColor
    def saveImg(self, theoutfile):
        self.pic.save(theoutfile, "JPEG")



#class CollectBrightPoints(object):
#
#    def __init__(self):

#        self.brightThreshold = 240, 240, 240
#        self.pic = None
#        self.picn = None
#        self.brightDict = {}
#    def loadImage(self, imgPath):
#        self.pic = Image.open(imgPath)
#        self.picn = self.pic.load()
#    def collectBrightPoints(self):
#        for x in xrange(self.pic.size[0]):

#            for y in xrange(self.pic.size[1]):
#                r,g,b = self.picn[x,y]
#                if r > self.brightThreshold[0] and \
#                    g > self.brightThreshold[1] and \
#                    b > self.brightThreshold[2]:
#                    # then it is brighter than our threshold

#                    self.brightDict[x,y] = r,g,b


class ObjectDetector(object):
    ''' returns a list of dicts representing 
        all the objects in the image '''
    def __init__(self):
        self.detail = 4
        self.objects = []
        self.size = 1000
        self.no = 255
        self.close = 100
        self.pic = None
        self.picn = None
        self.brightDict = {}
    def loadImage(self, imgPath):
        self.pic = Image.open(imgPath)
        self.picn = self.pic.load()
        self.picSize = self.pic.size
        self.detail = (self.picSize[0] + self.picSize[1])/2000
        self.size = (self.picSize[0] + self.picSize[1])/8
        # each must be at least 1 -- and the larger

        #   the self.detail is the faster the analyzation will be
        self.detail += 1
        self.size += 1
        
    def getSurroundingPoints(self, xy):
        ''' returns list of adjoining point '''
        x = xy[0]
        y = xy[1]
        plist = (
            (x-self.detail, y-self.detail), (x, y-self.detail), (x+self.detail, y-self.detail),
            (x-self.detail, y),(x+self.detail, y),
            (x-self.detail, y+self.detail),(x, y+self.detail),(x+self.detail,y+self.detail)
            )
        return (plist)

    def getRGBFor(self, x, y):
        try:
            return self.picn[x,y]
        except IndexError as e:
            return 255,255,255

    def readyToBeEvaluated(self, xy):
        try:
            r,g,b = self.picn[xy[0],xy[1]]
            if r==255 and g==255 and b==255:
                return False
        except:
            return False
        return True

    def markEvaluated(self, xy):
        try:
            self.picn[xy[0],xy[1]] = self.no, self.no, self.no
        except:
            pass

    def collectAllObjectPoints(self):
        for x in xrange(self.pic.size[0]):
            if x % self.detail == 0:
                for y in xrange(self.pic.size[1]):
                    if y % self.detail == 0:
                        r,g,b = self.picn[x,y]
                        if r == self.no and \
                            g == self.no and \
                            b == self.no:
                            # then no more

                            pass
                        else:
                            ol = {}
                            ol[x,y] = "go"
                            pp = []
                            pp.append((x,y))
                            stillLooking = True
                            while stillLooking:
                                if len(pp) > 0:
                                    xe, ye = pp.pop()
                                    # look for adjoining points

                                    for p in self.getSurroundingPoints((xe,ye)):
                                        if self.readyToBeEvaluated((p[0], p[1])):
                                            r2,g2,b2 = self.getRGBFor(p[0], p[1])
                                            if abs(r-r2) < self.close and \
                                                abs(g-g2) < self.close and \
                                                abs(b-b2) < self.close:
                                                # then its close enough

                                                ol[p[0],p[1]] = "go"
                                                pp.append((p[0],p[1]))

                                            self.markEvaluated((p[0],p[1]))
                                        self.markEvaluated((xe,ye))
                                else:
                                    # done expanding that point
                                    stillLooking = False
                                    if len(ol) > self.size:
                                        self.objects.append(ol)








if __name__ == "__main__":
    print "Start Process";

    # assumes that the .jpg files are in
    #   working directory 
    theFile = "3.jpg"

    theOutFile = "3.output.jpg"

    import os
    os.listdir('.')
    for f in os.listdir('.'):
        if f.find(".jpg") > 0:
            theFile = f
            print "working on " + theFile + "..."

            theOutFile = theFile + ".out.jpg"
            bbb = ObjectDetector()
            bbb.loadImage(theFile)
            print "     analyzing.."
            print "     file dimensions: " + str(bbb.picSize)
            print "        this files object weight: " + str(bbb.size)
            print "        this files analyzation detail: " + str(bbb.detail)
            bbb.collectAllObjectPoints()
            print "     objects detected: " +str(len(bbb.objects))
            drawer = TheOutliner()
            print "     loading and drawing rectangles.."

            drawer.loadImage(theFile)
            for o in bbb.objects:
                drawer.loadBrightPoints(o)
                drawer.drawBox()

            print "saving image..."
            drawer.saveImg(theOutFile)

            print "Process complete"

#output
#Start Process
#working on A Good Book to Have on Your Shelf.jpg...
#     analyzing..
#     file dimensions: (500, 667)
#        this files object weight: 146
#        this files analyzation detail: 1
#     objects detected: 6
#     loading and drawing rectangles..

#saving image...
#Process complete
#working on bamboo-forest.jpg...
#     analyzing..
#     file dimensions: (640, 480)
#        this files object weight: 141
#        this files analyzation detail: 1
#     objects detected: 68
#     loading and drawing rectangles..

#saving image...
#
# .............. SNIP .... (I had 20 jpeg files in the dir)
#
#working on Family_Photo.jpg...
#     analyzing..
#     file dimensions: (4200, 3300)
#        this files object weight: 938
#        this files analyzation detail: 4

#     objects detected: 20
#     loading and drawing rectangles..
#saving image...
#Process complete

Saturday, September 26, 2009

Python - sun image detector - outline objects in an image

The input:
Where oh where is the sun?

from PIL import Image
 
# find brightest region of image 
# and visually identify the region 
 
class TheOutliner(object):
    def __init__(self):
        self.outlineColor = 0, 255, 255
        self.pic = None
        self.picn = None
        self.minX = 0
        self.minY = 0
        self.maxX = 0
        self.maxY = 0
    def doEverything(self, imgPath, dictPoints, theoutfile):
        self.loadImage(imgPath)
        self.loadBrightPoints(dictPoints)
        self.drawBox()
        self.saveImg(theoutfile)
    def loadImage(self, imgPath):
        self.pic = Image.open(imgPath)
        self.picn = self.pic.load()
    def loadBrightPoints(self, dictPoints):
        # iterate through all points and 
        #   gather max/min x/y 
 
 
        # an x from the pool (the max/min 
        #   must be from dictPoints) 
        self.minX = dictPoints.keys()[0][0]
        self.maxX = self.minX
        self.minY = dictPoints.keys()[0][1]
        self.maxY = self.minY
 
 
        for point in dictPoints.keys():
            if point[0] < self.minX:
                self.minX = point[0]
            elif point[0] > self.maxX:
                self.maxX = point[0]
 
            if point[1] < self.minY:
                self.minY = point[1]
            elif point[1] > self.maxY:
                self.maxY = point[1]
    def drawBox(self):
        # drop box around bright points 
        for x in xrange(self.minX, self.maxX):
            # top bar 
            self.picn[x, self.minY] = self.outlineColor
            # bottom bar 
            self.picn[x, self.maxY] = self.outlineColor
        for y in xrange(self.minY, self.maxY):
            # left bar 
            self.picn[self.minX, y] = self.outlineColor
            # right bar 
            self.picn[self.maxX, y] = self.outlineColor
    def saveImg(self, theoutfile):
        self.pic.save(theoutfile, "JPEG")
 
 
 
class CollectBrightPoints(object):
    def __init__(self):
        self.brightThreshold = 240, 240, 240
        self.pic = None
        self.picn = None
        self.brightDict = {}
    def loadImage(self, imgPath):
        self.pic = Image.open(imgPath)
        self.picn = self.pic.load()
    def collectBrightPoints(self):
        for x in xrange(self.pic.size[0]):
            for y in xrange(self.pic.size[1]):
                r,g,b = self.picn[x,y]
                if r>self.brightThreshold[0] and \
                    g > self.brightThreshold[1] and \
                    b > self.brightThreshold[2]:
                    # then it is brighter than our threshold 
                    self.brightDict[x,y] = r,g,b
 
 
if __name__ == "__main__":
    print "Start Process";
 
    # assumes that the test.jpg is in the 
    #   working directory  
    theFile = "four.jpg" 
    theOutFile = "four.output.jpg" 
 
    cbp = CollectBrightPoints()
    cbp.loadImage(theFile)
    cbp.collectBrightPoints()
    brightDict = cbp.brightDict
 
    drawer = TheOutliner()
    drawer.doEverything(theFile, brightDict, theOutFile)
    print "Process complete"

The output:
The sun has been detected!