Wednesday, April 14, 2010

Python - frequency and location of words in text

# Split a paragraph into lines and find 
# the frequency and line number of words. 
 
from operator import itemgetter
 
# words and locations are stored in a dict 
wordDict = {}
 
# the text we will parse 
text = ''' this is the text on line one. 
this is line two text. 
here is the text of line number three.''' 
 
 
def groupWords(text):
    lineCount = 0
    # break down by lines 
    for line in text.split('\n'):
        line = line.strip(".,!?:;'") # strip puncuation 
        lineCount += 1
        upLine = line.upper() # words are words..case no matter 
        # break line into words 
        for word in upLine.split():
            if wordDict.has_key(word):
                # then add to the key 
                tempValue = wordDict[word]
                wordDict[word] = str(tempValue) + " " + str(lineCount)
            else:
                # add it 
                wordDict[word] = " " + str(lineCount)
 
groupWords(text)
 
# alphabetical output 
for k in sorted(wordDict.iterkeys()):
    print k + str(wordDict[k])
 
# my output: 
##  HERE 3 
##  IS 1 2 3 
##  LINE 1 2 3 
##  NUMBER 3 
##  OF 3 
##  ON 1 
##  ONE 1 
##  TEXT 1 2 3 
##  THE 1 3 
##  THIS 1 2 
##  THREE 3 
##  TWO 2 
 
# most frequent style output 
# put the dict in a list 
wordList = []
for k in wordDict.iterkeys():
    wordList.append((str(len(wordDict[k].replace(' ',''))),
                    str(k),
                    wordDict[k]))
 
 
for word in sorted(wordList, key=itemgetter(0), reverse=True):
    print word[0], word[1], ":"+word[2]
 
# my output: 
##  3 TEXT : 1 2 3 
##  3 IS : 1 2 3 
##  3 LINE : 1 2 3 
##  2 THIS : 1 2 
##  2 THE : 1 3 
##  1 ON : 1 
##  1 TWO : 2 
##  1 HERE : 3 
##  1 ONE : 1 
##  1 NUMBER : 3 
##  1 OF : 3 
##  1 THREE : 3 
 
 

No comments:

Post a Comment