Find the file here
import re,string,operator,os
def process_file(filename, guten):
hist = dict()
fp = open(filename)
if guten:
header = True
if not guten:
header = False
for line in fp:
if line[:20] == "*** END OF THIS PROJ": # There must be a better way to escape the header and footer
header = True
if not header:
process_line(line, hist)
if line[:20] == "*END*THE SMALL PRINT" or line[:20]=="*** START OF THIS PR": #this is only for the shakespeares folios "00ws110.tt"
header = False
#print "header escaped" #woo, debugging
return hist
def process_line(line, hist):
line = line.replace('-', ' ') #clean hyphenated words
for word in line.split(): #re.split('[\W_]+', line) #could do the split with regex, but regex is magic and doesn't strip punctuation quite as nicely
word = word.strip(string.punctuation + string.whitespace)
word = word.lower()
word = re.sub('[\W_]+','', word) #this regular expression should get rid of the few special iso characters not in string.punctuation
hist[word] = hist.get(word, 0) + 1
def top_20(hist):
hist_sorted = sorted(hist.iteritems(), key=operator.itemgetter(1), reverse=True) #according to stackexchange this is a really fast way to sort a dicitonary
output ="The twenty most common terms in this work are:\n"
for i in range(0,20):
output += str(hist_sorted[i]) +"\n"
return output
def compare_lists(list1, list2):
new_list = dict()
for word1 in list1:
new_list[word1] = True
#print word1
if list2.has_key(word1):
new_list[word1] = False
return new_list
def return_true_words(dic):
output_list = list()
for i in dic:
#print i
if dic[i]:
output_list.append(i)
return output_list
def exercise1(book):
'''
proccess_file() got rather complicated and too sofisticated for this problem
So we're just going to rewrite it here. We could define a new method,
but my preference is to only do that if we need to solve the same problem 3 or more times
'''
fp = open(book)
longstring = str()
for line in fp:
line = line.replace('-', ' ')
for word in line.split():
word = word.strip(string.punctuation + string.whitespace)
word = word.lower()
word = re.sub('[\W_]+','', word) #regular expressions clean up wierd characters not included in string.punctuation
longstring = longstring + word + " "
return longstring
def exercise2(book):
hist = process_file(book, True)
return "There are %d unique words in this work" %len(hist)
def exercise3(book):
hist = process_file(book, True)
output = top_20(hist)
return output
def exercise4(book):
output_list = return_true_words(compare_lists(
process_file(book, True), process_file("words.txt", False)))
return output_list #long
def writeitallout():
for i in range(1,5):
output = open("execise%d.txt" % i, "w")
method_name = "exercise%d" % i #because writing four method names is hard
outtext = eval(method_name) #eval() evaluates a string as python code
'''
eval() is kind of dangerous and has the potential to make it much easier
to excecute malicious, obfuscated code, but it works in this case.
I suppose this makes this bad code. The other methods I tried to solve this
problem did not work nearly as well.
'''
print "Writing exercise %d to file exercise%d.txt" % (i,i)
output.write(str(outtext('pg46.txt')))
output.close()
writeitallout()