[Python-talk] My code to parse the GNHLUG web page

Kent Johnson kent37 at tds.net
Fri Oct 26 09:14:51 EDT 2007


Here is my code from last night. I would like to clean it up, comment 
it, etc but I am out of town this weekend so I am posting it as is 
before I forget about it entirely...

getData() is a generator which yields clean triples of (date, location, 
count) where date is a datetime.date, location is a plain string and 
count is either an integer or None.

main() is a getData() client which accumulates counts by location and 
prints a list of location and total count, sorted by count.

Kent

#################################

import datetime, re, urllib2
from collections import defaultdict
from operator import itemgetter
from BeautifulSoup import BeautifulSoup

def main():
     counts = defaultdict(int)
     for dat, where, count in getData():
         if count is not None:
             counts[where] += count

     for where, count in sorted(counts.items(), key=itemgetter(1), 
reverse=True):
         print where, count


def getData():
     ''' Generate (date, where, count) '''
     url = 
'http://wiki.gnhlug.org/twiki2/bin/view/Www/PastEvents2007?skin=print.pattern'
     data = urllib2.urlopen(url).read()
     #data = open('/Users/kent/Desktop/PastEvents2007.html').read()
     soup = BeautifulSoup(data)

     table = soup.table
     for tr in table.findAll('tr')[1:]:
         dat = cleanDate(tr.contents[0])
         where = cleanWhere(tr.contents[1])
         count = cleanCount(tr.contents[-1])

         yield dat, where, count


def cleanDate(tag):
     br = tag.find('br')
     if br:
         s = br.next
     else:
         s = tag.string

     s = re.sub(r'-\d+', '', s)
     dat = s.strip() + ' 2007'

     for fmt in '%d %b %Y', '%d %B %Y':
         try:
             return datetime.datetime.strptime(dat, fmt).date()
         except ValueError:
             pass

     return None


def cleanWhere(tag):
     where = textOnly(tag).strip()
     where = re.sub(r'\s+ at .*$', '', where)
     return where


def cleanCount(tag):
     ''' Clean a count string and return an int or None '''
     s = tag.string.strip()
     s = s.strip().strip('?-~')
     if not s or s=='N/A':
         return None

     return int(s)


def textOnly(tag):
     return ''.join(e for e in tag.recursiveChildGenerator()
              if isinstance(e,unicode))

main()


More information about the Python-talk mailing list