[Python-talk] My code to parse the GNHLUG web page
Kent Johnson
kent37 at tds.net
Fri Oct 26 09:14:51 EDT 2007
Here is my code from last night. I would like to clean it up, comment
it, etc but I am out of town this weekend so I am posting it as is
before I forget about it entirely...
getData() is a generator which yields clean triples of (date, location,
count) where date is a datetime.date, location is a plain string and
count is either an integer or None.
main() is a getData() client which accumulates counts by location and
prints a list of location and total count, sorted by count.
Kent
#################################
import datetime, re, urllib2
from collections import defaultdict
from operator import itemgetter
from BeautifulSoup import BeautifulSoup
def main():
counts = defaultdict(int)
for dat, where, count in getData():
if count is not None:
counts[where] += count
for where, count in sorted(counts.items(), key=itemgetter(1),
reverse=True):
print where, count
def getData():
''' Generate (date, where, count) '''
url =
'http://wiki.gnhlug.org/twiki2/bin/view/Www/PastEvents2007?skin=print.pattern'
data = urllib2.urlopen(url).read()
#data = open('/Users/kent/Desktop/PastEvents2007.html').read()
soup = BeautifulSoup(data)
table = soup.table
for tr in table.findAll('tr')[1:]:
dat = cleanDate(tr.contents[0])
where = cleanWhere(tr.contents[1])
count = cleanCount(tr.contents[-1])
yield dat, where, count
def cleanDate(tag):
br = tag.find('br')
if br:
s = br.next
else:
s = tag.string
s = re.sub(r'-\d+', '', s)
dat = s.strip() + ' 2007'
for fmt in '%d %b %Y', '%d %B %Y':
try:
return datetime.datetime.strptime(dat, fmt).date()
except ValueError:
pass
return None
def cleanWhere(tag):
where = textOnly(tag).strip()
where = re.sub(r'\s+ at .*$', '', where)
return where
def cleanCount(tag):
''' Clean a count string and return an int or None '''
s = tag.string.strip()
s = s.strip().strip('?-~')
if not s or s=='N/A':
return None
return int(s)
def textOnly(tag):
return ''.join(e for e in tag.recursiveChildGenerator()
if isinstance(e,unicode))
main()
More information about the Python-talk
mailing list