Sgmlop Patterns
July 11, 2002 | Fredrik Lundh
Here’s a code snippet that extracts A HREF anchors from a webpage:
import sgmlop import urllib class anchor_parser: # sgmlop parser target def __init__(self): self.anchors = [] def finish_starttag(self, tag, attrs): if tag == "a": for k, v in attrs: if k == "href": self.anchors.append(v) def sgmlop_parse(target, data): # helper to feed events into a target parser = sgmlop.SGMLParser() parser.register(target) parser.feed(data) parser.close() # we're done return target def getpage(page): # helper to fetch an entire web page return urllib.urlopen(page).read() def getanchors(page): parser = sgmlop_parse(anchor_parser(), getpage(page)) return parser.anchors print getanchors("http://www.python.org")