Cleaning up HTML snippets
Fredrik Lundh | November 2003 | Originally posted to online.effbot.org
The following fixsnippet function takes an HTML fragment and turns it into a well-formed XHTML fragment. It checks that the fragment only contains tags from the VALID_TAGS list, maps presentational tags to semantic tags, and removes attributes from most tags. You can use this to convert things like blog entries, comments, and HTML mails into glorious XHTML.
Note that you need both the ElementTree library and the ElementTidy extension; follow the links below for details.
from elementtidy.TidyHTMLTreeBuilder import TidyHTMLTreeBuilder from elementtree.ElementTree import tostring # list of accepted tags VALID_TAGS = ( "a", "p", "blockquote", "ul", "ol", "li", "dl", "dt", "dd", "em", "strong", "dfn", "code", "q", "samp", "kbd", "var", "cite", "abbr", "acronym", "sub", "sup" ) # dictionary mapping presentational tags to semantic tags TAG_MAP = { "b": "strong", "i": "em", "tt": "samp" } # XHTML namespace url NS_XHTML = "{http://www.w3.org/1999/xhtml}" def fixsnippet(snippet): parser = TidyHTMLTreeBuilder() parser.feed(snippet) body = parser.close().find(NS_XHTML + "body") for elem in body: for elem in elem.getiterator(): # get rid of XHTML namespace if elem.tag.startswith(NS_XHTML): elem.tag = elem.tag[len(NS_XHTML):] # map presentation tags to semantic tags elem.tag = TAG_MAP.get(elem.tag, elem.tag) # check for bad tags if elem.tag not in VALID_TAGS: raise SyntaxError("invalid tag: %r" % elem.tag) # clear out attributes (extend as necessary) if elem.tag == "a": href = elem.get("href") elem.attrib.clear() if href: elem.set("href", href) else: elem.attrib.clear() # change the body tag to a div tag body.tag = "div"; body.attrib.clear() return tostring(body)