re.sub Patterns

Converting HTML to Plain Text #

January 15, 2003 | Fredrik Lundh

This function removes HTML tags, and also converts character entities and character references. If you only want to convert entities, see the next example.

import re

##
# Removes HTML markup from a text string.
#
# @param text The HTML source.
# @return The plain text. If the HTML source contains non-ASCII
# entities or character references, this is a Unicode string.

def strip_html(text):
    def fixup(m):
        text = m.group(0)
        if text[:1] == "<":
            return "" # ignore tags
        if text[:2] == "&#":
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        elif text[:1] == "&":
            import htmlentitydefs
            entity = htmlentitydefs.entitydefs.get(text[1:-1])
            if entity:
                if entity[:2] == "&#":
                    try:
                        return unichr(int(entity[2:-1]))
                    except ValueError:
                        pass
                else:
                    return unicode(entity, "iso-8859-1")
        return text # leave as is
    return re.sub("(?s)<[^>]*>|&#?\w+;", fixup, text)

Unescape HTML Entities #

October 28, 2006 | Fredrik Lundh

This function converts HTML entities and character references to ordinary characters.

import re, htmlentitydefs

##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.

def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

Simple Templating #

June 19, 2002 | Fredrik Lundh

Here are a couple of functions that implement PEP 292-style string interpolation:

import sre, sys

# match $var and ${var} and $$
_dosub = sre.compile(r'\$(?:\$|(\w+)|\{([^}]*)\})').sub

def expandvars(string, vars):
    # expand $var and ${var}; leave unknowns as is
    def repl(m, vars=vars):
        if not m.lastindex:
            return "$"
        try:
            return vars[m.group(m.lastindex)]
        except (KeyError, NameError):
            return m.group(0)
    return _dosub(repl, string)

def replacevars(string, vars):
    # same as expandvars, but raises an exception if variable not known
    def repl(m, vars=vars):
        if not m.lastindex:
            return "$"
        return vars[m.group(m.lastindex)]
    return _dosub(repl, string)

def replacevars_from_scope(string):
    # same as replacevars, but gets the variables from caller's local scope
    frame = sys._getframe(1)
    mapping = frame.f_globals.copy()
    mapping.update(frame.f_locals)
    return replacevars(string, mapping)

#
# try it out...

s = '${name} was born in ${country}'
print replacevars(s, {'name': 'Guido', 'country': 'the Netherlands'})

name = 'Barry'
country = 'the USA'
print replacevars_from_scope(s)

Using Callback Functions #

June 26, 2002 | Fredrik Lundh

Q. Look at the two functions quote and unquote. I wrote them without regular expression because I think it faster.

Faster to write, perhaps.

And faster to run, if you only use them on strings with no more than 2-3 characters.

But if you use a different set of test strings with more ordinary characters than escaped characters, e.g.

     strings = ['foo', '', '\\', ' ', '"', '\\"', '\\\\']
     strings = [(x+"spamspamspamspamspam")*10 for x in strings]

you’ll find that a RE approach can be much faster. The following version is about four times faster than your code, under 2.2:

import re

def re_quote(string, sub=re.compile(r"[\\\"]").sub):
    def fixup(m):
        return "\\" + m.group(0)
    return sub(fixup, string)

def re_unquote(string, sub=re.compile(r"(?s)\\(.)|\\").sub):
    def fixup(m):
        ch = m.group(1)
        if ch is None:
            raise ValueError("backslash at end of string")
        if ch not in r"\\\"":
            raise ValueError("unsupported character after backslash")
        return ch
    return sub(fixup, string)

Note the use of callback functions instead of substitution templates. It’s usually faster (and in my opinion, also more pythonic) to use e.g.

    def fixup(m):
        return "spam %s %s" % m.group(1, 2)
    re.sub(pattern, fixup, string)

or, if you prefer lambdas:

    re.sub(pattern, lambda m: "spam %s %s" % m.group(1, 2), string)

than the re.sub non-standard interpolation syntax:

    re.sub(pattern, "spam \\1 \\2", string)

(and where possible, it’s also slightly faster to use m.groups() instead of enumerating all the groups in m.group(…))

YMMV, as usual.