from lxml import etree import threading import re html_xsl = """\ """ # TODO: this should do real formatting pretty_html_xsl = """\ """ _local_transforms = threading.local() _local_transforms.html_transform = etree.XSLT(etree.XML(html_xsl)) _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(pretty_html_xsl)) def tostring(doc, pretty=False): """ return HTML string representation of the document given note: this will create a meta http-equiv="Content" tag in the head and may replace any that are present """ if pretty: try: pretty_html_transform = _local_transforms.pretty_html_transform except AttributeError: pretty_html_transform = _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(pretty_html_xsl)) return str(pretty_html_transform(doc)) else: try: html_transform = _local_transforms.html_transform except AttributeError: html_transform = _local_transforms.html_transform = etree.XSLT(etree.XML(html_xsl)) return str(html_transform(doc)) #HTTP_EQUIV_MATCHER_PAT = re.compile(r"\<\s*meta\s+([^\>])*http-equiv\s*=\s*(\'|\")\s*content-type\s*(\'|\")([^\>])*charset\s*=\s*(?P[\w-]+)([^\>])*\>",re.I|re.M) #OTHER_HTTP_EQUIV_MATCHER_PAT = re.compile(r"\<\s*meta\s+([^\>])*charset\s*=\s*(?P[\w-]+)([^\>])*http-equiv\s*=\s*(\'|\")\s*content-type\s*(\'|\")([^\>])*\>",re.I|re.M) def decodeAndParseHTML(text): """ if an html meta tag specifying a charset can be matched, decode the text to a python unicode string before parsing XXX - this is disabled and in camelCase for no good reason """ # m = HTTP_EQUIV_MATCHER_PAT.search(text) # if not m: # m = OTHER_HTTP_EQUIV_MATCHER_PAT.search(text) # # if m: # charset = m.group('charset') # text = text.decode(charset) content = etree.HTML(text) assert content is not None return content