""" utilities for manipulating html links """ from htmlserialize import decodeAndParseHTML, tostring import urlparse import re def fixup_text_links(doc, link_repl_func, remove_base_tags=True): """ fixup_links(), but work on text and returns text """ doc = decodeAndParseHTML(doc) fixup_links(doc, link_repl_func, remove_base_tags=remove_base_tags) return tostring(doc) def fixup_links(doc, link_repl_func, remove_base_tags=True): """ Takes a given document (already parsed by lxml) and modifies it in-place. Every link is passed through link_repl_func, and the output of that function replaces the link. """ if remove_base_tags: resolve_base_tags_in_document(doc) for attrib in 'href', 'src': els = doc.xpath('//*[@%s]' % attrib) for el in els: el.attrib[attrib] = link_repl_func(el.attrib[attrib]) fixup_css_links(doc, link_repl_func) fixup_style_links(doc, link_repl_func) def resolve_base_tags_in_document(doc): """ removes all html tags from the document given. """ base_href = None basetags = doc.xpath('//base[@href]') for b in basetags: base_href = b.attrib['href'] b.getparent().remove(b) if base_href is None: return # Now that we have a base_href (blech) we have to fix up all the # links in the document with this new information. def link_repl(href): return urlparse.urljoin(base_href, href) fixup_links(doc, link_repl, remove_base_tags=False) CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I) CSS_IMPORT_PAT = re.compile(r'@import "(.*?)"') def fixup_css_links(doc, link_repl_func): """ Fixes up any url(...) links in CSS style elements """ def absuri(matchobj): return 'url(%s)' % link_repl_func(matchobj.group(1)) def absimport(matchobj): return '@import "%s"' % link_repl_func(matchobj.group(1)) els = doc.xpath('//head/style') for el in els: if el.text: el.text = CSS_URL_PAT.sub(absuri, el.text) el.text = CSS_IMPORT_PAT.sub(absimport, el.text) def fixup_style_links(doc, link_repl_func): def absuri(matchobj): return 'url(%s)' % link_repl_func(matchobj.group(1)) for el in doc.xpath("//*[contains(@style, 'url(')]"): el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])