"""
utilities for manipulating html links
"""
from htmlserialize import decodeAndParseHTML, tostring
import urlparse
import re
def fixup_text_links(doc, link_repl_func, remove_base_tags=True):
"""
fixup_links(), but work on text and returns text
"""
doc = decodeAndParseHTML(doc)
fixup_links(doc, link_repl_func, remove_base_tags=remove_base_tags)
return tostring(doc)
def fixup_links(doc, link_repl_func,
remove_base_tags=True):
"""
Takes a given document (already parsed by lxml) and modifies it
in-place. Every link is passed through link_repl_func, and the
output of that function replaces the link.
"""
if remove_base_tags:
resolve_base_tags_in_document(doc)
for attrib in 'href', 'src':
els = doc.xpath('//*[@%s]' % attrib)
for el in els:
el.attrib[attrib] = link_repl_func(el.attrib[attrib])
fixup_css_links(doc, link_repl_func)
fixup_style_links(doc, link_repl_func)
def resolve_base_tags_in_document(doc):
"""
removes all html tags
from the document given.
"""
base_href = None
basetags = doc.xpath('//base[@href]')
for b in basetags:
base_href = b.attrib['href']
b.getparent().remove(b)
if base_href is None:
return
# Now that we have a base_href (blech) we have to fix up all the
# links in the document with this new information.
def link_repl(href):
return urlparse.urljoin(base_href, href)
fixup_links(doc, link_repl, remove_base_tags=False)
CSS_URL_PAT = re.compile(r'url\((.*?)\)', re.I)
CSS_IMPORT_PAT = re.compile(r'@import "(.*?)"')
def fixup_css_links(doc, link_repl_func):
"""
Fixes up any url(...) links in CSS style elements
"""
def absuri(matchobj):
return 'url(%s)' % link_repl_func(matchobj.group(1))
def absimport(matchobj):
return '@import "%s"' % link_repl_func(matchobj.group(1))
els = doc.xpath('//head/style')
for el in els:
if el.text:
el.text = CSS_URL_PAT.sub(absuri, el.text)
el.text = CSS_IMPORT_PAT.sub(absimport, el.text)
def fixup_style_links(doc, link_repl_func):
def absuri(matchobj):
return 'url(%s)' % link_repl_func(matchobj.group(1))
for el in doc.xpath("//*[contains(@style, 'url(')]"):
el.attrib['style'] = CSS_URL_PAT.sub(absuri, el.attrib['style'])