import socket from wsgiproxy.exactproxy import proxy_exact_request from lxml.html import document_fromstring, tostring import re class LXMLProxy(object): """ Proxies to another site, using lxml to rewrite all links from that site. """ def __init__(self, dest): self.dest = url_normalize(dest).rstrip('/') def __call__(self, environ, start_response): req = Request(environ) resp, orig_base, proxied_url = self.proxy(req) resp = self.modify_response(req, resp, orig_base, proxied_url) return resp(environ, start_response) def proxy(self, req): orig_base = url_normalize(req.application_url) dest = self.dest proxy_req = Request(req.environ.copy()) scheme, netloc, path, query, fragment = urlparse.urlsplit(dest) path = urllib.unquote(path) assert not fragment proxy_req.path_info = path + req.path_info proxy_req.server_name = netloc.split(':', 1)[0] if ':' in netloc: proxy_req.server_port = netloc.split(':', 1)[1] elif scheme == 'http': proxy_req.server_port = '80' elif scheme == 'https': proxy_req.server_port = '443' else: assert 0 proxy_req.host = netloc if query: if proxy_req.query_string: proxy_req.query_string += '&' proxy_req.query_string += query proxy_req.scheme = scheme proxy_req.script_name = '' proxied_url = url_normalize('%s://%s%s' % (scheme, netloc, proxy_req.path_qs)) proxy_req.accept_encoding = None try: print 'redirecting to', proxy_req.url resp = proxy_req.get_response(proxy_exact_request) except socket.error, e: if isinstance(e.args, tuple) and len(e.args) > 1: error = e.args[1] else: error = str(e) resp = exc.HTTPServiceUnavailable( 'Could not proxy the request to %s:%s : %s' % (proxy_req.server_name, proxy_req.server_port, error)) return resp, orig_base, proxied_url ## FIXME: instead of proxied_base/proxied_path, should I keep the ## modified request object? def modify_response(self, request, response, orig_base, proxied_url): """ Modify the response however the user wanted. """ # This might not have a trailing /: proxied_base = self.dest exact_proxied_base = self.dest if not proxied_base.endswith('/'): proxied_base += '/' exact_orig_base = orig_base if not orig_base.endswith('/'): orig_base += '/' assert (proxied_url.startswith(proxied_base) or proxied_url.split('?', 1)[0] == proxied_base[:-1]), ( "Unexpected proxied_url %r, doesn't start with proxied_base %r" % (proxied_url, proxied_base)) assert (request.url.startswith(orig_base) or request.url.split('?', 1)[0] == orig_base[:-1]), ( "Unexpected request.url %r, doesn't start with orig_base %r" % (request.url, orig_base)) def link_repl_func(link): """Rewrites a link to point to this proxy""" if link == exact_proxied_base: return exact_orig_base if not link.startswith(proxied_base): # External link, so we don't rewrite it return link new = orig_base + link[len(proxied_base):] return new if response.content_type != 'text/html': return response else: if not response.charset: ## FIXME: maybe we should guess the encoding? body = response.body else: body = response.unicode_body body_doc = document_fromstring(body, base_url=proxied_url) body_doc.make_links_absolute() body_doc.rewrite_links(link_repl_func) response.body = tostring(body_doc) if response.location: ## FIXME: if you give a proxy like ## http://openplans.org, and it redirects to ## http://www.openplans.org, it won't be rewritten and ## that can be confusing -- it *shouldn't* be ## rewritten, but some better log message is required loc = urlparse.urljoin(proxied_url, response.location) loc = link_repl_func(loc) response.location = loc if 'set-cookie' in response.headers: cookies = response.headers.getall('set-cookie') del response.headers['set-cookie'] for cook in cookies: old_domain = urlparse.urlsplit(proxied_url)[1].lower() new_domain = request.host.split(':', 1)[0].lower() def rewrite_domain(match): """Rewrites domains to point to this proxy""" domain = match.group(2) if domain == old_domain: ## FIXME: doesn't catch wildcards and the sort return match.group(1) + new_domain + match.group(3) else: return match.group(0) cook = self._cookie_domain_re.sub(rewrite_domain, cook) response.headers.add('set-cookie', cook) response.body = self.fixup_scripts(response.body) return response _script_re = re.compile(r'(]*>)(.*?)()', re.I|re.S) def fixup_scripts(self, body): return self._script_re.sub( lambda match: match.group(1) + match.group(2).replace('<', '<') + match.group(3), body) def make_lxml_proxy(global_conf, href): return LXMLProxy(href) ## Normalization ## """Normalize URLs""" import urlparse import urllib import re def url_normalize(url): """Normalizes the quoting of URLs, quoting any characters that should be quoted (but not double-quoting already quoted characters)""" scheme, netloc, path, query, fragment = urlparse.urlsplit(url) scheme = scheme.lower() if ':' in netloc: host, port = netloc.split(':', 1) if scheme == 'http' and port == '80': netloc = host elif scheme == 'https' and port == '443': netloc = host netloc = netloc.lower() path = _quote_special(path) if query: path += '?' + query if fragment: path += '#' + fragment result = '%s://%s%s' % (scheme, netloc, path) return result _slash_re = re.compile(r'%2f', re.I) def _quote_special(path): """Quotes any characters in the path that should be quoted, unquotes characters that don't need to be quoted. Also % quoting is upper-cased""" parts = [_quote_special_part(part) for part in _slash_re.split(path)] return '%2F'.join(parts) _percent_re = re.compile(r'%[0-9a-f][0-9a-f]', re.I) def _quote_special_part(part): return urllib.quote(urllib.unquote(part)) ## Special req/response ## import webob import chardet class Response(webob.Response): default_charset = None unicode_errors = 'replace' def _unicode_body__get(self): """ Get/set the unicode value of the body (using the charset of the Content-Type) """ if not self.charset: guess = chardet.detect(self.body) self.charset = guess['encoding'] body = self.body return body.decode(self.charset, self.unicode_errors) def _unicode_body__set(self, value): if not self.charset: raise AttributeError( "You cannot access Response.unicode_body unless charset is set") if not isinstance(value, unicode): raise TypeError( "You can only set Response.unicode_body to a unicode string (not %s)" % type(value)) self.body = value.encode(self.charset) def _unicode_body__del(self): del self.body unicode_body = property(_unicode_body__get, _unicode_body__set, _unicode_body__del, doc=_unicode_body__get.__doc__) class Request(webob.Request): ResponseClass = Response