import re import urllib.parse from fastapi import FastAPI, Request, HTTPException, Response import httpx from bs4 import BeautifulSoup app = FastAPI() # --------------------------------------------------------------------------- # Utility: Rewrite CSS URLs inside CSS text # --------------------------------------------------------------------------- def rewrite_css(css_text: str, target_url: str, proxy_prefix="/?url=") -> str: """ Finds all url(...) references in CSS and rewrites them so that the resources are loaded via the proxy. """ # Matches: url('...'), url("..."), or url(...) pattern = re.compile(r'url\(\s*(?P["\']?)(?P[^"\')]+)(?P=quote)\s*\)') def replace_url(match): original_url = match.group("url") # Skip if already proxied, is a data URL, or if it's already absolute and served correctly. if original_url.startswith(proxy_prefix) or original_url.startswith("data:"): return match.group(0) new_url = urllib.parse.urljoin(target_url, original_url) proxied = f'url({match.group("quote")}{proxy_prefix}{urllib.parse.quote(new_url)}{match.group("quote")})' return proxied return pattern.sub(replace_url, css_text) # --------------------------------------------------------------------------- # Injected JavaScript for Dynamic Interception & Real-Time Updates # --------------------------------------------------------------------------- # This script intercepts history changes, fetch, XHR, and anchor clicks, # and uses MutationObserver to rewrite new elements. It also handles SVG # attributes (including xlink:href) for icons and logos. INJECTED_JS = """ """ # --------------------------------------------------------------------------- # Helper Function: Rewrite URLs for All Relevant Attributes in HTML # --------------------------------------------------------------------------- def rewrite_urls(soup, target_url, proxy_prefix="/?url="): """ Iterates over all elements in the parsed HTML and rewrites URL-like attributes. Supports attributes such as: href, src, action, srcset, xlink:href. Also rewrites inline style attributes that contain CSS url() references. """ url_attrs = ['href', 'src', 'action', 'srcset', 'xlink:href'] for element in soup.find_all(True): # Rewrite attributes that hold URLs. for attr in list(element.attrs): if attr in url_attrs: orig_value = element.get(attr) if not orig_value or orig_value.startswith(proxy_prefix) or orig_value.startswith(("mailto:", "javascript:")): continue new_url = urllib.parse.urljoin(target_url, orig_value) element[attr] = proxy_prefix + urllib.parse.quote(new_url) # Rewrite inline style attributes containing CSS url() patterns. if element.has_attr("style"): original_style = element.get("style") new_style = rewrite_css(original_style, target_url, proxy_prefix) element["style"] = new_style return soup # --------------------------------------------------------------------------- # Core Function: Fetch and Rewrite the Target Resource # --------------------------------------------------------------------------- async def fetch_and_rewrite(target_url: str) -> Response: """ Fetches the target URL and processes the response. - If the content is HTML, removes conflicting CSP tags, inserts a tag, injects dynamic JavaScript, and rewrites URL attributes. - If the content is CSS, rewrites any url() references so that the resources are proxied. - Other content types are returned as-is. """ async with httpx.AsyncClient() as client: resp = await client.get(target_url) content_type = resp.headers.get("Content-Type", "") # Process CSS files to rewrite url() references. if "text/css" in content_type: new_css = rewrite_css(resp.text, target_url) return Response(content=new_css, media_type="text/css", status_code=resp.status_code) # Process HTML content. if "text/html" in content_type: soup = BeautifulSoup(resp.text, "html.parser") # Remove any Content Security Policy meta tags that might block our scripts. for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}): meta.decompose() # Insert (or replace) a tag to ensure relative URLs resolve properly. parsed_target = urllib.parse.urlparse(target_url) base_href = f"{parsed_target.scheme}://{parsed_target.netloc}" if soup.head: for base in soup.head.find_all("base"): base.decompose() base_tag = soup.new_tag("base", href=base_href) soup.head.insert(0, base_tag) else: head_tag = soup.new_tag("head") base_tag = soup.new_tag("base", href=base_href) head_tag.insert(0, base_tag) soup.insert(0, head_tag) # Inject dynamic JavaScript for real-time interception. if soup.body: soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser")) else: soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser")) # Rewrite URLs (including SVG icons, CSS backgrounds in style attributes, etc.). soup = rewrite_urls(soup, target_url) return Response(content=str(soup), media_type="text/html", status_code=resp.status_code) # For other content types (JS, images, etc.) return the response directly. return Response(content=resp.content, media_type=content_type, status_code=resp.status_code) # --------------------------------------------------------------------------- # Simple Proxy Endpoint: /proxy_full # --------------------------------------------------------------------------- @app.get("/proxy_full") async def proxy_full(url: str): """ A simple proxy endpoint that fetches the given URL and rewrites its HTML/CSS so that all resource URLs route via the proxy. """ if not url: raise HTTPException(status_code=400, detail="Missing 'url' query parameter") async with httpx.AsyncClient() as client: resp = await client.get(url) content_type = resp.headers.get("Content-Type", "") if "text/html" in content_type: soup = BeautifulSoup(resp.text, "html.parser") soup = rewrite_urls(soup, url) return Response(content=str(soup), media_type="text/html") elif "text/css" in content_type: new_css = rewrite_css(resp.text, url) return Response(content=new_css, media_type="text/css") return Response(content=resp.content, media_type=content_type, status_code=resp.status_code) # --------------------------------------------------------------------------- # Catch-All Dynamic Proxy Endpoint # --------------------------------------------------------------------------- @app.get("/{full_path:path}") async def catch_all(full_path: str, request: Request): """ Catch-all endpoint for dynamic proxying. Determines the target URL via a query parameter or a stored cookie, then processes the resource (HTML or CSS) to inject dynamic JS and rewrite URLs. """ query_params = dict(request.query_params) if "url" in query_params: target_url = query_params["url"] else: target_base = request.cookies.get("target_base") if not target_base: return Response("No target URL provided.", status_code=400) qs = request.url.query target_url = urllib.parse.urljoin(target_base, full_path) if qs: target_url += "?" + qs response = await fetch_and_rewrite(target_url) # Store the target's base URL in a cookie for subsequent requests. parsed_target = urllib.parse.urlparse(target_url) base_url = f"{parsed_target.scheme}://{parsed_target.netloc}" response.set_cookie("target_base", base_url) return response # --------------------------------------------------------------------------- # Run the Application on Port 7860 # --------------------------------------------------------------------------- if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)