Spaces:

triflix
/

testingproxy123

Paused

App Files Files Community

triflix commited on Mar 14, 2025

Commit

f3f8bd0

verified ·

1 Parent(s): 2100890

Update main.py

Browse files

Files changed (1) hide show

main.py +101 -55

main.py CHANGED Viewed

@@ -1,15 +1,39 @@
 from fastapi import FastAPI, Request, HTTPException, Response
 import httpx
 from bs4 import BeautifulSoup
-import urllib.parse
 app = FastAPI()
 # ---------------------------------------------------------------------------
 # Injected JavaScript for Dynamic Interception & Real-Time Updates
 # ---------------------------------------------------------------------------
-# This script intercepts client-side navigation (history, fetch, XHR, anchor clicks)
-# and uses MutationObserver to reprocess dynamically added DOM nodes.
 INJECTED_JS = """
 <script>
 (function() {
@@ -76,7 +100,7 @@ INJECTED_JS = """
     // Rewrites URL attributes for an element and its children.
     function reProxyElement(element) {
-        const urlAttrs = ['href', 'src', 'action', 'srcset'];
         urlAttrs.forEach(function(attr) {
             if (element.hasAttribute(attr)) {
                 const value = element.getAttribute(attr);
@@ -86,6 +110,16 @@ INJECTED_JS = """
                 }
             }
         });
         Array.from(element.children).forEach(child => reProxyElement(child));
     }
 })();
@@ -93,76 +127,87 @@ INJECTED_JS = """
 """
 # ---------------------------------------------------------------------------
-# Helper Function: Rewrite URLs for All Relevant Attributes
 # ---------------------------------------------------------------------------
 def rewrite_urls(soup, target_url, proxy_prefix="/?url="):
     """
     Iterates over all elements in the parsed HTML and rewrites URL-like attributes.
-    Supports attributes such as: href, src, action, srcset.
     """
-    url_attrs = ['href', 'src', 'action', 'srcset']
     for element in soup.find_all(True):
         for attr in list(element.attrs):
             if attr in url_attrs:
                 orig_value = element.get(attr)
-                # Skip already proxied or special schemes.
                 if not orig_value or orig_value.startswith(proxy_prefix) or orig_value.startswith(("mailto:", "javascript:")):
                     continue
-                # Resolve relative URLs.
                 new_url = urllib.parse.urljoin(target_url, orig_value)
-                # Rewrite URL to be loaded via the proxy.
                 element[attr] = proxy_prefix + urllib.parse.quote(new_url)
     return soup
 # ---------------------------------------------------------------------------
-# Core Function: Fetch and Rewrite the Target HTML
 # ---------------------------------------------------------------------------
 async def fetch_and_rewrite(target_url: str) -> Response:
     """
-    Asynchronously fetches the target URL, then parses and rewrites its HTML:
-    - Removes conflicting Content Security Policy meta tags.
-    - Inserts a <base> tag to resolve relative URLs.
-    - Injects JavaScript to intercept dynamic navigation and update DOM in real time.
-    - Rewrites URL attributes for a comprehensive set of tags.
     """
     async with httpx.AsyncClient() as client:
         resp = await client.get(target_url)
     content_type = resp.headers.get("Content-Type", "")
-    if "text/html" not in content_type:
-        # For non-HTML content, return the response directly.
-        return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
-    soup = BeautifulSoup(resp.text, "html.parser")
-    # Remove Content Security Policy meta tags that might block our injected scripts.
-    for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
-        meta.decompose()
-    # Insert (or replace) a <base> tag for proper resolution of relative URLs.
-    parsed_target = urllib.parse.urlparse(target_url)
-    base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
-    if soup.head:
-        for base in soup.head.find_all("base"):
-            base.decompose()
-        base_tag = soup.new_tag("base", href=base_href)
-        soup.head.insert(0, base_tag)
-    else:
-        head_tag = soup.new_tag("head")
-        base_tag = soup.new_tag("base", href=base_href)
-        head_tag.insert(0, base_tag)
-        soup.insert(0, head_tag)
-    # Inject our dynamic JavaScript into the beginning of the <body>.
-    if soup.body:
-        soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
-    else:
-        soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
-    # Rewrite URLs for all relevant elements.
-    soup = rewrite_urls(soup, target_url)
-    return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
 # ---------------------------------------------------------------------------
 # Simple Proxy Endpoint: /proxy_full
@@ -170,8 +215,7 @@ async def fetch_and_rewrite(target_url: str) -> Response:
 @app.get("/proxy_full")
 async def proxy_full(url: str):
     """
-    A simple proxy endpoint that fetches the given URL and rewrites
-    its HTML so that all resource URLs route through the proxy.
     """
     if not url:
         raise HTTPException(status_code=400, detail="Missing 'url' query parameter")
@@ -180,13 +224,15 @@ async def proxy_full(url: str):
         resp = await client.get(url)
     content_type = resp.headers.get("Content-Type", "")
-    if "text/html" not in content_type:
-        return Response(resp.content, media_type=content_type, status_code=resp.status_code)
-    soup = BeautifulSoup(resp.text, "html.parser")
-    soup = rewrite_urls(soup, url)
-    return Response(str(soup), media_type="text/html")
 # ---------------------------------------------------------------------------
 # Catch-All Dynamic Proxy Endpoint
@@ -195,8 +241,8 @@ async def proxy_full(url: str):
 async def catch_all(full_path: str, request: Request):
     """
     Catch-all endpoint for dynamic proxying.
-    Determines the target URL via a query parameter or a stored cookie.
-    Processes the HTML to inject dynamic JavaScript and rewrite URLs.
     """
     query_params = dict(request.query_params)
     if "url" in query_params:
@@ -212,7 +258,7 @@ async def catch_all(full_path: str, request: Request):
     response = await fetch_and_rewrite(target_url)
-    # Save the target's base URL in a cookie for subsequent requests.
     parsed_target = urllib.parse.urlparse(target_url)
     base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
     response.set_cookie("target_base", base_url)

+import re
+import urllib.parse
 from fastapi import FastAPI, Request, HTTPException, Response
 import httpx
 from bs4 import BeautifulSoup
 app = FastAPI()
+# ---------------------------------------------------------------------------
+# Utility: Rewrite CSS URLs inside CSS text
+# ---------------------------------------------------------------------------
+def rewrite_css(css_text: str, target_url: str, proxy_prefix="/?url=") -> str:
+    """
+    Finds all url(...) references in CSS and rewrites them so that
+    the resources are loaded via the proxy.
+    """
+    # Matches: url('...'), url("..."), or url(...)
+    pattern = re.compile(r'url\(\s*(?P<quote>["\']?)(?P<url>[^"\')]+)(?P=quote)\s*\)')
+    def replace_url(match):
+        original_url = match.group("url")
+        # Skip if already proxied, is a data URL, or if it's already absolute and served correctly.
+        if original_url.startswith(proxy_prefix) or original_url.startswith("data:"):
+            return match.group(0)
+        new_url = urllib.parse.urljoin(target_url, original_url)
+        proxied = f'url({match.group("quote")}{proxy_prefix}{urllib.parse.quote(new_url)}{match.group("quote")})'
+        return proxied
+    return pattern.sub(replace_url, css_text)
 # ---------------------------------------------------------------------------
 # Injected JavaScript for Dynamic Interception & Real-Time Updates
 # ---------------------------------------------------------------------------
+# This script intercepts history changes, fetch, XHR, and anchor clicks,
+# and uses MutationObserver to rewrite new elements. It also handles SVG
+# attributes (including xlink:href) for icons and logos.
 INJECTED_JS = """
 <script>
 (function() {
     // Rewrites URL attributes for an element and its children.
     function reProxyElement(element) {
+        const urlAttrs = ['href', 'src', 'action', 'srcset', 'xlink:href'];
         urlAttrs.forEach(function(attr) {
             if (element.hasAttribute(attr)) {
                 const value = element.getAttribute(attr);
                 }
             }
         });
+        // Also rewrite inline style attribute.
+        if (element.hasAttribute('style')) {
+            let styleVal = element.getAttribute('style');
+            // Simple client-side rewriting: prepend proxy to URLs.
+            styleVal = styleVal.replace(/url\\((['"]?)(.*?)\\1\\)/g, function(match, quote, url) {
+                if (url.startsWith('/?url=') || url.startsWith('data:')) return match;
+                return "url(" + quote + "/?url=" + encodeURIComponent(url) + quote + ")";
+            });
+            element.setAttribute('style', styleVal);
+        }
         Array.from(element.children).forEach(child => reProxyElement(child));
     }
 })();
 """
 # ---------------------------------------------------------------------------
+# Helper Function: Rewrite URLs for All Relevant Attributes in HTML
 # ---------------------------------------------------------------------------
 def rewrite_urls(soup, target_url, proxy_prefix="/?url="):
     """
     Iterates over all elements in the parsed HTML and rewrites URL-like attributes.
+    Supports attributes such as: href, src, action, srcset, xlink:href.
+    Also rewrites inline style attributes that contain CSS url() references.
     """
+    url_attrs = ['href', 'src', 'action', 'srcset', 'xlink:href']
     for element in soup.find_all(True):
+        # Rewrite attributes that hold URLs.
         for attr in list(element.attrs):
             if attr in url_attrs:
                 orig_value = element.get(attr)
                 if not orig_value or orig_value.startswith(proxy_prefix) or orig_value.startswith(("mailto:", "javascript:")):
                     continue
                 new_url = urllib.parse.urljoin(target_url, orig_value)
                 element[attr] = proxy_prefix + urllib.parse.quote(new_url)
+        # Rewrite inline style attributes containing CSS url() patterns.
+        if element.has_attr("style"):
+            original_style = element.get("style")
+            new_style = rewrite_css(original_style, target_url, proxy_prefix)
+            element["style"] = new_style
     return soup
 # ---------------------------------------------------------------------------
+# Core Function: Fetch and Rewrite the Target Resource
 # ---------------------------------------------------------------------------
 async def fetch_and_rewrite(target_url: str) -> Response:
     """
+    Fetches the target URL and processes the response.
+    - If the content is HTML, removes conflicting CSP tags, inserts a <base> tag,
+      injects dynamic JavaScript, and rewrites URL attributes.
+    - If the content is CSS, rewrites any url() references so that the resources are proxied.
+    - Other content types are returned as-is.
     """
     async with httpx.AsyncClient() as client:
         resp = await client.get(target_url)
     content_type = resp.headers.get("Content-Type", "")
+    # Process CSS files to rewrite url() references.
+    if "text/css" in content_type:
+        new_css = rewrite_css(resp.text, target_url)
+        return Response(content=new_css, media_type="text/css", status_code=resp.status_code)
+    # Process HTML content.
+    if "text/html" in content_type:
+        soup = BeautifulSoup(resp.text, "html.parser")
+        # Remove any Content Security Policy meta tags that might block our scripts.
+        for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
+            meta.decompose()
+        # Insert (or replace) a <base> tag to ensure relative URLs resolve properly.
+        parsed_target = urllib.parse.urlparse(target_url)
+        base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
+        if soup.head:
+            for base in soup.head.find_all("base"):
+                base.decompose()
+            base_tag = soup.new_tag("base", href=base_href)
+            soup.head.insert(0, base_tag)
+        else:
+            head_tag = soup.new_tag("head")
+            base_tag = soup.new_tag("base", href=base_href)
+            head_tag.insert(0, base_tag)
+            soup.insert(0, head_tag)
+        # Inject dynamic JavaScript for real-time interception.
+        if soup.body:
+            soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
+        else:
+            soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
+        # Rewrite URLs (including SVG icons, CSS backgrounds in style attributes, etc.).
+        soup = rewrite_urls(soup, target_url)
+        return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
+    # For other content types (JS, images, etc.) return the response directly.
+    return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
 # ---------------------------------------------------------------------------
 # Simple Proxy Endpoint: /proxy_full
 @app.get("/proxy_full")
 async def proxy_full(url: str):
     """
+    A simple proxy endpoint that fetches the given URL and rewrites its HTML/CSS so that all resource URLs route via the proxy.
     """
     if not url:
         raise HTTPException(status_code=400, detail="Missing 'url' query parameter")
         resp = await client.get(url)
     content_type = resp.headers.get("Content-Type", "")
+    if "text/html" in content_type:
+        soup = BeautifulSoup(resp.text, "html.parser")
+        soup = rewrite_urls(soup, url)
+        return Response(content=str(soup), media_type="text/html")
+    elif "text/css" in content_type:
+        new_css = rewrite_css(resp.text, url)
+        return Response(content=new_css, media_type="text/css")
+    return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
 # ---------------------------------------------------------------------------
 # Catch-All Dynamic Proxy Endpoint
 async def catch_all(full_path: str, request: Request):
     """
     Catch-all endpoint for dynamic proxying.
+    Determines the target URL via a query parameter or a stored cookie,
+    then processes the resource (HTML or CSS) to inject dynamic JS and rewrite URLs.
     """
     query_params = dict(request.query_params)
     if "url" in query_params:
     response = await fetch_and_rewrite(target_url)
+    # Store the target's base URL in a cookie for subsequent requests.
     parsed_target = urllib.parse.urlparse(target_url)
     base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
     response.set_cookie("target_base", base_url)