Spaces:
Paused
Paused
Update main.py
Browse files
main.py
CHANGED
|
@@ -5,11 +5,15 @@ import urllib.parse
|
|
| 5 |
|
| 6 |
app = FastAPI()
|
| 7 |
|
| 8 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
INJECTED_JS = """
|
| 10 |
<script>
|
| 11 |
-
// Intercept history.pushState so dynamic URL changes are routed through the proxy.
|
| 12 |
(function() {
|
|
|
|
| 13 |
const originalPushState = history.pushState;
|
| 14 |
history.pushState = function(state, title, url) {
|
| 15 |
if (url) {
|
|
@@ -19,7 +23,7 @@ INJECTED_JS = """
|
|
| 19 |
return originalPushState.call(history, state, title, url);
|
| 20 |
};
|
| 21 |
|
| 22 |
-
// Intercept fetch()
|
| 23 |
const originalFetch = window.fetch;
|
| 24 |
window.fetch = function(input, init) {
|
| 25 |
let url;
|
|
@@ -46,11 +50,10 @@ INJECTED_JS = """
|
|
| 46 |
return originalOpen.apply(this, [method, proxiedUrl, true]);
|
| 47 |
};
|
| 48 |
|
| 49 |
-
// Intercept anchor clicks to
|
| 50 |
document.addEventListener('click', function(event) {
|
| 51 |
const target = event.target.closest('a');
|
| 52 |
if (target && target.href) {
|
| 53 |
-
// Skip if already proxied or if special attributes exist.
|
| 54 |
if (target.getAttribute('data-no-proxy') || target.href.indexOf('/?url=') === 0) {
|
| 55 |
return;
|
| 56 |
}
|
|
@@ -58,32 +61,88 @@ INJECTED_JS = """
|
|
| 58 |
window.location.href = '/?url=' + encodeURIComponent(target.href);
|
| 59 |
}
|
| 60 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
})();
|
| 62 |
</script>
|
| 63 |
"""
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
async def fetch_and_rewrite(target_url: str) -> Response:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
async with httpx.AsyncClient() as client:
|
| 67 |
resp = await client.get(target_url)
|
| 68 |
-
content_type = resp.headers.get("Content-Type", "")
|
| 69 |
|
| 70 |
-
|
| 71 |
if "text/html" not in content_type:
|
|
|
|
| 72 |
return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
|
| 73 |
-
|
| 74 |
-
# Parse the HTML content.
|
| 75 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 76 |
-
|
| 77 |
-
# Remove
|
| 78 |
for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
|
| 79 |
meta.decompose()
|
| 80 |
-
|
| 81 |
-
#
|
| 82 |
-
# This ensures that relative URLs in the HTML resolve against the target domain.
|
| 83 |
parsed_target = urllib.parse.urlparse(target_url)
|
| 84 |
base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
|
| 85 |
if soup.head:
|
| 86 |
-
# Remove any existing <base> tags.
|
| 87 |
for base in soup.head.find_all("base"):
|
| 88 |
base.decompose()
|
| 89 |
base_tag = soup.new_tag("base", href=base_href)
|
|
@@ -93,38 +152,52 @@ async def fetch_and_rewrite(target_url: str) -> Response:
|
|
| 93 |
base_tag = soup.new_tag("base", href=base_href)
|
| 94 |
head_tag.insert(0, base_tag)
|
| 95 |
soup.insert(0, head_tag)
|
| 96 |
-
|
| 97 |
-
#
|
| 98 |
-
# This script intercepts dynamic navigation and network calls.
|
| 99 |
if soup.body:
|
| 100 |
soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
|
| 101 |
else:
|
| 102 |
soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
|
| 103 |
-
|
| 104 |
-
#
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
"a": "href",
|
| 108 |
-
"img": "src",
|
| 109 |
-
"script": "src",
|
| 110 |
-
"link": "href",
|
| 111 |
-
"form": "action"
|
| 112 |
-
}
|
| 113 |
-
for tag, attr in tags_attrs.items():
|
| 114 |
-
for element in soup.find_all(tag):
|
| 115 |
-
if element.has_attr(attr):
|
| 116 |
-
orig = element[attr]
|
| 117 |
-
# Skip if already proxied or if it’s a javascript/mailto link.
|
| 118 |
-
if orig.startswith("/?url=") or orig.startswith("javascript:") or orig.startswith("mailto:"):
|
| 119 |
-
continue
|
| 120 |
-
new_url = urllib.parse.urljoin(target_url, orig)
|
| 121 |
-
element[attr] = "/?url=" + urllib.parse.quote(new_url)
|
| 122 |
-
|
| 123 |
return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
|
| 124 |
|
| 125 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
@app.get("/{full_path:path}")
|
| 127 |
async def catch_all(full_path: str, request: Request):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
query_params = dict(request.query_params)
|
| 129 |
if "url" in query_params:
|
| 130 |
target_url = query_params["url"]
|
|
@@ -138,8 +211,17 @@ async def catch_all(full_path: str, request: Request):
|
|
| 138 |
target_url += "?" + qs
|
| 139 |
|
| 140 |
response = await fetch_and_rewrite(target_url)
|
| 141 |
-
|
|
|
|
| 142 |
parsed_target = urllib.parse.urlparse(target_url)
|
| 143 |
base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
|
| 144 |
response.set_cookie("target_base", base_url)
|
|
|
|
| 145 |
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
app = FastAPI()
|
| 7 |
|
| 8 |
+
# ---------------------------------------------------------------------------
|
| 9 |
+
# Injected JavaScript for Dynamic Interception & Real-Time Updates
|
| 10 |
+
# ---------------------------------------------------------------------------
|
| 11 |
+
# This script intercepts client-side navigation (history, fetch, XHR, anchor clicks)
|
| 12 |
+
# and uses MutationObserver to reprocess dynamically added DOM nodes.
|
| 13 |
INJECTED_JS = """
|
| 14 |
<script>
|
|
|
|
| 15 |
(function() {
|
| 16 |
+
// Intercept history.pushState to route dynamic navigations.
|
| 17 |
const originalPushState = history.pushState;
|
| 18 |
history.pushState = function(state, title, url) {
|
| 19 |
if (url) {
|
|
|
|
| 23 |
return originalPushState.call(history, state, title, url);
|
| 24 |
};
|
| 25 |
|
| 26 |
+
// Intercept fetch() calls.
|
| 27 |
const originalFetch = window.fetch;
|
| 28 |
window.fetch = function(input, init) {
|
| 29 |
let url;
|
|
|
|
| 50 |
return originalOpen.apply(this, [method, proxiedUrl, true]);
|
| 51 |
};
|
| 52 |
|
| 53 |
+
// Intercept anchor clicks to ensure navigation goes through the proxy.
|
| 54 |
document.addEventListener('click', function(event) {
|
| 55 |
const target = event.target.closest('a');
|
| 56 |
if (target && target.href) {
|
|
|
|
| 57 |
if (target.getAttribute('data-no-proxy') || target.href.indexOf('/?url=') === 0) {
|
| 58 |
return;
|
| 59 |
}
|
|
|
|
| 61 |
window.location.href = '/?url=' + encodeURIComponent(target.href);
|
| 62 |
}
|
| 63 |
});
|
| 64 |
+
|
| 65 |
+
// Use MutationObserver to catch and rewrite dynamically added elements.
|
| 66 |
+
const observer = new MutationObserver(function(mutations) {
|
| 67 |
+
mutations.forEach(function(mutation) {
|
| 68 |
+
mutation.addedNodes.forEach(function(node) {
|
| 69 |
+
if (node.nodeType === Node.ELEMENT_NODE) {
|
| 70 |
+
reProxyElement(node);
|
| 71 |
+
}
|
| 72 |
+
});
|
| 73 |
+
});
|
| 74 |
+
});
|
| 75 |
+
observer.observe(document.body, { childList: true, subtree: true });
|
| 76 |
+
|
| 77 |
+
// Rewrites URL attributes for an element and its children.
|
| 78 |
+
function reProxyElement(element) {
|
| 79 |
+
const urlAttrs = ['href', 'src', 'action', 'srcset'];
|
| 80 |
+
urlAttrs.forEach(function(attr) {
|
| 81 |
+
if (element.hasAttribute(attr)) {
|
| 82 |
+
const value = element.getAttribute(attr);
|
| 83 |
+
if (value && !value.startsWith('/?url=') &&
|
| 84 |
+
!value.startsWith('javascript:') && !value.startsWith('mailto:')) {
|
| 85 |
+
element.setAttribute(attr, '/?url=' + encodeURIComponent(value));
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
});
|
| 89 |
+
Array.from(element.children).forEach(child => reProxyElement(child));
|
| 90 |
+
}
|
| 91 |
})();
|
| 92 |
</script>
|
| 93 |
"""
|
| 94 |
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# Helper Function: Rewrite URLs for All Relevant Attributes
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
def rewrite_urls(soup, target_url, proxy_prefix="/?url="):
|
| 99 |
+
"""
|
| 100 |
+
Iterates over all elements in the parsed HTML and rewrites URL-like attributes.
|
| 101 |
+
Supports attributes such as: href, src, action, srcset.
|
| 102 |
+
"""
|
| 103 |
+
url_attrs = ['href', 'src', 'action', 'srcset']
|
| 104 |
+
for element in soup.find_all(True):
|
| 105 |
+
for attr in list(element.attrs):
|
| 106 |
+
if attr in url_attrs:
|
| 107 |
+
orig_value = element.get(attr)
|
| 108 |
+
# Skip already proxied or special schemes.
|
| 109 |
+
if not orig_value or orig_value.startswith(proxy_prefix) or orig_value.startswith(("mailto:", "javascript:")):
|
| 110 |
+
continue
|
| 111 |
+
# Resolve relative URLs.
|
| 112 |
+
new_url = urllib.parse.urljoin(target_url, orig_value)
|
| 113 |
+
# Rewrite URL to be loaded via the proxy.
|
| 114 |
+
element[attr] = proxy_prefix + urllib.parse.quote(new_url)
|
| 115 |
+
return soup
|
| 116 |
+
|
| 117 |
+
# ---------------------------------------------------------------------------
|
| 118 |
+
# Core Function: Fetch and Rewrite the Target HTML
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
async def fetch_and_rewrite(target_url: str) -> Response:
|
| 121 |
+
"""
|
| 122 |
+
Asynchronously fetches the target URL, then parses and rewrites its HTML:
|
| 123 |
+
- Removes conflicting Content Security Policy meta tags.
|
| 124 |
+
- Inserts a <base> tag to resolve relative URLs.
|
| 125 |
+
- Injects JavaScript to intercept dynamic navigation and update DOM in real time.
|
| 126 |
+
- Rewrites URL attributes for a comprehensive set of tags.
|
| 127 |
+
"""
|
| 128 |
async with httpx.AsyncClient() as client:
|
| 129 |
resp = await client.get(target_url)
|
|
|
|
| 130 |
|
| 131 |
+
content_type = resp.headers.get("Content-Type", "")
|
| 132 |
if "text/html" not in content_type:
|
| 133 |
+
# For non-HTML content, return the response directly.
|
| 134 |
return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
|
| 135 |
+
|
|
|
|
| 136 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 137 |
+
|
| 138 |
+
# Remove Content Security Policy meta tags that might block our injected scripts.
|
| 139 |
for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
|
| 140 |
meta.decompose()
|
| 141 |
+
|
| 142 |
+
# Insert (or replace) a <base> tag for proper resolution of relative URLs.
|
|
|
|
| 143 |
parsed_target = urllib.parse.urlparse(target_url)
|
| 144 |
base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
|
| 145 |
if soup.head:
|
|
|
|
| 146 |
for base in soup.head.find_all("base"):
|
| 147 |
base.decompose()
|
| 148 |
base_tag = soup.new_tag("base", href=base_href)
|
|
|
|
| 152 |
base_tag = soup.new_tag("base", href=base_href)
|
| 153 |
head_tag.insert(0, base_tag)
|
| 154 |
soup.insert(0, head_tag)
|
| 155 |
+
|
| 156 |
+
# Inject our dynamic JavaScript into the beginning of the <body>.
|
|
|
|
| 157 |
if soup.body:
|
| 158 |
soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
|
| 159 |
else:
|
| 160 |
soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
|
| 161 |
+
|
| 162 |
+
# Rewrite URLs for all relevant elements.
|
| 163 |
+
soup = rewrite_urls(soup, target_url)
|
| 164 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
|
| 166 |
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# Simple Proxy Endpoint: /proxy_full
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
@app.get("/proxy_full")
|
| 171 |
+
async def proxy_full(url: str):
|
| 172 |
+
"""
|
| 173 |
+
A simple proxy endpoint that fetches the given URL and rewrites
|
| 174 |
+
its HTML so that all resource URLs route through the proxy.
|
| 175 |
+
"""
|
| 176 |
+
if not url:
|
| 177 |
+
raise HTTPException(status_code=400, detail="Missing 'url' query parameter")
|
| 178 |
+
|
| 179 |
+
async with httpx.AsyncClient() as client:
|
| 180 |
+
resp = await client.get(url)
|
| 181 |
+
|
| 182 |
+
content_type = resp.headers.get("Content-Type", "")
|
| 183 |
+
if "text/html" not in content_type:
|
| 184 |
+
return Response(resp.content, media_type=content_type, status_code=resp.status_code)
|
| 185 |
+
|
| 186 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
| 187 |
+
soup = rewrite_urls(soup, url)
|
| 188 |
+
|
| 189 |
+
return Response(str(soup), media_type="text/html")
|
| 190 |
+
|
| 191 |
+
# ---------------------------------------------------------------------------
|
| 192 |
+
# Catch-All Dynamic Proxy Endpoint
|
| 193 |
+
# ---------------------------------------------------------------------------
|
| 194 |
@app.get("/{full_path:path}")
|
| 195 |
async def catch_all(full_path: str, request: Request):
|
| 196 |
+
"""
|
| 197 |
+
Catch-all endpoint for dynamic proxying.
|
| 198 |
+
Determines the target URL via a query parameter or a stored cookie.
|
| 199 |
+
Processes the HTML to inject dynamic JavaScript and rewrite URLs.
|
| 200 |
+
"""
|
| 201 |
query_params = dict(request.query_params)
|
| 202 |
if "url" in query_params:
|
| 203 |
target_url = query_params["url"]
|
|
|
|
| 211 |
target_url += "?" + qs
|
| 212 |
|
| 213 |
response = await fetch_and_rewrite(target_url)
|
| 214 |
+
|
| 215 |
+
# Save the target's base URL in a cookie for subsequent requests.
|
| 216 |
parsed_target = urllib.parse.urlparse(target_url)
|
| 217 |
base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
|
| 218 |
response.set_cookie("target_base", base_url)
|
| 219 |
+
|
| 220 |
return response
|
| 221 |
+
|
| 222 |
+
# ---------------------------------------------------------------------------
|
| 223 |
+
# Run the Application on Port 7860
|
| 224 |
+
# ---------------------------------------------------------------------------
|
| 225 |
+
if __name__ == "__main__":
|
| 226 |
+
import uvicorn
|
| 227 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|