triflix commited on
Commit
f459e61
·
verified ·
1 Parent(s): 365d093

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +67 -29
main.py CHANGED
@@ -5,23 +5,21 @@ import urllib.parse
5
 
6
  app = FastAPI()
7
 
8
- # JavaScript code that will be injected into proxied HTML pages.
9
- # It intercepts dynamic URL changes and client-side network requests.
10
  INJECTED_JS = """
11
  <script>
12
- // Intercept History API to rewrite URLs for proxying
13
  (function() {
 
14
  const originalPushState = history.pushState;
15
  history.pushState = function(state, title, url) {
16
  if (url) {
17
- // Rewrite the URL to use our proxy endpoint
18
- const proxiedUrl = '/proxy_full?url=' + encodeURIComponent(url);
19
  return originalPushState.call(history, state, title, proxiedUrl);
20
  }
21
  return originalPushState.call(history, state, title, url);
22
  };
23
 
24
- // Intercept fetch to ensure all dynamic requests go through the proxy
25
  const originalFetch = window.fetch;
26
  window.fetch = function(input, init) {
27
  let url;
@@ -32,7 +30,7 @@ INJECTED_JS = """
32
  } else {
33
  return originalFetch(input, init);
34
  }
35
- const proxiedUrl = '/proxy_full?url=' + encodeURIComponent(url);
36
  if (typeof input === 'object') {
37
  input = new Request(proxiedUrl, input);
38
  } else {
@@ -41,42 +39,54 @@ INJECTED_JS = """
41
  return originalFetch(input, init);
42
  };
43
 
44
- // Intercept XMLHttpRequest open() method to proxy XHR requests
45
  const originalOpen = XMLHttpRequest.prototype.open;
46
  XMLHttpRequest.prototype.open = function(method, url) {
47
- const proxiedUrl = '/proxy_full?url=' + encodeURIComponent(url);
48
  return originalOpen.apply(this, [method, proxiedUrl, true]);
49
  };
50
  })();
51
  </script>
52
  """
53
 
54
- @app.get("/proxy_full")
55
- async def proxy_full(url: str):
56
- if not url:
57
- raise HTTPException(status_code=400, detail="Missing 'url' query parameter")
58
-
59
- # Fetch the target URL using an async HTTP client.
60
  async with httpx.AsyncClient() as client:
61
- resp = await client.get(url)
62
-
63
  content_type = resp.headers.get("Content-Type", "")
64
- # For non-HTML content (images, CSS, JS, etc.), simply return the response.
 
65
  if "text/html" not in content_type:
66
  return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
67
 
68
- # Parse HTML with BeautifulSoup
69
  soup = BeautifulSoup(resp.text, "html.parser")
70
 
71
- # Inject our JavaScript to intercept client-side navigation and network calls.
 
 
 
72
  if soup.head:
73
- soup.head.append(BeautifulSoup(INJECTED_JS, "html.parser"))
74
- elif soup.body:
 
 
 
 
 
 
 
 
 
 
 
 
75
  soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
76
  else:
77
  soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
78
 
79
- # Rewrite URLs for key elements so that they go through the proxy.
 
80
  tags_attrs = {
81
  "a": "href",
82
  "img": "src",
@@ -87,12 +97,40 @@ async def proxy_full(url: str):
87
  for tag, attr in tags_attrs.items():
88
  for element in soup.find_all(tag):
89
  if element.has_attr(attr):
90
- original = element[attr]
91
- # Skip if already proxied
92
- if original.startswith("/proxy_full?url="):
93
  continue
94
- # Build an absolute URL and rewrite it to include our proxy
95
- new_url = urllib.parse.urljoin(url, original)
96
- element[attr] = "/proxy_full?url=" + urllib.parse.quote(new_url)
 
97
 
98
  return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  app = FastAPI()
7
 
8
+ # JavaScript injected into HTML to intercept dynamic navigation and AJAX calls.
 
9
  INJECTED_JS = """
10
  <script>
 
11
  (function() {
12
+ // Intercept history.pushState so dynamic URL changes use our proxy.
13
  const originalPushState = history.pushState;
14
  history.pushState = function(state, title, url) {
15
  if (url) {
16
+ const proxiedUrl = '/?url=' + encodeURIComponent(url);
 
17
  return originalPushState.call(history, state, title, proxiedUrl);
18
  }
19
  return originalPushState.call(history, state, title, url);
20
  };
21
 
22
+ // Intercept fetch() requests to route them through our proxy.
23
  const originalFetch = window.fetch;
24
  window.fetch = function(input, init) {
25
  let url;
 
30
  } else {
31
  return originalFetch(input, init);
32
  }
33
+ const proxiedUrl = '/?url=' + encodeURIComponent(url);
34
  if (typeof input === 'object') {
35
  input = new Request(proxiedUrl, input);
36
  } else {
 
39
  return originalFetch(input, init);
40
  };
41
 
42
+ // Intercept XMLHttpRequest open() calls.
43
  const originalOpen = XMLHttpRequest.prototype.open;
44
  XMLHttpRequest.prototype.open = function(method, url) {
45
+ const proxiedUrl = '/?url=' + encodeURIComponent(url);
46
  return originalOpen.apply(this, [method, proxiedUrl, true]);
47
  };
48
  })();
49
  </script>
50
  """
51
 
52
+ async def fetch_and_rewrite(target_url: str) -> Response:
 
 
 
 
 
53
  async with httpx.AsyncClient() as client:
54
+ # Fetch the target URL.
55
+ resp = await client.get(target_url)
56
  content_type = resp.headers.get("Content-Type", "")
57
+
58
+ # If not HTML (CSS, JS, images, etc.), return content directly.
59
  if "text/html" not in content_type:
60
  return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
61
 
62
+ # Parse HTML content.
63
  soup = BeautifulSoup(resp.text, "html.parser")
64
 
65
+ # --- Inject a <base> Tag ---
66
+ # This makes sure that all relative URLs in the HTML resolve correctly.
67
+ parsed_target = urllib.parse.urlparse(target_url)
68
+ base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
69
  if soup.head:
70
+ # Remove any existing <base> tags.
71
+ for base in soup.head.find_all('base'):
72
+ base.decompose()
73
+ base_tag = soup.new_tag("base", href=base_href)
74
+ soup.head.insert(0, base_tag)
75
+ else:
76
+ head_tag = soup.new_tag("head")
77
+ base_tag = soup.new_tag("base", href=base_href)
78
+ head_tag.insert(0, base_tag)
79
+ soup.insert(0, head_tag)
80
+
81
+ # --- Inject JavaScript for Dynamic Routing ---
82
+ # Place the JS in the <body> (or at the top if no body tag exists).
83
+ if soup.body:
84
  soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
85
  else:
86
  soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
87
 
88
+ # --- Rewrite Resource URLs ---
89
+ # Update links, images, scripts, CSS links, and form actions to route via the proxy.
90
  tags_attrs = {
91
  "a": "href",
92
  "img": "src",
 
97
  for tag, attr in tags_attrs.items():
98
  for element in soup.find_all(tag):
99
  if element.has_attr(attr):
100
+ orig = element[attr]
101
+ # Skip javascript: or mailto: links.
102
+ if orig.startswith("javascript:") or orig.startswith("mailto:"):
103
  continue
104
+ # Build an absolute URL using the target URL as base.
105
+ new_url = urllib.parse.urljoin(target_url, orig)
106
+ # Route it through the proxy.
107
+ element[attr] = "/?url=" + urllib.parse.quote(new_url)
108
 
109
  return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
110
+
111
+ # Catch-all route to handle any path.
112
+ @app.get("/{full_path:path}")
113
+ async def catch_all(full_path: str, request: Request):
114
+ query_params = dict(request.query_params)
115
+
116
+ # If a "url" query parameter is provided, this is the initial load.
117
+ if "url" in query_params:
118
+ target_url = query_params["url"]
119
+ else:
120
+ # Otherwise, try to rebuild the target URL using a stored cookie.
121
+ target_base = request.cookies.get("target_base")
122
+ if not target_base:
123
+ return Response("No target URL provided.", status_code=400)
124
+ qs = request.url.query # Preserve any query string.
125
+ target_url = urllib.parse.urljoin(target_base, full_path)
126
+ if qs:
127
+ target_url += "?" + qs
128
+
129
+ response = await fetch_and_rewrite(target_url)
130
+
131
+ # Store the base URL (scheme + host) in a cookie for subsequent requests.
132
+ parsed_target = urllib.parse.urlparse(target_url)
133
+ base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
134
+ response.set_cookie("target_base", base_url)
135
+
136
+ return response