triflix commited on
Commit
f3f8bd0
·
verified ·
1 Parent(s): 2100890

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +101 -55
main.py CHANGED
@@ -1,15 +1,39 @@
 
 
1
  from fastapi import FastAPI, Request, HTTPException, Response
2
  import httpx
3
  from bs4 import BeautifulSoup
4
- import urllib.parse
5
 
6
  app = FastAPI()
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # ---------------------------------------------------------------------------
9
  # Injected JavaScript for Dynamic Interception & Real-Time Updates
10
  # ---------------------------------------------------------------------------
11
- # This script intercepts client-side navigation (history, fetch, XHR, anchor clicks)
12
- # and uses MutationObserver to reprocess dynamically added DOM nodes.
 
13
  INJECTED_JS = """
14
  <script>
15
  (function() {
@@ -76,7 +100,7 @@ INJECTED_JS = """
76
 
77
  // Rewrites URL attributes for an element and its children.
78
  function reProxyElement(element) {
79
- const urlAttrs = ['href', 'src', 'action', 'srcset'];
80
  urlAttrs.forEach(function(attr) {
81
  if (element.hasAttribute(attr)) {
82
  const value = element.getAttribute(attr);
@@ -86,6 +110,16 @@ INJECTED_JS = """
86
  }
87
  }
88
  });
 
 
 
 
 
 
 
 
 
 
89
  Array.from(element.children).forEach(child => reProxyElement(child));
90
  }
91
  })();
@@ -93,76 +127,87 @@ INJECTED_JS = """
93
  """
94
 
95
  # ---------------------------------------------------------------------------
96
- # Helper Function: Rewrite URLs for All Relevant Attributes
97
  # ---------------------------------------------------------------------------
98
  def rewrite_urls(soup, target_url, proxy_prefix="/?url="):
99
  """
100
  Iterates over all elements in the parsed HTML and rewrites URL-like attributes.
101
- Supports attributes such as: href, src, action, srcset.
 
102
  """
103
- url_attrs = ['href', 'src', 'action', 'srcset']
104
  for element in soup.find_all(True):
 
105
  for attr in list(element.attrs):
106
  if attr in url_attrs:
107
  orig_value = element.get(attr)
108
- # Skip already proxied or special schemes.
109
  if not orig_value or orig_value.startswith(proxy_prefix) or orig_value.startswith(("mailto:", "javascript:")):
110
  continue
111
- # Resolve relative URLs.
112
  new_url = urllib.parse.urljoin(target_url, orig_value)
113
- # Rewrite URL to be loaded via the proxy.
114
  element[attr] = proxy_prefix + urllib.parse.quote(new_url)
 
 
 
 
 
115
  return soup
116
 
117
  # ---------------------------------------------------------------------------
118
- # Core Function: Fetch and Rewrite the Target HTML
119
  # ---------------------------------------------------------------------------
120
  async def fetch_and_rewrite(target_url: str) -> Response:
121
  """
122
- Asynchronously fetches the target URL, then parses and rewrites its HTML:
123
- - Removes conflicting Content Security Policy meta tags.
124
- - Inserts a <base> tag to resolve relative URLs.
125
- - Injects JavaScript to intercept dynamic navigation and update DOM in real time.
126
- - Rewrites URL attributes for a comprehensive set of tags.
127
  """
128
  async with httpx.AsyncClient() as client:
129
  resp = await client.get(target_url)
130
 
131
  content_type = resp.headers.get("Content-Type", "")
132
- if "text/html" not in content_type:
133
- # For non-HTML content, return the response directly.
134
- return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
135
 
136
- soup = BeautifulSoup(resp.text, "html.parser")
 
 
 
137
 
138
- # Remove Content Security Policy meta tags that might block our injected scripts.
139
- for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
140
- meta.decompose()
141
 
142
- # Insert (or replace) a <base> tag for proper resolution of relative URLs.
143
- parsed_target = urllib.parse.urlparse(target_url)
144
- base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
145
- if soup.head:
146
- for base in soup.head.find_all("base"):
147
- base.decompose()
148
- base_tag = soup.new_tag("base", href=base_href)
149
- soup.head.insert(0, base_tag)
150
- else:
151
- head_tag = soup.new_tag("head")
152
- base_tag = soup.new_tag("base", href=base_href)
153
- head_tag.insert(0, base_tag)
154
- soup.insert(0, head_tag)
155
 
156
- # Inject our dynamic JavaScript into the beginning of the <body>.
157
- if soup.body:
158
- soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
159
- else:
160
- soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- # Rewrite URLs for all relevant elements.
163
- soup = rewrite_urls(soup, target_url)
164
 
165
- return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
 
 
 
166
 
167
  # ---------------------------------------------------------------------------
168
  # Simple Proxy Endpoint: /proxy_full
@@ -170,8 +215,7 @@ async def fetch_and_rewrite(target_url: str) -> Response:
170
  @app.get("/proxy_full")
171
  async def proxy_full(url: str):
172
  """
173
- A simple proxy endpoint that fetches the given URL and rewrites
174
- its HTML so that all resource URLs route through the proxy.
175
  """
176
  if not url:
177
  raise HTTPException(status_code=400, detail="Missing 'url' query parameter")
@@ -180,13 +224,15 @@ async def proxy_full(url: str):
180
  resp = await client.get(url)
181
 
182
  content_type = resp.headers.get("Content-Type", "")
183
- if "text/html" not in content_type:
184
- return Response(resp.content, media_type=content_type, status_code=resp.status_code)
185
-
186
- soup = BeautifulSoup(resp.text, "html.parser")
187
- soup = rewrite_urls(soup, url)
 
 
188
 
189
- return Response(str(soup), media_type="text/html")
190
 
191
  # ---------------------------------------------------------------------------
192
  # Catch-All Dynamic Proxy Endpoint
@@ -195,8 +241,8 @@ async def proxy_full(url: str):
195
  async def catch_all(full_path: str, request: Request):
196
  """
197
  Catch-all endpoint for dynamic proxying.
198
- Determines the target URL via a query parameter or a stored cookie.
199
- Processes the HTML to inject dynamic JavaScript and rewrite URLs.
200
  """
201
  query_params = dict(request.query_params)
202
  if "url" in query_params:
@@ -212,7 +258,7 @@ async def catch_all(full_path: str, request: Request):
212
 
213
  response = await fetch_and_rewrite(target_url)
214
 
215
- # Save the target's base URL in a cookie for subsequent requests.
216
  parsed_target = urllib.parse.urlparse(target_url)
217
  base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
218
  response.set_cookie("target_base", base_url)
 
1
+ import re
2
+ import urllib.parse
3
  from fastapi import FastAPI, Request, HTTPException, Response
4
  import httpx
5
  from bs4 import BeautifulSoup
 
6
 
7
  app = FastAPI()
8
 
9
+ # ---------------------------------------------------------------------------
10
+ # Utility: Rewrite CSS URLs inside CSS text
11
+ # ---------------------------------------------------------------------------
12
+ def rewrite_css(css_text: str, target_url: str, proxy_prefix="/?url=") -> str:
13
+ """
14
+ Finds all url(...) references in CSS and rewrites them so that
15
+ the resources are loaded via the proxy.
16
+ """
17
+ # Matches: url('...'), url("..."), or url(...)
18
+ pattern = re.compile(r'url\(\s*(?P<quote>["\']?)(?P<url>[^"\')]+)(?P=quote)\s*\)')
19
+
20
+ def replace_url(match):
21
+ original_url = match.group("url")
22
+ # Skip if already proxied, is a data URL, or if it's already absolute and served correctly.
23
+ if original_url.startswith(proxy_prefix) or original_url.startswith("data:"):
24
+ return match.group(0)
25
+ new_url = urllib.parse.urljoin(target_url, original_url)
26
+ proxied = f'url({match.group("quote")}{proxy_prefix}{urllib.parse.quote(new_url)}{match.group("quote")})'
27
+ return proxied
28
+
29
+ return pattern.sub(replace_url, css_text)
30
+
31
  # ---------------------------------------------------------------------------
32
  # Injected JavaScript for Dynamic Interception & Real-Time Updates
33
  # ---------------------------------------------------------------------------
34
+ # This script intercepts history changes, fetch, XHR, and anchor clicks,
35
+ # and uses MutationObserver to rewrite new elements. It also handles SVG
36
+ # attributes (including xlink:href) for icons and logos.
37
  INJECTED_JS = """
38
  <script>
39
  (function() {
 
100
 
101
  // Rewrites URL attributes for an element and its children.
102
  function reProxyElement(element) {
103
+ const urlAttrs = ['href', 'src', 'action', 'srcset', 'xlink:href'];
104
  urlAttrs.forEach(function(attr) {
105
  if (element.hasAttribute(attr)) {
106
  const value = element.getAttribute(attr);
 
110
  }
111
  }
112
  });
113
+ // Also rewrite inline style attribute.
114
+ if (element.hasAttribute('style')) {
115
+ let styleVal = element.getAttribute('style');
116
+ // Simple client-side rewriting: prepend proxy to URLs.
117
+ styleVal = styleVal.replace(/url\\((['"]?)(.*?)\\1\\)/g, function(match, quote, url) {
118
+ if (url.startsWith('/?url=') || url.startsWith('data:')) return match;
119
+ return "url(" + quote + "/?url=" + encodeURIComponent(url) + quote + ")";
120
+ });
121
+ element.setAttribute('style', styleVal);
122
+ }
123
  Array.from(element.children).forEach(child => reProxyElement(child));
124
  }
125
  })();
 
127
  """
128
 
129
  # ---------------------------------------------------------------------------
130
+ # Helper Function: Rewrite URLs for All Relevant Attributes in HTML
131
  # ---------------------------------------------------------------------------
132
  def rewrite_urls(soup, target_url, proxy_prefix="/?url="):
133
  """
134
  Iterates over all elements in the parsed HTML and rewrites URL-like attributes.
135
+ Supports attributes such as: href, src, action, srcset, xlink:href.
136
+ Also rewrites inline style attributes that contain CSS url() references.
137
  """
138
+ url_attrs = ['href', 'src', 'action', 'srcset', 'xlink:href']
139
  for element in soup.find_all(True):
140
+ # Rewrite attributes that hold URLs.
141
  for attr in list(element.attrs):
142
  if attr in url_attrs:
143
  orig_value = element.get(attr)
 
144
  if not orig_value or orig_value.startswith(proxy_prefix) or orig_value.startswith(("mailto:", "javascript:")):
145
  continue
 
146
  new_url = urllib.parse.urljoin(target_url, orig_value)
 
147
  element[attr] = proxy_prefix + urllib.parse.quote(new_url)
148
+ # Rewrite inline style attributes containing CSS url() patterns.
149
+ if element.has_attr("style"):
150
+ original_style = element.get("style")
151
+ new_style = rewrite_css(original_style, target_url, proxy_prefix)
152
+ element["style"] = new_style
153
  return soup
154
 
155
  # ---------------------------------------------------------------------------
156
+ # Core Function: Fetch and Rewrite the Target Resource
157
  # ---------------------------------------------------------------------------
158
  async def fetch_and_rewrite(target_url: str) -> Response:
159
  """
160
+ Fetches the target URL and processes the response.
161
+ - If the content is HTML, removes conflicting CSP tags, inserts a <base> tag,
162
+ injects dynamic JavaScript, and rewrites URL attributes.
163
+ - If the content is CSS, rewrites any url() references so that the resources are proxied.
164
+ - Other content types are returned as-is.
165
  """
166
  async with httpx.AsyncClient() as client:
167
  resp = await client.get(target_url)
168
 
169
  content_type = resp.headers.get("Content-Type", "")
 
 
 
170
 
171
+ # Process CSS files to rewrite url() references.
172
+ if "text/css" in content_type:
173
+ new_css = rewrite_css(resp.text, target_url)
174
+ return Response(content=new_css, media_type="text/css", status_code=resp.status_code)
175
 
176
+ # Process HTML content.
177
+ if "text/html" in content_type:
178
+ soup = BeautifulSoup(resp.text, "html.parser")
179
 
180
+ # Remove any Content Security Policy meta tags that might block our scripts.
181
+ for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
182
+ meta.decompose()
 
 
 
 
 
 
 
 
 
 
183
 
184
+ # Insert (or replace) a <base> tag to ensure relative URLs resolve properly.
185
+ parsed_target = urllib.parse.urlparse(target_url)
186
+ base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
187
+ if soup.head:
188
+ for base in soup.head.find_all("base"):
189
+ base.decompose()
190
+ base_tag = soup.new_tag("base", href=base_href)
191
+ soup.head.insert(0, base_tag)
192
+ else:
193
+ head_tag = soup.new_tag("head")
194
+ base_tag = soup.new_tag("base", href=base_href)
195
+ head_tag.insert(0, base_tag)
196
+ soup.insert(0, head_tag)
197
+
198
+ # Inject dynamic JavaScript for real-time interception.
199
+ if soup.body:
200
+ soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
201
+ else:
202
+ soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
203
 
204
+ # Rewrite URLs (including SVG icons, CSS backgrounds in style attributes, etc.).
205
+ soup = rewrite_urls(soup, target_url)
206
 
207
+ return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
208
+
209
+ # For other content types (JS, images, etc.) return the response directly.
210
+ return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
211
 
212
  # ---------------------------------------------------------------------------
213
  # Simple Proxy Endpoint: /proxy_full
 
215
  @app.get("/proxy_full")
216
  async def proxy_full(url: str):
217
  """
218
+ A simple proxy endpoint that fetches the given URL and rewrites its HTML/CSS so that all resource URLs route via the proxy.
 
219
  """
220
  if not url:
221
  raise HTTPException(status_code=400, detail="Missing 'url' query parameter")
 
224
  resp = await client.get(url)
225
 
226
  content_type = resp.headers.get("Content-Type", "")
227
+ if "text/html" in content_type:
228
+ soup = BeautifulSoup(resp.text, "html.parser")
229
+ soup = rewrite_urls(soup, url)
230
+ return Response(content=str(soup), media_type="text/html")
231
+ elif "text/css" in content_type:
232
+ new_css = rewrite_css(resp.text, url)
233
+ return Response(content=new_css, media_type="text/css")
234
 
235
+ return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
236
 
237
  # ---------------------------------------------------------------------------
238
  # Catch-All Dynamic Proxy Endpoint
 
241
  async def catch_all(full_path: str, request: Request):
242
  """
243
  Catch-all endpoint for dynamic proxying.
244
+ Determines the target URL via a query parameter or a stored cookie,
245
+ then processes the resource (HTML or CSS) to inject dynamic JS and rewrite URLs.
246
  """
247
  query_params = dict(request.query_params)
248
  if "url" in query_params:
 
258
 
259
  response = await fetch_and_rewrite(target_url)
260
 
261
+ # Store the target's base URL in a cookie for subsequent requests.
262
  parsed_target = urllib.parse.urlparse(target_url)
263
  base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
264
  response.set_cookie("target_base", base_url)