triflix commited on
Commit
a0e40dd
·
verified ·
1 Parent(s): f459e61

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +39 -30
main.py CHANGED
@@ -5,11 +5,11 @@ import urllib.parse
5
 
6
  app = FastAPI()
7
 
8
- # JavaScript injected into HTML to intercept dynamic navigation and AJAX calls.
9
  INJECTED_JS = """
10
  <script>
 
11
  (function() {
12
- // Intercept history.pushState so dynamic URL changes use our proxy.
13
  const originalPushState = history.pushState;
14
  history.pushState = function(state, title, url) {
15
  if (url) {
@@ -19,7 +19,7 @@ INJECTED_JS = """
19
  return originalPushState.call(history, state, title, url);
20
  };
21
 
22
- // Intercept fetch() requests to route them through our proxy.
23
  const originalFetch = window.fetch;
24
  window.fetch = function(input, init) {
25
  let url;
@@ -39,36 +39,52 @@ INJECTED_JS = """
39
  return originalFetch(input, init);
40
  };
41
 
42
- // Intercept XMLHttpRequest open() calls.
43
  const originalOpen = XMLHttpRequest.prototype.open;
44
  XMLHttpRequest.prototype.open = function(method, url) {
45
  const proxiedUrl = '/?url=' + encodeURIComponent(url);
46
  return originalOpen.apply(this, [method, proxiedUrl, true]);
47
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  })();
49
  </script>
50
  """
51
 
52
  async def fetch_and_rewrite(target_url: str) -> Response:
53
  async with httpx.AsyncClient() as client:
54
- # Fetch the target URL.
55
  resp = await client.get(target_url)
56
  content_type = resp.headers.get("Content-Type", "")
57
 
58
- # If not HTML (CSS, JS, images, etc.), return content directly.
59
  if "text/html" not in content_type:
60
  return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
61
-
62
- # Parse HTML content.
63
  soup = BeautifulSoup(resp.text, "html.parser")
64
-
65
- # --- Inject a <base> Tag ---
66
- # This makes sure that all relative URLs in the HTML resolve correctly.
 
 
 
 
67
  parsed_target = urllib.parse.urlparse(target_url)
68
  base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
69
  if soup.head:
70
  # Remove any existing <base> tags.
71
- for base in soup.head.find_all('base'):
72
  base.decompose()
73
  base_tag = soup.new_tag("base", href=base_href)
74
  soup.head.insert(0, base_tag)
@@ -77,16 +93,16 @@ async def fetch_and_rewrite(target_url: str) -> Response:
77
  base_tag = soup.new_tag("base", href=base_href)
78
  head_tag.insert(0, base_tag)
79
  soup.insert(0, head_tag)
80
-
81
  # --- Inject JavaScript for Dynamic Routing ---
82
- # Place the JS in the <body> (or at the top if no body tag exists).
83
  if soup.body:
84
  soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
85
  else:
86
  soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
87
-
88
  # --- Rewrite Resource URLs ---
89
- # Update links, images, scripts, CSS links, and form actions to route via the proxy.
90
  tags_attrs = {
91
  "a": "href",
92
  "img": "src",
@@ -98,39 +114,32 @@ async def fetch_and_rewrite(target_url: str) -> Response:
98
  for element in soup.find_all(tag):
99
  if element.has_attr(attr):
100
  orig = element[attr]
101
- # Skip javascript: or mailto: links.
102
- if orig.startswith("javascript:") or orig.startswith("mailto:"):
103
  continue
104
- # Build an absolute URL using the target URL as base.
105
  new_url = urllib.parse.urljoin(target_url, orig)
106
- # Route it through the proxy.
107
  element[attr] = "/?url=" + urllib.parse.quote(new_url)
108
-
109
  return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
110
 
111
- # Catch-all route to handle any path.
112
  @app.get("/{full_path:path}")
113
  async def catch_all(full_path: str, request: Request):
114
  query_params = dict(request.query_params)
115
-
116
- # If a "url" query parameter is provided, this is the initial load.
117
  if "url" in query_params:
118
  target_url = query_params["url"]
119
  else:
120
- # Otherwise, try to rebuild the target URL using a stored cookie.
121
  target_base = request.cookies.get("target_base")
122
  if not target_base:
123
  return Response("No target URL provided.", status_code=400)
124
- qs = request.url.query # Preserve any query string.
125
  target_url = urllib.parse.urljoin(target_base, full_path)
126
  if qs:
127
  target_url += "?" + qs
128
-
129
  response = await fetch_and_rewrite(target_url)
130
-
131
- # Store the base URL (scheme + host) in a cookie for subsequent requests.
132
  parsed_target = urllib.parse.urlparse(target_url)
133
  base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
134
  response.set_cookie("target_base", base_url)
135
-
136
  return response
 
5
 
6
  app = FastAPI()
7
 
8
+ # Injected JavaScript now also intercepts anchor clicks.
9
  INJECTED_JS = """
10
  <script>
11
+ // Intercept history.pushState so dynamic URL changes are routed through the proxy.
12
  (function() {
 
13
  const originalPushState = history.pushState;
14
  history.pushState = function(state, title, url) {
15
  if (url) {
 
19
  return originalPushState.call(history, state, title, url);
20
  };
21
 
22
+ // Intercept fetch() requests.
23
  const originalFetch = window.fetch;
24
  window.fetch = function(input, init) {
25
  let url;
 
39
  return originalFetch(input, init);
40
  };
41
 
42
+ // Intercept XMLHttpRequest.open().
43
  const originalOpen = XMLHttpRequest.prototype.open;
44
  XMLHttpRequest.prototype.open = function(method, url) {
45
  const proxiedUrl = '/?url=' + encodeURIComponent(url);
46
  return originalOpen.apply(this, [method, proxiedUrl, true]);
47
  };
48
+
49
+ // Intercept anchor clicks to keep navigation within the proxy.
50
+ document.addEventListener('click', function(event) {
51
+ const target = event.target.closest('a');
52
+ if (target && target.href) {
53
+ // Skip if already proxied or if special attributes exist.
54
+ if (target.getAttribute('data-no-proxy') || target.href.indexOf('/?url=') === 0) {
55
+ return;
56
+ }
57
+ event.preventDefault();
58
+ window.location.href = '/?url=' + encodeURIComponent(target.href);
59
+ }
60
+ });
61
  })();
62
  </script>
63
  """
64
 
65
  async def fetch_and_rewrite(target_url: str) -> Response:
66
  async with httpx.AsyncClient() as client:
 
67
  resp = await client.get(target_url)
68
  content_type = resp.headers.get("Content-Type", "")
69
 
70
+ # For non-HTML resources (CSS, JS, images, etc.), return the content directly.
71
  if "text/html" not in content_type:
72
  return Response(content=resp.content, media_type=content_type, status_code=resp.status_code)
73
+
74
+ # Parse the HTML content.
75
  soup = BeautifulSoup(resp.text, "html.parser")
76
+
77
+ # Remove any Content Security Policy meta tags that might block our injected scripts.
78
+ for meta in soup.find_all("meta", attrs={"http-equiv": "Content-Security-Policy"}):
79
+ meta.decompose()
80
+
81
+ # --- Insert a <base> Tag ---
82
+ # This ensures that relative URLs in the HTML resolve against the target domain.
83
  parsed_target = urllib.parse.urlparse(target_url)
84
  base_href = f"{parsed_target.scheme}://{parsed_target.netloc}"
85
  if soup.head:
86
  # Remove any existing <base> tags.
87
+ for base in soup.head.find_all("base"):
88
  base.decompose()
89
  base_tag = soup.new_tag("base", href=base_href)
90
  soup.head.insert(0, base_tag)
 
93
  base_tag = soup.new_tag("base", href=base_href)
94
  head_tag.insert(0, base_tag)
95
  soup.insert(0, head_tag)
96
+
97
  # --- Inject JavaScript for Dynamic Routing ---
98
+ # This script intercepts dynamic navigation and network calls.
99
  if soup.body:
100
  soup.body.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
101
  else:
102
  soup.insert(0, BeautifulSoup(INJECTED_JS, "html.parser"))
103
+
104
  # --- Rewrite Resource URLs ---
105
+ # Rewrite URLs in various tags so that they are loaded through the proxy.
106
  tags_attrs = {
107
  "a": "href",
108
  "img": "src",
 
114
  for element in soup.find_all(tag):
115
  if element.has_attr(attr):
116
  orig = element[attr]
117
+ # Skip if already proxied or if it’s a javascript/mailto link.
118
+ if orig.startswith("/?url=") or orig.startswith("javascript:") or orig.startswith("mailto:"):
119
  continue
 
120
  new_url = urllib.parse.urljoin(target_url, orig)
 
121
  element[attr] = "/?url=" + urllib.parse.quote(new_url)
122
+
123
  return Response(content=str(soup), media_type="text/html", status_code=resp.status_code)
124
 
125
+ # Catch-all route that uses a query parameter or cookie to rebuild target URLs.
126
  @app.get("/{full_path:path}")
127
  async def catch_all(full_path: str, request: Request):
128
  query_params = dict(request.query_params)
 
 
129
  if "url" in query_params:
130
  target_url = query_params["url"]
131
  else:
 
132
  target_base = request.cookies.get("target_base")
133
  if not target_base:
134
  return Response("No target URL provided.", status_code=400)
135
+ qs = request.url.query
136
  target_url = urllib.parse.urljoin(target_base, full_path)
137
  if qs:
138
  target_url += "?" + qs
139
+
140
  response = await fetch_and_rewrite(target_url)
141
+ # Store the target’s base URL in a cookie for subsequent requests.
 
142
  parsed_target = urllib.parse.urlparse(target_url)
143
  base_url = f"{parsed_target.scheme}://{parsed_target.netloc}"
144
  response.set_cookie("target_base", base_url)
 
145
  return response