dpv007 commited on
Commit
2651019
·
verified ·
1 Parent(s): 1bd5bd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -22
app.py CHANGED
@@ -6,7 +6,6 @@ from urllib.parse import urljoin, quote
6
 
7
  app = FastAPI()
8
 
9
-
10
  HTML_INDEX = """
11
  <!doctype html>
12
  <html>
@@ -87,26 +86,38 @@ async def index():
87
  return HTML_INDEX
88
 
89
 
90
- async def fetch_url(url: str) -> httpx.Response:
91
  """
92
- Fetch target URL via httpx.
 
93
  """
94
- async with httpx.AsyncClient(follow_redirects=True, timeout=15) as client:
95
- # Basic headers to mimic a browser
96
- headers = {
97
- "User-Agent": (
98
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
99
- "AppleWebKit/537.36 (KHTML, like Gecko) "
100
- "Chrome/120.0 Safari/537.36"
101
- )
102
- }
 
 
 
 
 
 
 
 
 
 
103
  resp = await client.get(url, headers=headers)
104
  return resp
105
 
106
 
107
  def rewrite_html(html: str, base_url: str) -> str:
108
  """
109
- Rewrite links in HTML so sub-resources go through /proxy as well.
 
110
  """
111
  soup = BeautifulSoup(html, "html.parser")
112
 
@@ -116,20 +127,34 @@ def rewrite_html(html: str, base_url: str) -> str:
116
  original = tag.attrs.get(attr)
117
  if not original:
118
  return
119
- # Handle things like //cdn.example.com, /path, relative paths, etc.
120
  absolute = urljoin(base_url, original)
121
  tag.attrs[attr] = f"/proxy?url={quote(absolute, safe='')}"
122
 
123
- # rewrite common URL-carrying tags
124
- for tag in soup.find_all(["a", "img", "script", "link", "form", "iframe"]):
 
 
 
 
 
 
 
 
 
 
 
 
125
  if tag.name in ("a", "link"):
126
  proxify("href", tag)
127
- if tag.name in ("img", "script", "iframe"):
128
  proxify("src", tag)
129
  if tag.name == "form":
130
  proxify("action", tag)
 
 
 
131
 
132
- # Optionally, inject a small banner to indicate proxied content
133
  banner = soup.new_tag("div")
134
  banner.string = f"Proxied via HF Space — {base_url}"
135
  banner["style"] = (
@@ -147,9 +172,14 @@ def rewrite_html(html: str, base_url: str) -> str:
147
  async def proxy(url: str, request: Request):
148
  """
149
  Reverse-proxy endpoint: /proxy?url=https://example.com
 
 
 
 
 
150
  """
151
  try:
152
- upstream = await fetch_url(url)
153
  except Exception as e:
154
  return HTMLResponse(
155
  f"<h1>Error</h1><p>Could not fetch {url}</p><pre>{e}</pre>",
@@ -158,16 +188,16 @@ async def proxy(url: str, request: Request):
158
 
159
  content_type = upstream.headers.get("content-type", "")
160
 
161
- # HTML: rewrite links so that all further requests go via /proxy
162
  if "text/html" in content_type:
163
  rewritten = rewrite_html(upstream.text, base_url=url)
164
  return HTMLResponse(content=rewritten, status_code=upstream.status_code)
165
 
166
- # For non-HTML (images, JS, CSS, fonts...), just pass through
167
- # while stripping hop-by-hop headers.
168
  safe_headers = {}
169
  for k, v in upstream.headers.items():
170
  lk = k.lower()
 
171
  if lk in ("content-encoding", "transfer-encoding", "connection"):
172
  continue
173
  safe_headers[k] = v
 
6
 
7
  app = FastAPI()
8
 
 
9
  HTML_INDEX = """
10
  <!doctype html>
11
  <html>
 
86
  return HTML_INDEX
87
 
88
 
89
+ async def fetch_url(url: str, request: Request) -> httpx.Response:
90
  """
91
+ Fetch target URL via httpx, forwarding some useful headers
92
+ (like Range for video/audio).
93
  """
94
+ client_headers = request.headers
95
+
96
+ headers = {
97
+ "User-Agent": client_headers.get(
98
+ "user-agent",
99
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
100
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
101
+ "Chrome/120.0 Safari/537.36",
102
+ ),
103
+ "Accept": client_headers.get("accept", "*/*"),
104
+ "Accept-Language": client_headers.get("accept-language", "en-US,en;q=0.9"),
105
+ }
106
+
107
+ # Forward Range header for video/audio seeking
108
+ range_header = client_headers.get("range")
109
+ if range_header:
110
+ headers["Range"] = range_header
111
+
112
+ async with httpx.AsyncClient(follow_redirects=True, timeout=30) as client:
113
  resp = await client.get(url, headers=headers)
114
  return resp
115
 
116
 
117
  def rewrite_html(html: str, base_url: str) -> str:
118
  """
119
+ Rewrite links in HTML so sub-resources (scripts, css, images, video, etc.)
120
+ go through /proxy as well.
121
  """
122
  soup = BeautifulSoup(html, "html.parser")
123
 
 
127
  original = tag.attrs.get(attr)
128
  if not original:
129
  return
 
130
  absolute = urljoin(base_url, original)
131
  tag.attrs[attr] = f"/proxy?url={quote(absolute, safe='')}"
132
 
133
+ # Tags that can contain URLs
134
+ for tag in soup.find_all(
135
+ [
136
+ "a",
137
+ "img",
138
+ "script",
139
+ "link",
140
+ "form",
141
+ "iframe",
142
+ "video",
143
+ "audio",
144
+ "source",
145
+ ]
146
+ ):
147
  if tag.name in ("a", "link"):
148
  proxify("href", tag)
149
+ if tag.name in ("img", "script", "iframe", "video", "audio", "source"):
150
  proxify("src", tag)
151
  if tag.name == "form":
152
  proxify("action", tag)
153
+ # video poster attribute (thumbnail)
154
+ if tag.name == "video":
155
+ proxify("poster", tag)
156
 
157
+ # Optional: add a small banner so you know it's proxied
158
  banner = soup.new_tag("div")
159
  banner.string = f"Proxied via HF Space — {base_url}"
160
  banner["style"] = (
 
172
  async def proxy(url: str, request: Request):
173
  """
174
  Reverse-proxy endpoint: /proxy?url=https://example.com
175
+ Supports:
176
+ - HTML (rewritten)
177
+ - Images
178
+ - JS / CSS
179
+ - Video / audio (with Range header forwarded)
180
  """
181
  try:
182
+ upstream = await fetch_url(url, request)
183
  except Exception as e:
184
  return HTMLResponse(
185
  f"<h1>Error</h1><p>Could not fetch {url}</p><pre>{e}</pre>",
 
188
 
189
  content_type = upstream.headers.get("content-type", "")
190
 
191
+ # HTML: rewrite links so further requests go via /proxy
192
  if "text/html" in content_type:
193
  rewritten = rewrite_html(upstream.text, base_url=url)
194
  return HTMLResponse(content=rewritten, status_code=upstream.status_code)
195
 
196
+ # Non-HTML (images, videos, audio, JS, CSS, fonts...): pass through
 
197
  safe_headers = {}
198
  for k, v in upstream.headers.items():
199
  lk = k.lower()
200
+ # Strip hop-by-hop and encoding headers (let FastAPI handle compression)
201
  if lk in ("content-encoding", "transfer-encoding", "connection"):
202
  continue
203
  safe_headers[k] = v