sehsapneb commited on
Commit
c4c13a1
·
verified ·
1 Parent(s): c9ed46a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -36
app.py CHANGED
@@ -85,10 +85,8 @@ def is_allowed_url(url: str) -> bool:
85
  if parsed.scheme not in ("http", "https"):
86
  return False
87
 
88
- host = parsed.hostname or ""
89
- host = host.lower()
90
 
91
- # 禁止访问明显本地 / 内网地址
92
  if host in ("localhost", "127.0.0.1"):
93
  return False
94
 
@@ -106,7 +104,6 @@ def is_allowed_url(url: str) -> bool:
106
  return True
107
 
108
 
109
- # hop-by-hop 头:代理两边不应转发
110
  HOP_BY_HOP_HEADERS = {
111
  "connection",
112
  "keep-alive",
@@ -118,7 +115,6 @@ HOP_BY_HOP_HEADERS = {
118
  "upgrade",
119
  }
120
 
121
- # 会导致被 iframe 拦截的头
122
  BLOCKED_HEADERS = {
123
  "x-frame-options",
124
  "content-security-policy",
@@ -126,10 +122,7 @@ BLOCKED_HEADERS = {
126
 
127
 
128
  def rewrite_html(html: str, base_url: str) -> str:
129
- """
130
- 重写 HTML 里的链接,使之继续走 /proxy。
131
- 处理 a/link/img/script/iframe/video/source/form 等常见标签。
132
- """
133
  soup = BeautifulSoup(html, "html.parser")
134
 
135
  tag_attr_pairs = [
@@ -150,14 +143,11 @@ def rewrite_html(html: str, base_url: str) -> str:
150
  continue
151
 
152
  low = value.lower().strip()
153
- # 锚点、javascript: 不改
154
  if low.startswith("#") or low.startswith("javascript:"):
155
  continue
156
 
157
- # 相对链接 -> 绝对 URL
158
  absolute = urljoin(base_url, value)
159
  proxied = f"/proxy?url={quote(absolute, safe='')}"
160
-
161
  node[attr] = proxied
162
 
163
  return str(soup)
@@ -184,20 +174,22 @@ async def proxy(request: Request, url: str):
184
  detail="仅支持公网 http/https 地址,且不允许访问内网 / 本地地址。",
185
  )
186
 
187
- # 2. 准备要转发的请求头(过滤掉 hop-by-hop 头部和 Host
188
  outgoing_headers = {}
189
  for k, v in request.headers.items():
190
  lk = k.lower()
191
  if lk in HOP_BY_HOP_HEADERS:
192
  continue
193
- if lk == "host":
194
  continue
195
  outgoing_headers[k] = v
196
 
197
- # 请求体
 
 
198
  body = await request.body()
199
 
200
- # 3. 用 httpx 转发请求
201
  async with httpx.AsyncClient(
202
  follow_redirects=True,
203
  timeout=20.0,
@@ -209,35 +201,26 @@ async def proxy(request: Request, url: str):
209
  headers=outgoing_headers,
210
  )
211
 
212
- # httpx 默认会帮你解压 gzip/deflate,upstream_resp.content 一般是“解压后的”内容
213
- content_type = upstream_resp.headers.get("content-type", "") or ""
214
 
215
- # 4. 回响应头(删掉 hop-by-hop、content-length、以及会阻止 iframe 的头)
216
  response_headers = {}
217
  for k, v in upstream_resp.headers.items():
218
  lk = k.lower()
219
  if lk in HOP_BY_HOP_HEADERS:
220
  continue
221
- if lk == "content-length":
 
222
  continue
223
  if lk in BLOCKED_HEADERS:
224
  continue
225
  response_headers[k] = v
226
 
227
- # 5. 如果是 HTML,就重写里面的链接
228
- if "text/html" in content_type.lower() or "application/xhtml+xml" in content_type.lower():
229
- # upstream_resp.text 会根据 charset 自动解码成字符串
230
- html_text = upstream_resp.text
231
- rewritten_html = rewrite_html(
232
- html_text,
233
- base_url=str(upstream_resp.url),
234
- )
235
 
236
- # 我们重新编码 HTML,所以不能再带 content-encoding
237
- response_headers.pop("content-encoding", None)
238
- response_headers.pop("Content-Encoding", None)
239
-
240
- # 统一成 utf-8 文本,避免浏览器乱猜导致部分中文变成问号 / 乱码
241
  return HTMLResponse(
242
  content=rewritten_html,
243
  status_code=upstream_resp.status_code,
@@ -245,11 +228,33 @@ async def proxy(request: Request, url: str):
245
  media_type="text/html; charset=utf-8",
246
  )
247
 
248
- # 6. 其他类型(CSS/JS/图片/二进制等)原样透传
249
- # 这里直接把 bytes 给浏览器,保持原来的 Content-Type / Content-Encoding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  return Response(
251
  content=upstream_resp.content,
252
  status_code=upstream_resp.status_code,
253
  headers=response_headers,
254
- media_type=content_type or None,
255
  )
 
85
  if parsed.scheme not in ("http", "https"):
86
  return False
87
 
88
+ host = (parsed.hostname or "").lower()
 
89
 
 
90
  if host in ("localhost", "127.0.0.1"):
91
  return False
92
 
 
104
  return True
105
 
106
 
 
107
  HOP_BY_HOP_HEADERS = {
108
  "connection",
109
  "keep-alive",
 
115
  "upgrade",
116
  }
117
 
 
118
  BLOCKED_HEADERS = {
119
  "x-frame-options",
120
  "content-security-policy",
 
122
 
123
 
124
  def rewrite_html(html: str, base_url: str) -> str:
125
+ """重写 HTML 中的链接,使站内跳转继续走 /proxy。"""
 
 
 
126
  soup = BeautifulSoup(html, "html.parser")
127
 
128
  tag_attr_pairs = [
 
143
  continue
144
 
145
  low = value.lower().strip()
 
146
  if low.startswith("#") or low.startswith("javascript:"):
147
  continue
148
 
 
149
  absolute = urljoin(base_url, value)
150
  proxied = f"/proxy?url={quote(absolute, safe='')}"
 
151
  node[attr] = proxied
152
 
153
  return str(soup)
 
174
  detail="仅支持公网 http/https 地址,且不允许访问内网 / 本地地址。",
175
  )
176
 
177
+ # 2. 构造上游请求头(掉 hop-by-hop、host、accept-encoding
178
  outgoing_headers = {}
179
  for k, v in request.headers.items():
180
  lk = k.lower()
181
  if lk in HOP_BY_HOP_HEADERS:
182
  continue
183
+ if lk in ("host", "accept-encoding"):
184
  continue
185
  outgoing_headers[k] = v
186
 
187
+ # 统一告诉上游:不要压缩(避免各种编码问题)
188
+ outgoing_headers["Accept-Encoding"] = "identity"
189
+
190
  body = await request.body()
191
 
192
+ # 3. 转发请求
193
  async with httpx.AsyncClient(
194
  follow_redirects=True,
195
  timeout=20.0,
 
201
  headers=outgoing_headers,
202
  )
203
 
204
+ content_type = (upstream_resp.headers.get("content-type") or "").lower()
 
205
 
206
+ # 4. 构造要返响应头
207
  response_headers = {}
208
  for k, v in upstream_resp.headers.items():
209
  lk = k.lower()
210
  if lk in HOP_BY_HOP_HEADERS:
211
  continue
212
+ if lk in ("content-length", "content-encoding"):
213
+ # 长度和压缩交给我们自己处理
214
  continue
215
  if lk in BLOCKED_HEADERS:
216
  continue
217
  response_headers[k] = v
218
 
219
+ # 5. HTML重写链接 + 直接输出页面
220
+ if "text/html" in content_type or "application/xhtml+xml" in content_type:
221
+ html_text = upstream_resp.text # httpx charset 解码
222
+ rewritten_html = rewrite_html(html_text, base_url=str(upstream_resp.url))
 
 
 
 
223
 
 
 
 
 
 
224
  return HTMLResponse(
225
  content=rewritten_html,
226
  status_code=upstream_resp.status_code,
 
228
  media_type="text/html; charset=utf-8",
229
  )
230
 
231
+ # 6. 图片 / 视频 / 音频 / 字体 / CSS / JS:原样透传(浏览器不会把它们显示成一大堆字)
232
+ if (
233
+ content_type.startswith("image/")
234
+ or content_type.startswith("video/")
235
+ or content_type.startswith("audio/")
236
+ or "font" in content_type
237
+ or content_type in (
238
+ "text/css",
239
+ "application/javascript",
240
+ "text/javascript",
241
+ "application/x-javascript",
242
+ )
243
+ ):
244
+ return Response(
245
+ content=upstream_resp.content,
246
+ status_code=upstream_resp.status_code,
247
+ headers=response_headers,
248
+ media_type=content_type or None,
249
+ )
250
+
251
+ # 7. 其它(如 application/octet-stream、zip、二进制流):强制当附件下载,避免在窗口里看到乱码
252
+ if "content-disposition" not in {k.lower(): v for k, v in response_headers.items()}:
253
+ response_headers["Content-Disposition"] = "attachment; filename=downloaded.bin"
254
+
255
  return Response(
256
  content=upstream_resp.content,
257
  status_code=upstream_resp.status_code,
258
  headers=response_headers,
259
+ media_type=content_type or "application/octet-stream",
260
  )