sehsapneb commited on
Commit
c56eb99
·
verified ·
1 Parent(s): 2b3f02b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -18
app.py CHANGED
@@ -6,7 +6,9 @@ from bs4 import BeautifulSoup
6
 
7
  app = FastAPI()
8
 
9
- # ---- 简单首页(输入网址的页面) ----
 
 
10
 
11
  INDEX_HTML = """
12
  <!DOCTYPE html>
@@ -62,7 +64,9 @@ async def index():
62
  return INDEX_HTML
63
 
64
 
65
- # ---- 工具函数:校验和规范 URL ----
 
 
66
 
67
  def normalize_url(raw: str) -> str:
68
  """如果没有 scheme,自动加上 http://"""
@@ -87,17 +91,22 @@ def is_allowed_url(url: str) -> bool:
87
  # 禁止访问一些明显的本地 / 内网地址
88
  if host in ("localhost", "127.0.0.1"):
89
  return False
90
- private_prefixes = ("10.", "192.168.", "172.16.", "172.17.", "172.18.",
91
- "172.19.", "172.20.", "172.21.", "172.22.",
92
- "172.23.", "172.24.", "172.25.", "172.26.",
93
- "172.27.", "172.28.", "172.29.", "172.30.",
94
- "172.31.")
 
 
 
 
95
  if any(host.startswith(p) for p in private_prefixes):
96
  return False
97
 
98
  return True
99
 
100
 
 
101
  HOP_BY_HOP_HEADERS = {
102
  "connection",
103
  "keep-alive",
@@ -109,13 +118,20 @@ HOP_BY_HOP_HEADERS = {
109
  "upgrade",
110
  }
111
 
 
 
 
 
 
112
 
113
- # ---- 重写 HTML 里的链接,使之继续走 /proxy ----
114
 
115
  def rewrite_html(html: str, base_url: str) -> str:
 
 
 
 
116
  soup = BeautifulSoup(html, "html.parser")
117
 
118
- # 要处理的标签和对应属性
119
  tag_attr_pairs = [
120
  ("a", "href"),
121
  ("link", "href"),
@@ -133,12 +149,12 @@ def rewrite_html(html: str, base_url: str) -> str:
133
  if not value:
134
  continue
135
 
136
- # 锚点或 javascript: 等不处理
137
  low = value.lower().strip()
 
138
  if low.startswith("#") or low.startswith("javascript:"):
139
  continue
140
 
141
- # 相对地址 -> 绝对地址
142
  absolute = urljoin(base_url, value)
143
  proxied = f"/proxy?url={quote(absolute, safe='')}"
144
 
@@ -147,7 +163,9 @@ def rewrite_html(html: str, base_url: str) -> str:
147
  return str(soup)
148
 
149
 
150
- # ---- 反向代理核心:/proxy ----
 
 
151
 
152
  @app.api_route(
153
  "/proxy",
@@ -166,7 +184,7 @@ async def proxy(request: Request, url: str):
166
  detail="仅支持公网 http/https 地址,且不允许访问内网 / 本地地址。",
167
  )
168
 
169
- # 2. 准备要转发的请求头(过滤掉 hop-by-hop 头部)
170
  outgoing_headers = {}
171
  for k, v in request.headers.items():
172
  lk = k.lower()
@@ -180,7 +198,10 @@ async def proxy(request: Request, url: str):
180
  body = await request.body()
181
 
182
  # 3. 用 httpx 转发请求
183
- async with httpx.AsyncClient(follow_redirects=True, timeout=20.0) as client:
 
 
 
184
  upstream_resp = await client.request(
185
  request.method,
186
  target_url,
@@ -190,30 +211,39 @@ async def proxy(request: Request, url: str):
190
 
191
  content_type = upstream_resp.headers.get("content-type", "")
192
 
193
- # 4. 回传响应头(过滤掉 hop-by-hop + 一些可能冲突的)
194
  response_headers = {}
195
  for k, v in upstream_resp.headers.items():
196
  lk = k.lower()
197
  if lk in HOP_BY_HOP_HEADERS:
198
  continue
199
- if lk in ("content-length", "content-encoding"):
200
- # 交给 FastAPI 重新计算
 
 
201
  continue
202
  response_headers[k] = v
203
 
204
  # 5. 如果是 HTML,就重写里面的链接
205
  if "text/html" in content_type:
 
206
  rewritten_html = rewrite_html(
207
  upstream_resp.text,
208
  base_url=str(upstream_resp.url),
209
  )
 
 
 
 
 
210
  return HTMLResponse(
211
  content=rewritten_html,
212
  status_code=upstream_resp.status_code,
213
  headers=response_headers,
214
  )
215
 
216
- # 6. 其他类型(CSS/JS/图片等)原样透传
 
217
  return Response(
218
  content=upstream_resp.content,
219
  status_code=upstream_resp.status_code,
 
6
 
7
  app = FastAPI()
8
 
9
+ # ==========================
10
+ # 首页:输入网址的简单页面
11
+ # ==========================
12
 
13
  INDEX_HTML = """
14
  <!DOCTYPE html>
 
64
  return INDEX_HTML
65
 
66
 
67
+ # ==========================
68
+ # 工具函数
69
+ # ==========================
70
 
71
  def normalize_url(raw: str) -> str:
72
  """如果没有 scheme,自动加上 http://"""
 
91
  # 禁止访问一些明显的本地 / 内网地址
92
  if host in ("localhost", "127.0.0.1"):
93
  return False
94
+
95
+ private_prefixes = (
96
+ "10.",
97
+ "192.168.",
98
+ "172.16.", "172.17.", "172.18.", "172.19.",
99
+ "172.20.", "172.21.", "172.22.", "172.23.",
100
+ "172.24.", "172.25.", "172.26.", "172.27.",
101
+ "172.28.", "172.29.", "172.30.", "172.31.",
102
+ )
103
  if any(host.startswith(p) for p in private_prefixes):
104
  return False
105
 
106
  return True
107
 
108
 
109
+ # hop-by-hop 头:代理两边不应转发
110
  HOP_BY_HOP_HEADERS = {
111
  "connection",
112
  "keep-alive",
 
118
  "upgrade",
119
  }
120
 
121
+ # 会导致被 iframe 拦截的头,在 Hugging Face 的 iframe 环境里可以适当去掉
122
+ BLOCKED_HEADERS = {
123
+ "x-frame-options",
124
+ "content-security-policy",
125
+ }
126
 
 
127
 
128
  def rewrite_html(html: str, base_url: str) -> str:
129
+ """
130
+ 重写 HTML 里的链接,使之继续走 /proxy。
131
+ 处理 a/link/img/script/iframe/video/source/form 等常见标签。
132
+ """
133
  soup = BeautifulSoup(html, "html.parser")
134
 
 
135
  tag_attr_pairs = [
136
  ("a", "href"),
137
  ("link", "href"),
 
149
  if not value:
150
  continue
151
 
 
152
  low = value.lower().strip()
153
+ # 锚点、javascript: 不改
154
  if low.startswith("#") or low.startswith("javascript:"):
155
  continue
156
 
157
+ # 相对链接 -> 绝对 URL
158
  absolute = urljoin(base_url, value)
159
  proxied = f"/proxy?url={quote(absolute, safe='')}"
160
 
 
163
  return str(soup)
164
 
165
 
166
+ # ==========================
167
+ # 反向代理主逻辑
168
+ # ==========================
169
 
170
  @app.api_route(
171
  "/proxy",
 
184
  detail="仅支持公网 http/https 地址,且不允许访问内网 / 本地地址。",
185
  )
186
 
187
+ # 2. 准备要转发的请求头(过滤掉 hop-by-hop 头部和 Host)
188
  outgoing_headers = {}
189
  for k, v in request.headers.items():
190
  lk = k.lower()
 
198
  body = await request.body()
199
 
200
  # 3. 用 httpx 转发请求
201
+ async with httpx.AsyncClient(
202
+ follow_redirects=True,
203
+ timeout=20.0,
204
+ ) as client:
205
  upstream_resp = await client.request(
206
  request.method,
207
  target_url,
 
211
 
212
  content_type = upstream_resp.headers.get("content-type", "")
213
 
214
+ # 4. 回传响应头(删掉 hop-by-hop、content-length、以及会阻止 iframe 的头)
215
  response_headers = {}
216
  for k, v in upstream_resp.headers.items():
217
  lk = k.lower()
218
  if lk in HOP_BY_HOP_HEADERS:
219
  continue
220
+ if lk == "content-length":
221
+ # FastAPI 自己计算长度
222
+ continue
223
+ if lk in BLOCKED_HEADERS:
224
  continue
225
  response_headers[k] = v
226
 
227
  # 5. 如果是 HTML,就重写里面的链接
228
  if "text/html" in content_type:
229
+ # upstream_resp.text 会根据 charset 自动解码成字符串
230
  rewritten_html = rewrite_html(
231
  upstream_resp.text,
232
  base_url=str(upstream_resp.url),
233
  )
234
+
235
+ # 这里我们已经重新编码 HTML 了,所以不能再带 content-encoding
236
+ response_headers.pop("content-encoding", None)
237
+ response_headers.pop("Content-Encoding", None)
238
+
239
  return HTMLResponse(
240
  content=rewritten_html,
241
  status_code=upstream_resp.status_code,
242
  headers=response_headers,
243
  )
244
 
245
+ # 6. 其他类型(CSS/JS/图片/二进制等)原样透传
246
+ # 保留 Content-Encoding,这样浏览器才能正确解压 / 展示,避免乱码
247
  return Response(
248
  content=upstream_resp.content,
249
  status_code=upstream_resp.status_code,