Update app.py
Browse files
app.py
CHANGED
|
@@ -85,10 +85,8 @@ def is_allowed_url(url: str) -> bool:
|
|
| 85 |
if parsed.scheme not in ("http", "https"):
|
| 86 |
return False
|
| 87 |
|
| 88 |
-
host = parsed.hostname or ""
|
| 89 |
-
host = host.lower()
|
| 90 |
|
| 91 |
-
# 禁止访问明显本地 / 内网地址
|
| 92 |
if host in ("localhost", "127.0.0.1"):
|
| 93 |
return False
|
| 94 |
|
|
@@ -106,7 +104,6 @@ def is_allowed_url(url: str) -> bool:
|
|
| 106 |
return True
|
| 107 |
|
| 108 |
|
| 109 |
-
# hop-by-hop 头:代理两边不应转发
|
| 110 |
HOP_BY_HOP_HEADERS = {
|
| 111 |
"connection",
|
| 112 |
"keep-alive",
|
|
@@ -118,7 +115,6 @@ HOP_BY_HOP_HEADERS = {
|
|
| 118 |
"upgrade",
|
| 119 |
}
|
| 120 |
|
| 121 |
-
# 会导致被 iframe 拦截的头
|
| 122 |
BLOCKED_HEADERS = {
|
| 123 |
"x-frame-options",
|
| 124 |
"content-security-policy",
|
|
@@ -126,10 +122,7 @@ BLOCKED_HEADERS = {
|
|
| 126 |
|
| 127 |
|
| 128 |
def rewrite_html(html: str, base_url: str) -> str:
|
| 129 |
-
"""
|
| 130 |
-
重写 HTML 里的链接,使之继续走 /proxy。
|
| 131 |
-
处理 a/link/img/script/iframe/video/source/form 等常见标签。
|
| 132 |
-
"""
|
| 133 |
soup = BeautifulSoup(html, "html.parser")
|
| 134 |
|
| 135 |
tag_attr_pairs = [
|
|
@@ -150,14 +143,11 @@ def rewrite_html(html: str, base_url: str) -> str:
|
|
| 150 |
continue
|
| 151 |
|
| 152 |
low = value.lower().strip()
|
| 153 |
-
# 锚点、javascript: 不改
|
| 154 |
if low.startswith("#") or low.startswith("javascript:"):
|
| 155 |
continue
|
| 156 |
|
| 157 |
-
# 相对链接 -> 绝对 URL
|
| 158 |
absolute = urljoin(base_url, value)
|
| 159 |
proxied = f"/proxy?url={quote(absolute, safe='')}"
|
| 160 |
-
|
| 161 |
node[attr] = proxied
|
| 162 |
|
| 163 |
return str(soup)
|
|
@@ -184,20 +174,22 @@ async def proxy(request: Request, url: str):
|
|
| 184 |
detail="仅支持公网 http/https 地址,且不允许访问内网 / 本地地址。",
|
| 185 |
)
|
| 186 |
|
| 187 |
-
# 2.
|
| 188 |
outgoing_headers = {}
|
| 189 |
for k, v in request.headers.items():
|
| 190 |
lk = k.lower()
|
| 191 |
if lk in HOP_BY_HOP_HEADERS:
|
| 192 |
continue
|
| 193 |
-
if lk
|
| 194 |
continue
|
| 195 |
outgoing_headers[k] = v
|
| 196 |
|
| 197 |
-
#
|
|
|
|
|
|
|
| 198 |
body = await request.body()
|
| 199 |
|
| 200 |
-
# 3.
|
| 201 |
async with httpx.AsyncClient(
|
| 202 |
follow_redirects=True,
|
| 203 |
timeout=20.0,
|
|
@@ -209,35 +201,26 @@ async def proxy(request: Request, url: str):
|
|
| 209 |
headers=outgoing_headers,
|
| 210 |
)
|
| 211 |
|
| 212 |
-
|
| 213 |
-
content_type = upstream_resp.headers.get("content-type", "") or ""
|
| 214 |
|
| 215 |
-
# 4. 回
|
| 216 |
response_headers = {}
|
| 217 |
for k, v in upstream_resp.headers.items():
|
| 218 |
lk = k.lower()
|
| 219 |
if lk in HOP_BY_HOP_HEADERS:
|
| 220 |
continue
|
| 221 |
-
if lk
|
|
|
|
| 222 |
continue
|
| 223 |
if lk in BLOCKED_HEADERS:
|
| 224 |
continue
|
| 225 |
response_headers[k] = v
|
| 226 |
|
| 227 |
-
# 5.
|
| 228 |
-
if "text/html" in content_type
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
rewritten_html = rewrite_html(
|
| 232 |
-
html_text,
|
| 233 |
-
base_url=str(upstream_resp.url),
|
| 234 |
-
)
|
| 235 |
|
| 236 |
-
# 我们重新编码 HTML,所以不能再带 content-encoding
|
| 237 |
-
response_headers.pop("content-encoding", None)
|
| 238 |
-
response_headers.pop("Content-Encoding", None)
|
| 239 |
-
|
| 240 |
-
# 统一成 utf-8 文本,避免浏览器乱猜导致部分中文变成问号 / 乱码
|
| 241 |
return HTMLResponse(
|
| 242 |
content=rewritten_html,
|
| 243 |
status_code=upstream_resp.status_code,
|
|
@@ -245,11 +228,33 @@ async def proxy(request: Request, url: str):
|
|
| 245 |
media_type="text/html; charset=utf-8",
|
| 246 |
)
|
| 247 |
|
| 248 |
-
# 6.
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
return Response(
|
| 251 |
content=upstream_resp.content,
|
| 252 |
status_code=upstream_resp.status_code,
|
| 253 |
headers=response_headers,
|
| 254 |
-
media_type=content_type or
|
| 255 |
)
|
|
|
|
| 85 |
if parsed.scheme not in ("http", "https"):
|
| 86 |
return False
|
| 87 |
|
| 88 |
+
host = (parsed.hostname or "").lower()
|
|
|
|
| 89 |
|
|
|
|
| 90 |
if host in ("localhost", "127.0.0.1"):
|
| 91 |
return False
|
| 92 |
|
|
|
|
| 104 |
return True
|
| 105 |
|
| 106 |
|
|
|
|
| 107 |
HOP_BY_HOP_HEADERS = {
|
| 108 |
"connection",
|
| 109 |
"keep-alive",
|
|
|
|
| 115 |
"upgrade",
|
| 116 |
}
|
| 117 |
|
|
|
|
| 118 |
BLOCKED_HEADERS = {
|
| 119 |
"x-frame-options",
|
| 120 |
"content-security-policy",
|
|
|
|
| 122 |
|
| 123 |
|
| 124 |
def rewrite_html(html: str, base_url: str) -> str:
|
| 125 |
+
"""重写 HTML 中的链接,使站内跳转继续走 /proxy。"""
|
|
|
|
|
|
|
|
|
|
| 126 |
soup = BeautifulSoup(html, "html.parser")
|
| 127 |
|
| 128 |
tag_attr_pairs = [
|
|
|
|
| 143 |
continue
|
| 144 |
|
| 145 |
low = value.lower().strip()
|
|
|
|
| 146 |
if low.startswith("#") or low.startswith("javascript:"):
|
| 147 |
continue
|
| 148 |
|
|
|
|
| 149 |
absolute = urljoin(base_url, value)
|
| 150 |
proxied = f"/proxy?url={quote(absolute, safe='')}"
|
|
|
|
| 151 |
node[attr] = proxied
|
| 152 |
|
| 153 |
return str(soup)
|
|
|
|
| 174 |
detail="仅支持公网 http/https 地址,且不允许访问内网 / 本地地址。",
|
| 175 |
)
|
| 176 |
|
| 177 |
+
# 2. 构造上游请求头(去掉 hop-by-hop、host、accept-encoding)
|
| 178 |
outgoing_headers = {}
|
| 179 |
for k, v in request.headers.items():
|
| 180 |
lk = k.lower()
|
| 181 |
if lk in HOP_BY_HOP_HEADERS:
|
| 182 |
continue
|
| 183 |
+
if lk in ("host", "accept-encoding"):
|
| 184 |
continue
|
| 185 |
outgoing_headers[k] = v
|
| 186 |
|
| 187 |
+
# 统一告诉上游:不要压缩(避免各种编码问题)
|
| 188 |
+
outgoing_headers["Accept-Encoding"] = "identity"
|
| 189 |
+
|
| 190 |
body = await request.body()
|
| 191 |
|
| 192 |
+
# 3. 转发请求
|
| 193 |
async with httpx.AsyncClient(
|
| 194 |
follow_redirects=True,
|
| 195 |
timeout=20.0,
|
|
|
|
| 201 |
headers=outgoing_headers,
|
| 202 |
)
|
| 203 |
|
| 204 |
+
content_type = (upstream_resp.headers.get("content-type") or "").lower()
|
|
|
|
| 205 |
|
| 206 |
+
# 4. 构造要返回的响应头
|
| 207 |
response_headers = {}
|
| 208 |
for k, v in upstream_resp.headers.items():
|
| 209 |
lk = k.lower()
|
| 210 |
if lk in HOP_BY_HOP_HEADERS:
|
| 211 |
continue
|
| 212 |
+
if lk in ("content-length", "content-encoding"):
|
| 213 |
+
# 长度和压缩交给我们自己处理
|
| 214 |
continue
|
| 215 |
if lk in BLOCKED_HEADERS:
|
| 216 |
continue
|
| 217 |
response_headers[k] = v
|
| 218 |
|
| 219 |
+
# 5. HTML:重写链接 + 直接输出页面
|
| 220 |
+
if "text/html" in content_type or "application/xhtml+xml" in content_type:
|
| 221 |
+
html_text = upstream_resp.text # httpx 会按 charset 解码
|
| 222 |
+
rewritten_html = rewrite_html(html_text, base_url=str(upstream_resp.url))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
return HTMLResponse(
|
| 225 |
content=rewritten_html,
|
| 226 |
status_code=upstream_resp.status_code,
|
|
|
|
| 228 |
media_type="text/html; charset=utf-8",
|
| 229 |
)
|
| 230 |
|
| 231 |
+
# 6. 图片 / 视频 / 音频 / 字体 / CSS / JS:原样透传(浏览器不会把它们显示成一大堆字)
|
| 232 |
+
if (
|
| 233 |
+
content_type.startswith("image/")
|
| 234 |
+
or content_type.startswith("video/")
|
| 235 |
+
or content_type.startswith("audio/")
|
| 236 |
+
or "font" in content_type
|
| 237 |
+
or content_type in (
|
| 238 |
+
"text/css",
|
| 239 |
+
"application/javascript",
|
| 240 |
+
"text/javascript",
|
| 241 |
+
"application/x-javascript",
|
| 242 |
+
)
|
| 243 |
+
):
|
| 244 |
+
return Response(
|
| 245 |
+
content=upstream_resp.content,
|
| 246 |
+
status_code=upstream_resp.status_code,
|
| 247 |
+
headers=response_headers,
|
| 248 |
+
media_type=content_type or None,
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# 7. 其它(如 application/octet-stream、zip、二进制流):强制当附件下载,避免在窗口里看到乱码
|
| 252 |
+
if "content-disposition" not in {k.lower(): v for k, v in response_headers.items()}:
|
| 253 |
+
response_headers["Content-Disposition"] = "attachment; filename=downloaded.bin"
|
| 254 |
+
|
| 255 |
return Response(
|
| 256 |
content=upstream_resp.content,
|
| 257 |
status_code=upstream_resp.status_code,
|
| 258 |
headers=response_headers,
|
| 259 |
+
media_type=content_type or "application/octet-stream",
|
| 260 |
)
|