fromozu commited on
Commit
bcd40c4
·
verified ·
1 Parent(s): 6622127

Upload hf_backend/test_annas_download.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_annas_download.py +154 -0
hf_backend/test_annas_download.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test using Playwright to handle Anna's Archive slow_download redirect.
4
+ Key insight: Use a VERY long timeout and see if DDoS-Guard eventually allows through.
5
+ """
6
+
7
+ from playwright.sync_api import sync_playwright
8
+ import time
9
+ import re
10
+
11
+ MD5 = "d94c20d1364af9b484949659398c4062"
12
+ SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
13
+
14
+ def download_with_playwright_slow(url, timeout=180):
15
+ """
16
+ Use Playwright to navigate to slow_download and wait for redirect.
17
+ Extended timeout to see if DDoS-Guard eventually allows access.
18
+ """
19
+ print(f"目标: {url}")
20
+ print(f"等待时间: 最多 {timeout} 秒")
21
+ print()
22
+
23
+ with sync_playwright() as p:
24
+ browser = p.chromium.launch(
25
+ headless=True,
26
+ args=['--disable-blink-features=AutomationDetect']
27
+ )
28
+ context = browser.new_context(
29
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
30
+ viewport={"width": 1920, "height": 1080},
31
+ locale="en-US",
32
+ timezone_id="America/New_York",
33
+ extra_http_headers={
34
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
35
+ "Accept-Language": "en-US,en;q=0.9",
36
+ "Accept-Encoding": "gzip, deflate, br",
37
+ }
38
+ )
39
+
40
+ # Try to hide webdriver
41
+ page = context.new_page()
42
+ page.add_init_script("""
43
+ Object.defineProperty(navigator, 'webdriver', {
44
+ get: () => undefined
45
+ });
46
+ """)
47
+
48
+ download_info = {"url": None, "content": None}
49
+
50
+ def on_download(download):
51
+ download_info["url"] = download.url
52
+ print(f"检测到下载事件!")
53
+ print(f"下载 URL: {download.url}")
54
+
55
+ page.on("download", on_download)
56
+
57
+ start_time = time.time()
58
+
59
+ try:
60
+ print("步骤1: 导航到 slow_download...")
61
+ response = page.goto(url, timeout=120000, wait_until="domcontentloaded")
62
+ elapsed = time.time() - start_time
63
+ print(f" 初始响应: 状态={response.status if response else 'None'}, "
64
+ f"URL={page.url}, 耗时={elapsed:.1f}秒")
65
+
66
+ print(f"\n步骤2: 等待最多 {timeout} 秒让重定向完成...")
67
+
68
+ final_url = None
69
+ for i in range(timeout):
70
+ time.sleep(1)
71
+ elapsed = i + 1
72
+
73
+ current_url = page.url
74
+ current_title = page.title()
75
+
76
+ # Check if URL became EPUB
77
+ if current_url.lower().endswith('.epub'):
78
+ final_url = current_url
79
+ print(f"\n*** 在第 {elapsed} 秒检测到 EPUB URL! ***")
80
+ print(f" URL: {final_url}")
81
+ break
82
+
83
+ # Check for download
84
+ if download_info["url"]:
85
+ final_url = download_info["url"]
86
+ break
87
+
88
+ # Check if we got redirected to a different domain (CDN)
89
+ if 'annas-archive' not in current_url and current_url.startswith('http'):
90
+ final_url = current_url
91
+ print(f"\n*** 在第 {elapsed} 秒检测到外部 URL! ***")
92
+ print(f" URL: {final_url}")
93
+ break
94
+
95
+ # Progress report every 15 seconds
96
+ if elapsed % 15 == 0:
97
+ print(f" {elapsed}秒... 当前URL: {current_url[:60]}..., 标题: {current_title}")
98
+
99
+ # If title is not DDoS-Guard and URL changed, something happened
100
+ if current_title != "DDoS-Guard" and i > 5:
101
+ if elapsed % 15 == 0:
102
+ print(f" [信息] 标题='{current_title}', 非 DDoS-Guard 页面")
103
+
104
+ except Exception as e:
105
+ elapsed = time.time() - start_time
106
+ print(f"\n异常: {e}")
107
+ print(f"发生时间: {elapsed:.1f}秒后")
108
+
109
+ print(f"\n最终 URL: {page.url}")
110
+ print(f"最终标题: {page.title()}")
111
+
112
+ # If we found a final URL, try to download the content
113
+ if final_url:
114
+ print(f"\n步骤3: 使用 requests 下载最终内容...")
115
+ import requests
116
+ try:
117
+ resp = requests.get(final_url, timeout=60, headers={
118
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
119
+ })
120
+ print(f" 状态: {resp.status_code}")
121
+ print(f" 内容大小: {len(resp.content)} 字节")
122
+ print(f" 内容类型: {resp.headers.get('content-type', 'unknown')}")
123
+
124
+ if resp.content[:2] == b'PK': # EPUB is a ZIP file
125
+ print(" [OK] 内容是有效的 EPUB/ZIP格式")
126
+ return final_url, resp.content
127
+ else:
128
+ print(" [警告] 内容不是以 PK 开头")
129
+
130
+ except Exception as e:
131
+ print(f" 下载失败: {e}")
132
+
133
+ browser.close()
134
+
135
+ return None, None
136
+
137
+ def main():
138
+ print("=" * 60)
139
+ print("Anna's Archive slow_download Playwright 长等待测试")
140
+ print("=" * 60)
141
+
142
+ url, content = download_with_playwright_slow(SLOW_URL, timeout=120)
143
+
144
+ if url:
145
+ print(f"\n" + "=" * 60)
146
+ print("成功!")
147
+ print(f"最终URL: {url}")
148
+ print(f"内容大小: {len(content)} 字节")
149
+ print("=" * 60)
150
+ else:
151
+ print("\n未能获取到最终链接")
152
+
153
+ if __name__ == "__main__":
154
+ main()