fromozu commited on
Commit
d382bea
·
verified ·
1 Parent(s): ebce8a1

Upload hf_backend/test_slow_download.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_slow_download.py +274 -0
hf_backend/test_slow_download.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test multiple strategies to access slow_download and get final EPUB link.
4
+ """
5
+
6
+ from playwright.sync_api import sync_playwright
7
+ import time
8
+ import sys
9
+ import os
10
+
11
+ MD5 = "d94c20d1364af9b484949659398c4062"
12
+ SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
13
+
14
+ def strategy_1_regular_browser():
15
+ """Try with a regular non-headless browser - less likely to be flagged."""
16
+ print("\n[策略1] 使用非无头模式浏览器...")
17
+ with sync_playwright() as p:
18
+ browser = p.chromium.launch(headless=False)
19
+ context = browser.new_context(
20
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
21
+ )
22
+ page = context.new_page()
23
+
24
+ try:
25
+ response = page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
26
+ print(f" 状态码: {response.status if response else 'None'}")
27
+
28
+ # 等待最多60秒
29
+ for i in range(60):
30
+ time.sleep(1)
31
+ url = page.url
32
+ if url.lower().endswith('.epub'):
33
+ print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB")
34
+ browser.close()
35
+ return url
36
+
37
+ if (i + 1) % 10 == 0:
38
+ print(f" {i+1}秒... URL: {url[:80]}...")
39
+
40
+ except Exception as e:
41
+ print(f" 错误: {e}")
42
+
43
+ browser.close()
44
+ return None
45
+
46
+ def strategy_2_stealth_with_webdriver_hidden():
47
+ """Try with webdriver property properly hidden."""
48
+ print("\n[策略2] 隐藏 webdriver 属性...")
49
+ with sync_playwright() as p:
50
+ browser = p.chromium.launch(headless=True)
51
+ context = browser.new_context(
52
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
53
+ viewport={"width": 1920, "height": 1080},
54
+ locale="en-US",
55
+ )
56
+ page = context.new_page()
57
+
58
+ # Inject script to hide webdriver
59
+ page.add_init_script("""
60
+ Object.defineProperty(navigator, 'webdriver', {
61
+ get: () => undefined,
62
+ configurable: true
63
+ });
64
+ delete navigator.__webdriver;
65
+ delete navigator.__proto__?.webdriver;
66
+ """)
67
+
68
+ try:
69
+ response = page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
70
+ print(f" 状态码: {response.status if response else 'None'}")
71
+
72
+ for i in range(60):
73
+ time.sleep(1)
74
+ url = page.url
75
+ if url.lower().endswith('.epub'):
76
+ print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB")
77
+ browser.close()
78
+ return url
79
+
80
+ if (i + 1) % 10 == 0:
81
+ print(f" {i+1}秒... URL: {url[:80]}...")
82
+
83
+ except Exception as e:
84
+ print(f" 错误: {e}")
85
+
86
+ browser.close()
87
+ return None
88
+
89
+ def strategy_3_wait_for_network():
90
+ """Try waiting for network idle instead of fixed timeout."""
91
+ print("\n[策略3] 等待网络空闲...")
92
+ with sync_playwright() as p:
93
+ browser = p.chromium.launch(headless=True)
94
+ context = browser.new_context(
95
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
96
+ )
97
+ page = context.new_page()
98
+
99
+ try:
100
+ page.goto(SLOW_URL, timeout=120000, wait_until="networkidle", network_idle_timeout=30)
101
+
102
+ for i in range(30):
103
+ time.sleep(1)
104
+ url = page.url
105
+ if url.lower().endswith('.epub'):
106
+ print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB")
107
+ browser.close()
108
+ return url
109
+
110
+ print(f" 最终 URL: {page.url}")
111
+
112
+ except Exception as e:
113
+ print(f" 错误: {e}")
114
+
115
+ browser.close()
116
+ return None
117
+
118
+ def strategy_4_check_for_download_event():
119
+ """Listen for download event instead of checking URL."""
120
+ print("\n[策略4] 监听下载事件...")
121
+ download_info = {"url": None}
122
+
123
+ with sync_playwright() as p:
124
+ browser = p.chromium.launch(headless=True)
125
+ context = browser.new_context(
126
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
127
+ )
128
+ page = context.new_page()
129
+
130
+ def on_download(download):
131
+ download_info["url"] = download.url
132
+ print(f" 检测到下载: {download.url}")
133
+
134
+ page.on("download", on_download)
135
+
136
+ try:
137
+ page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
138
+
139
+ for i in range(60):
140
+ time.sleep(1)
141
+ if download_info["url"]:
142
+ print(f" 在第 {i+1} 秒检测到下载")
143
+ browser.close()
144
+ return download_info["url"]
145
+
146
+ if (i + 1) % 10 == 0:
147
+ print(f" {i+1}秒...")
148
+
149
+ except Exception as e:
150
+ print(f" 错误: {e}")
151
+
152
+ browser.close()
153
+ return None
154
+
155
+ def strategy_5_check_page_content():
156
+ """Check if page content eventually reveals the EPUB URL."""
157
+ print("\n[策略5] 检查页面内容中的 EPUB URL...")
158
+ with sync_playwright() as p:
159
+ browser = p.chromium.launch(headless=True)
160
+ context = browser.new_context(
161
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
162
+ )
163
+ page = context.new_page()
164
+
165
+ try:
166
+ page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
167
+
168
+ for i in range(60):
169
+ time.sleep(1)
170
+
171
+ # Check URL
172
+ url = page.url
173
+ if url.lower().endswith('.epub'):
174
+ print(f" URL 变成 EPUB: {url}")
175
+ browser.close()
176
+ return url
177
+
178
+ # Check page content for EPUB links
179
+ content = page.content()
180
+ if '.epub' in content.lower():
181
+ import re
182
+ # Find EPUB URLs in page
183
+ matches = re.findall(r'href=["\']([^"\']*\.epub[^"\']*)["\']', content, re.IGNORECASE)
184
+ if matches:
185
+ print(f" 从页面内容找到 EPUB: {matches[0]}")
186
+ browser.close()
187
+ return matches[0]
188
+
189
+ # Also try meta refresh or javascript redirects
190
+ matches = re.findall(r'url\s*[=:]\s*["\']([^"\']*\.epub[^"\']*)["\']', content, re.IGNORECASE)
191
+ if matches:
192
+ print(f" 从 JS 找到 EPUB: {matches[0]}")
193
+ browser.close()
194
+ return matches[0]
195
+
196
+ if (i + 1) % 10 == 0:
197
+ title = page.title()
198
+ print(f" {i+1}秒... 标题: {title}")
199
+
200
+ except Exception as e:
201
+ print(f" 错误: {e}")
202
+
203
+ browser.close()
204
+ return None
205
+
206
+ def strategy_6_firefox_instead():
207
+ """Try with Firefox instead of Chromium."""
208
+ print("\n[策略6] 使用 Firefox 浏览器...")
209
+ with sync_playwright() as p:
210
+ try:
211
+ browser = p.firefox.launch(headless=True)
212
+ except Exception as e:
213
+ print(f" Firefox 启动失败: {e}")
214
+ return None
215
+
216
+ context = browser.new_context(
217
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
218
+ )
219
+ page = context.new_page()
220
+
221
+ try:
222
+ response = page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
223
+ print(f" 状态码: {response.status if response else 'None'}")
224
+
225
+ for i in range(60):
226
+ time.sleep(1)
227
+ url = page.url
228
+ if url.lower().endswith('.epub'):
229
+ print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB")
230
+ browser.close()
231
+ return url
232
+
233
+ if (i + 1) % 10 == 0:
234
+ print(f" {i+1}秒... URL: {url[:80]}...")
235
+
236
+ except Exception as e:
237
+ print(f" 错误: {e}")
238
+
239
+ browser.close()
240
+ return None
241
+
242
+ def main():
243
+ print("=" * 60)
244
+ print("Anna's Archive slow_download 访问测试")
245
+ print("=" * 60)
246
+ print(f"目标: {SLOW_URL}")
247
+ print("测试多种策略看哪种能绕过 DDoS-Guard...\n")
248
+
249
+ strategies = [
250
+ ("策略1-非无头浏览器", strategy_1_regular_browser),
251
+ ("策略2-Stealth模式", strategy_2_stealth_with_webdriver_hidden),
252
+ ("策略3-等待网络空闲", strategy_3_wait_for_network),
253
+ ("策略4-监听下载事件", strategy_4_check_for_download_event),
254
+ ("策略5-检查页面内容", strategy_5_check_page_content),
255
+ ("策略6-Firefox", strategy_6_firefox_instead),
256
+ ]
257
+
258
+ for name, func in strategies:
259
+ print(f"\n{'='*60}")
260
+ print(f"尝试: {name}")
261
+ print("=" * 60)
262
+ result = func()
263
+ if result:
264
+ print(f"\n*** 成功! 最终链接: {result} ***")
265
+ return result
266
+ print(f" {name} 失败")
267
+
268
+ print("\n" + "=" * 60)
269
+ print("所有策略都失败了")
270
+ print("=" * 60)
271
+ return None
272
+
273
+ if __name__ == "__main__":
274
+ main()