fromozu commited on
Commit
ad2dba4
·
verified ·
1 Parent(s): bcd40c4

Upload hf_backend/test_annas_downloader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_annas_downloader.py +163 -0
hf_backend/test_annas_downloader.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test the Anna's Archive download approach using Playwright.
4
+ """
5
+
6
+ from playwright.sync_api import sync_playwright
7
+ import time
8
+ import re
9
+ import requests
10
+
11
+ MD5 = "d94c20d1364af9b484949659398c4062"
12
+ SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
13
+
14
+ def download_with_playwright(url, timeout=180):
15
+ """
16
+ Download from Anna's Archive slow_download URL using Playwright.
17
+ Waits for DDoS-Guard to pass, then downloads the EPUB.
18
+ Returns (filename, content) on success, (None, None) on failure.
19
+ """
20
+ print(f"使用 Playwright 下载: {url}")
21
+
22
+ with sync_playwright() as p:
23
+ browser = p.chromium.launch(headless=True)
24
+ context = browser.new_context(
25
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
26
+ )
27
+ page = context.new_page()
28
+
29
+ # Enable download tracking
30
+ download_info = {"path": None, "url": None}
31
+
32
+ def on_download(download):
33
+ download_info["path"] = download.path
34
+ download_info["url"] = download.url
35
+ print(f" 检测到下载: {download.url}")
36
+
37
+ page.on("download", on_download)
38
+
39
+ print(" 导航到 slow_download...")
40
+ page.goto(url, timeout=120000, wait_until="domcontentloaded")
41
+
42
+ # Wait for DDoS-Guard to pass
43
+ print(" 等待 DDoS-Guard 通过 (最多 120 秒)...")
44
+ ddos_passed = False
45
+ for i in range(120):
46
+ time.sleep(1)
47
+ title = page.title()
48
+ if title != "DDoS-Guard":
49
+ print(f" DDoS-Guard 在第 {i+1} 秒通过, 标题: {title}")
50
+ ddos_passed = True
51
+ break
52
+ if (i + 1) % 20 == 0:
53
+ print(f" {i+1}秒...")
54
+
55
+ if not ddos_passed:
56
+ print(" DDoS-Guard 未通过, 放弃")
57
+ browser.close()
58
+ return None, None
59
+
60
+ # After DDoS passes, wait a bit for the page to render
61
+ time.sleep(3)
62
+
63
+ # Check if URL already changed to EPUB
64
+ current_url = page.url
65
+ if current_url.lower().endswith('.epub'):
66
+ print(f" URL 已经变成 EPUB: {current_url}")
67
+ final_url = current_url
68
+ else:
69
+ # Wait for redirect to EPUB
70
+ print(" 等待 URL 变成 EPUB...")
71
+ final_url = None
72
+ for i in range(60):
73
+ time.sleep(1)
74
+ current_url = page.url
75
+ if current_url.lower().endswith('.epub'):
76
+ final_url = current_url
77
+ print(f" 在第 {i+1} 秒 URL 变成 EPUB: {final_url}")
78
+ break
79
+ if (i + 1) % 15 == 0:
80
+ print(f" {i+1}秒... URL: {current_url[:60]}...")
81
+
82
+ # If we have a download path, read the file
83
+ if download_info["path"]:
84
+ print(f" 从下载路径读取: {download_info['path']}")
85
+ with open(download_info["path"], "rb") as f:
86
+ content = f.read()
87
+ filename = current_url.split("/")[-1] if current_url else "downloaded.epub"
88
+ browser.close()
89
+ return filename, content
90
+
91
+ # If we found an EPUB URL, download using requests
92
+ if final_url:
93
+ print(f" 从最终 URL 下载: {final_url}")
94
+ try:
95
+ resp = requests.get(final_url, timeout=60, headers={
96
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
97
+ })
98
+ resp.raise_for_status()
99
+ filename = final_url.split("/")[-1] or "downloaded.epub"
100
+ browser.close()
101
+ return filename, resp.content
102
+ except Exception as e:
103
+ print(f" requests 下载失败: {e}")
104
+
105
+ # Try to extract URL from page content
106
+ print(" 尝试从页面内容提取 EPUB URL...")
107
+ html = page.content()
108
+
109
+ # Look for EPUB URLs
110
+ epub_match = re.search(r'href=["\']([^"\']*\.epub[^"\']*)["\']', html, re.IGNORECASE)
111
+ if epub_match:
112
+ epub_url = epub_match.group(1)
113
+ print(f" 从 HTML 找到 EPUB URL: {epub_url}")
114
+ try:
115
+ resp = requests.get(epub_url, timeout=60, headers={
116
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
117
+ })
118
+ resp.raise_for_status()
119
+ filename = epub_url.split("/")[-1] or "downloaded.epub"
120
+ browser.close()
121
+ return filename, resp.content
122
+ except Exception as e:
123
+ print(f" EPUB URL 下载失败: {e}")
124
+
125
+ # Look for CDN URLs
126
+ for cdn in ['amazonaws.com', 'cloudfront.net', 'digitaloceanspaces.com']:
127
+ if cdn in html.lower():
128
+ cdn_match = re.search(r'https?://[^\s"\'<>]+' + cdn.replace('.', r'\.') + r'[^\s"\'<>]*', html, re.IGNORECASE)
129
+ if cdn_match:
130
+ cdn_url = cdn_match.group(0)
131
+ print(f" 从 HTML 找到 CDN URL: {cdn_url}")
132
+ try:
133
+ resp = requests.get(cdn_url, timeout=60, headers={
134
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
135
+ })
136
+ resp.raise_for_status()
137
+ filename = cdn_url.split("/")[-1] or "downloaded.epub"
138
+ browser.close()
139
+ return filename, resp.content
140
+ except Exception as e:
141
+ print(f" CDN URL 下载失败: {e}")
142
+
143
+ print(" 未能获取到 EPUB")
144
+ browser.close()
145
+ return None, None
146
+
147
+ def main():
148
+ print("=" * 60)
149
+ print("Anna's Archive Playwright 下载测试")
150
+ print("=" * 60)
151
+
152
+ filename, content = download_with_playwright(SLOW_URL, timeout=180)
153
+
154
+ if filename and content:
155
+ print(f"\n*** 成功! ***")
156
+ print(f"文件名: {filename}")
157
+ print(f"大小: {len(content)} 字节")
158
+ print(f"前 20 字节: {content[:20]}")
159
+ else:
160
+ print("\n下载失败")
161
+
162
+ if __name__ == "__main__":
163
+ main()