fromozu commited on
Commit
45aefea
·
verified ·
1 Parent(s): 9c5225e

Upload hf_backend/test_download_when_ready.py with huggingface_hub

Browse files
hf_backend/test_download_when_ready.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test: Wait for DDoS-Guard to pass, then IMMEDIATELY try to get the EPUB URL.
4
+ Don't wait too long after the title changes - the redirect might happen quickly.
5
+ """
6
+
7
+ from playwright.sync_api import sync_playwright
8
+ import time
9
+ import re
10
+ import requests
11
+
12
+ MD5 = "d94c20d1364af9b484949659398c4062"
13
+ SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
14
+
15
+ def test_download_when_ready():
16
+ """Wait for DDoS-Guard to pass, then download immediately."""
17
+ print(f"目标: {SLOW_URL}")
18
+
19
+ with sync_playwright() as p:
20
+ browser = p.chromium.launch(headless=True)
21
+ context = browser.new_context(
22
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
23
+ )
24
+ page = context.new_page()
25
+
26
+ print("步骤1: 导航到 slow_download...")
27
+ page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
28
+
29
+ print("步骤2: 等待 DDoS-Guard 通过...")
30
+ ddos_passed = False
31
+ for i in range(120):
32
+ time.sleep(1)
33
+ title = page.title()
34
+ if title != "DDoS-Guard":
35
+ print(f" 在第 {i+1} 秒 DDoS-Guard 通过! 标题: {title}")
36
+ ddos_passed = True
37
+ break
38
+ if (i + 1) % 15 == 0:
39
+ print(f" {i+1}秒... 仍在等待...")
40
+
41
+ if not ddos_passed:
42
+ print(" DDoS-Guard 未通过")
43
+ browser.close()
44
+ return None
45
+
46
+ # Immediately after DDoS passes, try to get the page content
47
+ print("\n步骤3: DDoS 通过后立即获取内容...")
48
+
49
+ # Wait a short time for content to load
50
+ time.sleep(3)
51
+
52
+ # Get the page HTML
53
+ html = page.content()
54
+ print(f" HTML 长度: {len(html)} 字符")
55
+
56
+ # Look for EPUB URLs in the page
57
+ epub_urls = re.findall(r'https?://[^\s"\'<>]+\.epub[^\s"\'<>]*', html, re.IGNORECASE)
58
+ if epub_urls:
59
+ print(f" 找到 EPUB URLs: {epub_urls}")
60
+ return epub_urls[0]
61
+
62
+ # Look for meta refresh
63
+ meta_refresh = re.findall(r'<meta[^>]*content=["\']?\d+;url=([^"\'>\s]+)', html, re.IGNORECASE)
64
+ if meta_refresh:
65
+ print(f" 找到 meta refresh: {meta_refresh}")
66
+ return meta_refresh[0]
67
+
68
+ # Look for JavaScript redirects
69
+ js_redirects = re.findall(r'(?:window\.)?location\s*=\s*["\']([^"\']+)["\']', html)
70
+ if js_redirects:
71
+ print(f" 找到 JS redirect: {js_redirects}")
72
+ # Try to follow this redirect
73
+ redirect_url = js_redirects[0]
74
+ if not redirect_url.startswith('http'):
75
+ redirect_url = "https://annas-archive.gl" + redirect_url
76
+ return redirect_url
77
+
78
+ # Look for CDN URLs
79
+ cdn_patterns = ['amazonaws.com', 'cloudfront.net', 'fastly.net', 'digitaloceanspaces.com', 'libgen.org']
80
+ for cdn in cdn_patterns:
81
+ if cdn in html.lower():
82
+ cdn_urls = re.findall(r'https?://[^\s"\'<>]+' + cdn.replace('.', r'\.') + r'[^\s"\'<>]*', html, re.IGNORECASE)
83
+ if cdn_urls:
84
+ print(f" 找到 CDN URL ({cdn}): {cdn_urls[0]}")
85
+ return cdn_urls[0]
86
+
87
+ # Try to find any download-related URLs
88
+ download_links = re.findall(r'href=["\']([^"\']*(?:download|file|cdn)[^"\']*)["\']', html, re.IGNORECASE)
89
+ if download_links:
90
+ print(f" 找到下载相关链接: {download_links}")
91
+ return download_links[0]
92
+
93
+ # Print visible text to see what the page says
94
+ print("\n 页面可见文本:")
95
+ try:
96
+ text = page.inner_text("body")
97
+ print(text[:1000])
98
+ except:
99
+ pass
100
+
101
+ # Try to check if there's an automatic redirect happening by monitoring URL
102
+ print("\n步骤4: 监测 URL 变化 10 秒...")
103
+ start_url = page.url
104
+ for i in range(10):
105
+ time.sleep(1)
106
+ current_url = page.url
107
+ if current_url != start_url:
108
+ print(f" 在第 {i+1} 秒 URL 变化: {current_url}")
109
+ if current_url.lower().endswith('.epub'):
110
+ return current_url
111
+ # If URL changed to something else, try to download from it
112
+ try:
113
+ resp = requests.get(current_url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
114
+ if resp.status_code == 200 and resp.content[:2] == b'PK':
115
+ print(f" 从新 URL 下载成功: {len(resp.content)} 字节")
116
+ return current_url
117
+ except:
118
+ pass
119
+ if (i + 1) % 5 == 0:
120
+ print(f" {i+1}秒... URL: {current_url[:60]}...")
121
+
122
+ browser.close()
123
+ return None
124
+
125
+ def main():
126
+ print("=" * 60)
127
+ print("Anna's Archive 下载链接获取")
128
+ print("=" * 60)
129
+ print()
130
+
131
+ result = test_download_when_ready()
132
+
133
+ if result:
134
+ print(f"\n*** 成功获取到链接: {result} ***")
135
+ else:
136
+ print("\n未能获取到下载链接")
137
+
138
+ if __name__ == "__main__":
139
+ main()