fromozu commited on
Commit
fcaef18
·
verified ·
1 Parent(s): 0fc00c7

Upload hf_backend/test_same_context.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_same_context.py +181 -0
hf_backend/test_same_context.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test: Use same browser context for search + slow_download to preserve cookies.
4
+ Also try checking if we can extract final URL from search results directly.
5
+ """
6
+
7
+ from playwright.sync_api import sync_playwright
8
+ import time
9
+ import re
10
+
11
+ MD5 = "d94c20d1364af9b484949659398c4062"
12
+ SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
13
+
14
+ def test_same_context_cookies():
15
+ """Try using cookies from search page to access slow_download."""
16
+ print("[测试] 在同一浏览器上下文中访问搜索页和slow_download...")
17
+
18
+ with sync_playwright() as p:
19
+ browser = p.chromium.launch(headless=True)
20
+ context = browser.new_context(
21
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
22
+ )
23
+ page = context.new_page()
24
+
25
+ # First, visit the search page to get cookies
26
+ print(" 1. 访问搜索页面获取cookies...")
27
+ try:
28
+ page.goto("https://annas-archive.gl/search?q=Capitalism+A+Global+History", timeout=60000, wait_until="networkidle")
29
+ print(f" 搜索页标题: {page.title()}")
30
+ cookies = context.cookies()
31
+ print(f" 获取到 {len(cookies)} 个 cookies")
32
+ for c in cookies:
33
+ print(f" {c['name']}: {c['value'][:30]}...")
34
+ except Exception as e:
35
+ print(f" 搜索页错误: {e}")
36
+
37
+ # Now try slow_download in same context
38
+ print("\n 2. 在同一上下文中访问slow_download...")
39
+ try:
40
+ response = page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
41
+ print(f" 状态码: {response.status if response else 'None'}")
42
+
43
+ for i in range(45):
44
+ time.sleep(1)
45
+ url = page.url
46
+ title = page.title()
47
+
48
+ if url.lower().endswith('.epub'):
49
+ print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB")
50
+ browser.close()
51
+ return url
52
+
53
+ if title != "DDoS-Guard":
54
+ print(f" {i+1}秒: 标题={title}, URL={url[:60]}...")
55
+
56
+ if (i + 1) % 15 == 0:
57
+ print(f" {i+1}秒...")
58
+
59
+ except Exception as e:
60
+ print(f" slow_download 错误: {e}")
61
+
62
+ browser.close()
63
+ return None
64
+
65
+ def test_extract_from_search_md5():
66
+ """
67
+ Check if there's a way to get the final URL directly from the MD5.
68
+ Maybe Anna's Archive has an API or we can construct the URL.
69
+ """
70
+ print("\n[测试] 分析MD5哈希看是否能直接构造最终URL...")
71
+
72
+ # The MD5 is: d94c20d1364af9b484949659398c4062
73
+ # This is 32 hex characters = 128 bits
74
+ # Anna's Archive file IDs might be based on this
75
+
76
+ # Try to check what fast_redirect or other endpoints exist
77
+ print(f" MD5: {MD5}")
78
+ print(f" 可能的文件标识符: {MD5}")
79
+
80
+ # Try some alternative URL patterns
81
+ test_urls = [
82
+ f"https://annas-archive.gl/fast_download/{MD5}/0/3",
83
+ f"https://annas-archive.gl/download/{MD5}",
84
+ f"https://annas-archive.gl/get/{MD5}",
85
+ ]
86
+
87
+ for url in test_urls:
88
+ print(f"\n 测试: {url}")
89
+ with sync_playwright() as p:
90
+ browser = p.chromium.launch(headless=True)
91
+ page = browser.new_page()
92
+ try:
93
+ response = page.goto(url, timeout=15000, wait_until="domcontentloaded")
94
+ print(f" 状态码: {response.status if response else 'None'}")
95
+ print(f" 最终URL: {page.url}")
96
+ except Exception as e:
97
+ print(f" 错误: {str(e)[:50]}")
98
+ browser.close()
99
+ time.sleep(1)
100
+
101
+ return None
102
+
103
+ def test_page_source_for_redirect():
104
+ """Check the actual page source of slow_download to understand redirect mechanism."""
105
+ print("\n[测试] 检查slow_download页面源码中的重定向机制...")
106
+
107
+ with sync_playwright() as p:
108
+ browser = p.chromium.launch(headless=True)
109
+ context = browser.new_context(
110
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
111
+ )
112
+ page = context.new_page()
113
+
114
+ try:
115
+ response = page.goto(SLOW_URL, timeout=30000, wait_until="domcontentloaded")
116
+ print(f" 状态码: {response.status if response else 'None'}")
117
+
118
+ # Get page content
119
+ content = page.content()
120
+ print(f" 页面内容长度: {len(content)} 字符")
121
+
122
+ # Look for meta refresh, javascript redirects, etc.
123
+ if 'meta' in content.lower():
124
+ meta_refresh = re.findall(r'<meta[^>]*refresh[^>]*content=["\']([^"\']*)["\']', content, re.IGNORECASE)
125
+ if meta_refresh:
126
+ print(f" Meta Refresh: {meta_refresh}")
127
+
128
+ if 'window.location' in content.lower():
129
+ print(" 发现 window.location 重定向")
130
+
131
+ if 'location.href' in content.lower():
132
+ print(" 发现 location.href 重定向")
133
+
134
+ if 'setTimeout' in content.lower():
135
+ print(" 发现 setTimeout 定时器")
136
+
137
+ # Look for any URLs in the page
138
+ urls = re.findall(r'https?://[^\s"\'<>]+\.epub[^\s"\'<>]*', content, re.IGNORECASE)
139
+ if urls:
140
+ print(f" 找到EPUB URLs: {urls}")
141
+
142
+ # Print relevant parts of the page
143
+ print("\n 页面内容片段:")
144
+ lines = content.split('\n')
145
+ for i, line in enumerate(lines):
146
+ if any(keyword in line.lower() for keyword in ['refresh', 'location', 'timeout', 'redirect', 'epub', 'download']):
147
+ print(f" {line.strip()[:100]}")
148
+
149
+ except Exception as e:
150
+ print(f" 错误: {e}")
151
+
152
+ browser.close()
153
+
154
+ return None
155
+
156
+ def main():
157
+ print("=" * 60)
158
+ print("Anna's Archive slow_download 深度测试")
159
+ print("=" * 60)
160
+
161
+ print("\n目标 MD5:", MD5)
162
+ print("目标 URL:", SLOW_URL)
163
+
164
+ # Test 1: Same context cookies
165
+ result1 = test_same_context_cookies()
166
+ if result1:
167
+ print(f"\n*** 成功! 链接: {result1} ***")
168
+ return result1
169
+
170
+ # Test 2: Extract from MD5
171
+ test_extract_from_search_md5()
172
+
173
+ # Test 3: Check page source
174
+ test_page_source_for_redirect()
175
+
176
+ print("\n" + "=" * 60)
177
+ print("所有测试完成")
178
+ print("=" * 60)
179
+
180
+ if __name__ == "__main__":
181
+ main()