Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Test: Use same browser context for search + slow_download to preserve cookies. | |
| Also try checking if we can extract final URL from search results directly. | |
| """ | |
| from playwright.sync_api import sync_playwright | |
| import time | |
| import re | |
| MD5 = "d94c20d1364af9b484949659398c4062" | |
| SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3" | |
| def test_same_context_cookies(): | |
| """Try using cookies from search page to access slow_download.""" | |
| print("[测试] 在同一浏览器上下文中访问搜索页和slow_download...") | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context( | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", | |
| ) | |
| page = context.new_page() | |
| # First, visit the search page to get cookies | |
| print(" 1. 访问搜索页面获取cookies...") | |
| try: | |
| page.goto("https://annas-archive.gl/search?q=Capitalism+A+Global+History", timeout=60000, wait_until="networkidle") | |
| print(f" 搜索页标题: {page.title()}") | |
| cookies = context.cookies() | |
| print(f" 获取到 {len(cookies)} 个 cookies") | |
| for c in cookies: | |
| print(f" {c['name']}: {c['value'][:30]}...") | |
| except Exception as e: | |
| print(f" 搜索页错误: {e}") | |
| # Now try slow_download in same context | |
| print("\n 2. 在同一上下文中访问slow_download...") | |
| try: | |
| response = page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded") | |
| print(f" 状态码: {response.status if response else 'None'}") | |
| for i in range(45): | |
| time.sleep(1) | |
| url = page.url | |
| title = page.title() | |
| if url.lower().endswith('.epub'): | |
| print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB") | |
| browser.close() | |
| return url | |
| if title != "DDoS-Guard": | |
| print(f" {i+1}秒: 标题={title}, URL={url[:60]}...") | |
| if (i + 1) % 15 == 0: | |
| print(f" {i+1}秒...") | |
| except Exception as e: | |
| print(f" slow_download 错误: {e}") | |
| browser.close() | |
| return None | |
| def test_extract_from_search_md5(): | |
| """ | |
| Check if there's a way to get the final URL directly from the MD5. | |
| Maybe Anna's Archive has an API or we can construct the URL. | |
| """ | |
| print("\n[测试] 分析MD5哈希看是否能直接构造最终URL...") | |
| # The MD5 is: d94c20d1364af9b484949659398c4062 | |
| # This is 32 hex characters = 128 bits | |
| # Anna's Archive file IDs might be based on this | |
| # Try to check what fast_redirect or other endpoints exist | |
| print(f" MD5: {MD5}") | |
| print(f" 可能的文件标识符: {MD5}") | |
| # Try some alternative URL patterns | |
| test_urls = [ | |
| f"https://annas-archive.gl/fast_download/{MD5}/0/3", | |
| f"https://annas-archive.gl/download/{MD5}", | |
| f"https://annas-archive.gl/get/{MD5}", | |
| ] | |
| for url in test_urls: | |
| print(f"\n 测试: {url}") | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| page = browser.new_page() | |
| try: | |
| response = page.goto(url, timeout=15000, wait_until="domcontentloaded") | |
| print(f" 状态码: {response.status if response else 'None'}") | |
| print(f" 最终URL: {page.url}") | |
| except Exception as e: | |
| print(f" 错误: {str(e)[:50]}") | |
| browser.close() | |
| time.sleep(1) | |
| return None | |
| def test_page_source_for_redirect(): | |
| """Check the actual page source of slow_download to understand redirect mechanism.""" | |
| print("\n[测试] 检查slow_download页面源码中的重定向机制...") | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context( | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", | |
| ) | |
| page = context.new_page() | |
| try: | |
| response = page.goto(SLOW_URL, timeout=30000, wait_until="domcontentloaded") | |
| print(f" 状态码: {response.status if response else 'None'}") | |
| # Get page content | |
| content = page.content() | |
| print(f" 页面内容长度: {len(content)} 字符") | |
| # Look for meta refresh, javascript redirects, etc. | |
| if 'meta' in content.lower(): | |
| meta_refresh = re.findall(r'<meta[^>]*refresh[^>]*content=["\']([^"\']*)["\']', content, re.IGNORECASE) | |
| if meta_refresh: | |
| print(f" Meta Refresh: {meta_refresh}") | |
| if 'window.location' in content.lower(): | |
| print(" 发现 window.location 重定向") | |
| if 'location.href' in content.lower(): | |
| print(" 发现 location.href 重定向") | |
| if 'setTimeout' in content.lower(): | |
| print(" 发现 setTimeout 定时器") | |
| # Look for any URLs in the page | |
| urls = re.findall(r'https?://[^\s"\'<>]+\.epub[^\s"\'<>]*', content, re.IGNORECASE) | |
| if urls: | |
| print(f" 找到EPUB URLs: {urls}") | |
| # Print relevant parts of the page | |
| print("\n 页面内容片段:") | |
| lines = content.split('\n') | |
| for i, line in enumerate(lines): | |
| if any(keyword in line.lower() for keyword in ['refresh', 'location', 'timeout', 'redirect', 'epub', 'download']): | |
| print(f" {line.strip()[:100]}") | |
| except Exception as e: | |
| print(f" 错误: {e}") | |
| browser.close() | |
| return None | |
| def main(): | |
| print("=" * 60) | |
| print("Anna's Archive slow_download 深度测试") | |
| print("=" * 60) | |
| print("\n目标 MD5:", MD5) | |
| print("目标 URL:", SLOW_URL) | |
| # Test 1: Same context cookies | |
| result1 = test_same_context_cookies() | |
| if result1: | |
| print(f"\n*** 成功! 链接: {result1} ***") | |
| return result1 | |
| # Test 2: Extract from MD5 | |
| test_extract_from_search_md5() | |
| # Test 3: Check page source | |
| test_page_source_for_redirect() | |
| print("\n" + "=" * 60) | |
| print("所有测试完成") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() |