Spaces:
Paused
Paused
File size: 6,537 Bytes
fcaef18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | #!/usr/bin/env python3
"""
Test: Use same browser context for search + slow_download to preserve cookies.
Also try checking if we can extract final URL from search results directly.
"""
from playwright.sync_api import sync_playwright
import time
import re
MD5 = "d94c20d1364af9b484949659398c4062"
SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
def test_same_context_cookies():
"""Try using cookies from search page to access slow_download."""
print("[测试] 在同一浏览器上下文中访问搜索页和slow_download...")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
)
page = context.new_page()
# First, visit the search page to get cookies
print(" 1. 访问搜索页面获取cookies...")
try:
page.goto("https://annas-archive.gl/search?q=Capitalism+A+Global+History", timeout=60000, wait_until="networkidle")
print(f" 搜索页标题: {page.title()}")
cookies = context.cookies()
print(f" 获取到 {len(cookies)} 个 cookies")
for c in cookies:
print(f" {c['name']}: {c['value'][:30]}...")
except Exception as e:
print(f" 搜索页错误: {e}")
# Now try slow_download in same context
print("\n 2. 在同一上下文中访问slow_download...")
try:
response = page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
print(f" 状态码: {response.status if response else 'None'}")
for i in range(45):
time.sleep(1)
url = page.url
title = page.title()
if url.lower().endswith('.epub'):
print(f" 成功! 在第 {i+1} 秒 URL 变成 EPUB")
browser.close()
return url
if title != "DDoS-Guard":
print(f" {i+1}秒: 标题={title}, URL={url[:60]}...")
if (i + 1) % 15 == 0:
print(f" {i+1}秒...")
except Exception as e:
print(f" slow_download 错误: {e}")
browser.close()
return None
def test_extract_from_search_md5():
"""
Check if there's a way to get the final URL directly from the MD5.
Maybe Anna's Archive has an API or we can construct the URL.
"""
print("\n[测试] 分析MD5哈希看是否能直接构造最终URL...")
# The MD5 is: d94c20d1364af9b484949659398c4062
# This is 32 hex characters = 128 bits
# Anna's Archive file IDs might be based on this
# Try to check what fast_redirect or other endpoints exist
print(f" MD5: {MD5}")
print(f" 可能的文件标识符: {MD5}")
# Try some alternative URL patterns
test_urls = [
f"https://annas-archive.gl/fast_download/{MD5}/0/3",
f"https://annas-archive.gl/download/{MD5}",
f"https://annas-archive.gl/get/{MD5}",
]
for url in test_urls:
print(f"\n 测试: {url}")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
try:
response = page.goto(url, timeout=15000, wait_until="domcontentloaded")
print(f" 状态码: {response.status if response else 'None'}")
print(f" 最终URL: {page.url}")
except Exception as e:
print(f" 错误: {str(e)[:50]}")
browser.close()
time.sleep(1)
return None
def test_page_source_for_redirect():
"""Check the actual page source of slow_download to understand redirect mechanism."""
print("\n[测试] 检查slow_download页面源码中的重定向机制...")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
)
page = context.new_page()
try:
response = page.goto(SLOW_URL, timeout=30000, wait_until="domcontentloaded")
print(f" 状态码: {response.status if response else 'None'}")
# Get page content
content = page.content()
print(f" 页面内容长度: {len(content)} 字符")
# Look for meta refresh, javascript redirects, etc.
if 'meta' in content.lower():
meta_refresh = re.findall(r'<meta[^>]*refresh[^>]*content=["\']([^"\']*)["\']', content, re.IGNORECASE)
if meta_refresh:
print(f" Meta Refresh: {meta_refresh}")
if 'window.location' in content.lower():
print(" 发现 window.location 重定向")
if 'location.href' in content.lower():
print(" 发现 location.href 重定向")
if 'setTimeout' in content.lower():
print(" 发现 setTimeout 定时器")
# Look for any URLs in the page
urls = re.findall(r'https?://[^\s"\'<>]+\.epub[^\s"\'<>]*', content, re.IGNORECASE)
if urls:
print(f" 找到EPUB URLs: {urls}")
# Print relevant parts of the page
print("\n 页面内容片段:")
lines = content.split('\n')
for i, line in enumerate(lines):
if any(keyword in line.lower() for keyword in ['refresh', 'location', 'timeout', 'redirect', 'epub', 'download']):
print(f" {line.strip()[:100]}")
except Exception as e:
print(f" 错误: {e}")
browser.close()
return None
def main():
print("=" * 60)
print("Anna's Archive slow_download 深度测试")
print("=" * 60)
print("\n目标 MD5:", MD5)
print("目标 URL:", SLOW_URL)
# Test 1: Same context cookies
result1 = test_same_context_cookies()
if result1:
print(f"\n*** 成功! 链接: {result1} ***")
return result1
# Test 2: Extract from MD5
test_extract_from_search_md5()
# Test 3: Check page source
test_page_source_for_redirect()
print("\n" + "=" * 60)
print("所有测试完成")
print("=" * 60)
if __name__ == "__main__":
main() |