#!/usr/bin/env python3 """ Test using Playwright to capture the final EPUB URL from Anna's Archive. Handle the navigation properly by monitoring URL changes. """ from playwright.sync_api import sync_playwright import time import re MD5 = "d94c20d1364af9b484949659398c4062" SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3" def get_page_content_when_ready(url, timeout=120): """Navigate to URL and wait for 'Download from partner website' page.""" with sync_playwright() as p: browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", ) page = context.new_page() page.goto(url, timeout=120000, wait_until="domcontentloaded") # Wait for DDoS-Guard to pass for i in range(timeout): time.sleep(1) title = page.title() if title != "DDoS-Guard": print(f"DDoS-Guard 在第 {i+1} 秒通过") print(f"页面标题: {title}") break else: print(f"{timeout}秒内 DDoS-Guard 未通过") browser.close() return None # Small delay to let content load time.sleep(2) # Get the content content = page.content() browser.close() return content def extract_epub_url(content): """Extract EPUB URL from page content.""" if not content: return None # Look for direct EPUB links patterns = [ r'href=["\']([^"\']*\.epub[^"\']*)["\']', r'src=["\']([^"\']*\.epub[^"\']*)["\']', r'url\(["\']?([^"\')]*\.epub[^"\')]*)["\']?\)', r'["\']([^"\']*\.epub[^"\']*)["\']', ] for pattern in patterns: matches = re.findall(pattern, content, re.IGNORECASE) for match in matches[:5]: if match and not match.startswith('data:'): print(f"找到 EPUB URL (pattern): {match[:100]}") return match # Look for meta refresh which might redirect to EPUB meta_refresh = re.findall(r']*content=["\']?[^"\']*url=([^"\'>\s]+)', content, re.IGNORECASE) if meta_refresh: print(f"找到 meta refresh: {meta_refresh}") return meta_refresh[0] # Look for JavaScript redirects js_redirects = re.findall(r'window\.location\s*=\s*["\']([^"\']+)["\']', content) if js_redirects: print(f"找到 JS redirect: {js_redirects}") return js_redirects[0] js_redirects = re.findall(r'location\.href\s*=\s*["\']([^"\']+)["\']', content) if js_redirects: print(f"找到 location.href: {js_redirects}") return js_redirects[0] # Look for any URLs with common CDN domains cdn_patterns = ['amazonaws', 'cloudfront', 'fastly', 'azure', 'googleapis', 'gstatic'] for cdn in cdn_patterns: if cdn in content.lower(): urls = re.findall(r'https?://[^\s"\'<>\)]+' + cdn + r'[^\s"\'<>\)]*', content, re.IGNORECASE) if urls: print(f"找到 CDN URL ({cdn}): {urls[0][:100]}") return urls[0] return None def main(): print("=" * 60) print("获取 Anna's Archive 慢下载页面内容") print("=" * 60) print(f"URL: {SLOW_URL}\n") content = get_page_content_when_ready(SLOW_URL, timeout=120) if content: print(f"\n页面内容长度: {len(content)} 字符") # Find all links links = re.findall(r'href=["\']([^"\']+)["\']', content) print(f"\n找到 {len(links)} 个 href 链接") # Look for any interesting links for link in links[:20]: if any(keyword in link.lower() for keyword in ['epub', 'download', 'file', 'cdn', 'amazonaws', 'cloudfront']): print(f" 相关链接: {link[:100]}") # Extract EPUB URL print("\n提取 EPUB URL...") epub_url = extract_epub_url(content) if epub_url: print(f"\n*** 成功找到 EPUB URL ***") print(f"URL: {epub_url}") else: print("\n页面中未找到 EPUB URL") print("\n页面内容片段:") # Print body text body_match = re.search(r']*>(.*?)', content, re.DOTALL | re.IGNORECASE) if body_match: body_text = re.sub(r'<[^>]+>', ' ', body_match.group(1)) body_text = re.sub(r'\s+', ' ', body_text).strip() print(body_text[:2000]) else: print("\n获取页面内容失败") if __name__ == "__main__": main()