Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Test using Playwright to capture the final EPUB URL from Anna's Archive. | |
| Handle the navigation properly by monitoring URL changes. | |
| """ | |
| from playwright.sync_api import sync_playwright | |
| import time | |
| import re | |
| MD5 = "d94c20d1364af9b484949659398c4062" | |
| SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3" | |
| def get_page_content_when_ready(url, timeout=120): | |
| """Navigate to URL and wait for 'Download from partner website' page.""" | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context( | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", | |
| ) | |
| page = context.new_page() | |
| page.goto(url, timeout=120000, wait_until="domcontentloaded") | |
| # Wait for DDoS-Guard to pass | |
| for i in range(timeout): | |
| time.sleep(1) | |
| title = page.title() | |
| if title != "DDoS-Guard": | |
| print(f"DDoS-Guard 在第 {i+1} 秒通过") | |
| print(f"页面标题: {title}") | |
| break | |
| else: | |
| print(f"{timeout}秒内 DDoS-Guard 未通过") | |
| browser.close() | |
| return None | |
| # Small delay to let content load | |
| time.sleep(2) | |
| # Get the content | |
| content = page.content() | |
| browser.close() | |
| return content | |
| def extract_epub_url(content): | |
| """Extract EPUB URL from page content.""" | |
| if not content: | |
| return None | |
| # Look for direct EPUB links | |
| patterns = [ | |
| r'href=["\']([^"\']*\.epub[^"\']*)["\']', | |
| r'src=["\']([^"\']*\.epub[^"\']*)["\']', | |
| r'url\(["\']?([^"\')]*\.epub[^"\')]*)["\']?\)', | |
| r'["\']([^"\']*\.epub[^"\']*)["\']', | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, content, re.IGNORECASE) | |
| for match in matches[:5]: | |
| if match and not match.startswith('data:'): | |
| print(f"找到 EPUB URL (pattern): {match[:100]}") | |
| return match | |
| # Look for meta refresh which might redirect to EPUB | |
| meta_refresh = re.findall(r'<meta[^>]*content=["\']?[^"\']*url=([^"\'>\s]+)', content, re.IGNORECASE) | |
| if meta_refresh: | |
| print(f"找到 meta refresh: {meta_refresh}") | |
| return meta_refresh[0] | |
| # Look for JavaScript redirects | |
| js_redirects = re.findall(r'window\.location\s*=\s*["\']([^"\']+)["\']', content) | |
| if js_redirects: | |
| print(f"找到 JS redirect: {js_redirects}") | |
| return js_redirects[0] | |
| js_redirects = re.findall(r'location\.href\s*=\s*["\']([^"\']+)["\']', content) | |
| if js_redirects: | |
| print(f"找到 location.href: {js_redirects}") | |
| return js_redirects[0] | |
| # Look for any URLs with common CDN domains | |
| cdn_patterns = ['amazonaws', 'cloudfront', 'fastly', 'azure', 'googleapis', 'gstatic'] | |
| for cdn in cdn_patterns: | |
| if cdn in content.lower(): | |
| urls = re.findall(r'https?://[^\s"\'<>\)]+' + cdn + r'[^\s"\'<>\)]*', content, re.IGNORECASE) | |
| if urls: | |
| print(f"找到 CDN URL ({cdn}): {urls[0][:100]}") | |
| return urls[0] | |
| return None | |
| def main(): | |
| print("=" * 60) | |
| print("获取 Anna's Archive 慢下载页面内容") | |
| print("=" * 60) | |
| print(f"URL: {SLOW_URL}\n") | |
| content = get_page_content_when_ready(SLOW_URL, timeout=120) | |
| if content: | |
| print(f"\n页面内容长度: {len(content)} 字符") | |
| # Find all links | |
| links = re.findall(r'href=["\']([^"\']+)["\']', content) | |
| print(f"\n找到 {len(links)} 个 href 链接") | |
| # Look for any interesting links | |
| for link in links[:20]: | |
| if any(keyword in link.lower() for keyword in ['epub', 'download', 'file', 'cdn', 'amazonaws', 'cloudfront']): | |
| print(f" 相关链接: {link[:100]}") | |
| # Extract EPUB URL | |
| print("\n提取 EPUB URL...") | |
| epub_url = extract_epub_url(content) | |
| if epub_url: | |
| print(f"\n*** 成功找到 EPUB URL ***") | |
| print(f"URL: {epub_url}") | |
| else: | |
| print("\n页面中未找到 EPUB URL") | |
| print("\n页面内容片段:") | |
| # Print body text | |
| body_match = re.search(r'<body[^>]*>(.*?)</body>', content, re.DOTALL | re.IGNORECASE) | |
| if body_match: | |
| body_text = re.sub(r'<[^>]+>', ' ', body_match.group(1)) | |
| body_text = re.sub(r'\s+', ' ', body_text).strip() | |
| print(body_text[:2000]) | |
| else: | |
| print("\n获取页面内容失败") | |
| if __name__ == "__main__": | |
| main() |