ebook-executor / hf_backend /test_final_link.py
fromozu's picture
Upload hf_backend/test_final_link.py with huggingface_hub
1ff5a24 verified
#!/usr/bin/env python3
"""
Test using Playwright to capture the final EPUB URL from Anna's Archive.
Handle the navigation properly by monitoring URL changes.
"""
from playwright.sync_api import sync_playwright
import time
import re
MD5 = "d94c20d1364af9b484949659398c4062"
SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
def get_page_content_when_ready(url, timeout=120):
"""Navigate to URL and wait for 'Download from partner website' page."""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
)
page = context.new_page()
page.goto(url, timeout=120000, wait_until="domcontentloaded")
# Wait for DDoS-Guard to pass
for i in range(timeout):
time.sleep(1)
title = page.title()
if title != "DDoS-Guard":
print(f"DDoS-Guard 在第 {i+1} 秒通过")
print(f"页面标题: {title}")
break
else:
print(f"{timeout}秒内 DDoS-Guard 未通过")
browser.close()
return None
# Small delay to let content load
time.sleep(2)
# Get the content
content = page.content()
browser.close()
return content
def extract_epub_url(content):
"""Extract EPUB URL from page content."""
if not content:
return None
# Look for direct EPUB links
patterns = [
r'href=["\']([^"\']*\.epub[^"\']*)["\']',
r'src=["\']([^"\']*\.epub[^"\']*)["\']',
r'url\(["\']?([^"\')]*\.epub[^"\')]*)["\']?\)',
r'["\']([^"\']*\.epub[^"\']*)["\']',
]
for pattern in patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches[:5]:
if match and not match.startswith('data:'):
print(f"找到 EPUB URL (pattern): {match[:100]}")
return match
# Look for meta refresh which might redirect to EPUB
meta_refresh = re.findall(r'<meta[^>]*content=["\']?[^"\']*url=([^"\'>\s]+)', content, re.IGNORECASE)
if meta_refresh:
print(f"找到 meta refresh: {meta_refresh}")
return meta_refresh[0]
# Look for JavaScript redirects
js_redirects = re.findall(r'window\.location\s*=\s*["\']([^"\']+)["\']', content)
if js_redirects:
print(f"找到 JS redirect: {js_redirects}")
return js_redirects[0]
js_redirects = re.findall(r'location\.href\s*=\s*["\']([^"\']+)["\']', content)
if js_redirects:
print(f"找到 location.href: {js_redirects}")
return js_redirects[0]
# Look for any URLs with common CDN domains
cdn_patterns = ['amazonaws', 'cloudfront', 'fastly', 'azure', 'googleapis', 'gstatic']
for cdn in cdn_patterns:
if cdn in content.lower():
urls = re.findall(r'https?://[^\s"\'<>\)]+' + cdn + r'[^\s"\'<>\)]*', content, re.IGNORECASE)
if urls:
print(f"找到 CDN URL ({cdn}): {urls[0][:100]}")
return urls[0]
return None
def main():
print("=" * 60)
print("获取 Anna's Archive 慢下载页面内容")
print("=" * 60)
print(f"URL: {SLOW_URL}\n")
content = get_page_content_when_ready(SLOW_URL, timeout=120)
if content:
print(f"\n页面内容长度: {len(content)} 字符")
# Find all links
links = re.findall(r'href=["\']([^"\']+)["\']', content)
print(f"\n找到 {len(links)} 个 href 链接")
# Look for any interesting links
for link in links[:20]:
if any(keyword in link.lower() for keyword in ['epub', 'download', 'file', 'cdn', 'amazonaws', 'cloudfront']):
print(f" 相关链接: {link[:100]}")
# Extract EPUB URL
print("\n提取 EPUB URL...")
epub_url = extract_epub_url(content)
if epub_url:
print(f"\n*** 成功找到 EPUB URL ***")
print(f"URL: {epub_url}")
else:
print("\n页面中未找到 EPUB URL")
print("\n页面内容片段:")
# Print body text
body_match = re.search(r'<body[^>]*>(.*?)</body>', content, re.DOTALL | re.IGNORECASE)
if body_match:
body_text = re.sub(r'<[^>]+>', ' ', body_match.group(1))
body_text = re.sub(r'\s+', ' ', body_text).strip()
print(body_text[:2000])
else:
print("\n获取页面内容失败")
if __name__ == "__main__":
main()