#!/usr/bin/env python3
"""
Test using Playwright to capture the final EPUB URL from Anna's Archive.
Handle the navigation properly by monitoring URL changes.
"""

from playwright.sync_api import sync_playwright
import time
import re

MD5 = "d94c20d1364af9b484949659398c4062"
SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"

def get_page_content_when_ready(url, timeout=120):
    """Navigate to URL and wait for 'Download from partner website' page."""
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
        )
        page = context.new_page()

        page.goto(url, timeout=120000, wait_until="domcontentloaded")

        # Wait for DDoS-Guard to pass
        for i in range(timeout):
            time.sleep(1)
            title = page.title()
            if title != "DDoS-Guard":
                print(f"DDoS-Guard 在第 {i+1} 秒通过")
                print(f"页面标题: {title}")
                break
        else:
            print(f"{timeout}秒内 DDoS-Guard 未通过")
            browser.close()
            return None

        # Small delay to let content load
        time.sleep(2)

        # Get the content
        content = page.content()
        browser.close()
        return content

def extract_epub_url(content):
    """Extract EPUB URL from page content."""
    if not content:
        return None

    # Look for direct EPUB links
    patterns = [
        r'href=["\']([^"\']*\.epub[^"\']*)["\']',
        r'src=["\']([^"\']*\.epub[^"\']*)["\']',
        r'url\(["\']?([^"\')]*\.epub[^"\')]*)["\']?\)',
        r'["\']([^"\']*\.epub[^"\']*)["\']',
    ]

    for pattern in patterns:
        matches = re.findall(pattern, content, re.IGNORECASE)
        for match in matches[:5]:
            if match and not match.startswith('data:'):
                print(f"找到 EPUB URL (pattern): {match[:100]}")
                return match

    # Look for meta refresh which might redirect to EPUB
    meta_refresh = re.findall(r'<meta[^>]*content=["\']?[^"\']*url=([^"\'>\s]+)', content, re.IGNORECASE)
    if meta_refresh:
        print(f"找到 meta refresh: {meta_refresh}")
        return meta_refresh[0]

    # Look for JavaScript redirects
    js_redirects = re.findall(r'window\.location\s*=\s*["\']([^"\']+)["\']', content)
    if js_redirects:
        print(f"找到 JS redirect: {js_redirects}")
        return js_redirects[0]

    js_redirects = re.findall(r'location\.href\s*=\s*["\']([^"\']+)["\']', content)
    if js_redirects:
        print(f"找到 location.href: {js_redirects}")
        return js_redirects[0]

    # Look for any URLs with common CDN domains
    cdn_patterns = ['amazonaws', 'cloudfront', 'fastly', 'azure', 'googleapis', 'gstatic']
    for cdn in cdn_patterns:
        if cdn in content.lower():
            urls = re.findall(r'https?://[^\s"\'<>\)]+' + cdn + r'[^\s"\'<>\)]*', content, re.IGNORECASE)
            if urls:
                print(f"找到 CDN URL ({cdn}): {urls[0][:100]}")
                return urls[0]

    return None

def main():
    print("=" * 60)
    print("获取 Anna's Archive 慢下载页面内容")
    print("=" * 60)
    print(f"URL: {SLOW_URL}\n")

    content = get_page_content_when_ready(SLOW_URL, timeout=120)

    if content:
        print(f"\n页面内容长度: {len(content)} 字符")

        # Find all links
        links = re.findall(r'href=["\']([^"\']+)["\']', content)
        print(f"\n找到 {len(links)} 个 href 链接")

        # Look for any interesting links
        for link in links[:20]:
            if any(keyword in link.lower() for keyword in ['epub', 'download', 'file', 'cdn', 'amazonaws', 'cloudfront']):
                print(f"  相关链接: {link[:100]}")

        # Extract EPUB URL
        print("\n提取 EPUB URL...")
        epub_url = extract_epub_url(content)

        if epub_url:
            print(f"\n*** 成功找到 EPUB URL ***")
            print(f"URL: {epub_url}")
        else:
            print("\n页面中未找到 EPUB URL")
            print("\n页面内容片段:")
            # Print body text
            body_match = re.search(r'<body[^>]*>(.*?)</body>', content, re.DOTALL | re.IGNORECASE)
            if body_match:
                body_text = re.sub(r'<[^>]+>', ' ', body_match.group(1))
                body_text = re.sub(r'\s+', ' ', body_text).strip()
                print(body_text[:2000])
    else:
        print("\n获取页面内容失败")

if __name__ == "__main__":
    main()