Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

ad2dba4

verified ·

1 Parent(s): bcd40c4

Upload hf_backend/test_annas_downloader.py with huggingface_hub

Browse files

Files changed (1) hide show

hf_backend/test_annas_downloader.py +163 -0

hf_backend/test_annas_downloader.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env python3
+"""
+Test the Anna's Archive download approach using Playwright.
+"""
+from playwright.sync_api import sync_playwright
+import time
+import re
+import requests
+MD5 = "d94c20d1364af9b484949659398c4062"
+SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
+def download_with_playwright(url, timeout=180):
+    """
+    Download from Anna's Archive slow_download URL using Playwright.
+    Waits for DDoS-Guard to pass, then downloads the EPUB.
+    Returns (filename, content) on success, (None, None) on failure.
+    """
+    print(f"使用 Playwright 下载: {url}")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+        )
+        page = context.new_page()
+        # Enable download tracking
+        download_info = {"path": None, "url": None}
+        def on_download(download):
+            download_info["path"] = download.path
+            download_info["url"] = download.url
+            print(f"   检测到下载: {download.url}")
+        page.on("download", on_download)
+        print("   导航到 slow_download...")
+        page.goto(url, timeout=120000, wait_until="domcontentloaded")
+        # Wait for DDoS-Guard to pass
+        print("   等待 DDoS-Guard 通过 (最多 120 秒)...")
+        ddos_passed = False
+        for i in range(120):
+            time.sleep(1)
+            title = page.title()
+            if title != "DDoS-Guard":
+                print(f"   DDoS-Guard 在第 {i+1} 秒通过, 标题: {title}")
+                ddos_passed = True
+                break
+            if (i + 1) % 20 == 0:
+                print(f"   {i+1}秒...")
+        if not ddos_passed:
+            print("   DDoS-Guard 未通过, 放弃")
+            browser.close()
+            return None, None
+        # After DDoS passes, wait a bit for the page to render
+        time.sleep(3)
+        # Check if URL already changed to EPUB
+        current_url = page.url
+        if current_url.lower().endswith('.epub'):
+            print(f"   URL 已经变成 EPUB: {current_url}")
+            final_url = current_url
+        else:
+            # Wait for redirect to EPUB
+            print("   等待 URL 变成 EPUB...")
+            final_url = None
+            for i in range(60):
+                time.sleep(1)
+                current_url = page.url
+                if current_url.lower().endswith('.epub'):
+                    final_url = current_url
+                    print(f"   在第 {i+1} 秒 URL 变成 EPUB: {final_url}")
+                    break
+                if (i + 1) % 15 == 0:
+                    print(f"   {i+1}秒... URL: {current_url[:60]}...")
+        # If we have a download path, read the file
+        if download_info["path"]:
+            print(f"   从下载路径读取: {download_info['path']}")
+            with open(download_info["path"], "rb") as f:
+                content = f.read()
+            filename = current_url.split("/")[-1] if current_url else "downloaded.epub"
+            browser.close()
+            return filename, content
+        # If we found an EPUB URL, download using requests
+        if final_url:
+            print(f"   从最终 URL 下载: {final_url}")
+            try:
+                resp = requests.get(final_url, timeout=60, headers={
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+                })
+                resp.raise_for_status()
+                filename = final_url.split("/")[-1] or "downloaded.epub"
+                browser.close()
+                return filename, resp.content
+            except Exception as e:
+                print(f"   requests 下载失败: {e}")
+        # Try to extract URL from page content
+        print("   尝试从页面内容提取 EPUB URL...")
+        html = page.content()
+        # Look for EPUB URLs
+        epub_match = re.search(r'href=["\']([^"\']*\.epub[^"\']*)["\']', html, re.IGNORECASE)
+        if epub_match:
+            epub_url = epub_match.group(1)
+            print(f"   从 HTML 找到 EPUB URL: {epub_url}")
+            try:
+                resp = requests.get(epub_url, timeout=60, headers={
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+                })
+                resp.raise_for_status()
+                filename = epub_url.split("/")[-1] or "downloaded.epub"
+                browser.close()
+                return filename, resp.content
+            except Exception as e:
+                print(f"   EPUB URL 下载失败: {e}")
+        # Look for CDN URLs
+        for cdn in ['amazonaws.com', 'cloudfront.net', 'digitaloceanspaces.com']:
+            if cdn in html.lower():
+                cdn_match = re.search(r'https?://[^\s"\'<>]+' + cdn.replace('.', r'\.') + r'[^\s"\'<>]*', html, re.IGNORECASE)
+                if cdn_match:
+                    cdn_url = cdn_match.group(0)
+                    print(f"   从 HTML 找到 CDN URL: {cdn_url}")
+                    try:
+                        resp = requests.get(cdn_url, timeout=60, headers={
+                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+                        })
+                        resp.raise_for_status()
+                        filename = cdn_url.split("/")[-1] or "downloaded.epub"
+                        browser.close()
+                        return filename, resp.content
+                    except Exception as e:
+                        print(f"   CDN URL 下载失败: {e}")
+        print("   未能获取到 EPUB")
+        browser.close()
+        return None, None
+def main():
+    print("=" * 60)
+    print("Anna's Archive Playwright 下载测试")
+    print("=" * 60)
+    filename, content = download_with_playwright(SLOW_URL, timeout=180)
+    if filename and content:
+        print(f"\n*** 成功! ***")
+        print(f"文件名: {filename}")
+        print(f"大小: {len(content)} 字节")
+        print(f"前 20 字节: {content[:20]}")
+    else:
+        print("\n下载失败")
+if __name__ == "__main__":
+    main()