Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

bcd40c4

verified ·

1 Parent(s): 6622127

Upload hf_backend/test_annas_download.py with huggingface_hub

Browse files

Files changed (1) hide show

hf_backend/test_annas_download.py +154 -0

hf_backend/test_annas_download.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/usr/bin/env python3
+"""
+Test using Playwright to handle Anna's Archive slow_download redirect.
+Key insight: Use a VERY long timeout and see if DDoS-Guard eventually allows through.
+"""
+from playwright.sync_api import sync_playwright
+import time
+import re
+MD5 = "d94c20d1364af9b484949659398c4062"
+SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
+def download_with_playwright_slow(url, timeout=180):
+    """
+    Use Playwright to navigate to slow_download and wait for redirect.
+    Extended timeout to see if DDoS-Guard eventually allows access.
+    """
+    print(f"目标: {url}")
+    print(f"等待时间: 最多 {timeout} 秒")
+    print()
+    with sync_playwright() as p:
+        browser = p.chromium.launch(
+            headless=True,
+            args=['--disable-blink-features=AutomationDetect']
+        )
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+            viewport={"width": 1920, "height": 1080},
+            locale="en-US",
+            timezone_id="America/New_York",
+            extra_http_headers={
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.9",
+                "Accept-Encoding": "gzip, deflate, br",
+            }
+        )
+        # Try to hide webdriver
+        page = context.new_page()
+        page.add_init_script("""
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+        """)
+        download_info = {"url": None, "content": None}
+        def on_download(download):
+            download_info["url"] = download.url
+            print(f"检测到下载事件!")
+            print(f"下载 URL: {download.url}")
+        page.on("download", on_download)
+        start_time = time.time()
+        try:
+            print("步骤1: 导航到 slow_download...")
+            response = page.goto(url, timeout=120000, wait_until="domcontentloaded")
+            elapsed = time.time() - start_time
+            print(f"   初始响应: 状态={response.status if response else 'None'}, "
+                  f"URL={page.url}, 耗时={elapsed:.1f}秒")
+            print(f"\n步骤2: 等待最多 {timeout} 秒让重定向完成...")
+            final_url = None
+            for i in range(timeout):
+                time.sleep(1)
+                elapsed = i + 1
+                current_url = page.url
+                current_title = page.title()
+                # Check if URL became EPUB
+                if current_url.lower().endswith('.epub'):
+                    final_url = current_url
+                    print(f"\n*** 在第 {elapsed} 秒检测到 EPUB URL! ***")
+                    print(f"    URL: {final_url}")
+                    break
+                # Check for download
+                if download_info["url"]:
+                    final_url = download_info["url"]
+                    break
+                # Check if we got redirected to a different domain (CDN)
+                if 'annas-archive' not in current_url and current_url.startswith('http'):
+                    final_url = current_url
+                    print(f"\n*** 在第 {elapsed} 秒检测到外部 URL! ***")
+                    print(f"    URL: {final_url}")
+                    break
+                # Progress report every 15 seconds
+                if elapsed % 15 == 0:
+                    print(f"    {elapsed}秒... 当前URL: {current_url[:60]}..., 标题: {current_title}")
+                # If title is not DDoS-Guard and URL changed, something happened
+                if current_title != "DDoS-Guard" and i > 5:
+                    if elapsed % 15 == 0:
+                        print(f"    [信息] 标题='{current_title}', 非 DDoS-Guard 页面")
+        except Exception as e:
+            elapsed = time.time() - start_time
+            print(f"\n异常: {e}")
+            print(f"发生时间: {elapsed:.1f}秒后")
+        print(f"\n最终 URL: {page.url}")
+        print(f"最终标题: {page.title()}")
+        # If we found a final URL, try to download the content
+        if final_url:
+            print(f"\n步骤3: 使用 requests 下载最终内容...")
+            import requests
+            try:
+                resp = requests.get(final_url, timeout=60, headers={
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+                })
+                print(f"    状态: {resp.status_code}")
+                print(f"    内容大小: {len(resp.content)} 字节")
+                print(f"    内容类型: {resp.headers.get('content-type', 'unknown')}")
+                if resp.content[:2] == b'PK':  # EPUB is a ZIP file
+                    print("    [OK] 内容是有效的 EPUB/ZIP格式")
+                    return final_url, resp.content
+                else:
+                    print("    [警告] 内容不是以 PK 开头")
+            except Exception as e:
+                print(f"    下载失败: {e}")
+        browser.close()
+    return None, None
+def main():
+    print("=" * 60)
+    print("Anna's Archive slow_download Playwright 长等待测试")
+    print("=" * 60)
+    url, content = download_with_playwright_slow(SLOW_URL, timeout=120)
+    if url:
+        print(f"\n" + "=" * 60)
+        print("成功!")
+        print(f"最终URL: {url}")
+        print(f"内容大小: {len(content)} 字节")
+        print("=" * 60)
+    else:
+        print("\n未能获取到最终链接")
+if __name__ == "__main__":
+    main()