Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

45aefea

verified ·

1 Parent(s): 9c5225e

Upload hf_backend/test_download_when_ready.py with huggingface_hub

Browse files

Files changed (1) hide show

hf_backend/test_download_when_ready.py +139 -0

hf_backend/test_download_when_ready.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#!/usr/bin/env python3
+"""
+Test: Wait for DDoS-Guard to pass, then IMMEDIATELY try to get the EPUB URL.
+Don't wait too long after the title changes - the redirect might happen quickly.
+"""
+from playwright.sync_api import sync_playwright
+import time
+import re
+import requests
+MD5 = "d94c20d1364af9b484949659398c4062"
+SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"
+def test_download_when_ready():
+    """Wait for DDoS-Guard to pass, then download immediately."""
+    print(f"目标: {SLOW_URL}")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+        )
+        page = context.new_page()
+        print("步骤1: 导航到 slow_download...")
+        page.goto(SLOW_URL, timeout=120000, wait_until="domcontentloaded")
+        print("步骤2: 等待 DDoS-Guard 通过...")
+        ddos_passed = False
+        for i in range(120):
+            time.sleep(1)
+            title = page.title()
+            if title != "DDoS-Guard":
+                print(f"   在第 {i+1} 秒 DDoS-Guard 通过! 标题: {title}")
+                ddos_passed = True
+                break
+            if (i + 1) % 15 == 0:
+                print(f"   {i+1}秒... 仍在等待...")
+        if not ddos_passed:
+            print("   DDoS-Guard 未通过")
+            browser.close()
+            return None
+        # Immediately after DDoS passes, try to get the page content
+        print("\n步骤3: DDoS 通过后立即获取内容...")
+        # Wait a short time for content to load
+        time.sleep(3)
+        # Get the page HTML
+        html = page.content()
+        print(f"   HTML 长度: {len(html)} 字符")
+        # Look for EPUB URLs in the page
+        epub_urls = re.findall(r'https?://[^\s"\'<>]+\.epub[^\s"\'<>]*', html, re.IGNORECASE)
+        if epub_urls:
+            print(f"   找到 EPUB URLs: {epub_urls}")
+            return epub_urls[0]
+        # Look for meta refresh
+        meta_refresh = re.findall(r'<meta[^>]*content=["\']?\d+;url=([^"\'>\s]+)', html, re.IGNORECASE)
+        if meta_refresh:
+            print(f"   找到 meta refresh: {meta_refresh}")
+            return meta_refresh[0]
+        # Look for JavaScript redirects
+        js_redirects = re.findall(r'(?:window\.)?location\s*=\s*["\']([^"\']+)["\']', html)
+        if js_redirects:
+            print(f"   找到 JS redirect: {js_redirects}")
+            # Try to follow this redirect
+            redirect_url = js_redirects[0]
+            if not redirect_url.startswith('http'):
+                redirect_url = "https://annas-archive.gl" + redirect_url
+            return redirect_url
+        # Look for CDN URLs
+        cdn_patterns = ['amazonaws.com', 'cloudfront.net', 'fastly.net', 'digitaloceanspaces.com', 'libgen.org']
+        for cdn in cdn_patterns:
+            if cdn in html.lower():
+                cdn_urls = re.findall(r'https?://[^\s"\'<>]+' + cdn.replace('.', r'\.') + r'[^\s"\'<>]*', html, re.IGNORECASE)
+                if cdn_urls:
+                    print(f"   找到 CDN URL ({cdn}): {cdn_urls[0]}")
+                    return cdn_urls[0]
+        # Try to find any download-related URLs
+        download_links = re.findall(r'href=["\']([^"\']*(?:download|file|cdn)[^"\']*)["\']', html, re.IGNORECASE)
+        if download_links:
+            print(f"   找到下载相关链接: {download_links}")
+            return download_links[0]
+        # Print visible text to see what the page says
+        print("\n   页面可见文本:")
+        try:
+            text = page.inner_text("body")
+            print(text[:1000])
+        except:
+            pass
+        # Try to check if there's an automatic redirect happening by monitoring URL
+        print("\n步骤4: 监测 URL 变化 10 秒...")
+        start_url = page.url
+        for i in range(10):
+            time.sleep(1)
+            current_url = page.url
+            if current_url != start_url:
+                print(f"   在第 {i+1} 秒 URL 变化: {current_url}")
+                if current_url.lower().endswith('.epub'):
+                    return current_url
+                # If URL changed to something else, try to download from it
+                try:
+                    resp = requests.get(current_url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
+                    if resp.status_code == 200 and resp.content[:2] == b'PK':
+                        print(f"   从新 URL 下载成功: {len(resp.content)} 字节")
+                        return current_url
+                except:
+                    pass
+            if (i + 1) % 5 == 0:
+                print(f"   {i+1}秒... URL: {current_url[:60]}...")
+        browser.close()
+        return None
+def main():
+    print("=" * 60)
+    print("Anna's Archive 下载链接获取")
+    print("=" * 60)
+    print()
+    result = test_download_when_ready()
+    if result:
+        print(f"\n*** 成功获取到链接: {result} ***")
+    else:
+        print("\n未能获取到下载链接")
+if __name__ == "__main__":
+    main()