Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

216f552

verified ·

1 Parent(s): 9d7a623

Upload hf_backend/test_md5_page.py with huggingface_hub

Browse files

Files changed (1) hide show

hf_backend/test_md5_page.py +77 -0

hf_backend/test_md5_page.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/usr/bin/env python3
+"""
+从 md5 页面提取文件信息，看看有没有其他下载方式
+"""
+from playwright.sync_api import sync_playwright
+import time
+import re
+MD5 = "d94c20d1364af9b484949659398c4062"
+MD5_URL = f"https://annas-archive.gl/md5/{MD5}"
+def get_md5_page_info():
+    """获取 md5 页面的所有信息"""
+    print(f"目标: {MD5_URL}\n")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+        )
+        page = context.new_page()
+        print("访问 md5 页面...")
+        page.goto(MD5_URL, timeout=60000, wait_until="domcontentloaded")
+        page.wait_for_timeout(3000)
+        print(f"页面标题: {page.title()}")
+        # 获取所有文本内容
+        content = page.content()
+        print(f"HTML长度: {len(content)}")
+        # 查找所有链接
+        links = page.query_selector_all("a[href]")
+        print(f"找到 {len(links)} 个链接")
+        # 查找包含 epub 或 download 的链接
+        print("\n相关链接:")
+        for link in links:
+            href = link.get_attribute("href")
+            if href and any(k in href.lower() for k in ['epub', 'download', 'filepath', 'zlib', 'libgen']):
+                text = link.inner_text().strip()
+                print(f"  {href[:80]} - {text[:30]}")
+        # 查找 JavaScript 中的数据
+        print("\n查找 JavaScript 中的文件信息...")
+        scripts = page.query_selector_all("script")
+        for script in scripts:
+            text = script.inner_text()
+            if 'filepath' in text or 'zlib' in text or 'no-category' in text:
+                print(f"找到相关脚本内容:")
+                # 提取相关部分
+                if 'filepath' in text:
+                    filepath_match = re.search(r'filepath[=:]["\']([^"\']+)["\']', text)
+                    if filepath_match:
+                        print(f"  filepath: {filepath_match.group(1)}")
+        # 获取可见文本
+        print("\n页面可见文本片段:")
+        try:
+            text = page.inner_text("body")
+            # 只打印相关部分
+            for line in text.split('\n'):
+                line = line.strip()
+                if line and any(k in line.lower() for k in ['epub', 'zlib', 'beckert', 'capitalism', 'download']):
+                    print(f"  {line[:100]}")
+        except:
+            pass
+        browser.close()
+if __name__ == "__main__":
+    print("=" * 60)
+    print("获取 md5 页面信息")
+    print("=" * 60)
+    get_md5_page_info()