Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

f50cc3f

verified ·

1 Parent(s): 7c2a335

Upload hf_backend/test_zlibrary_search.py with huggingface_hub

Browse files

Files changed (1) hide show

hf_backend/test_zlibrary_search.py +135 -0

hf_backend/test_zlibrary_search.py ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/env python3
+"""
+测试 Z-Library 搜索和下载 EPUB
+"""
+from playwright.sync_api import sync_playwright
+import time
+import re
+def test_zlibrary_search(query):
+    """测试 Z-Library 搜索功能"""
+    print(f"搜索: {query}\n")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=False)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+        )
+        page = context.new_page()
+        print("步骤1: 访问搜索页...")
+        search_url = f"https://z-library.sk/search?q={query.replace(' ', '+')}&languages=1"
+        print(f"   URL: {search_url}")
+        page.goto(search_url, timeout=60000, wait_until="domcontentloaded")
+        page.wait_for_timeout(3000)
+        print(f"   标题: {page.title()}")
+        # 等待搜索结果加载
+        print("\n步骤2: 等待搜索结果...")
+        for i in range(15):
+            time.sleep(1)
+            content = page.content()
+            if 'book' in content.lower() or 'result' in content.lower():
+                print(f"   {i+1}秒: 内容已加载")
+                break
+            print(f"   {i+1}秒: 等待中...")
+        # 获取页面内容分析
+        print("\n步骤3: 分析页面内容...")
+        # 查找书籍条目
+        links = page.query_selector_all("a[href]")
+        book_links = []
+        download_links = []
+        for link in links:
+            href = link.get_attribute("href")
+            text = link.inner_text().strip()
+            if href:
+                # 查找书籍详情页
+                if ('/book/' in href or '/b/' in href) and href not in book_links:
+                    if text and len(text) > 3:
+                        book_links.append((href, text))
+                # 查找下载链接
+                if 'download' in href.lower() or '.epub' in href.lower():
+                    download_links.append((href, text))
+        print(f"   找到书籍链接: {len(book_links)}")
+        for href, text in book_links[:5]:
+            print(f"      {href[:60]} - {text[:40]}")
+        print(f"\n   找到下载链接: {len(download_links)}")
+        for href, text in download_links[:5]:
+            print(f"      {href[:60]} - {text[:30]}")
+        # 尝试提取 EPUB 直接下载链接
+        print("\n步骤4: 查找 EPUB 直接下载链接...")
+        content = page.content()
+        # 查找 .epub 链接
+        epub_matches = re.findall(r'href=["\']([^"\']*\.epub[^"\']*)["\']', content, re.IGNORECASE)
+        if epub_matches:
+            print(f"   找到 EPUB 链接: {len(epub_matches)}")
+            for m in epub_matches[:3]:
+                print(f"      {m[:80]}")
+        # 查找 MD5 或 ID
+        md5_matches = re.findall(r'md5[=:]?["\']?([a-f0-9]{32})', content, re.IGNORECASE)
+        if md5_matches:
+            print(f"   找到 MD5: {md5_matches[:3]}")
+        # 尝试点击第一本书查看详情
+        if book_links:
+            first_book = book_links[0]
+            print(f"\n步骤5: 点击第一本书查看详情...")
+            print(f"   链接: {first_book[0]}")
+            try:
+                # 创建新标签页访问书籍详情
+                page2 = context.new_page()
+                page2.goto(first_book[0], timeout=60000, wait_until="domcontentloaded")
+                page2.wait_for_timeout(3000)
+                print(f"   详情页标题: {page2.title()}")
+                # 查找下载按钮
+                download_btns = page2.query_selector_all("a[href*='download'], button[class*='download']")
+                print(f"   下载按钮: {len(download_btns)}")
+                # 查找 EPUB 下载链接
+                detail_content = page2.content()
+                epub_in_detail = re.findall(r'href=["\']([^"\']*\.epub[^"\']*)["\']', detail_content, re.IGNORECASE)
+                if epub_in_detail:
+                    print(f"   EPUB 链接: {epub_in_detail[:3]}")
+                # 查找下载 URL 模式
+                download_patterns = [
+                    r'/download/[^\s"\']+\.epub',
+                    r'https?://[^\s"\']+\.epub',
+                    r'"md5"[ :]?"([a-f0-9]{32})"',
+                ]
+                for pattern in download_patterns:
+                    matches = re.findall(pattern, detail_content, re.IGNORECASE)
+                    if matches:
+                        print(f"   模式 {pattern[:30]}: {matches[:3]}")
+                page2.close()
+            except Exception as e:
+                print(f"   错误: {e}")
+        print("\n按回车键关闭浏览器...")
+        input()
+        browser.close()
+if __name__ == "__main__":
+    test_zlibrary_search("Capitalism: A Global History")