fromozu commited on
Commit
f50cc3f
·
verified ·
1 Parent(s): 7c2a335

Upload hf_backend/test_zlibrary_search.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_zlibrary_search.py +135 -0
hf_backend/test_zlibrary_search.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 测试 Z-Library 搜索和下载 EPUB
4
+ """
5
+
6
+ from playwright.sync_api import sync_playwright
7
+ import time
8
+ import re
9
+
10
+ def test_zlibrary_search(query):
11
+ """测试 Z-Library 搜索功能"""
12
+ print(f"搜索: {query}\n")
13
+
14
+ with sync_playwright() as p:
15
+ browser = p.chromium.launch(headless=False)
16
+ context = browser.new_context(
17
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
18
+ )
19
+ page = context.new_page()
20
+
21
+ print("步骤1: 访问搜索页...")
22
+ search_url = f"https://z-library.sk/search?q={query.replace(' ', '+')}&languages=1"
23
+ print(f" URL: {search_url}")
24
+
25
+ page.goto(search_url, timeout=60000, wait_until="domcontentloaded")
26
+ page.wait_for_timeout(3000)
27
+
28
+ print(f" 标题: {page.title()}")
29
+
30
+ # 等待搜索结果加载
31
+ print("\n步骤2: 等待搜索结果...")
32
+ for i in range(15):
33
+ time.sleep(1)
34
+ content = page.content()
35
+ if 'book' in content.lower() or 'result' in content.lower():
36
+ print(f" {i+1}秒: 内容已加载")
37
+ break
38
+ print(f" {i+1}秒: 等待中...")
39
+
40
+ # 获取页面内容分析
41
+ print("\n步骤3: 分析页面内容...")
42
+
43
+ # 查找书籍条目
44
+ links = page.query_selector_all("a[href]")
45
+
46
+ book_links = []
47
+ download_links = []
48
+
49
+ for link in links:
50
+ href = link.get_attribute("href")
51
+ text = link.inner_text().strip()
52
+
53
+ if href:
54
+ # 查找书籍详情页
55
+ if ('/book/' in href or '/b/' in href) and href not in book_links:
56
+ if text and len(text) > 3:
57
+ book_links.append((href, text))
58
+
59
+ # 查找下载链接
60
+ if 'download' in href.lower() or '.epub' in href.lower():
61
+ download_links.append((href, text))
62
+
63
+ print(f" 找到书籍链接: {len(book_links)}")
64
+ for href, text in book_links[:5]:
65
+ print(f" {href[:60]} - {text[:40]}")
66
+
67
+ print(f"\n 找到下载链接: {len(download_links)}")
68
+ for href, text in download_links[:5]:
69
+ print(f" {href[:60]} - {text[:30]}")
70
+
71
+ # 尝试提取 EPUB 直接下载链接
72
+ print("\n步骤4: 查找 EPUB 直接下载链接...")
73
+ content = page.content()
74
+
75
+ # 查找 .epub 链接
76
+ epub_matches = re.findall(r'href=["\']([^"\']*\.epub[^"\']*)["\']', content, re.IGNORECASE)
77
+ if epub_matches:
78
+ print(f" 找到 EPUB 链接: {len(epub_matches)}")
79
+ for m in epub_matches[:3]:
80
+ print(f" {m[:80]}")
81
+
82
+ # 查找 MD5 或 ID
83
+ md5_matches = re.findall(r'md5[=:]?["\']?([a-f0-9]{32})', content, re.IGNORECASE)
84
+ if md5_matches:
85
+ print(f" 找到 MD5: {md5_matches[:3]}")
86
+
87
+ # 尝试点击第一本书查看详情
88
+ if book_links:
89
+ first_book = book_links[0]
90
+ print(f"\n步骤5: 点击第一本书查看详情...")
91
+ print(f" 链接: {first_book[0]}")
92
+
93
+ try:
94
+ # 创建新标签页访问书籍详情
95
+ page2 = context.new_page()
96
+ page2.goto(first_book[0], timeout=60000, wait_until="domcontentloaded")
97
+ page2.wait_for_timeout(3000)
98
+
99
+ print(f" 详情页标题: {page2.title()}")
100
+
101
+ # 查找下载按钮
102
+ download_btns = page2.query_selector_all("a[href*='download'], button[class*='download']")
103
+ print(f" 下载按钮: {len(download_btns)}")
104
+
105
+ # 查找 EPUB 下载链接
106
+ detail_content = page2.content()
107
+
108
+ epub_in_detail = re.findall(r'href=["\']([^"\']*\.epub[^"\']*)["\']', detail_content, re.IGNORECASE)
109
+ if epub_in_detail:
110
+ print(f" EPUB 链接: {epub_in_detail[:3]}")
111
+
112
+ # 查找下载 URL 模式
113
+ download_patterns = [
114
+ r'/download/[^\s"\']+\.epub',
115
+ r'https?://[^\s"\']+\.epub',
116
+ r'"md5"[ :]?"([a-f0-9]{32})"',
117
+ ]
118
+
119
+ for pattern in download_patterns:
120
+ matches = re.findall(pattern, detail_content, re.IGNORECASE)
121
+ if matches:
122
+ print(f" 模式 {pattern[:30]}: {matches[:3]}")
123
+
124
+ page2.close()
125
+
126
+ except Exception as e:
127
+ print(f" 错误: {e}")
128
+
129
+ print("\n按回车键关闭浏览器...")
130
+ input()
131
+
132
+ browser.close()
133
+
134
+ if __name__ == "__main__":
135
+ test_zlibrary_search("Capitalism: A Global History")