fromozu commited on
Commit
158ec06
·
verified ·
1 Parent(s): 9bb2166

Upload hf_backend/test_zlibrary_full.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_zlibrary_full.py +90 -0
hf_backend/test_zlibrary_full.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 测试 Z-Library 搜索功能,获取书籍列表和下载页 URL
4
+ """
5
+
6
+ from playwright.sync_api import sync_playwright
7
+ import time
8
+ import re
9
+ from urllib.parse import quote
10
+
11
+ def test_zlibrary_search(query):
12
+ """测试 Z-Library 搜索"""
13
+ print(f"搜索: {query}\n")
14
+
15
+ with sync_playwright() as p:
16
+ browser = p.chromium.launch(headless=False)
17
+ context = browser.new_context(
18
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
19
+ )
20
+ page = context.new_page()
21
+
22
+ # 访问搜索页
23
+ search_url = f"https://z-library.sk/s/{quote(query)}"
24
+ print(f"搜索 URL: {search_url}")
25
+
26
+ page.goto(search_url, timeout=60000, wait_until="domcontentloaded")
27
+ page.wait_for_timeout(5000)
28
+
29
+ print(f"页面标题: {page.title()}")
30
+
31
+ # 获取页面内容
32
+ content = page.content()
33
+
34
+ # 查找书籍详情页链接
35
+ # 格式: /book/{id}/{title}.html
36
+ book_pattern = r'/book/([a-zA-Z0-9]+)/([^"]+\.html)'
37
+ book_matches = re.findall(book_pattern, content)
38
+
39
+ print(f"\n找到书籍数量: {len(book_matches)}")
40
+
41
+ books = []
42
+ for book_id, book_title in book_matches[:10]:
43
+ title = book_title.replace('.html', '').replace('-', ' ')
44
+ full_url = f"https://z-library.sk/book/{book_id}/{book_title}"
45
+ books.append((title, full_url))
46
+ print(f" - {title[:50]}")
47
+ print(f" URL: {full_url}")
48
+
49
+ # 查找下载链接
50
+ # 格式: /dl/{id}
51
+ dl_pattern = r'/dl/([a-zA-Z0-9]+)'
52
+ dl_matches = re.findall(dl_pattern, content)
53
+ unique_dl = list(set(dl_matches))
54
+ print(f"\n找到下载链接: {len(unique_dl)}")
55
+ for dl_id in unique_dl[:5]:
56
+ print(f" https://z-library.sk/dl/{dl_id}")
57
+
58
+ # 点击第一本书,获取下载页面 URL
59
+ if book_matches:
60
+ first_book_id, first_book_title = book_matches[0]
61
+ book_url = f"https://z-library.sk/book/{first_book_id}/{first_book_title}"
62
+ print(f"\n访问第一本书详情页: {book_url}")
63
+
64
+ page2 = context.new_page()
65
+ page2.goto(book_url, timeout=60000, wait_until="domcontentloaded")
66
+ page2.wait_for_timeout(3000)
67
+
68
+ # 获取下载链接
69
+ detail_content = page2.content()
70
+
71
+ # 查找 dl/ 链接
72
+ dl_in_detail = re.findall(r'/dl/([a-zA-Z0-9]+)', detail_content)
73
+ if dl_in_detail:
74
+ dl_id = dl_in_detail[0]
75
+ dl_url = f"https://z-library.sk/dl/{dl_id}"
76
+ print(f"下载页 URL: {dl_url}")
77
+
78
+ # 查找 EPUB 文件的直接下载 URL
79
+ epub_pattern = r'"(https?://[^"]*\.epub[^"]*)"'
80
+ epub_matches = re.findall(epub_pattern, detail_content)
81
+ if epub_matches:
82
+ print(f"EPUB 直链: {epub_matches[0]}")
83
+
84
+ page2.close()
85
+
86
+ input("按回车键关闭浏览器...")
87
+ browser.close()
88
+
89
+ if __name__ == "__main__":
90
+ test_zlibrary_search("Capitalism: A Global History")