fromozu commited on
Commit
d9ee57b
·
verified ·
1 Parent(s): 158ec06

Upload hf_backend/test_zlibrary_func.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/test_zlibrary_func.py +87 -0
hf_backend/test_zlibrary_func.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 测试 Z-Library 搜索功能,验证能否获取 EPUB 直链
4
+ """
5
+
6
+ from playwright.sync_api import sync_playwright
7
+ import time
8
+ import re
9
+ from urllib.parse import quote
10
+ import requests
11
+
12
+ def test_zlibrary_search(query):
13
+ """测试 Z-Library 搜索"""
14
+ print(f"搜索: {query}\n")
15
+
16
+ with sync_playwright() as p:
17
+ browser = p.chromium.launch(headless=False)
18
+ context = browser.new_context(
19
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
20
+ )
21
+ page = context.new_page()
22
+
23
+ # 访问搜索页
24
+ search_url = f"https://z-library.sk/s/{quote(query)}"
25
+ page.goto(search_url, timeout=60000, wait_until="domcontentloaded")
26
+ page.wait_for_timeout(5000)
27
+
28
+ print(f"页面标题: {page.title()}")
29
+
30
+ # 获取页面内容
31
+ content = page.content()
32
+
33
+ # 查找书籍详情页链接
34
+ book_pattern = r'/book/([a-zA-Z0-9]+)/([^"]+\.html)'
35
+ book_matches = re.findall(book_pattern, content)
36
+
37
+ print(f"找到书籍数量: {len(book_matches)}")
38
+
39
+ # 获取第一本书的下载页 URL
40
+ if book_matches:
41
+ first_book_id, first_book_title = book_matches[0]
42
+ book_url = f"https://z-library.sk/book/{first_book_id}/{first_book_title}"
43
+ print(f"\n访问第一本书详情页: {book_url}")
44
+
45
+ page2 = context.new_page()
46
+ page2.goto(book_url, timeout=60000, wait_until="domcontentloaded")
47
+ page2.wait_for_timeout(3000)
48
+
49
+ # 获取下载链接
50
+ detail_content = page2.content()
51
+
52
+ # 查找 dl/ 链接
53
+ dl_matches = re.findall(r'/dl/([a-zA-Z0-9]+)', detail_content)
54
+ if dl_matches:
55
+ dl_id = dl_matches[0]
56
+ dl_url = f"https://z-library.sk/dl/{dl_id}"
57
+ print(f"下载页 URL: {dl_url}")
58
+
59
+ # 尝试用 Playwright 访问下载页
60
+ print("\n用 Playwright 访问下载页...")
61
+ page3 = context.new_page()
62
+ page3.goto(dl_url, timeout=60000, wait_until="domcontentloaded")
63
+
64
+ # 等待 Cloudflare 检查
65
+ for i in range(30):
66
+ time.sleep(1)
67
+ title = page3.title()
68
+ print(f" {i+1}秒: {title}")
69
+
70
+ if 'Checking' not in title and 'browser' not in title.lower():
71
+ break
72
+
73
+ final_url = page3.url
74
+ print(f"\n最终 URL: {final_url}")
75
+
76
+ if final_url.lower().endswith('.epub'):
77
+ print("[成功] 获取到 EPUB 直链!")
78
+
79
+ page3.close()
80
+
81
+ page2.close()
82
+
83
+ input("按回车键关闭浏览器...")
84
+ browser.close()
85
+
86
+ if __name__ == "__main__":
87
+ test_zlibrary_search("Capitalism: A Global History")