fromozu commited on
Commit
4e2658f
·
verified ·
1 Parent(s): b194eda

Revert: remove Playwright fallback

Browse files
Files changed (1) hide show
  1. hf_backend/fetcher.py +978 -1009
hf_backend/fetcher.py CHANGED
@@ -1,1009 +1,978 @@
1
- from __future__ import annotations
2
-
3
- import math
4
- import re
5
- import time
6
- import zipfile
7
- from difflib import SequenceMatcher
8
- from io import BytesIO
9
- from pathlib import PurePosixPath
10
- from typing import Any
11
- from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
12
-
13
- import requests
14
-
15
- try:
16
- import cloudscraper
17
- except ImportError:
18
- cloudscraper = None
19
-
20
- try:
21
- from bs4 import BeautifulSoup
22
- except ImportError:
23
- BeautifulSoup = None
24
-
25
- try:
26
- from playwright.sync_api import sync_playwright
27
- except ImportError:
28
- sync_playwright = None
29
-
30
- from hf_backend.config import AppConfig
31
- from hf_backend.filename_utils import normalize_source_filename
32
-
33
-
34
- class FetchError(RuntimeError):
35
- pass
36
-
37
-
38
- USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB,请提供直链"
39
-
40
- ENGLISH_CODES = {
41
- "en",
42
- "eng",
43
- "en-us",
44
- "en-gb",
45
- "english",
46
- }
47
-
48
-
49
- def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
50
- normalized_query = str(query or "").strip()
51
- if not normalized_query:
52
- raise FetchError("请输入书名或 EPUB 下载链接")
53
-
54
- if _looks_like_url(normalized_query):
55
- filename, content = download_epub_from_url(config, normalized_query)
56
- return {
57
- "filename": normalize_source_filename(filename, default_extension=".epub"),
58
- "content": content,
59
- "origin": "link_fetch",
60
- "provider": "direct_link",
61
- "query": normalized_query,
62
- "download_url": normalized_query,
63
- }
64
-
65
- candidates: list[dict[str, Any]] = []
66
- last_error = None
67
-
68
- # Only use Anna's Archive and Z-Library
69
- for provider in (
70
- search_src_a,
71
- search_src_b,
72
- ):
73
- try:
74
- provider_candidates = provider(config, normalized_query)
75
- candidates.extend(provider_candidates)
76
- except FetchError as exc:
77
- last_error = exc
78
- continue
79
- except (requests.RequestException, ValueError) as exc:
80
- last_error = exc
81
- continue
82
-
83
- if not candidates:
84
- if last_error:
85
- error_msg = str(last_error)
86
- if "src_a" in error_msg:
87
- raise FetchError("未找到可用的英文 EPUB,请尝试提供直链或使用其他书名")
88
- raise FetchError(f"搜索失败:{error_msg[:100]}")
89
- raise FetchError(USER_FACING_NOT_FOUND)
90
-
91
- # Rank candidates by score, then try downloading from best to worst
92
- scored = [
93
- (candidate, _score_candidate(normalized_query, candidate))
94
- for candidate in candidates
95
- ]
96
- scored.sort(key=lambda item: item[1], reverse=True)
97
-
98
- top_score = scored[0][1] if scored else 0.0
99
- # Minimum score ratio to allow fallback (e.g., 0.6 means fallback must be at least 60% of top score)
100
- _FALLBACK_MIN_RATIO = 0.6
101
-
102
- download_error = None
103
- for candidate, score in scored:
104
- # Don't fall back to books that are too different from the top match
105
- if top_score > 0.5 and score < top_score * _FALLBACK_MIN_RATIO:
106
- break
107
-
108
- provider = candidate.get("provider", "")
109
- try:
110
- filename, content = download_epub_from_url(
111
- config,
112
- candidate["download_url"],
113
- filename_hint=candidate.get("filename", ""),
114
- provider=provider,
115
- )
116
- return {
117
- "filename": normalize_source_filename(filename, default_extension=".epub"),
118
- "content": content,
119
- "origin": "title_fetch",
120
- "provider": provider,
121
- "query": normalized_query,
122
- "title": candidate.get("title", ""),
123
- "author": candidate.get("author", ""),
124
- "download_url": candidate["download_url"],
125
- }
126
- except FetchError as exc:
127
- download_error = exc
128
- continue
129
- except (requests.RequestException, ValueError) as exc:
130
- download_error = exc
131
- continue
132
-
133
- raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
134
-
135
-
136
- def download_epub_from_url(
137
- config: AppConfig,
138
- url: str,
139
- *,
140
- filename_hint: str = "",
141
- provider: str = "",
142
- ) -> tuple[str, bytes]:
143
- # Route to specialized downloaders based on provider hint or URL pattern
144
- if provider == "src_a" or (
145
- config.src_a_base_url
146
- and url.lower().startswith(config.src_a_base_url.lower())
147
- and "/slow_download/" in url.lower()
148
- ):
149
- return _download_from_src_a(config, url, filename_hint)
150
-
151
- if provider == "src_b" or (
152
- config.src_b_base_url
153
- and url.lower().startswith(config.src_b_base_url.lower())
154
- and "/dl/" in url.lower()
155
- ):
156
- return _download_from_src_b(config, url, filename_hint)
157
-
158
- effective_url = _normalize_download_url(url)
159
- response = requests.get(
160
- effective_url,
161
- headers={"user-agent": config.fetch_user_agent},
162
- timeout=config.fetch_timeout_seconds,
163
- allow_redirects=True,
164
- )
165
- response.raise_for_status()
166
-
167
- content = response.content
168
- if not content:
169
- raise FetchError("下载结果为空")
170
-
171
- filename = _derive_filename(response, response.url or url, filename_hint)
172
- _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
173
-
174
- if not filename.lower().endswith(".epub"):
175
- filename = f"{filename}.epub"
176
- return filename, content
177
-
178
-
179
- def _download_from_src_a(
180
- config: AppConfig,
181
- url: str,
182
- filename_hint: str,
183
- ) -> tuple[str, bytes]:
184
- """
185
- Download EPUB from Anna's Archive.
186
- First tries Libgen (unprotected), then falls back to Anna's Archive
187
- fast_download/slow_download (requires cloudscraper for DDoS-Guard).
188
- """
189
- # Extract md5 from URL (format: .../slow_download/{md5}/0/3)
190
- md5_match = re.search(r"/slow_download/([a-f0-9]+)", url)
191
- if not md5_match:
192
- raise FetchError("无法从 URL 提取 md5")
193
- md5 = md5_match.group(1)
194
-
195
- libgen_headers = {
196
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
197
- }
198
-
199
- # --- Attempt 1: Libgen (fast, unprotected) ---
200
- libgen_success = False
201
- if BeautifulSoup is not None:
202
- try:
203
- ads_url = f"https://libgen.li/ads.php?md5={md5}"
204
- ads_response = requests.get(ads_url, headers=libgen_headers, timeout=15)
205
- ads_response.raise_for_status()
206
- ads_soup = BeautifulSoup(ads_response.text, "lxml")
207
- get_link = ads_soup.find("a", string=re.compile("GET"))
208
- if get_link:
209
- get_href = get_link.get("href", "")
210
- if get_href:
211
- download_url = f"https://libgen.li/{get_href.lstrip('/')}"
212
- response = requests.get(
213
- download_url,
214
- headers=libgen_headers,
215
- timeout=config.fetch_timeout_seconds,
216
- allow_redirects=True,
217
- )
218
- response.raise_for_status()
219
- content = response.content
220
- if content:
221
- filename = _derive_filename(response, response.url or download_url, filename_hint)
222
- _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
223
- if not filename.lower().endswith(".epub"):
224
- filename = f"{filename}.epub"
225
- return filename, content
226
- except Exception:
227
- pass # Fall through to Anna's Archive
228
-
229
- # --- Attempt 2: Anna's Archive fast_download (cloudscraper) ---
230
- if cloudscraper is not None:
231
- base_url = config.src_a_base_url.rstrip("/")
232
- for server_id in range(6): # Try first 6 fast partner servers
233
- try:
234
- fast_url = f"{base_url}/fast_download/{md5}/0/{server_id}"
235
- scraper = cloudscraper.create_scraper(
236
- browser={"browser": "chrome", "platform": "windows", "mobile": False},
237
- delay=10,
238
- )
239
- headers = {
240
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
241
- "accept": "*/*",
242
- }
243
- response = scraper.get(
244
- fast_url,
245
- headers=headers,
246
- timeout=config.fetch_timeout_seconds,
247
- allow_redirects=True,
248
- )
249
- if response.status_code == 200 and len(response.content) > 1000:
250
- filename = _derive_filename(response, response.url or fast_url, filename_hint)
251
- try:
252
- _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
253
- except FetchError:
254
- continue
255
- if not filename.lower().endswith(".epub"):
256
- filename = f"{filename}.epub"
257
- return filename, response.content
258
- except Exception:
259
- continue
260
-
261
- # --- Attempt 3: Anna's Archive slow_download (cloudscraper, may have wait) ---
262
- if cloudscraper is not None:
263
- base_url = config.src_a_base_url.rstrip("/")
264
- for server_id in range(4):
265
- try:
266
- slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
267
- scraper = cloudscraper.create_scraper(
268
- browser={"browser": "chrome", "platform": "windows", "mobile": False},
269
- delay=10,
270
- )
271
- headers = {
272
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
273
- "accept": "*/*",
274
- }
275
- response = scraper.get(
276
- slow_url,
277
- headers=headers,
278
- timeout=180,
279
- allow_redirects=True,
280
- )
281
- if response.status_code == 200 and len(response.content) > 1000:
282
- filename = _derive_filename(response, response.url or slow_url, filename_hint)
283
- try:
284
- _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
285
- except FetchError:
286
- continue
287
- if not filename.lower().endswith(".epub"):
288
- filename = f"{filename}.epub"
289
- return filename, response.content
290
- except Exception:
291
- continue
292
-
293
- # --- Attempt 4: Playwright headless browser (handles DDoS-Guard JS challenge) ---
294
- if sync_playwright is not None:
295
- base_url = config.src_a_base_url.rstrip("/")
296
- # Try slow_download with Playwright (wait through countdown)
297
- for server_id in range(3):
298
- try:
299
- slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
300
- content = _download_with_playwright(slow_url)
301
- if content and len(content) > 1000:
302
- _validate_epub_bytes(content=content, filename=filename_hint or "book.epub", content_type="")
303
- fname = filename_hint if filename_hint and filename_hint.lower().endswith(".epub") else f"{filename_hint or 'book'}.epub"
304
- return fname, content
305
- except Exception:
306
- continue
307
-
308
- raise FetchError("所有下载方式均失败(Libgen 和 Anna's Archive)")
309
-
310
-
311
- def _download_with_playwright(url: str, *, wait_seconds: int = 50) -> bytes | None:
312
- """
313
- Use Playwright headless browser to bypass DDoS-Guard and download from
314
- Anna's Archive slow_download pages. Waits for the countdown timer then
315
- captures the download.
316
- """
317
- if sync_playwright is None:
318
- return None
319
-
320
- with sync_playwright() as p:
321
- browser = p.chromium.launch(headless=True)
322
- context = browser.new_context(
323
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
324
- accept_downloads=True,
325
- )
326
- page = context.new_page()
327
-
328
- try:
329
- page.goto(url, timeout=60_000, wait_until="domcontentloaded")
330
-
331
- # Wait for the countdown timer to finish and a download link to appear
332
- for elapsed in range(wait_seconds):
333
- time.sleep(1)
334
-
335
- # Check if page navigated away (redirect to download)
336
- current = page.url
337
- if current != url and "slow_download" not in current and "fast_download" not in current:
338
- # Direct redirect — fetch via requests using cookies from the browser
339
- break
340
-
341
- # Check for download links that appeared after countdown
342
- try:
343
- links = page.query_selector_all("a[href]")
344
- for link in links:
345
- href = link.get_attribute("href") or ""
346
- text = (link.text_content() or "").strip().lower()
347
- if ("get.php" in href or href.endswith(".epub") or
348
- ("download" in text and href and href != "#")):
349
- # Try to capture download
350
- try:
351
- with page.expect_download(timeout=5_000) as dl_info:
352
- link.click()
353
- dl = dl_info.value
354
- dl_path = dl.path()
355
- if dl_path:
356
- with open(dl_path, "rb") as f:
357
- return f.read()
358
- except Exception:
359
- # click didn't trigger download, try fetching URL directly
360
- abs_href = href if href.startswith("http") else f"https://annas-archive.gl{href}"
361
- resp = context.request.get(abs_href, timeout=120_000)
362
- if resp.status == 200 and len(resp.body()) > 1000:
363
- return resp.body()
364
- except Exception:
365
- pass
366
-
367
- except Exception:
368
- pass
369
- finally:
370
- browser.close()
371
-
372
- return None
373
-
374
-
375
- def _download_from_src_b(
376
- config: AppConfig,
377
- url: str,
378
- filename_hint: str,
379
- ) -> tuple[str, bytes]:
380
- """
381
- Download EPUB from src_b download URL.
382
- Uses cloudscraper to handle protection.
383
- Falls back to requests.get if cloudscraper is unavailable.
384
- """
385
- if cloudscraper is not None:
386
- try:
387
- scraper = cloudscraper.create_scraper(
388
- browser={"browser": "chrome", "platform": "windows", "mobile": False},
389
- delay=10,
390
- )
391
- headers = {
392
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
393
- "accept": "*/*",
394
- }
395
- response = scraper.get(
396
- url,
397
- headers=headers,
398
- timeout=120,
399
- allow_redirects=True,
400
- )
401
- if response.status_code == 403:
402
- raise FetchError("下载被阻止,请手动下载")
403
- response.raise_for_status()
404
- content = response.content
405
- if not content:
406
- raise FetchError("下载结果为空")
407
- filename = _derive_filename(response, response.url or url, filename_hint)
408
- _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
409
- if not filename.lower().endswith(".epub"):
410
- filename = f"{filename}.epub"
411
- return filename, content
412
- except FetchError:
413
- raise
414
- except Exception:
415
- pass # Fall through to requests
416
-
417
- # Fallback to plain requests
418
- response = requests.get(
419
- url,
420
- headers={"user-agent": config.fetch_user_agent},
421
- timeout=config.fetch_timeout_seconds,
422
- allow_redirects=True,
423
- )
424
- if response.status_code == 403:
425
- raise FetchError("下载被阻止,请手动下载")
426
- response.raise_for_status()
427
- content = response.content
428
- if not content:
429
- raise FetchError("下载结果为空")
430
- filename = _derive_filename(response, response.url or url, filename_hint)
431
- _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
432
- if not filename.lower().endswith(".epub"):
433
- filename = f"{filename}.epub"
434
- return filename, content
435
-
436
-
437
- def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
438
- scored: list[tuple[float, dict[str, Any]]] = []
439
- for candidate in candidates:
440
- score = _score_candidate(query, candidate)
441
- if score >= 0.45:
442
- scored.append((score, candidate))
443
- if not scored:
444
- return None
445
- scored.sort(key=lambda item: item[0], reverse=True)
446
- return scored[0][1]
447
-
448
-
449
- def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
450
- response = requests.get(
451
- config.standard_ebooks_search_url,
452
- params={"query": query},
453
- headers={"user-agent": config.fetch_user_agent},
454
- timeout=_provider_timeout(config),
455
- )
456
- response.raise_for_status()
457
-
458
- paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
459
- candidates: list[dict[str, Any]] = []
460
- for path in paths[:6]:
461
- detail_url = urljoin(config.standard_ebooks_search_url, path)
462
- detail_response = requests.get(
463
- detail_url,
464
- headers={"user-agent": config.fetch_user_agent},
465
- timeout=_provider_timeout(config),
466
- )
467
- detail_response.raise_for_status()
468
- download_path = _pick_standard_ebooks_download(detail_response.text)
469
- if not download_path:
470
- continue
471
- candidates.append(
472
- {
473
- "provider": "standard_ebooks",
474
- "title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
475
- "author": _author_from_book_path(path),
476
- "language": "en",
477
- "download_url": _normalize_download_url(
478
- urljoin(config.standard_ebooks_search_url, download_path),
479
- ),
480
- "filename": PurePosixPath(download_path).name,
481
- }
482
- )
483
- return candidates
484
-
485
-
486
- def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
487
- response = requests.get(
488
- config.project_gutenberg_search_url,
489
- params={"query": query},
490
- headers={"user-agent": config.fetch_user_agent},
491
- timeout=_provider_timeout(config),
492
- )
493
- response.raise_for_status()
494
-
495
- book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
496
- candidates: list[dict[str, Any]] = []
497
- for book_id in book_ids[:5]:
498
- detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
499
- detail_response = requests.get(
500
- detail_url,
501
- headers={"user-agent": config.fetch_user_agent},
502
- timeout=_provider_timeout(config),
503
- )
504
- detail_response.raise_for_status()
505
- detail_html = detail_response.text
506
- download_path = _pick_gutenberg_epub(detail_html)
507
- if not download_path:
508
- continue
509
- candidates.append(
510
- {
511
- "provider": "project_gutenberg",
512
- "title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
513
- "author": "",
514
- "language": "en",
515
- "download_url": urljoin(detail_url, download_path),
516
- "filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
517
- }
518
- )
519
- return candidates
520
-
521
-
522
- def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
523
- response = requests.get(
524
- config.internet_archive_advancedsearch_url,
525
- params={
526
- "q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
527
- "fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
528
- "sort[]": "downloads desc",
529
- "rows": 8,
530
- "page": 1,
531
- "output": "json",
532
- },
533
- headers={"user-agent": config.fetch_user_agent},
534
- timeout=_provider_timeout(config),
535
- )
536
- response.raise_for_status()
537
- payload = response.json()
538
- docs = payload.get("response", {}).get("docs", [])
539
-
540
- candidates: list[dict[str, Any]] = []
541
- for item in docs:
542
- if not _is_english(item.get("language")):
543
- continue
544
- formats = item.get("format") or []
545
- if isinstance(formats, str):
546
- formats = [formats]
547
- if not any(str(value).strip().lower() == "epub" for value in formats):
548
- continue
549
- identifier = str(item.get("identifier") or "").strip()
550
- if not identifier:
551
- continue
552
- metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
553
- metadata_response = requests.get(
554
- metadata_url,
555
- headers={"user-agent": config.fetch_user_agent},
556
- timeout=_provider_timeout(config),
557
- )
558
- metadata_response.raise_for_status()
559
- metadata = metadata_response.json()
560
- filename = _pick_archive_epub_filename(metadata)
561
- if not filename:
562
- continue
563
- candidates.append(
564
- {
565
- "provider": "internet_archive",
566
- "title": str(item.get("title") or ""),
567
- "author": _first_text(item.get("creator")),
568
- "language": _first_text(item.get("language")),
569
- "downloads": int(item.get("downloads") or 0),
570
- "download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
571
- "filename": filename,
572
- }
573
- )
574
- return candidates
575
-
576
-
577
- def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
578
- """
579
- Search src_a for EPUB files matching the query.
580
- Uses cloudscraper to bypass DDoS-Guard protection.
581
- """
582
- if not config.src_a_search_url or not config.src_a_base_url:
583
- return []
584
-
585
- if cloudscraper is None or BeautifulSoup is None:
586
- return []
587
-
588
- candidates: list[dict[str, Any]] = []
589
- search_url = config.src_a_search_url
590
-
591
- try:
592
- scraper = cloudscraper.create_scraper(
593
- browser={"browser": "chrome", "platform": "windows", "mobile": False},
594
- delay=10,
595
- )
596
- headers = {
597
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
598
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
599
- "accept-language": "en-US,en;q=0.9",
600
- }
601
- response = scraper.get(
602
- f"{search_url}?q={quote(query)}",
603
- headers=headers,
604
- timeout=60,
605
- )
606
- response.raise_for_status()
607
-
608
- html = response.text
609
- if not html or len(html) < 500:
610
- raise FetchError("搜索页面加载失败")
611
-
612
- soup = BeautifulSoup(html, "lxml")
613
-
614
- # Find all links containing /md5/ - these are book entry links
615
- md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
616
-
617
- seen_md5 = set()
618
- for link in md5_links:
619
- link_text = (link.get_text() or "").strip()
620
- # We want links with empty text (Save links) that point to md5 entries
621
- if link_text and link_text != "Save":
622
- continue
623
-
624
- href = link.get("href", "")
625
- md5_match = re.search(r"/md5/([a-f0-9]+)", href)
626
- if not md5_match:
627
- continue
628
- md5 = md5_match.group(1)
629
- if md5 in seen_md5:
630
- continue
631
-
632
- # Find the parent container to get context
633
- container = link.find_parent("div")
634
- if not container:
635
- container = link.parent
636
- if not container:
637
- continue
638
-
639
- container_text = container.get_text(separator="\n") or ""
640
-
641
- # Check if this container has an EPUB file
642
- epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
643
- if not epub_match:
644
- continue
645
-
646
- epub_path = epub_match.group(1)
647
- seen_md5.add(md5)
648
-
649
- # Extract metadata: English [en] · EPUB · 1.2MB · 2020
650
- meta_match = re.search(
651
- r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
652
- container_text,
653
- )
654
- metadata = {
655
- "language": meta_match.group(1) if meta_match else "",
656
- "format": meta_match.group(2) if meta_match else "",
657
- "filesize": meta_match.group(3) if meta_match else "",
658
- "year": meta_match.group(4) if meta_match else "",
659
- } if meta_match else None
660
-
661
- # Extract title and author from container text lines
662
- lines = [l.strip() for l in container_text.split("\n") if l.strip()]
663
- title = ""
664
- author = ""
665
- for line in lines:
666
- if ".epub" in line.lower():
667
- continue
668
- if re.match(r"^[\d.,]+$", line):
669
- continue
670
- if re.match(r"^(English|Save|\d+)", line):
671
- continue
672
- if not title:
673
- title = line[:200]
674
- elif not author and len(line) < 100:
675
- author = line
676
- break
677
-
678
- download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
679
- filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
680
-
681
- candidates.append({
682
- "provider": "src_a",
683
- "title": title,
684
- "author": author,
685
- "language": "en",
686
- "download_url": download_url,
687
- "filename": filename,
688
- "filesize": metadata["filesize"] if metadata else "",
689
- "year": metadata["year"] if metadata else "",
690
- })
691
-
692
- except FetchError:
693
- raise
694
- except Exception as exc:
695
- raise FetchError(f"搜索失败: {str(exc)[:100]}")
696
-
697
- return candidates
698
-
699
-
700
- def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
701
- """
702
- Search src_b for EPUB files matching the query.
703
- Uses cloudscraper to bypass DDoS-Guard protection.
704
- Tries alternative Z-Library domains if primary fails.
705
- """
706
- if not config.src_b_base_url:
707
- return []
708
-
709
- if cloudscraper is None or BeautifulSoup is None:
710
- return []
711
-
712
- # Try primary domain first, then alternatives
713
- primary_base = config.src_b_base_url.rstrip("/")
714
- alt_domains = ["https://z-lib.is", "https://z-library.se"]
715
- bases_to_try = [primary_base] + [d for d in alt_domains if d.rstrip("/") != primary_base]
716
-
717
- last_error = None
718
- for base in bases_to_try:
719
- try:
720
- return _search_src_b_at_domain(base, query)
721
- except FetchError as exc:
722
- last_error = exc
723
- continue
724
- except (requests.RequestException, ValueError) as exc:
725
- last_error = exc
726
- continue
727
-
728
- if last_error:
729
- raise last_error
730
- return []
731
-
732
-
733
- def _search_src_b_at_domain(base: str, query: str) -> list[dict[str, Any]]:
734
- """Search a single Z-Library domain for EPUB files."""
735
- candidates: list[dict[str, Any]] = []
736
-
737
- scraper = cloudscraper.create_scraper(
738
- browser={"browser": "chrome", "platform": "windows", "mobile": False},
739
- delay=10,
740
- )
741
- headers = {
742
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
743
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
744
- "accept-language": "en-US,en;q=0.9",
745
- }
746
-
747
- # Search page
748
- search_url = f"{base}/s/{quote(query)}"
749
- response = scraper.get(search_url, headers=headers, timeout=60)
750
- response.raise_for_status()
751
-
752
- html = response.text
753
- if not html or len(html) < 200:
754
- raise FetchError("搜索页面加载失败")
755
-
756
- soup = BeautifulSoup(html, "lxml")
757
-
758
- # Find book items by looking for links to /book/
759
- book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
760
- if not book_links:
761
- raise FetchError("未找到任何书籍")
762
-
763
- # Deduplicate by href
764
- seen_hrefs: set[str] = set()
765
- books: list[dict[str, str]] = []
766
- for link in book_links:
767
- href = link.get("href", "")
768
- if not href or href in seen_hrefs:
769
- continue
770
- seen_hrefs.add(href)
771
- text = (link.get_text() or "").strip()
772
- if text and len(text) > 3:
773
- books.append({"href": href, "text": text})
774
-
775
- # Visit each book page to check for EPUB format
776
- for book in books[:15]:
777
- href = book.get("href", "")
778
- if not href:
779
- continue
780
-
781
- book_url = href if href.startswith("http") else f"{base}{href}"
782
-
783
- try:
784
- book_resp = scraper.get(book_url, headers=headers, timeout=30)
785
- book_resp.raise_for_status()
786
- book_html = book_resp.text
787
- book_soup = BeautifulSoup(book_html, "lxml")
788
- page_text = book_soup.get_text(separator="\n") or ""
789
-
790
- if "epub" not in page_text.lower():
791
- continue
792
-
793
- # Extract author
794
- author = ""
795
- author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
796
- if author_match:
797
- author = author_match.group(1).strip()[:100]
798
-
799
- # Get download URL from /dl/ link
800
- dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
801
- dl_id = dl_match.group(1) if dl_match else ""
802
- download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
803
-
804
- title_text = book.get("text", "").split("\n")[0].strip()
805
- if not title_text:
806
- title_text = book.get("text", "")[:100]
807
-
808
- candidates.append({
809
- "provider": "src_b",
810
- "title": title_text,
811
- "author": author,
812
- "language": "en",
813
- "download_url": download_url,
814
- "filename": "",
815
- "filesize": "",
816
- })
817
-
818
- except Exception:
819
- continue
820
-
821
- return candidates
822
-
823
-
824
- def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
825
- if len(content) < 4 or not content.startswith(b"PK"):
826
- raise FetchError("下载内容不是 EPUB")
827
-
828
- try:
829
- with zipfile.ZipFile(BytesIO(content)) as archive:
830
- mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
831
- except (KeyError, zipfile.BadZipFile) as exc:
832
- raise FetchError("下载内容不是 EPUB") from exc
833
-
834
- if mimetype != "application/epub+zip":
835
- raise FetchError("下载内容不是 EPUB")
836
-
837
- lowered_content_type = content_type.lower()
838
- if filename.lower().endswith(".epub"):
839
- return
840
- if "application/epub+zip" in lowered_content_type:
841
- return
842
-
843
-
844
- def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
845
- hint = str(filename_hint or "").strip()
846
- if hint:
847
- return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
848
-
849
- disposition = response.headers.get("content-disposition", "")
850
- match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
851
- if match:
852
- return normalize_source_filename(
853
- PurePosixPath(unquote(match.group(1).strip())).name,
854
- default_extension=".epub",
855
- )
856
-
857
- parsed = urlparse(url)
858
- name = PurePosixPath(unquote(parsed.path)).name
859
- if name:
860
- return normalize_source_filename(name, default_extension=".epub")
861
- return "downloaded_book.epub"
862
-
863
-
864
- def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
865
- if not candidate.get("download_url") or not _is_english(candidate.get("language")):
866
- return -1.0
867
-
868
- normalized_query = _normalize_text(query)
869
- normalized_title = _normalize_text(candidate.get("title", ""))
870
-
871
- # Word overlap scoring - more accurate for title matching
872
- query_words = set(normalized_query.split())
873
- title_words = set(normalized_title.split())
874
-
875
- if not query_words or not title_words:
876
- return -1.0
877
-
878
- # Calculate word overlap
879
- common_words = query_words & title_words
880
- all_words = query_words | title_words
881
-
882
- # Jaccard similarity (word overlap / total unique words)
883
- jaccard_score = len(common_words) / len(all_words) if all_words else 0
884
-
885
- # Sequence similarity for word order
886
- sequence_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
887
-
888
- # Combined base score (weighted toward word overlap)
889
- title_score = jaccard_score * 0.7 + sequence_score * 0.3
890
-
891
- # Strong bonus for exact match
892
- if normalized_query == normalized_title:
893
- title_score += 1.0
894
- # Bonus for all query words in title
895
- elif query_words <= title_words:
896
- title_score += 0.3
897
- # Penalty for titles with no meaningful word overlap
898
- elif len(common_words) == 0:
899
- title_score -= 0.3
900
-
901
- provider_bonus = {
902
- "src_a": 0.1,
903
- "src_b": 0.08,
904
- }.get(candidate.get("provider"), 0.0)
905
-
906
- filename = str(candidate.get("filename") or "").lower()
907
- download_url = str(candidate.get("download_url") or "").lower()
908
- epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
909
-
910
- downloads = max(int(candidate.get("downloads") or 0), 0)
911
- downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
912
- return title_score + provider_bonus + epub_bonus + downloads_bonus
913
-
914
-
915
- def _looks_like_url(value: str) -> bool:
916
- parsed = urlparse(value)
917
- return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
918
-
919
-
920
- def _normalize_text(value: str) -> str:
921
- lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
922
- return " ".join(lowered.split())
923
-
924
-
925
- def _is_english(value: Any) -> bool:
926
- if isinstance(value, (list, tuple, set)):
927
- return any(_is_english(item) for item in value)
928
- normalized = _normalize_text(str(value or ""))
929
- return normalized in ENGLISH_CODES
930
-
931
-
932
- def _unique_matches(pattern: str, text: str) -> list[str]:
933
- results: list[str] = []
934
- for match in re.findall(pattern, text):
935
- value = match.strip()
936
- if value and value not in results:
937
- results.append(value)
938
- return results
939
-
940
-
941
- def _pick_standard_ebooks_download(html: str) -> str:
942
- links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
943
- for link in links:
944
- lower_link = link.lower()
945
- if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
946
- continue
947
- return link
948
- return links[0] if links else ""
949
-
950
-
951
- def _pick_gutenberg_epub(html: str) -> str:
952
- links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
953
- for link in links:
954
- lower_link = link.lower()
955
- if lower_link.endswith(".epub") or ".epub." in lower_link:
956
- return link
957
- return ""
958
-
959
-
960
- def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
961
- for item in metadata.get("files", []) or []:
962
- name = str(item.get("name") or "")
963
- if name.lower().endswith(".epub"):
964
- return name
965
- return ""
966
-
967
-
968
- def _extract_html_title(html: str) -> str:
969
- title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
970
- if not title_match:
971
- return ""
972
- title = re.sub(r"\s+", " ", title_match.group(1)).strip()
973
- title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
974
- title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
975
- title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
976
- return title
977
-
978
-
979
- def _author_from_book_path(path: str) -> str:
980
- parts = [part for part in path.strip("/").split("/") if part]
981
- if len(parts) < 3:
982
- return ""
983
- return " ".join(part.capitalize() for part in parts[1].split("-"))
984
-
985
-
986
- def _title_from_book_path(path: str) -> str:
987
- parts = [part for part in path.strip("/").split("/") if part]
988
- if len(parts) < 3:
989
- return ""
990
- return " ".join(part.capitalize() for part in parts[2].split("-"))
991
-
992
-
993
- def _first_text(value: Any) -> str:
994
- if isinstance(value, (list, tuple)):
995
- return str(value[0]) if value else ""
996
- return str(value or "")
997
-
998
-
999
- def _provider_timeout(config: AppConfig) -> int:
1000
- return max(5, min(int(config.fetch_timeout_seconds), 10))
1001
-
1002
-
1003
- def _normalize_download_url(url: str) -> str:
1004
- parsed = urlparse(url)
1005
- if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
1006
- params = dict(parse_qsl(parsed.query, keep_blank_values=True))
1007
- params.setdefault("source", "download")
1008
- return urlunparse(parsed._replace(query=urlencode(params)))
1009
- return url
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import re
5
+ import time
6
+ import zipfile
7
+ from difflib import SequenceMatcher
8
+ from io import BytesIO
9
+ from pathlib import PurePosixPath
10
+ from typing import Any
11
+ from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
12
+
13
+ import requests
14
+
15
+ try:
16
+ import cloudscraper
17
+ except ImportError:
18
+ cloudscraper = None
19
+
20
+ try:
21
+ from bs4 import BeautifulSoup
22
+ except ImportError:
23
+ BeautifulSoup = None
24
+
25
+
26
+ from hf_backend.config import AppConfig
27
+ from hf_backend.filename_utils import normalize_source_filename
28
+
29
+
30
+ class FetchError(RuntimeError):
31
+ pass
32
+
33
+
34
+ USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB,请提供直链"
35
+
36
+ ENGLISH_CODES = {
37
+ "en",
38
+ "eng",
39
+ "en-us",
40
+ "en-gb",
41
+ "english",
42
+ }
43
+
44
+
45
+ def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
46
+ normalized_query = str(query or "").strip()
47
+ if not normalized_query:
48
+ raise FetchError("请输入书名或 EPUB 下载链接")
49
+
50
+ if _looks_like_url(normalized_query):
51
+ filename, content = download_epub_from_url(config, normalized_query)
52
+ return {
53
+ "filename": normalize_source_filename(filename, default_extension=".epub"),
54
+ "content": content,
55
+ "origin": "link_fetch",
56
+ "provider": "direct_link",
57
+ "query": normalized_query,
58
+ "download_url": normalized_query,
59
+ }
60
+
61
+ candidates: list[dict[str, Any]] = []
62
+ last_error = None
63
+
64
+ # Only use Anna's Archive and Z-Library
65
+ for provider in (
66
+ search_src_a,
67
+ search_src_b,
68
+ ):
69
+ try:
70
+ provider_candidates = provider(config, normalized_query)
71
+ candidates.extend(provider_candidates)
72
+ except FetchError as exc:
73
+ last_error = exc
74
+ continue
75
+ except (requests.RequestException, ValueError) as exc:
76
+ last_error = exc
77
+ continue
78
+
79
+ if not candidates:
80
+ if last_error:
81
+ error_msg = str(last_error)
82
+ if "src_a" in error_msg:
83
+ raise FetchError("未找到可用的英文 EPUB,请尝试提供直链或使用其他书名")
84
+ raise FetchError(f"搜索失败:{error_msg[:100]}")
85
+ raise FetchError(USER_FACING_NOT_FOUND)
86
+
87
+ # Rank candidates by score, then try downloading from best to worst
88
+ scored = [
89
+ (candidate, _score_candidate(normalized_query, candidate))
90
+ for candidate in candidates
91
+ ]
92
+ scored.sort(key=lambda item: item[1], reverse=True)
93
+
94
+ top_score = scored[0][1] if scored else 0.0
95
+ # Minimum score ratio to allow fallback (e.g., 0.6 means fallback must be at least 60% of top score)
96
+ _FALLBACK_MIN_RATIO = 0.6
97
+
98
+ download_error = None
99
+ for candidate, score in scored:
100
+ # Don't fall back to books that are too different from the top match
101
+ if top_score > 0.5 and score < top_score * _FALLBACK_MIN_RATIO:
102
+ break
103
+
104
+ provider = candidate.get("provider", "")
105
+ try:
106
+ filename, content = download_epub_from_url(
107
+ config,
108
+ candidate["download_url"],
109
+ filename_hint=candidate.get("filename", ""),
110
+ provider=provider,
111
+ )
112
+ return {
113
+ "filename": normalize_source_filename(filename, default_extension=".epub"),
114
+ "content": content,
115
+ "origin": "title_fetch",
116
+ "provider": provider,
117
+ "query": normalized_query,
118
+ "title": candidate.get("title", ""),
119
+ "author": candidate.get("author", ""),
120
+ "download_url": candidate["download_url"],
121
+ }
122
+ except FetchError as exc:
123
+ download_error = exc
124
+ continue
125
+ except (requests.RequestException, ValueError) as exc:
126
+ download_error = exc
127
+ continue
128
+
129
+ raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
130
+
131
+
132
+ def download_epub_from_url(
133
+ config: AppConfig,
134
+ url: str,
135
+ *,
136
+ filename_hint: str = "",
137
+ provider: str = "",
138
+ ) -> tuple[str, bytes]:
139
+ # Route to specialized downloaders based on provider hint or URL pattern
140
+ if provider == "src_a" or (
141
+ config.src_a_base_url
142
+ and url.lower().startswith(config.src_a_base_url.lower())
143
+ and "/slow_download/" in url.lower()
144
+ ):
145
+ return _download_from_src_a(config, url, filename_hint)
146
+
147
+ if provider == "src_b" or (
148
+ config.src_b_base_url
149
+ and url.lower().startswith(config.src_b_base_url.lower())
150
+ and "/dl/" in url.lower()
151
+ ):
152
+ return _download_from_src_b(config, url, filename_hint)
153
+
154
+ effective_url = _normalize_download_url(url)
155
+ response = requests.get(
156
+ effective_url,
157
+ headers={"user-agent": config.fetch_user_agent},
158
+ timeout=config.fetch_timeout_seconds,
159
+ allow_redirects=True,
160
+ )
161
+ response.raise_for_status()
162
+
163
+ content = response.content
164
+ if not content:
165
+ raise FetchError("下载结果为空")
166
+
167
+ filename = _derive_filename(response, response.url or url, filename_hint)
168
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
169
+
170
+ if not filename.lower().endswith(".epub"):
171
+ filename = f"{filename}.epub"
172
+ return filename, content
173
+
174
+
175
+ def _download_from_src_a(
176
+ config: AppConfig,
177
+ url: str,
178
+ filename_hint: str,
179
+ ) -> tuple[str, bytes]:
180
+ """
181
+ Download EPUB from Anna's Archive.
182
+ First tries Libgen (unprotected), then falls back to Anna's Archive
183
+ fast_download/slow_download (requires cloudscraper for DDoS-Guard).
184
+ """
185
+ # Extract md5 from URL (format: .../slow_download/{md5}/0/3)
186
+ md5_match = re.search(r"/slow_download/([a-f0-9]+)", url)
187
+ if not md5_match:
188
+ raise FetchError("无法从 URL 提取 md5")
189
+ md5 = md5_match.group(1)
190
+
191
+ libgen_headers = {
192
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
193
+ }
194
+
195
+ # --- Attempt 1: Libgen (fast, unprotected) ---
196
+ libgen_success = False
197
+ if BeautifulSoup is not None:
198
+ try:
199
+ ads_url = f"https://libgen.li/ads.php?md5={md5}"
200
+ ads_response = requests.get(ads_url, headers=libgen_headers, timeout=15)
201
+ ads_response.raise_for_status()
202
+ ads_soup = BeautifulSoup(ads_response.text, "lxml")
203
+ get_link = ads_soup.find("a", string=re.compile("GET"))
204
+ if get_link:
205
+ get_href = get_link.get("href", "")
206
+ if get_href:
207
+ download_url = f"https://libgen.li/{get_href.lstrip('/')}"
208
+ response = requests.get(
209
+ download_url,
210
+ headers=libgen_headers,
211
+ timeout=config.fetch_timeout_seconds,
212
+ allow_redirects=True,
213
+ )
214
+ response.raise_for_status()
215
+ content = response.content
216
+ if content:
217
+ filename = _derive_filename(response, response.url or download_url, filename_hint)
218
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
219
+ if not filename.lower().endswith(".epub"):
220
+ filename = f"{filename}.epub"
221
+ return filename, content
222
+ except Exception:
223
+ pass # Fall through to Anna's Archive
224
+
225
+ # --- Attempt 2: Anna's Archive fast_download (cloudscraper) ---
226
+ if cloudscraper is not None:
227
+ base_url = config.src_a_base_url.rstrip("/")
228
+ for server_id in range(6): # Try first 6 fast partner servers
229
+ try:
230
+ fast_url = f"{base_url}/fast_download/{md5}/0/{server_id}"
231
+ scraper = cloudscraper.create_scraper(
232
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
233
+ delay=10,
234
+ )
235
+ headers = {
236
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
237
+ "accept": "*/*",
238
+ }
239
+ response = scraper.get(
240
+ fast_url,
241
+ headers=headers,
242
+ timeout=config.fetch_timeout_seconds,
243
+ allow_redirects=True,
244
+ )
245
+ if response.status_code == 200 and len(response.content) > 1000:
246
+ filename = _derive_filename(response, response.url or fast_url, filename_hint)
247
+ try:
248
+ _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
249
+ except FetchError:
250
+ continue
251
+ if not filename.lower().endswith(".epub"):
252
+ filename = f"{filename}.epub"
253
+ return filename, response.content
254
+ except Exception:
255
+ continue
256
+
257
+ # --- Attempt 3: Anna's Archive slow_download (cloudscraper, may have wait) ---
258
+ if cloudscraper is not None:
259
+ base_url = config.src_a_base_url.rstrip("/")
260
+ for server_id in range(4):
261
+ try:
262
+ slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
263
+ scraper = cloudscraper.create_scraper(
264
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
265
+ delay=10,
266
+ )
267
+ headers = {
268
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
269
+ "accept": "*/*",
270
+ }
271
+ response = scraper.get(
272
+ slow_url,
273
+ headers=headers,
274
+ timeout=180,
275
+ allow_redirects=True,
276
+ )
277
+ if response.status_code == 200 and len(response.content) > 1000:
278
+ filename = _derive_filename(response, response.url or slow_url, filename_hint)
279
+ try:
280
+ _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
281
+ except FetchError:
282
+ continue
283
+ if not filename.lower().endswith(".epub"):
284
+ filename = f"{filename}.epub"
285
+ return filename, response.content
286
+ except Exception:
287
+ continue
288
+
289
+ with sync_playwright() as p:
290
+ browser = p.chromium.launch(headless=True)
291
+ context = browser.new_context(
292
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
293
+ accept_downloads=True,
294
+ )
295
+ page = context.new_page()
296
+
297
+ try:
298
+ page.goto(url, timeout=60_000, wait_until="domcontentloaded")
299
+
300
+ # Wait for the countdown timer to finish and a download link to appear
301
+ for elapsed in range(wait_seconds):
302
+ time.sleep(1)
303
+
304
+ # Check if page navigated away (redirect to download)
305
+ current = page.url
306
+ if current != url and "slow_download" not in current and "fast_download" not in current:
307
+ # Direct redirect — fetch via requests using cookies from the browser
308
+ break
309
+
310
+ # Check for download links that appeared after countdown
311
+ try:
312
+ links = page.query_selector_all("a[href]")
313
+ for link in links:
314
+ href = link.get_attribute("href") or ""
315
+ text = (link.text_content() or "").strip().lower()
316
+ if ("get.php" in href or href.endswith(".epub") or
317
+ ("download" in text and href and href != "#")):
318
+ # Try to capture download
319
+ try:
320
+ with page.expect_download(timeout=5_000) as dl_info:
321
+ link.click()
322
+ dl = dl_info.value
323
+ dl_path = dl.path()
324
+ if dl_path:
325
+ with open(dl_path, "rb") as f:
326
+ return f.read()
327
+ except Exception:
328
+ # click didn't trigger download, try fetching URL directly
329
+ abs_href = href if href.startswith("http") else f"https://annas-archive.gl{href}"
330
+ resp = context.request.get(abs_href, timeout=120_000)
331
+ if resp.status == 200 and len(resp.body()) > 1000:
332
+ return resp.body()
333
+ except Exception:
334
+ pass
335
+
336
+ except Exception:
337
+ pass
338
+ finally:
339
+ browser.close()
340
+
341
+ return None
342
+
343
+
344
+ def _download_from_src_b(
345
+ config: AppConfig,
346
+ url: str,
347
+ filename_hint: str,
348
+ ) -> tuple[str, bytes]:
349
+ """
350
+ Download EPUB from src_b download URL.
351
+ Uses cloudscraper to handle protection.
352
+ Falls back to requests.get if cloudscraper is unavailable.
353
+ """
354
+ if cloudscraper is not None:
355
+ try:
356
+ scraper = cloudscraper.create_scraper(
357
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
358
+ delay=10,
359
+ )
360
+ headers = {
361
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
362
+ "accept": "*/*",
363
+ }
364
+ response = scraper.get(
365
+ url,
366
+ headers=headers,
367
+ timeout=120,
368
+ allow_redirects=True,
369
+ )
370
+ if response.status_code == 403:
371
+ raise FetchError("下载被阻止,请手动下载")
372
+ response.raise_for_status()
373
+ content = response.content
374
+ if not content:
375
+ raise FetchError("下载结果为空")
376
+ filename = _derive_filename(response, response.url or url, filename_hint)
377
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
378
+ if not filename.lower().endswith(".epub"):
379
+ filename = f"{filename}.epub"
380
+ return filename, content
381
+ except FetchError:
382
+ raise
383
+ except Exception:
384
+ pass # Fall through to requests
385
+
386
+ # Fallback to plain requests
387
+ response = requests.get(
388
+ url,
389
+ headers={"user-agent": config.fetch_user_agent},
390
+ timeout=config.fetch_timeout_seconds,
391
+ allow_redirects=True,
392
+ )
393
+ if response.status_code == 403:
394
+ raise FetchError("下载被阻止,请手动下载")
395
+ response.raise_for_status()
396
+ content = response.content
397
+ if not content:
398
+ raise FetchError("下载结果为空")
399
+ filename = _derive_filename(response, response.url or url, filename_hint)
400
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
401
+ if not filename.lower().endswith(".epub"):
402
+ filename = f"{filename}.epub"
403
+ return filename, content
404
+
405
+
406
+ def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
407
+ scored: list[tuple[float, dict[str, Any]]] = []
408
+ for candidate in candidates:
409
+ score = _score_candidate(query, candidate)
410
+ if score >= 0.45:
411
+ scored.append((score, candidate))
412
+ if not scored:
413
+ return None
414
+ scored.sort(key=lambda item: item[0], reverse=True)
415
+ return scored[0][1]
416
+
417
+
418
+ def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
419
+ response = requests.get(
420
+ config.standard_ebooks_search_url,
421
+ params={"query": query},
422
+ headers={"user-agent": config.fetch_user_agent},
423
+ timeout=_provider_timeout(config),
424
+ )
425
+ response.raise_for_status()
426
+
427
+ paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
428
+ candidates: list[dict[str, Any]] = []
429
+ for path in paths[:6]:
430
+ detail_url = urljoin(config.standard_ebooks_search_url, path)
431
+ detail_response = requests.get(
432
+ detail_url,
433
+ headers={"user-agent": config.fetch_user_agent},
434
+ timeout=_provider_timeout(config),
435
+ )
436
+ detail_response.raise_for_status()
437
+ download_path = _pick_standard_ebooks_download(detail_response.text)
438
+ if not download_path:
439
+ continue
440
+ candidates.append(
441
+ {
442
+ "provider": "standard_ebooks",
443
+ "title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
444
+ "author": _author_from_book_path(path),
445
+ "language": "en",
446
+ "download_url": _normalize_download_url(
447
+ urljoin(config.standard_ebooks_search_url, download_path),
448
+ ),
449
+ "filename": PurePosixPath(download_path).name,
450
+ }
451
+ )
452
+ return candidates
453
+
454
+
455
+ def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
456
+ response = requests.get(
457
+ config.project_gutenberg_search_url,
458
+ params={"query": query},
459
+ headers={"user-agent": config.fetch_user_agent},
460
+ timeout=_provider_timeout(config),
461
+ )
462
+ response.raise_for_status()
463
+
464
+ book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
465
+ candidates: list[dict[str, Any]] = []
466
+ for book_id in book_ids[:5]:
467
+ detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
468
+ detail_response = requests.get(
469
+ detail_url,
470
+ headers={"user-agent": config.fetch_user_agent},
471
+ timeout=_provider_timeout(config),
472
+ )
473
+ detail_response.raise_for_status()
474
+ detail_html = detail_response.text
475
+ download_path = _pick_gutenberg_epub(detail_html)
476
+ if not download_path:
477
+ continue
478
+ candidates.append(
479
+ {
480
+ "provider": "project_gutenberg",
481
+ "title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
482
+ "author": "",
483
+ "language": "en",
484
+ "download_url": urljoin(detail_url, download_path),
485
+ "filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
486
+ }
487
+ )
488
+ return candidates
489
+
490
+
491
+ def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
492
+ response = requests.get(
493
+ config.internet_archive_advancedsearch_url,
494
+ params={
495
+ "q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
496
+ "fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
497
+ "sort[]": "downloads desc",
498
+ "rows": 8,
499
+ "page": 1,
500
+ "output": "json",
501
+ },
502
+ headers={"user-agent": config.fetch_user_agent},
503
+ timeout=_provider_timeout(config),
504
+ )
505
+ response.raise_for_status()
506
+ payload = response.json()
507
+ docs = payload.get("response", {}).get("docs", [])
508
+
509
+ candidates: list[dict[str, Any]] = []
510
+ for item in docs:
511
+ if not _is_english(item.get("language")):
512
+ continue
513
+ formats = item.get("format") or []
514
+ if isinstance(formats, str):
515
+ formats = [formats]
516
+ if not any(str(value).strip().lower() == "epub" for value in formats):
517
+ continue
518
+ identifier = str(item.get("identifier") or "").strip()
519
+ if not identifier:
520
+ continue
521
+ metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
522
+ metadata_response = requests.get(
523
+ metadata_url,
524
+ headers={"user-agent": config.fetch_user_agent},
525
+ timeout=_provider_timeout(config),
526
+ )
527
+ metadata_response.raise_for_status()
528
+ metadata = metadata_response.json()
529
+ filename = _pick_archive_epub_filename(metadata)
530
+ if not filename:
531
+ continue
532
+ candidates.append(
533
+ {
534
+ "provider": "internet_archive",
535
+ "title": str(item.get("title") or ""),
536
+ "author": _first_text(item.get("creator")),
537
+ "language": _first_text(item.get("language")),
538
+ "downloads": int(item.get("downloads") or 0),
539
+ "download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
540
+ "filename": filename,
541
+ }
542
+ )
543
+ return candidates
544
+
545
+
546
+ def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
547
+ """
548
+ Search src_a for EPUB files matching the query.
549
+ Uses cloudscraper to bypass DDoS-Guard protection.
550
+ """
551
+ if not config.src_a_search_url or not config.src_a_base_url:
552
+ return []
553
+
554
+ if cloudscraper is None or BeautifulSoup is None:
555
+ return []
556
+
557
+ candidates: list[dict[str, Any]] = []
558
+ search_url = config.src_a_search_url
559
+
560
+ try:
561
+ scraper = cloudscraper.create_scraper(
562
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
563
+ delay=10,
564
+ )
565
+ headers = {
566
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
567
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
568
+ "accept-language": "en-US,en;q=0.9",
569
+ }
570
+ response = scraper.get(
571
+ f"{search_url}?q={quote(query)}",
572
+ headers=headers,
573
+ timeout=60,
574
+ )
575
+ response.raise_for_status()
576
+
577
+ html = response.text
578
+ if not html or len(html) < 500:
579
+ raise FetchError("搜索页面加载失败")
580
+
581
+ soup = BeautifulSoup(html, "lxml")
582
+
583
+ # Find all links containing /md5/ - these are book entry links
584
+ md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
585
+
586
+ seen_md5 = set()
587
+ for link in md5_links:
588
+ link_text = (link.get_text() or "").strip()
589
+ # We want links with empty text (Save links) that point to md5 entries
590
+ if link_text and link_text != "Save":
591
+ continue
592
+
593
+ href = link.get("href", "")
594
+ md5_match = re.search(r"/md5/([a-f0-9]+)", href)
595
+ if not md5_match:
596
+ continue
597
+ md5 = md5_match.group(1)
598
+ if md5 in seen_md5:
599
+ continue
600
+
601
+ # Find the parent container to get context
602
+ container = link.find_parent("div")
603
+ if not container:
604
+ container = link.parent
605
+ if not container:
606
+ continue
607
+
608
+ container_text = container.get_text(separator="\n") or ""
609
+
610
+ # Check if this container has an EPUB file
611
+ epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
612
+ if not epub_match:
613
+ continue
614
+
615
+ epub_path = epub_match.group(1)
616
+ seen_md5.add(md5)
617
+
618
+ # Extract metadata: English [en] · EPUB · 1.2MB · 2020
619
+ meta_match = re.search(
620
+ r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
621
+ container_text,
622
+ )
623
+ metadata = {
624
+ "language": meta_match.group(1) if meta_match else "",
625
+ "format": meta_match.group(2) if meta_match else "",
626
+ "filesize": meta_match.group(3) if meta_match else "",
627
+ "year": meta_match.group(4) if meta_match else "",
628
+ } if meta_match else None
629
+
630
+ # Extract title and author from container text lines
631
+ lines = [l.strip() for l in container_text.split("\n") if l.strip()]
632
+ title = ""
633
+ author = ""
634
+ for line in lines:
635
+ if ".epub" in line.lower():
636
+ continue
637
+ if re.match(r"^[\d.,]+$", line):
638
+ continue
639
+ if re.match(r"^(English|Save|\d+)", line):
640
+ continue
641
+ if not title:
642
+ title = line[:200]
643
+ elif not author and len(line) < 100:
644
+ author = line
645
+ break
646
+
647
+ download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
648
+ filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
649
+
650
+ candidates.append({
651
+ "provider": "src_a",
652
+ "title": title,
653
+ "author": author,
654
+ "language": "en",
655
+ "download_url": download_url,
656
+ "filename": filename,
657
+ "filesize": metadata["filesize"] if metadata else "",
658
+ "year": metadata["year"] if metadata else "",
659
+ })
660
+
661
+ except FetchError:
662
+ raise
663
+ except Exception as exc:
664
+ raise FetchError(f"搜索失败: {str(exc)[:100]}")
665
+
666
+ return candidates
667
+
668
+
669
+ def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
670
+ """
671
+ Search src_b for EPUB files matching the query.
672
+ Uses cloudscraper to bypass DDoS-Guard protection.
673
+ Tries alternative Z-Library domains if primary fails.
674
+ """
675
+ if not config.src_b_base_url:
676
+ return []
677
+
678
+ if cloudscraper is None or BeautifulSoup is None:
679
+ return []
680
+
681
+ # Try primary domain first, then alternatives
682
+ primary_base = config.src_b_base_url.rstrip("/")
683
+ alt_domains = ["https://z-lib.is", "https://z-library.se"]
684
+ bases_to_try = [primary_base] + [d for d in alt_domains if d.rstrip("/") != primary_base]
685
+
686
+ last_error = None
687
+ for base in bases_to_try:
688
+ try:
689
+ return _search_src_b_at_domain(base, query)
690
+ except FetchError as exc:
691
+ last_error = exc
692
+ continue
693
+ except (requests.RequestException, ValueError) as exc:
694
+ last_error = exc
695
+ continue
696
+
697
+ if last_error:
698
+ raise last_error
699
+ return []
700
+
701
+
702
+ def _search_src_b_at_domain(base: str, query: str) -> list[dict[str, Any]]:
703
+ """Search a single Z-Library domain for EPUB files."""
704
+ candidates: list[dict[str, Any]] = []
705
+
706
+ scraper = cloudscraper.create_scraper(
707
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
708
+ delay=10,
709
+ )
710
+ headers = {
711
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
712
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
713
+ "accept-language": "en-US,en;q=0.9",
714
+ }
715
+
716
+ # Search page
717
+ search_url = f"{base}/s/{quote(query)}"
718
+ response = scraper.get(search_url, headers=headers, timeout=60)
719
+ response.raise_for_status()
720
+
721
+ html = response.text
722
+ if not html or len(html) < 200:
723
+ raise FetchError("搜索页面加载失败")
724
+
725
+ soup = BeautifulSoup(html, "lxml")
726
+
727
+ # Find book items by looking for links to /book/
728
+ book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
729
+ if not book_links:
730
+ raise FetchError("未找到任何书籍")
731
+
732
+ # Deduplicate by href
733
+ seen_hrefs: set[str] = set()
734
+ books: list[dict[str, str]] = []
735
+ for link in book_links:
736
+ href = link.get("href", "")
737
+ if not href or href in seen_hrefs:
738
+ continue
739
+ seen_hrefs.add(href)
740
+ text = (link.get_text() or "").strip()
741
+ if text and len(text) > 3:
742
+ books.append({"href": href, "text": text})
743
+
744
+ # Visit each book page to check for EPUB format
745
+ for book in books[:15]:
746
+ href = book.get("href", "")
747
+ if not href:
748
+ continue
749
+
750
+ book_url = href if href.startswith("http") else f"{base}{href}"
751
+
752
+ try:
753
+ book_resp = scraper.get(book_url, headers=headers, timeout=30)
754
+ book_resp.raise_for_status()
755
+ book_html = book_resp.text
756
+ book_soup = BeautifulSoup(book_html, "lxml")
757
+ page_text = book_soup.get_text(separator="\n") or ""
758
+
759
+ if "epub" not in page_text.lower():
760
+ continue
761
+
762
+ # Extract author
763
+ author = ""
764
+ author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
765
+ if author_match:
766
+ author = author_match.group(1).strip()[:100]
767
+
768
+ # Get download URL from /dl/ link
769
+ dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
770
+ dl_id = dl_match.group(1) if dl_match else ""
771
+ download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
772
+
773
+ title_text = book.get("text", "").split("\n")[0].strip()
774
+ if not title_text:
775
+ title_text = book.get("text", "")[:100]
776
+
777
+ candidates.append({
778
+ "provider": "src_b",
779
+ "title": title_text,
780
+ "author": author,
781
+ "language": "en",
782
+ "download_url": download_url,
783
+ "filename": "",
784
+ "filesize": "",
785
+ })
786
+
787
+ except Exception:
788
+ continue
789
+
790
+ return candidates
791
+
792
+
793
+ def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
794
+ if len(content) < 4 or not content.startswith(b"PK"):
795
+ raise FetchError("下载内容不是 EPUB")
796
+
797
+ try:
798
+ with zipfile.ZipFile(BytesIO(content)) as archive:
799
+ mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
800
+ except (KeyError, zipfile.BadZipFile) as exc:
801
+ raise FetchError("下载内容不是 EPUB") from exc
802
+
803
+ if mimetype != "application/epub+zip":
804
+ raise FetchError("下载内容不是 EPUB")
805
+
806
+ lowered_content_type = content_type.lower()
807
+ if filename.lower().endswith(".epub"):
808
+ return
809
+ if "application/epub+zip" in lowered_content_type:
810
+ return
811
+
812
+
813
+ def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
814
+ hint = str(filename_hint or "").strip()
815
+ if hint:
816
+ return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
817
+
818
+ disposition = response.headers.get("content-disposition", "")
819
+ match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
820
+ if match:
821
+ return normalize_source_filename(
822
+ PurePosixPath(unquote(match.group(1).strip())).name,
823
+ default_extension=".epub",
824
+ )
825
+
826
+ parsed = urlparse(url)
827
+ name = PurePosixPath(unquote(parsed.path)).name
828
+ if name:
829
+ return normalize_source_filename(name, default_extension=".epub")
830
+ return "downloaded_book.epub"
831
+
832
+
833
+ def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
834
+ if not candidate.get("download_url") or not _is_english(candidate.get("language")):
835
+ return -1.0
836
+
837
+ normalized_query = _normalize_text(query)
838
+ normalized_title = _normalize_text(candidate.get("title", ""))
839
+
840
+ # Word overlap scoring - more accurate for title matching
841
+ query_words = set(normalized_query.split())
842
+ title_words = set(normalized_title.split())
843
+
844
+ if not query_words or not title_words:
845
+ return -1.0
846
+
847
+ # Calculate word overlap
848
+ common_words = query_words & title_words
849
+ all_words = query_words | title_words
850
+
851
+ # Jaccard similarity (word overlap / total unique words)
852
+ jaccard_score = len(common_words) / len(all_words) if all_words else 0
853
+
854
+ # Sequence similarity for word order
855
+ sequence_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
856
+
857
+ # Combined base score (weighted toward word overlap)
858
+ title_score = jaccard_score * 0.7 + sequence_score * 0.3
859
+
860
+ # Strong bonus for exact match
861
+ if normalized_query == normalized_title:
862
+ title_score += 1.0
863
+ # Bonus for all query words in title
864
+ elif query_words <= title_words:
865
+ title_score += 0.3
866
+ # Penalty for titles with no meaningful word overlap
867
+ elif len(common_words) == 0:
868
+ title_score -= 0.3
869
+
870
+ provider_bonus = {
871
+ "src_a": 0.1,
872
+ "src_b": 0.08,
873
+ }.get(candidate.get("provider"), 0.0)
874
+
875
+ filename = str(candidate.get("filename") or "").lower()
876
+ download_url = str(candidate.get("download_url") or "").lower()
877
+ epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
878
+
879
+ downloads = max(int(candidate.get("downloads") or 0), 0)
880
+ downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
881
+ return title_score + provider_bonus + epub_bonus + downloads_bonus
882
+
883
+
884
+ def _looks_like_url(value: str) -> bool:
885
+ parsed = urlparse(value)
886
+ return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
887
+
888
+
889
+ def _normalize_text(value: str) -> str:
890
+ lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
891
+ return " ".join(lowered.split())
892
+
893
+
894
+ def _is_english(value: Any) -> bool:
895
+ if isinstance(value, (list, tuple, set)):
896
+ return any(_is_english(item) for item in value)
897
+ normalized = _normalize_text(str(value or ""))
898
+ return normalized in ENGLISH_CODES
899
+
900
+
901
+ def _unique_matches(pattern: str, text: str) -> list[str]:
902
+ results: list[str] = []
903
+ for match in re.findall(pattern, text):
904
+ value = match.strip()
905
+ if value and value not in results:
906
+ results.append(value)
907
+ return results
908
+
909
+
910
+ def _pick_standard_ebooks_download(html: str) -> str:
911
+ links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
912
+ for link in links:
913
+ lower_link = link.lower()
914
+ if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
915
+ continue
916
+ return link
917
+ return links[0] if links else ""
918
+
919
+
920
+ def _pick_gutenberg_epub(html: str) -> str:
921
+ links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
922
+ for link in links:
923
+ lower_link = link.lower()
924
+ if lower_link.endswith(".epub") or ".epub." in lower_link:
925
+ return link
926
+ return ""
927
+
928
+
929
+ def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
930
+ for item in metadata.get("files", []) or []:
931
+ name = str(item.get("name") or "")
932
+ if name.lower().endswith(".epub"):
933
+ return name
934
+ return ""
935
+
936
+
937
+ def _extract_html_title(html: str) -> str:
938
+ title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
939
+ if not title_match:
940
+ return ""
941
+ title = re.sub(r"\s+", " ", title_match.group(1)).strip()
942
+ title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
943
+ title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
944
+ title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
945
+ return title
946
+
947
+
948
+ def _author_from_book_path(path: str) -> str:
949
+ parts = [part for part in path.strip("/").split("/") if part]
950
+ if len(parts) < 3:
951
+ return ""
952
+ return " ".join(part.capitalize() for part in parts[1].split("-"))
953
+
954
+
955
+ def _title_from_book_path(path: str) -> str:
956
+ parts = [part for part in path.strip("/").split("/") if part]
957
+ if len(parts) < 3:
958
+ return ""
959
+ return " ".join(part.capitalize() for part in parts[2].split("-"))
960
+
961
+
962
+ def _first_text(value: Any) -> str:
963
+ if isinstance(value, (list, tuple)):
964
+ return str(value[0]) if value else ""
965
+ return str(value or "")
966
+
967
+
968
+ def _provider_timeout(config: AppConfig) -> int:
969
+ return max(5, min(int(config.fetch_timeout_seconds), 10))
970
+
971
+
972
+ def _normalize_download_url(url: str) -> str:
973
+ parsed = urlparse(url)
974
+ if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
975
+ params = dict(parse_qsl(parsed.query, keep_blank_values=True))
976
+ params.setdefault("source", "download")
977
+ return urlunparse(parsed._replace(query=urlencode(params)))
978
+ return url