fromozu commited on
Commit
35b16a0
·
verified ·
1 Parent(s): 026edb5

Upload hf_backend/fetcher.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_backend/fetcher.py +828 -0
hf_backend/fetcher.py ADDED
@@ -0,0 +1,828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ import re
5
+ import time
6
+ import zipfile
7
+ from difflib import SequenceMatcher
8
+ from io import BytesIO
9
+ from pathlib import PurePosixPath
10
+ from typing import Any
11
+ from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
12
+
13
+ import requests
14
+
15
+ try:
16
+ import cloudscraper
17
+ except ImportError:
18
+ cloudscraper = None
19
+
20
+ try:
21
+ from bs4 import BeautifulSoup
22
+ except ImportError:
23
+ BeautifulSoup = None
24
+
25
+ from hf_backend.config import AppConfig
26
+ from hf_backend.filename_utils import normalize_source_filename
27
+
28
+
29
+ class FetchError(RuntimeError):
30
+ pass
31
+
32
+
33
+ USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB,请提供直链"
34
+
35
+ ENGLISH_CODES = {
36
+ "en",
37
+ "eng",
38
+ "en-us",
39
+ "en-gb",
40
+ "english",
41
+ }
42
+
43
+
44
+ def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
45
+ normalized_query = str(query or "").strip()
46
+ if not normalized_query:
47
+ raise FetchError("请输入书名或 EPUB 下载链接")
48
+
49
+ if _looks_like_url(normalized_query):
50
+ filename, content = download_epub_from_url(config, normalized_query)
51
+ return {
52
+ "filename": normalize_source_filename(filename, default_extension=".epub"),
53
+ "content": content,
54
+ "origin": "link_fetch",
55
+ "provider": "direct_link",
56
+ "query": normalized_query,
57
+ "download_url": normalized_query,
58
+ }
59
+
60
+ candidates: list[dict[str, Any]] = []
61
+ last_error = None
62
+
63
+ # Provider order: src_a first (most comprehensive), then src_b, then other sources
64
+ for provider in (
65
+ search_src_a,
66
+ search_src_b,
67
+ search_standard_ebooks,
68
+ search_project_gutenberg,
69
+ search_internet_archive,
70
+ ):
71
+ try:
72
+ provider_candidates = provider(config, normalized_query)
73
+ candidates.extend(provider_candidates)
74
+ except FetchError as exc:
75
+ last_error = exc
76
+ continue # Skip to next provider (source blocked or not found)
77
+ except (requests.RequestException, ValueError) as exc:
78
+ last_error = exc
79
+ continue # Skip to next provider on connection errors
80
+ best_candidate = pick_best_candidate(normalized_query, candidates)
81
+ if best_candidate and _score_candidate(normalized_query, best_candidate) >= 0.9:
82
+ break
83
+
84
+ if not candidates:
85
+ if last_error:
86
+ error_msg = str(last_error)
87
+ if "src_a" in error_msg:
88
+ raise FetchError("未找到可用的英文 EPUB,请尝试提供直链或使用其他书名")
89
+ raise FetchError(f"搜索失败:{error_msg[:100]}")
90
+ raise FetchError(USER_FACING_NOT_FOUND)
91
+
92
+ # Rank candidates by score, then try downloading from best to worst
93
+ ranked = sorted(
94
+ candidates,
95
+ key=lambda c: _score_candidate(normalized_query, c),
96
+ reverse=True,
97
+ )
98
+
99
+ download_error = None
100
+ for candidate in ranked:
101
+ try:
102
+ filename, content = download_epub_from_url(
103
+ config,
104
+ candidate["download_url"],
105
+ filename_hint=candidate.get("filename", ""),
106
+ provider=candidate.get("provider", ""),
107
+ )
108
+ return {
109
+ "filename": normalize_source_filename(filename, default_extension=".epub"),
110
+ "content": content,
111
+ "origin": "title_fetch",
112
+ "provider": candidate.get("provider", ""),
113
+ "query": normalized_query,
114
+ "title": candidate.get("title", ""),
115
+ "author": candidate.get("author", ""),
116
+ "download_url": candidate["download_url"],
117
+ }
118
+ except FetchError as exc:
119
+ download_error = exc
120
+ continue # Try next candidate
121
+ except (requests.RequestException, ValueError) as exc:
122
+ download_error = exc
123
+ continue
124
+
125
+ raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
126
+
127
+
128
+ def download_epub_from_url(
129
+ config: AppConfig,
130
+ url: str,
131
+ *,
132
+ filename_hint: str = "",
133
+ provider: str = "",
134
+ ) -> tuple[str, bytes]:
135
+ # Route to specialized downloaders based on provider hint or URL pattern
136
+ if provider == "src_a" or (
137
+ config.src_a_base_url
138
+ and url.lower().startswith(config.src_a_base_url.lower())
139
+ and "/slow_download/" in url.lower()
140
+ ):
141
+ return _download_from_src_a(config, url, filename_hint)
142
+
143
+ if provider == "src_b" or (
144
+ config.src_b_base_url
145
+ and url.lower().startswith(config.src_b_base_url.lower())
146
+ and "/dl/" in url.lower()
147
+ ):
148
+ return _download_from_src_b(config, url, filename_hint)
149
+
150
+ effective_url = _normalize_download_url(url)
151
+ response = requests.get(
152
+ effective_url,
153
+ headers={"user-agent": config.fetch_user_agent},
154
+ timeout=config.fetch_timeout_seconds,
155
+ allow_redirects=True,
156
+ )
157
+ response.raise_for_status()
158
+
159
+ content = response.content
160
+ if not content:
161
+ raise FetchError("下载结果为空")
162
+
163
+ filename = _derive_filename(response, response.url or url, filename_hint)
164
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
165
+
166
+ if not filename.lower().endswith(".epub"):
167
+ filename = f"{filename}.epub"
168
+ return filename, content
169
+
170
+
171
+ def _download_from_src_a(
172
+ config: AppConfig,
173
+ url: str,
174
+ filename_hint: str,
175
+ ) -> tuple[str, bytes]:
176
+ """
177
+ Download EPUB from a protected slow_download URL.
178
+ Uses cloudscraper to handle DDoS-Guard / Cloudflare redirect.
179
+ Falls back to requests.get if cloudscraper is unavailable.
180
+ """
181
+ effective_url = _normalize_download_url(url)
182
+
183
+ if cloudscraper is not None:
184
+ try:
185
+ scraper = cloudscraper.create_scraper(
186
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
187
+ delay=10,
188
+ )
189
+ headers = {
190
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
191
+ "accept": "*/*",
192
+ }
193
+ response = scraper.get(
194
+ effective_url,
195
+ headers=headers,
196
+ timeout=120,
197
+ allow_redirects=True,
198
+ )
199
+ if response.status_code == 403:
200
+ raise FetchError("下载被阻止,请在浏览器中打开此链接手动下载")
201
+ response.raise_for_status()
202
+ content = response.content
203
+ if not content:
204
+ raise FetchError("下载结果为空")
205
+ filename = _derive_filename(response, response.url or url, filename_hint)
206
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
207
+ if not filename.lower().endswith(".epub"):
208
+ filename = f"{filename}.epub"
209
+ return filename, content
210
+ except FetchError:
211
+ raise
212
+ except Exception:
213
+ pass # Fall through to requests
214
+
215
+ # Fallback to plain requests
216
+ response = requests.get(
217
+ effective_url,
218
+ headers={"user-agent": config.fetch_user_agent},
219
+ timeout=config.fetch_timeout_seconds,
220
+ allow_redirects=True,
221
+ )
222
+ if response.status_code == 403:
223
+ raise FetchError("下载被阻止,请在浏览器中打开此链接手动下载")
224
+ response.raise_for_status()
225
+ content = response.content
226
+ if not content:
227
+ raise FetchError("下载结果为空")
228
+ filename = _derive_filename(response, response.url or url, filename_hint)
229
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
230
+ if not filename.lower().endswith(".epub"):
231
+ filename = f"{filename}.epub"
232
+ return filename, content
233
+
234
+
235
+ def _download_from_src_b(
236
+ config: AppConfig,
237
+ url: str,
238
+ filename_hint: str,
239
+ ) -> tuple[str, bytes]:
240
+ """
241
+ Download EPUB from src_b download URL.
242
+ Uses cloudscraper to handle protection.
243
+ Falls back to requests.get if cloudscraper is unavailable.
244
+ """
245
+ if cloudscraper is not None:
246
+ try:
247
+ scraper = cloudscraper.create_scraper(
248
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
249
+ delay=10,
250
+ )
251
+ headers = {
252
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
253
+ "accept": "*/*",
254
+ }
255
+ response = scraper.get(
256
+ url,
257
+ headers=headers,
258
+ timeout=120,
259
+ allow_redirects=True,
260
+ )
261
+ if response.status_code == 403:
262
+ raise FetchError("下载被阻止,请手动下载")
263
+ response.raise_for_status()
264
+ content = response.content
265
+ if not content:
266
+ raise FetchError("下载结果为空")
267
+ filename = _derive_filename(response, response.url or url, filename_hint)
268
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
269
+ if not filename.lower().endswith(".epub"):
270
+ filename = f"{filename}.epub"
271
+ return filename, content
272
+ except FetchError:
273
+ raise
274
+ except Exception:
275
+ pass # Fall through to requests
276
+
277
+ # Fallback to plain requests
278
+ response = requests.get(
279
+ url,
280
+ headers={"user-agent": config.fetch_user_agent},
281
+ timeout=config.fetch_timeout_seconds,
282
+ allow_redirects=True,
283
+ )
284
+ if response.status_code == 403:
285
+ raise FetchError("下载被阻止,请手动下载")
286
+ response.raise_for_status()
287
+ content = response.content
288
+ if not content:
289
+ raise FetchError("下载结果为空")
290
+ filename = _derive_filename(response, response.url or url, filename_hint)
291
+ _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
292
+ if not filename.lower().endswith(".epub"):
293
+ filename = f"{filename}.epub"
294
+ return filename, content
295
+
296
+
297
+ def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
298
+ scored: list[tuple[float, dict[str, Any]]] = []
299
+ for candidate in candidates:
300
+ score = _score_candidate(query, candidate)
301
+ if score >= 0.45:
302
+ scored.append((score, candidate))
303
+ if not scored:
304
+ return None
305
+ scored.sort(key=lambda item: item[0], reverse=True)
306
+ return scored[0][1]
307
+
308
+
309
+ def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
310
+ response = requests.get(
311
+ config.standard_ebooks_search_url,
312
+ params={"query": query},
313
+ headers={"user-agent": config.fetch_user_agent},
314
+ timeout=_provider_timeout(config),
315
+ )
316
+ response.raise_for_status()
317
+
318
+ paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
319
+ candidates: list[dict[str, Any]] = []
320
+ for path in paths[:6]:
321
+ detail_url = urljoin(config.standard_ebooks_search_url, path)
322
+ detail_response = requests.get(
323
+ detail_url,
324
+ headers={"user-agent": config.fetch_user_agent},
325
+ timeout=_provider_timeout(config),
326
+ )
327
+ detail_response.raise_for_status()
328
+ download_path = _pick_standard_ebooks_download(detail_response.text)
329
+ if not download_path:
330
+ continue
331
+ candidates.append(
332
+ {
333
+ "provider": "standard_ebooks",
334
+ "title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
335
+ "author": _author_from_book_path(path),
336
+ "language": "en",
337
+ "download_url": _normalize_download_url(
338
+ urljoin(config.standard_ebooks_search_url, download_path),
339
+ ),
340
+ "filename": PurePosixPath(download_path).name,
341
+ }
342
+ )
343
+ return candidates
344
+
345
+
346
+ def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
347
+ response = requests.get(
348
+ config.project_gutenberg_search_url,
349
+ params={"query": query},
350
+ headers={"user-agent": config.fetch_user_agent},
351
+ timeout=_provider_timeout(config),
352
+ )
353
+ response.raise_for_status()
354
+
355
+ book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
356
+ candidates: list[dict[str, Any]] = []
357
+ for book_id in book_ids[:5]:
358
+ detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
359
+ detail_response = requests.get(
360
+ detail_url,
361
+ headers={"user-agent": config.fetch_user_agent},
362
+ timeout=_provider_timeout(config),
363
+ )
364
+ detail_response.raise_for_status()
365
+ detail_html = detail_response.text
366
+ download_path = _pick_gutenberg_epub(detail_html)
367
+ if not download_path:
368
+ continue
369
+ candidates.append(
370
+ {
371
+ "provider": "project_gutenberg",
372
+ "title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
373
+ "author": "",
374
+ "language": "en",
375
+ "download_url": urljoin(detail_url, download_path),
376
+ "filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
377
+ }
378
+ )
379
+ return candidates
380
+
381
+
382
+ def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
383
+ response = requests.get(
384
+ config.internet_archive_advancedsearch_url,
385
+ params={
386
+ "q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
387
+ "fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
388
+ "sort[]": "downloads desc",
389
+ "rows": 8,
390
+ "page": 1,
391
+ "output": "json",
392
+ },
393
+ headers={"user-agent": config.fetch_user_agent},
394
+ timeout=_provider_timeout(config),
395
+ )
396
+ response.raise_for_status()
397
+ payload = response.json()
398
+ docs = payload.get("response", {}).get("docs", [])
399
+
400
+ candidates: list[dict[str, Any]] = []
401
+ for item in docs:
402
+ if not _is_english(item.get("language")):
403
+ continue
404
+ formats = item.get("format") or []
405
+ if isinstance(formats, str):
406
+ formats = [formats]
407
+ if not any(str(value).strip().lower() == "epub" for value in formats):
408
+ continue
409
+ identifier = str(item.get("identifier") or "").strip()
410
+ if not identifier:
411
+ continue
412
+ metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
413
+ metadata_response = requests.get(
414
+ metadata_url,
415
+ headers={"user-agent": config.fetch_user_agent},
416
+ timeout=_provider_timeout(config),
417
+ )
418
+ metadata_response.raise_for_status()
419
+ metadata = metadata_response.json()
420
+ filename = _pick_archive_epub_filename(metadata)
421
+ if not filename:
422
+ continue
423
+ candidates.append(
424
+ {
425
+ "provider": "internet_archive",
426
+ "title": str(item.get("title") or ""),
427
+ "author": _first_text(item.get("creator")),
428
+ "language": _first_text(item.get("language")),
429
+ "downloads": int(item.get("downloads") or 0),
430
+ "download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
431
+ "filename": filename,
432
+ }
433
+ )
434
+ return candidates
435
+
436
+
437
+ def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
438
+ """
439
+ Search src_a for EPUB files matching the query.
440
+ Uses cloudscraper to bypass DDoS-Guard protection.
441
+ """
442
+ if not config.src_a_search_url or not config.src_a_base_url:
443
+ return []
444
+
445
+ if cloudscraper is None or BeautifulSoup is None:
446
+ return []
447
+
448
+ candidates: list[dict[str, Any]] = []
449
+ search_url = config.src_a_search_url
450
+
451
+ try:
452
+ scraper = cloudscraper.create_scraper(
453
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
454
+ delay=10,
455
+ )
456
+ headers = {
457
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
458
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
459
+ "accept-language": "en-US,en;q=0.9",
460
+ }
461
+ response = scraper.get(
462
+ f"{search_url}?q={quote(query)}",
463
+ headers=headers,
464
+ timeout=60,
465
+ )
466
+ response.raise_for_status()
467
+
468
+ html = response.text
469
+ if not html or len(html) < 500:
470
+ raise FetchError("搜索页面加载失败")
471
+
472
+ soup = BeautifulSoup(html, "lxml")
473
+
474
+ # Find all links containing /md5/ - these are book entry links
475
+ md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
476
+
477
+ seen_md5 = set()
478
+ for link in md5_links:
479
+ link_text = (link.get_text() or "").strip()
480
+ # We want links with empty text (Save links) that point to md5 entries
481
+ if link_text and link_text != "Save":
482
+ continue
483
+
484
+ href = link.get("href", "")
485
+ md5_match = re.search(r"/md5/([a-f0-9]+)", href)
486
+ if not md5_match:
487
+ continue
488
+ md5 = md5_match.group(1)
489
+ if md5 in seen_md5:
490
+ continue
491
+
492
+ # Find the parent container to get context
493
+ container = link.find_parent("div")
494
+ if not container:
495
+ container = link.parent
496
+ if not container:
497
+ continue
498
+
499
+ container_text = container.get_text(separator="\n") or ""
500
+
501
+ # Check if this container has an EPUB file
502
+ epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
503
+ if not epub_match:
504
+ continue
505
+
506
+ epub_path = epub_match.group(1)
507
+ seen_md5.add(md5)
508
+
509
+ # Extract metadata: English [en] · EPUB · 1.2MB · 2020
510
+ meta_match = re.search(
511
+ r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
512
+ container_text,
513
+ )
514
+ metadata = {
515
+ "language": meta_match.group(1) if meta_match else "",
516
+ "format": meta_match.group(2) if meta_match else "",
517
+ "filesize": meta_match.group(3) if meta_match else "",
518
+ "year": meta_match.group(4) if meta_match else "",
519
+ } if meta_match else None
520
+
521
+ # Extract title and author from container text lines
522
+ lines = [l.strip() for l in container_text.split("\n") if l.strip()]
523
+ title = ""
524
+ author = ""
525
+ for line in lines:
526
+ if ".epub" in line.lower():
527
+ continue
528
+ if re.match(r"^[\d.,]+$", line):
529
+ continue
530
+ if re.match(r"^(English|Save|\d+)", line):
531
+ continue
532
+ if not title:
533
+ title = line[:200]
534
+ elif not author and len(line) < 100:
535
+ author = line
536
+ break
537
+
538
+ download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
539
+ filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
540
+
541
+ candidates.append({
542
+ "provider": "src_a",
543
+ "title": title,
544
+ "author": author,
545
+ "language": "en",
546
+ "download_url": download_url,
547
+ "filename": filename,
548
+ "filesize": metadata["filesize"] if metadata else "",
549
+ "year": metadata["year"] if metadata else "",
550
+ })
551
+
552
+ except FetchError:
553
+ raise
554
+ except Exception as exc:
555
+ raise FetchError(f"搜索失败: {str(exc)[:100]}")
556
+
557
+ return candidates
558
+
559
+
560
+ def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
561
+ """
562
+ Search src_b for EPUB files matching the query.
563
+ Uses cloudscraper to bypass DDoS-Guard protection.
564
+ """
565
+ if not config.src_b_base_url:
566
+ return []
567
+
568
+ if cloudscraper is None or BeautifulSoup is None:
569
+ return []
570
+
571
+ base = config.src_b_base_url.rstrip("/")
572
+ candidates: list[dict[str, Any]] = []
573
+
574
+ try:
575
+ scraper = cloudscraper.create_scraper(
576
+ browser={"browser": "chrome", "platform": "windows", "mobile": False},
577
+ delay=10,
578
+ )
579
+ headers = {
580
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
581
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
582
+ "accept-language": "en-US,en;q=0.9",
583
+ }
584
+
585
+ # Search page
586
+ search_url = f"{base}/s/{quote(query)}"
587
+ response = scraper.get(search_url, headers=headers, timeout=60)
588
+ response.raise_for_status()
589
+
590
+ html = response.text
591
+ if not html or len(html) < 200:
592
+ raise FetchError("搜索页面加载失败")
593
+
594
+ soup = BeautifulSoup(html, "lxml")
595
+
596
+ # Find book items by looking for links to /book/
597
+ book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
598
+ if not book_links:
599
+ raise FetchError("未找到任何书籍")
600
+
601
+ # Deduplicate by href
602
+ seen_hrefs: set[str] = set()
603
+ books: list[dict[str, str]] = []
604
+ for link in book_links:
605
+ href = link.get("href", "")
606
+ if not href or href in seen_hrefs:
607
+ continue
608
+ seen_hrefs.add(href)
609
+ text = (link.get_text() or "").strip()
610
+ if text and len(text) > 3:
611
+ books.append({"href": href, "text": text})
612
+
613
+ # Visit each book page to check for EPUB format
614
+ for book in books[:15]:
615
+ href = book.get("href", "")
616
+ if not href:
617
+ continue
618
+
619
+ book_url = href if href.startswith("http") else f"{base}{href}"
620
+
621
+ try:
622
+ book_resp = scraper.get(book_url, headers=headers, timeout=30)
623
+ book_resp.raise_for_status()
624
+ book_html = book_resp.text
625
+ book_soup = BeautifulSoup(book_html, "lxml")
626
+ page_text = book_soup.get_text(separator="\n") or ""
627
+
628
+ if "epub" not in page_text.lower():
629
+ continue
630
+
631
+ # Extract author
632
+ author = ""
633
+ author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
634
+ if author_match:
635
+ author = author_match.group(1).strip()[:100]
636
+
637
+ # Get download URL from /dl/ link
638
+ dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
639
+ dl_id = dl_match.group(1) if dl_match else ""
640
+ download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
641
+
642
+ title_text = book.get("text", "").split("\n")[0].strip()
643
+ if not title_text:
644
+ title_text = book.get("text", "")[:100]
645
+
646
+ candidates.append({
647
+ "provider": "src_b",
648
+ "title": title_text,
649
+ "author": author,
650
+ "language": "en",
651
+ "download_url": download_url,
652
+ "filename": "",
653
+ "filesize": "",
654
+ })
655
+
656
+ except Exception:
657
+ continue
658
+
659
+ except FetchError:
660
+ raise
661
+ except Exception as exc:
662
+ raise FetchError(f"搜索失败: {str(exc)[:100]}")
663
+
664
+ return candidates
665
+
666
+
667
+ def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
668
+ if len(content) < 4 or not content.startswith(b"PK"):
669
+ raise FetchError("下载内容不是 EPUB")
670
+
671
+ try:
672
+ with zipfile.ZipFile(BytesIO(content)) as archive:
673
+ mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
674
+ except (KeyError, zipfile.BadZipFile) as exc:
675
+ raise FetchError("下载内容不是 EPUB") from exc
676
+
677
+ if mimetype != "application/epub+zip":
678
+ raise FetchError("下载内容不是 EPUB")
679
+
680
+ lowered_content_type = content_type.lower()
681
+ if filename.lower().endswith(".epub"):
682
+ return
683
+ if "application/epub+zip" in lowered_content_type:
684
+ return
685
+
686
+
687
+ def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
688
+ hint = str(filename_hint or "").strip()
689
+ if hint:
690
+ return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
691
+
692
+ disposition = response.headers.get("content-disposition", "")
693
+ match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
694
+ if match:
695
+ return normalize_source_filename(
696
+ PurePosixPath(unquote(match.group(1).strip())).name,
697
+ default_extension=".epub",
698
+ )
699
+
700
+ parsed = urlparse(url)
701
+ name = PurePosixPath(unquote(parsed.path)).name
702
+ if name:
703
+ return normalize_source_filename(name, default_extension=".epub")
704
+ return "downloaded_book.epub"
705
+
706
+
707
+ def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
708
+ if not candidate.get("download_url") or not _is_english(candidate.get("language")):
709
+ return -1.0
710
+
711
+ normalized_query = _normalize_text(query)
712
+ normalized_title = _normalize_text(candidate.get("title", ""))
713
+ title_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
714
+ if normalized_query and normalized_title and normalized_query == normalized_title:
715
+ title_score += 0.25
716
+
717
+ provider_bonus = {
718
+ "standard_ebooks": 0.08,
719
+ "project_gutenberg": 0.05,
720
+ "internet_archive": 0.03,
721
+ "src_a": 0.04,
722
+ "src_b": 0.05,
723
+ }.get(candidate.get("provider"), 0.0)
724
+
725
+ filename = str(candidate.get("filename") or "").lower()
726
+ download_url = str(candidate.get("download_url") or "").lower()
727
+ epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
728
+
729
+ downloads = max(int(candidate.get("downloads") or 0), 0)
730
+ downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
731
+ return title_score + provider_bonus + epub_bonus + downloads_bonus
732
+
733
+
734
+ def _looks_like_url(value: str) -> bool:
735
+ parsed = urlparse(value)
736
+ return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
737
+
738
+
739
+ def _normalize_text(value: str) -> str:
740
+ lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
741
+ return " ".join(lowered.split())
742
+
743
+
744
+ def _is_english(value: Any) -> bool:
745
+ if isinstance(value, (list, tuple, set)):
746
+ return any(_is_english(item) for item in value)
747
+ normalized = _normalize_text(str(value or ""))
748
+ return normalized in ENGLISH_CODES
749
+
750
+
751
+ def _unique_matches(pattern: str, text: str) -> list[str]:
752
+ results: list[str] = []
753
+ for match in re.findall(pattern, text):
754
+ value = match.strip()
755
+ if value and value not in results:
756
+ results.append(value)
757
+ return results
758
+
759
+
760
+ def _pick_standard_ebooks_download(html: str) -> str:
761
+ links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
762
+ for link in links:
763
+ lower_link = link.lower()
764
+ if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
765
+ continue
766
+ return link
767
+ return links[0] if links else ""
768
+
769
+
770
+ def _pick_gutenberg_epub(html: str) -> str:
771
+ links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
772
+ for link in links:
773
+ lower_link = link.lower()
774
+ if lower_link.endswith(".epub") or ".epub." in lower_link:
775
+ return link
776
+ return ""
777
+
778
+
779
+ def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
780
+ for item in metadata.get("files", []) or []:
781
+ name = str(item.get("name") or "")
782
+ if name.lower().endswith(".epub"):
783
+ return name
784
+ return ""
785
+
786
+
787
+ def _extract_html_title(html: str) -> str:
788
+ title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
789
+ if not title_match:
790
+ return ""
791
+ title = re.sub(r"\s+", " ", title_match.group(1)).strip()
792
+ title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
793
+ title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
794
+ title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
795
+ return title
796
+
797
+
798
+ def _author_from_book_path(path: str) -> str:
799
+ parts = [part for part in path.strip("/").split("/") if part]
800
+ if len(parts) < 3:
801
+ return ""
802
+ return " ".join(part.capitalize() for part in parts[1].split("-"))
803
+
804
+
805
+ def _title_from_book_path(path: str) -> str:
806
+ parts = [part for part in path.strip("/").split("/") if part]
807
+ if len(parts) < 3:
808
+ return ""
809
+ return " ".join(part.capitalize() for part in parts[2].split("-"))
810
+
811
+
812
+ def _first_text(value: Any) -> str:
813
+ if isinstance(value, (list, tuple)):
814
+ return str(value[0]) if value else ""
815
+ return str(value or "")
816
+
817
+
818
+ def _provider_timeout(config: AppConfig) -> int:
819
+ return max(5, min(int(config.fetch_timeout_seconds), 10))
820
+
821
+
822
+ def _normalize_download_url(url: str) -> str:
823
+ parsed = urlparse(url)
824
+ if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
825
+ params = dict(parse_qsl(parsed.query, keep_blank_values=True))
826
+ params.setdefault("source", "download")
827
+ return urlunparse(parsed._replace(query=urlencode(params)))
828
+ return url