Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on about 1 month ago

Commit

4e2658f

verified ·

1 Parent(s): b194eda

Revert: remove Playwright fallback

Browse files

Files changed (1) hide show

hf_backend/fetcher.py +978 -1009

hf_backend/fetcher.py CHANGED Viewed

@@ -1,1009 +1,978 @@
-from __future__ import annotations
-import math
-import re
-import time
-import zipfile
-from difflib import SequenceMatcher
-from io import BytesIO
-from pathlib import PurePosixPath
-from typing import Any
-from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
-import requests
-try:
-    import cloudscraper
-except ImportError:
-    cloudscraper = None
-try:
-    from bs4 import BeautifulSoup
-except ImportError:
-    BeautifulSoup = None
-try:
-    from playwright.sync_api import sync_playwright
-except ImportError:
-    sync_playwright = None
-from hf_backend.config import AppConfig
-from hf_backend.filename_utils import normalize_source_filename
-class FetchError(RuntimeError):
-    pass
-USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB，请提供直链"
-ENGLISH_CODES = {
-    "en",
-    "eng",
-    "en-us",
-    "en-gb",
-    "english",
-}
-def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
-    normalized_query = str(query or "").strip()
-    if not normalized_query:
-        raise FetchError("请输入书名或 EPUB 下载链接")
-    if _looks_like_url(normalized_query):
-        filename, content = download_epub_from_url(config, normalized_query)
-        return {
-            "filename": normalize_source_filename(filename, default_extension=".epub"),
-            "content": content,
-            "origin": "link_fetch",
-            "provider": "direct_link",
-            "query": normalized_query,
-            "download_url": normalized_query,
-        }
-    candidates: list[dict[str, Any]] = []
-    last_error = None
-    # Only use Anna's Archive and Z-Library
-    for provider in (
-        search_src_a,
-        search_src_b,
-    ):
-        try:
-            provider_candidates = provider(config, normalized_query)
-            candidates.extend(provider_candidates)
-        except FetchError as exc:
-            last_error = exc
-            continue
-        except (requests.RequestException, ValueError) as exc:
-            last_error = exc
-            continue
-    if not candidates:
-        if last_error:
-            error_msg = str(last_error)
-            if "src_a" in error_msg:
-                raise FetchError("未找到可用的英文 EPUB，请尝试提供直链或使用其他书名")
-            raise FetchError(f"搜索失败：{error_msg[:100]}")
-        raise FetchError(USER_FACING_NOT_FOUND)
-    # Rank candidates by score, then try downloading from best to worst
-    scored = [
-        (candidate, _score_candidate(normalized_query, candidate))
-        for candidate in candidates
-    ]
-    scored.sort(key=lambda item: item[1], reverse=True)
-    top_score = scored[0][1] if scored else 0.0
-    # Minimum score ratio to allow fallback (e.g., 0.6 means fallback must be at least 60% of top score)
-    _FALLBACK_MIN_RATIO = 0.6
-    download_error = None
-    for candidate, score in scored:
-        # Don't fall back to books that are too different from the top match
-        if top_score > 0.5 and score < top_score * _FALLBACK_MIN_RATIO:
-            break
-        provider = candidate.get("provider", "")
-        try:
-            filename, content = download_epub_from_url(
-                config,
-                candidate["download_url"],
-                filename_hint=candidate.get("filename", ""),
-                provider=provider,
-            )
-            return {
-                "filename": normalize_source_filename(filename, default_extension=".epub"),
-                "content": content,
-                "origin": "title_fetch",
-                "provider": provider,
-                "query": normalized_query,
-                "title": candidate.get("title", ""),
-                "author": candidate.get("author", ""),
-                "download_url": candidate["download_url"],
-            }
-        except FetchError as exc:
-            download_error = exc
-            continue
-        except (requests.RequestException, ValueError) as exc:
-            download_error = exc
-            continue
-    raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
-def download_epub_from_url(
-    config: AppConfig,
-    url: str,
-    *,
-    filename_hint: str = "",
-    provider: str = "",
-) -> tuple[str, bytes]:
-    # Route to specialized downloaders based on provider hint or URL pattern
-    if provider == "src_a" or (
-        config.src_a_base_url
-        and url.lower().startswith(config.src_a_base_url.lower())
-        and "/slow_download/" in url.lower()
-    ):
-        return _download_from_src_a(config, url, filename_hint)
-    if provider == "src_b" or (
-        config.src_b_base_url
-        and url.lower().startswith(config.src_b_base_url.lower())
-        and "/dl/" in url.lower()
-    ):
-        return _download_from_src_b(config, url, filename_hint)
-    effective_url = _normalize_download_url(url)
-    response = requests.get(
-        effective_url,
-        headers={"user-agent": config.fetch_user_agent},
-        timeout=config.fetch_timeout_seconds,
-        allow_redirects=True,
-    )
-    response.raise_for_status()
-    content = response.content
-    if not content:
-        raise FetchError("下载结果为空")
-    filename = _derive_filename(response, response.url or url, filename_hint)
-    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
-    if not filename.lower().endswith(".epub"):
-        filename = f"{filename}.epub"
-    return filename, content
-def _download_from_src_a(
-    config: AppConfig,
-    url: str,
-    filename_hint: str,
-) -> tuple[str, bytes]:
-    """
-    Download EPUB from Anna's Archive.
-    First tries Libgen (unprotected), then falls back to Anna's Archive
-    fast_download/slow_download (requires cloudscraper for DDoS-Guard).
-    """
-    # Extract md5 from URL (format: .../slow_download/{md5}/0/3)
-    md5_match = re.search(r"/slow_download/([a-f0-9]+)", url)
-    if not md5_match:
-        raise FetchError("无法从 URL 提取 md5")
-    md5 = md5_match.group(1)
-    libgen_headers = {
-        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-    }
-    # --- Attempt 1: Libgen (fast, unprotected) ---
-    libgen_success = False
-    if BeautifulSoup is not None:
-        try:
-            ads_url = f"https://libgen.li/ads.php?md5={md5}"
-            ads_response = requests.get(ads_url, headers=libgen_headers, timeout=15)
-            ads_response.raise_for_status()
-            ads_soup = BeautifulSoup(ads_response.text, "lxml")
-            get_link = ads_soup.find("a", string=re.compile("GET"))
-            if get_link:
-                get_href = get_link.get("href", "")
-                if get_href:
-                    download_url = f"https://libgen.li/{get_href.lstrip('/')}"
-                    response = requests.get(
-                        download_url,
-                        headers=libgen_headers,
-                        timeout=config.fetch_timeout_seconds,
-                        allow_redirects=True,
-                    )
-                    response.raise_for_status()
-                    content = response.content
-                    if content:
-                        filename = _derive_filename(response, response.url or download_url, filename_hint)
-                        _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
-                        if not filename.lower().endswith(".epub"):
-                            filename = f"{filename}.epub"
-                        return filename, content
-        except Exception:
-            pass  # Fall through to Anna's Archive
-    # --- Attempt 2: Anna's Archive fast_download (cloudscraper) ---
-    if cloudscraper is not None:
-        base_url = config.src_a_base_url.rstrip("/")
-        for server_id in range(6):  # Try first 6 fast partner servers
-            try:
-                fast_url = f"{base_url}/fast_download/{md5}/0/{server_id}"
-                scraper = cloudscraper.create_scraper(
-                    browser={"browser": "chrome", "platform": "windows", "mobile": False},
-                    delay=10,
-                )
-                headers = {
-                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                    "accept": "*/*",
-                }
-                response = scraper.get(
-                    fast_url,
-                    headers=headers,
-                    timeout=config.fetch_timeout_seconds,
-                    allow_redirects=True,
-                )
-                if response.status_code == 200 and len(response.content) > 1000:
-                    filename = _derive_filename(response, response.url or fast_url, filename_hint)
-                    try:
-                        _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
-                    except FetchError:
-                        continue
-                    if not filename.lower().endswith(".epub"):
-                        filename = f"{filename}.epub"
-                    return filename, response.content
-            except Exception:
-                continue
-    # --- Attempt 3: Anna's Archive slow_download (cloudscraper, may have wait) ---
-    if cloudscraper is not None:
-        base_url = config.src_a_base_url.rstrip("/")
-        for server_id in range(4):
-            try:
-                slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
-                scraper = cloudscraper.create_scraper(
-                    browser={"browser": "chrome", "platform": "windows", "mobile": False},
-                    delay=10,
-                )
-                headers = {
-                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                    "accept": "*/*",
-                }
-                response = scraper.get(
-                    slow_url,
-                    headers=headers,
-                    timeout=180,
-                    allow_redirects=True,
-                )
-                if response.status_code == 200 and len(response.content) > 1000:
-                    filename = _derive_filename(response, response.url or slow_url, filename_hint)
-                    try:
-                        _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
-                    except FetchError:
-                        continue
-                    if not filename.lower().endswith(".epub"):
-                        filename = f"{filename}.epub"
-                    return filename, response.content
-            except Exception:
-                continue
-    # --- Attempt 4: Playwright headless browser (handles DDoS-Guard JS challenge) ---
-    if sync_playwright is not None:
-        base_url = config.src_a_base_url.rstrip("/")
-        # Try slow_download with Playwright (wait through countdown)
-        for server_id in range(3):
-            try:
-                slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
-                content = _download_with_playwright(slow_url)
-                if content and len(content) > 1000:
-                    _validate_epub_bytes(content=content, filename=filename_hint or "book.epub", content_type="")
-                    fname = filename_hint if filename_hint and filename_hint.lower().endswith(".epub") else f"{filename_hint or 'book'}.epub"
-                    return fname, content
-            except Exception:
-                continue
-    raise FetchError("所有下载方式均失败（Libgen 和 Anna's Archive）")
-def _download_with_playwright(url: str, *, wait_seconds: int = 50) -> bytes | None:
-    """
-    Use Playwright headless browser to bypass DDoS-Guard and download from
-    Anna's Archive slow_download pages.  Waits for the countdown timer then
-    captures the download.
-    """
-    if sync_playwright is None:
-        return None
-    with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True)
-        context = browser.new_context(
-            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-            accept_downloads=True,
-        )
-        page = context.new_page()
-        try:
-            page.goto(url, timeout=60_000, wait_until="domcontentloaded")
-            # Wait for the countdown timer to finish and a download link to appear
-            for elapsed in range(wait_seconds):
-                time.sleep(1)
-                # Check if page navigated away (redirect to download)
-                current = page.url
-                if current != url and "slow_download" not in current and "fast_download" not in current:
-                    # Direct redirect — fetch via requests using cookies from the browser
-                    break
-                # Check for download links that appeared after countdown
-                try:
-                    links = page.query_selector_all("a[href]")
-                    for link in links:
-                        href = link.get_attribute("href") or ""
-                        text = (link.text_content() or "").strip().lower()
-                        if ("get.php" in href or href.endswith(".epub") or
-                                ("download" in text and href and href != "#")):
-                            # Try to capture download
-                            try:
-                                with page.expect_download(timeout=5_000) as dl_info:
-                                    link.click()
-                                dl = dl_info.value
-                                dl_path = dl.path()
-                                if dl_path:
-                                    with open(dl_path, "rb") as f:
-                                        return f.read()
-                            except Exception:
-                                # click didn't trigger download, try fetching URL directly
-                                abs_href = href if href.startswith("http") else f"https://annas-archive.gl{href}"
-                                resp = context.request.get(abs_href, timeout=120_000)
-                                if resp.status == 200 and len(resp.body()) > 1000:
-                                    return resp.body()
-                except Exception:
-                    pass
-        except Exception:
-            pass
-        finally:
-            browser.close()
-    return None
-def _download_from_src_b(
-    config: AppConfig,
-    url: str,
-    filename_hint: str,
-) -> tuple[str, bytes]:
-    """
-    Download EPUB from src_b download URL.
-    Uses cloudscraper to handle protection.
-    Falls back to requests.get if cloudscraper is unavailable.
-    """
-    if cloudscraper is not None:
-        try:
-            scraper = cloudscraper.create_scraper(
-                browser={"browser": "chrome", "platform": "windows", "mobile": False},
-                delay=10,
-            )
-            headers = {
-                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
-                "accept": "*/*",
-            }
-            response = scraper.get(
-                url,
-                headers=headers,
-                timeout=120,
-                allow_redirects=True,
-            )
-            if response.status_code == 403:
-                raise FetchError("下载被阻止，请手动下载")
-            response.raise_for_status()
-            content = response.content
-            if not content:
-                raise FetchError("下载结果为空")
-            filename = _derive_filename(response, response.url or url, filename_hint)
-            _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
-            if not filename.lower().endswith(".epub"):
-                filename = f"{filename}.epub"
-            return filename, content
-        except FetchError:
-            raise
-        except Exception:
-            pass  # Fall through to requests
-    # Fallback to plain requests
-    response = requests.get(
-        url,
-        headers={"user-agent": config.fetch_user_agent},
-        timeout=config.fetch_timeout_seconds,
-        allow_redirects=True,
-    )
-    if response.status_code == 403:
-        raise FetchError("下载被阻止，请手动下载")
-    response.raise_for_status()
-    content = response.content
-    if not content:
-        raise FetchError("下载结果为空")
-    filename = _derive_filename(response, response.url or url, filename_hint)
-    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
-    if not filename.lower().endswith(".epub"):
-        filename = f"{filename}.epub"
-    return filename, content
-def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
-    scored: list[tuple[float, dict[str, Any]]] = []
-    for candidate in candidates:
-        score = _score_candidate(query, candidate)
-        if score >= 0.45:
-            scored.append((score, candidate))
-    if not scored:
-        return None
-    scored.sort(key=lambda item: item[0], reverse=True)
-    return scored[0][1]
-def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
-    response = requests.get(
-        config.standard_ebooks_search_url,
-        params={"query": query},
-        headers={"user-agent": config.fetch_user_agent},
-        timeout=_provider_timeout(config),
-    )
-    response.raise_for_status()
-    paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
-    candidates: list[dict[str, Any]] = []
-    for path in paths[:6]:
-        detail_url = urljoin(config.standard_ebooks_search_url, path)
-        detail_response = requests.get(
-            detail_url,
-            headers={"user-agent": config.fetch_user_agent},
-            timeout=_provider_timeout(config),
-        )
-        detail_response.raise_for_status()
-        download_path = _pick_standard_ebooks_download(detail_response.text)
-        if not download_path:
-            continue
-        candidates.append(
-            {
-                "provider": "standard_ebooks",
-                "title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
-                "author": _author_from_book_path(path),
-                "language": "en",
-                "download_url": _normalize_download_url(
-                    urljoin(config.standard_ebooks_search_url, download_path),
-                ),
-                "filename": PurePosixPath(download_path).name,
-            }
-        )
-    return candidates
-def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
-    response = requests.get(
-        config.project_gutenberg_search_url,
-        params={"query": query},
-        headers={"user-agent": config.fetch_user_agent},
-        timeout=_provider_timeout(config),
-    )
-    response.raise_for_status()
-    book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
-    candidates: list[dict[str, Any]] = []
-    for book_id in book_ids[:5]:
-        detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
-        detail_response = requests.get(
-            detail_url,
-            headers={"user-agent": config.fetch_user_agent},
-            timeout=_provider_timeout(config),
-        )
-        detail_response.raise_for_status()
-        detail_html = detail_response.text
-        download_path = _pick_gutenberg_epub(detail_html)
-        if not download_path:
-            continue
-        candidates.append(
-            {
-                "provider": "project_gutenberg",
-                "title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
-                "author": "",
-                "language": "en",
-                "download_url": urljoin(detail_url, download_path),
-                "filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
-            }
-        )
-    return candidates
-def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
-    response = requests.get(
-        config.internet_archive_advancedsearch_url,
-        params={
-            "q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
-            "fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
-            "sort[]": "downloads desc",
-            "rows": 8,
-            "page": 1,
-            "output": "json",
-        },
-        headers={"user-agent": config.fetch_user_agent},
-        timeout=_provider_timeout(config),
-    )
-    response.raise_for_status()
-    payload = response.json()
-    docs = payload.get("response", {}).get("docs", [])
-    candidates: list[dict[str, Any]] = []
-    for item in docs:
-        if not _is_english(item.get("language")):
-            continue
-        formats = item.get("format") or []
-        if isinstance(formats, str):
-            formats = [formats]
-        if not any(str(value).strip().lower() == "epub" for value in formats):
-            continue
-        identifier = str(item.get("identifier") or "").strip()
-        if not identifier:
-            continue
-        metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
-        metadata_response = requests.get(
-            metadata_url,
-            headers={"user-agent": config.fetch_user_agent},
-            timeout=_provider_timeout(config),
-        )
-        metadata_response.raise_for_status()
-        metadata = metadata_response.json()
-        filename = _pick_archive_epub_filename(metadata)
-        if not filename:
-            continue
-        candidates.append(
-            {
-                "provider": "internet_archive",
-                "title": str(item.get("title") or ""),
-                "author": _first_text(item.get("creator")),
-                "language": _first_text(item.get("language")),
-                "downloads": int(item.get("downloads") or 0),
-                "download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
-                "filename": filename,
-            }
-        )
-    return candidates
-def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
-    """
-    Search src_a for EPUB files matching the query.
-    Uses cloudscraper to bypass DDoS-Guard protection.
-    """
-    if not config.src_a_search_url or not config.src_a_base_url:
-        return []
-    if cloudscraper is None or BeautifulSoup is None:
-        return []
-    candidates: list[dict[str, Any]] = []
-    search_url = config.src_a_search_url
-    try:
-        scraper = cloudscraper.create_scraper(
-            browser={"browser": "chrome", "platform": "windows", "mobile": False},
-            delay=10,
-        )
-        headers = {
-            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            "accept-language": "en-US,en;q=0.9",
-        }
-        response = scraper.get(
-            f"{search_url}?q={quote(query)}",
-            headers=headers,
-            timeout=60,
-        )
-        response.raise_for_status()
-        html = response.text
-        if not html or len(html) < 500:
-            raise FetchError("搜索页面加载失败")
-        soup = BeautifulSoup(html, "lxml")
-        # Find all links containing /md5/ - these are book entry links
-        md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
-        seen_md5 = set()
-        for link in md5_links:
-            link_text = (link.get_text() or "").strip()
-            # We want links with empty text (Save links) that point to md5 entries
-            if link_text and link_text != "Save":
-                continue
-            href = link.get("href", "")
-            md5_match = re.search(r"/md5/([a-f0-9]+)", href)
-            if not md5_match:
-                continue
-            md5 = md5_match.group(1)
-            if md5 in seen_md5:
-                continue
-            # Find the parent container to get context
-            container = link.find_parent("div")
-            if not container:
-                container = link.parent
-            if not container:
-                continue
-            container_text = container.get_text(separator="\n") or ""
-            # Check if this container has an EPUB file
-            epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
-            if not epub_match:
-                continue
-            epub_path = epub_match.group(1)
-            seen_md5.add(md5)
-            # Extract metadata: English [en] · EPUB · 1.2MB · 2020
-            meta_match = re.search(
-                r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
-                container_text,
-            )
-            metadata = {
-                "language": meta_match.group(1) if meta_match else "",
-                "format": meta_match.group(2) if meta_match else "",
-                "filesize": meta_match.group(3) if meta_match else "",
-                "year": meta_match.group(4) if meta_match else "",
-            } if meta_match else None
-            # Extract title and author from container text lines
-            lines = [l.strip() for l in container_text.split("\n") if l.strip()]
-            title = ""
-            author = ""
-            for line in lines:
-                if ".epub" in line.lower():
-                    continue
-                if re.match(r"^[\d.,]+$", line):
-                    continue
-                if re.match(r"^(English|Save|\d+)", line):
-                    continue
-                if not title:
-                    title = line[:200]
-                elif not author and len(line) < 100:
-                    author = line
-                    break
-            download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
-            filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
-            candidates.append({
-                "provider": "src_a",
-                "title": title,
-                "author": author,
-                "language": "en",
-                "download_url": download_url,
-                "filename": filename,
-                "filesize": metadata["filesize"] if metadata else "",
-                "year": metadata["year"] if metadata else "",
-            })
-    except FetchError:
-        raise
-    except Exception as exc:
-        raise FetchError(f"搜索失败: {str(exc)[:100]}")
-    return candidates
-def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
-    """
-    Search src_b for EPUB files matching the query.
-    Uses cloudscraper to bypass DDoS-Guard protection.
-    Tries alternative Z-Library domains if primary fails.
-    """
-    if not config.src_b_base_url:
-        return []
-    if cloudscraper is None or BeautifulSoup is None:
-        return []
-    # Try primary domain first, then alternatives
-    primary_base = config.src_b_base_url.rstrip("/")
-    alt_domains = ["https://z-lib.is", "https://z-library.se"]
-    bases_to_try = [primary_base] + [d for d in alt_domains if d.rstrip("/") != primary_base]
-    last_error = None
-    for base in bases_to_try:
-        try:
-            return _search_src_b_at_domain(base, query)
-        except FetchError as exc:
-            last_error = exc
-            continue
-        except (requests.RequestException, ValueError) as exc:
-            last_error = exc
-            continue
-    if last_error:
-        raise last_error
-    return []
-def _search_src_b_at_domain(base: str, query: str) -> list[dict[str, Any]]:
-    """Search a single Z-Library domain for EPUB files."""
-    candidates: list[dict[str, Any]] = []
-    scraper = cloudscraper.create_scraper(
-        browser={"browser": "chrome", "platform": "windows", "mobile": False},
-        delay=10,
-    )
-    headers = {
-        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
-        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-        "accept-language": "en-US,en;q=0.9",
-    }
-    # Search page
-    search_url = f"{base}/s/{quote(query)}"
-    response = scraper.get(search_url, headers=headers, timeout=60)
-    response.raise_for_status()
-    html = response.text
-    if not html or len(html) < 200:
-        raise FetchError("搜索页面加载失败")
-    soup = BeautifulSoup(html, "lxml")
-    # Find book items by looking for links to /book/
-    book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
-    if not book_links:
-        raise FetchError("未找到任何书籍")
-    # Deduplicate by href
-    seen_hrefs: set[str] = set()
-    books: list[dict[str, str]] = []
-    for link in book_links:
-        href = link.get("href", "")
-        if not href or href in seen_hrefs:
-            continue
-        seen_hrefs.add(href)
-        text = (link.get_text() or "").strip()
-        if text and len(text) > 3:
-            books.append({"href": href, "text": text})
-    # Visit each book page to check for EPUB format
-    for book in books[:15]:
-        href = book.get("href", "")
-        if not href:
-            continue
-        book_url = href if href.startswith("http") else f"{base}{href}"
-        try:
-            book_resp = scraper.get(book_url, headers=headers, timeout=30)
-            book_resp.raise_for_status()
-            book_html = book_resp.text
-            book_soup = BeautifulSoup(book_html, "lxml")
-            page_text = book_soup.get_text(separator="\n") or ""
-            if "epub" not in page_text.lower():
-                continue
-            # Extract author
-            author = ""
-            author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
-            if author_match:
-                author = author_match.group(1).strip()[:100]
-            # Get download URL from /dl/ link
-            dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
-            dl_id = dl_match.group(1) if dl_match else ""
-            download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
-            title_text = book.get("text", "").split("\n")[0].strip()
-            if not title_text:
-                title_text = book.get("text", "")[:100]
-            candidates.append({
-                "provider": "src_b",
-                "title": title_text,
-                "author": author,
-                "language": "en",
-                "download_url": download_url,
-                "filename": "",
-                "filesize": "",
-            })
-        except Exception:
-            continue
-    return candidates
-def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
-    if len(content) < 4 or not content.startswith(b"PK"):
-        raise FetchError("下载内容不是 EPUB")
-    try:
-        with zipfile.ZipFile(BytesIO(content)) as archive:
-            mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
-    except (KeyError, zipfile.BadZipFile) as exc:
-        raise FetchError("下载内容不是 EPUB") from exc
-    if mimetype != "application/epub+zip":
-        raise FetchError("下载内容不是 EPUB")
-    lowered_content_type = content_type.lower()
-    if filename.lower().endswith(".epub"):
-        return
-    if "application/epub+zip" in lowered_content_type:
-        return
-def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
-    hint = str(filename_hint or "").strip()
-    if hint:
-        return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
-    disposition = response.headers.get("content-disposition", "")
-    match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
-    if match:
-        return normalize_source_filename(
-            PurePosixPath(unquote(match.group(1).strip())).name,
-            default_extension=".epub",
-        )
-    parsed = urlparse(url)
-    name = PurePosixPath(unquote(parsed.path)).name
-    if name:
-        return normalize_source_filename(name, default_extension=".epub")
-    return "downloaded_book.epub"
-def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
-    if not candidate.get("download_url") or not _is_english(candidate.get("language")):
-        return -1.0
-    normalized_query = _normalize_text(query)
-    normalized_title = _normalize_text(candidate.get("title", ""))
-    # Word overlap scoring - more accurate for title matching
-    query_words = set(normalized_query.split())
-    title_words = set(normalized_title.split())
-    if not query_words or not title_words:
-        return -1.0
-    # Calculate word overlap
-    common_words = query_words & title_words
-    all_words = query_words | title_words
-    # Jaccard similarity (word overlap / total unique words)
-    jaccard_score = len(common_words) / len(all_words) if all_words else 0
-    # Sequence similarity for word order
-    sequence_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
-    # Combined base score (weighted toward word overlap)
-    title_score = jaccard_score * 0.7 + sequence_score * 0.3
-    # Strong bonus for exact match
-    if normalized_query == normalized_title:
-        title_score += 1.0
-    # Bonus for all query words in title
-    elif query_words <= title_words:
-        title_score += 0.3
-    # Penalty for titles with no meaningful word overlap
-    elif len(common_words) == 0:
-        title_score -= 0.3
-    provider_bonus = {
-        "src_a": 0.1,
-        "src_b": 0.08,
-    }.get(candidate.get("provider"), 0.0)
-    filename = str(candidate.get("filename") or "").lower()
-    download_url = str(candidate.get("download_url") or "").lower()
-    epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
-    downloads = max(int(candidate.get("downloads") or 0), 0)
-    downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
-    return title_score + provider_bonus + epub_bonus + downloads_bonus
-def _looks_like_url(value: str) -> bool:
-    parsed = urlparse(value)
-    return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
-def _normalize_text(value: str) -> str:
-    lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
-    return " ".join(lowered.split())
-def _is_english(value: Any) -> bool:
-    if isinstance(value, (list, tuple, set)):
-        return any(_is_english(item) for item in value)
-    normalized = _normalize_text(str(value or ""))
-    return normalized in ENGLISH_CODES
-def _unique_matches(pattern: str, text: str) -> list[str]:
-    results: list[str] = []
-    for match in re.findall(pattern, text):
-        value = match.strip()
-        if value and value not in results:
-            results.append(value)
-    return results
-def _pick_standard_ebooks_download(html: str) -> str:
-    links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
-    for link in links:
-        lower_link = link.lower()
-        if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
-            continue
-        return link
-    return links[0] if links else ""
-def _pick_gutenberg_epub(html: str) -> str:
-    links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
-    for link in links:
-        lower_link = link.lower()
-        if lower_link.endswith(".epub") or ".epub." in lower_link:
-            return link
-    return ""
-def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
-    for item in metadata.get("files", []) or []:
-        name = str(item.get("name") or "")
-        if name.lower().endswith(".epub"):
-            return name
-    return ""
-def _extract_html_title(html: str) -> str:
-    title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
-    if not title_match:
-        return ""
-    title = re.sub(r"\s+", " ", title_match.group(1)).strip()
-    title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
-    title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
-    title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
-    return title
-def _author_from_book_path(path: str) -> str:
-    parts = [part for part in path.strip("/").split("/") if part]
-    if len(parts) < 3:
-        return ""
-    return " ".join(part.capitalize() for part in parts[1].split("-"))
-def _title_from_book_path(path: str) -> str:
-    parts = [part for part in path.strip("/").split("/") if part]
-    if len(parts) < 3:
-        return ""
-    return " ".join(part.capitalize() for part in parts[2].split("-"))
-def _first_text(value: Any) -> str:
-    if isinstance(value, (list, tuple)):
-        return str(value[0]) if value else ""
-    return str(value or "")
-def _provider_timeout(config: AppConfig) -> int:
-    return max(5, min(int(config.fetch_timeout_seconds), 10))
-def _normalize_download_url(url: str) -> str:
-    parsed = urlparse(url)
-    if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
-        params = dict(parse_qsl(parsed.query, keep_blank_values=True))
-        params.setdefault("source", "download")
-        return urlunparse(parsed._replace(query=urlencode(params)))
-    return url

+from __future__ import annotations
+import math
+import re
+import time
+import zipfile
+from difflib import SequenceMatcher
+from io import BytesIO
+from pathlib import PurePosixPath
+from typing import Any
+from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
+import requests
+try:
+    import cloudscraper
+except ImportError:
+    cloudscraper = None
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    BeautifulSoup = None
+from hf_backend.config import AppConfig
+from hf_backend.filename_utils import normalize_source_filename
+class FetchError(RuntimeError):
+    pass
+USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB，请提供直链"
+ENGLISH_CODES = {
+    "en",
+    "eng",
+    "en-us",
+    "en-gb",
+    "english",
+}
+def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
+    normalized_query = str(query or "").strip()
+    if not normalized_query:
+        raise FetchError("请输入书名或 EPUB 下载链接")
+    if _looks_like_url(normalized_query):
+        filename, content = download_epub_from_url(config, normalized_query)
+        return {
+            "filename": normalize_source_filename(filename, default_extension=".epub"),
+            "content": content,
+            "origin": "link_fetch",
+            "provider": "direct_link",
+            "query": normalized_query,
+            "download_url": normalized_query,
+        }
+    candidates: list[dict[str, Any]] = []
+    last_error = None
+    # Only use Anna's Archive and Z-Library
+    for provider in (
+        search_src_a,
+        search_src_b,
+    ):
+        try:
+            provider_candidates = provider(config, normalized_query)
+            candidates.extend(provider_candidates)
+        except FetchError as exc:
+            last_error = exc
+            continue
+        except (requests.RequestException, ValueError) as exc:
+            last_error = exc
+            continue
+    if not candidates:
+        if last_error:
+            error_msg = str(last_error)
+            if "src_a" in error_msg:
+                raise FetchError("未找到可用的英文 EPUB，请尝试提供直链或使用其他书名")
+            raise FetchError(f"搜索失败：{error_msg[:100]}")
+        raise FetchError(USER_FACING_NOT_FOUND)
+    # Rank candidates by score, then try downloading from best to worst
+    scored = [
+        (candidate, _score_candidate(normalized_query, candidate))
+        for candidate in candidates
+    ]
+    scored.sort(key=lambda item: item[1], reverse=True)
+    top_score = scored[0][1] if scored else 0.0
+    # Minimum score ratio to allow fallback (e.g., 0.6 means fallback must be at least 60% of top score)
+    _FALLBACK_MIN_RATIO = 0.6
+    download_error = None
+    for candidate, score in scored:
+        # Don't fall back to books that are too different from the top match
+        if top_score > 0.5 and score < top_score * _FALLBACK_MIN_RATIO:
+            break
+        provider = candidate.get("provider", "")
+        try:
+            filename, content = download_epub_from_url(
+                config,
+                candidate["download_url"],
+                filename_hint=candidate.get("filename", ""),
+                provider=provider,
+            )
+            return {
+                "filename": normalize_source_filename(filename, default_extension=".epub"),
+                "content": content,
+                "origin": "title_fetch",
+                "provider": provider,
+                "query": normalized_query,
+                "title": candidate.get("title", ""),
+                "author": candidate.get("author", ""),
+                "download_url": candidate["download_url"],
+            }
+        except FetchError as exc:
+            download_error = exc
+            continue
+        except (requests.RequestException, ValueError) as exc:
+            download_error = exc
+            continue
+    raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
+def download_epub_from_url(
+    config: AppConfig,
+    url: str,
+    *,
+    filename_hint: str = "",
+    provider: str = "",
+) -> tuple[str, bytes]:
+    # Route to specialized downloaders based on provider hint or URL pattern
+    if provider == "src_a" or (
+        config.src_a_base_url
+        and url.lower().startswith(config.src_a_base_url.lower())
+        and "/slow_download/" in url.lower()
+    ):
+        return _download_from_src_a(config, url, filename_hint)
+    if provider == "src_b" or (
+        config.src_b_base_url
+        and url.lower().startswith(config.src_b_base_url.lower())
+        and "/dl/" in url.lower()
+    ):
+        return _download_from_src_b(config, url, filename_hint)
+    effective_url = _normalize_download_url(url)
+    response = requests.get(
+        effective_url,
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=config.fetch_timeout_seconds,
+        allow_redirects=True,
+    )
+    response.raise_for_status()
+    content = response.content
+    if not content:
+        raise FetchError("下载结果为空")
+    filename = _derive_filename(response, response.url or url, filename_hint)
+    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+    if not filename.lower().endswith(".epub"):
+        filename = f"{filename}.epub"
+    return filename, content
+def _download_from_src_a(
+    config: AppConfig,
+    url: str,
+    filename_hint: str,
+) -> tuple[str, bytes]:
+    """
+    Download EPUB from Anna's Archive.
+    First tries Libgen (unprotected), then falls back to Anna's Archive
+    fast_download/slow_download (requires cloudscraper for DDoS-Guard).
+    """
+    # Extract md5 from URL (format: .../slow_download/{md5}/0/3)
+    md5_match = re.search(r"/slow_download/([a-f0-9]+)", url)
+    if not md5_match:
+        raise FetchError("无法从 URL 提取 md5")
+    md5 = md5_match.group(1)
+    libgen_headers = {
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    }
+    # --- Attempt 1: Libgen (fast, unprotected) ---
+    libgen_success = False
+    if BeautifulSoup is not None:
+        try:
+            ads_url = f"https://libgen.li/ads.php?md5={md5}"
+            ads_response = requests.get(ads_url, headers=libgen_headers, timeout=15)
+            ads_response.raise_for_status()
+            ads_soup = BeautifulSoup(ads_response.text, "lxml")
+            get_link = ads_soup.find("a", string=re.compile("GET"))
+            if get_link:
+                get_href = get_link.get("href", "")
+                if get_href:
+                    download_url = f"https://libgen.li/{get_href.lstrip('/')}"
+                    response = requests.get(
+                        download_url,
+                        headers=libgen_headers,
+                        timeout=config.fetch_timeout_seconds,
+                        allow_redirects=True,
+                    )
+                    response.raise_for_status()
+                    content = response.content
+                    if content:
+                        filename = _derive_filename(response, response.url or download_url, filename_hint)
+                        _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+                        if not filename.lower().endswith(".epub"):
+                            filename = f"{filename}.epub"
+                        return filename, content
+        except Exception:
+            pass  # Fall through to Anna's Archive
+    # --- Attempt 2: Anna's Archive fast_download (cloudscraper) ---
+    if cloudscraper is not None:
+        base_url = config.src_a_base_url.rstrip("/")
+        for server_id in range(6):  # Try first 6 fast partner servers
+            try:
+                fast_url = f"{base_url}/fast_download/{md5}/0/{server_id}"
+                scraper = cloudscraper.create_scraper(
+                    browser={"browser": "chrome", "platform": "windows", "mobile": False},
+                    delay=10,
+                )
+                headers = {
+                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                    "accept": "*/*",
+                }
+                response = scraper.get(
+                    fast_url,
+                    headers=headers,
+                    timeout=config.fetch_timeout_seconds,
+                    allow_redirects=True,
+                )
+                if response.status_code == 200 and len(response.content) > 1000:
+                    filename = _derive_filename(response, response.url or fast_url, filename_hint)
+                    try:
+                        _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
+                    except FetchError:
+                        continue
+                    if not filename.lower().endswith(".epub"):
+                        filename = f"{filename}.epub"
+                    return filename, response.content
+            except Exception:
+                continue
+    # --- Attempt 3: Anna's Archive slow_download (cloudscraper, may have wait) ---
+    if cloudscraper is not None:
+        base_url = config.src_a_base_url.rstrip("/")
+        for server_id in range(4):
+            try:
+                slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
+                scraper = cloudscraper.create_scraper(
+                    browser={"browser": "chrome", "platform": "windows", "mobile": False},
+                    delay=10,
+                )
+                headers = {
+                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                    "accept": "*/*",
+                }
+                response = scraper.get(
+                    slow_url,
+                    headers=headers,
+                    timeout=180,
+                    allow_redirects=True,
+                )
+                if response.status_code == 200 and len(response.content) > 1000:
+                    filename = _derive_filename(response, response.url or slow_url, filename_hint)
+                    try:
+                        _validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
+                    except FetchError:
+                        continue
+                    if not filename.lower().endswith(".epub"):
+                        filename = f"{filename}.epub"
+                    return filename, response.content
+            except Exception:
+                continue
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            accept_downloads=True,
+        )
+        page = context.new_page()
+        try:
+            page.goto(url, timeout=60_000, wait_until="domcontentloaded")
+            # Wait for the countdown timer to finish and a download link to appear
+            for elapsed in range(wait_seconds):
+                time.sleep(1)
+                # Check if page navigated away (redirect to download)
+                current = page.url
+                if current != url and "slow_download" not in current and "fast_download" not in current:
+                    # Direct redirect — fetch via requests using cookies from the browser
+                    break
+                # Check for download links that appeared after countdown
+                try:
+                    links = page.query_selector_all("a[href]")
+                    for link in links:
+                        href = link.get_attribute("href") or ""
+                        text = (link.text_content() or "").strip().lower()
+                        if ("get.php" in href or href.endswith(".epub") or
+                                ("download" in text and href and href != "#")):
+                            # Try to capture download
+                            try:
+                                with page.expect_download(timeout=5_000) as dl_info:
+                                    link.click()
+                                dl = dl_info.value
+                                dl_path = dl.path()
+                                if dl_path:
+                                    with open(dl_path, "rb") as f:
+                                        return f.read()
+                            except Exception:
+                                # click didn't trigger download, try fetching URL directly
+                                abs_href = href if href.startswith("http") else f"https://annas-archive.gl{href}"
+                                resp = context.request.get(abs_href, timeout=120_000)
+                                if resp.status == 200 and len(resp.body()) > 1000:
+                                    return resp.body()
+                except Exception:
+                    pass
+        except Exception:
+            pass
+        finally:
+            browser.close()
+    return None
+def _download_from_src_b(
+    config: AppConfig,
+    url: str,
+    filename_hint: str,
+) -> tuple[str, bytes]:
+    """
+    Download EPUB from src_b download URL.
+    Uses cloudscraper to handle protection.
+    Falls back to requests.get if cloudscraper is unavailable.
+    """
+    if cloudscraper is not None:
+        try:
+            scraper = cloudscraper.create_scraper(
+                browser={"browser": "chrome", "platform": "windows", "mobile": False},
+                delay=10,
+            )
+            headers = {
+                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+                "accept": "*/*",
+            }
+            response = scraper.get(
+                url,
+                headers=headers,
+                timeout=120,
+                allow_redirects=True,
+            )
+            if response.status_code == 403:
+                raise FetchError("下载被阻止，请手动下载")
+            response.raise_for_status()
+            content = response.content
+            if not content:
+                raise FetchError("下载结果为空")
+            filename = _derive_filename(response, response.url or url, filename_hint)
+            _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+            if not filename.lower().endswith(".epub"):
+                filename = f"{filename}.epub"
+            return filename, content
+        except FetchError:
+            raise
+        except Exception:
+            pass  # Fall through to requests
+    # Fallback to plain requests
+    response = requests.get(
+        url,
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=config.fetch_timeout_seconds,
+        allow_redirects=True,
+    )
+    if response.status_code == 403:
+        raise FetchError("下载被阻止，请手动下载")
+    response.raise_for_status()
+    content = response.content
+    if not content:
+        raise FetchError("下载结果为空")
+    filename = _derive_filename(response, response.url or url, filename_hint)
+    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+    if not filename.lower().endswith(".epub"):
+        filename = f"{filename}.epub"
+    return filename, content
+def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
+    scored: list[tuple[float, dict[str, Any]]] = []
+    for candidate in candidates:
+        score = _score_candidate(query, candidate)
+        if score >= 0.45:
+            scored.append((score, candidate))
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return scored[0][1]
+def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    response = requests.get(
+        config.standard_ebooks_search_url,
+        params={"query": query},
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=_provider_timeout(config),
+    )
+    response.raise_for_status()
+    paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
+    candidates: list[dict[str, Any]] = []
+    for path in paths[:6]:
+        detail_url = urljoin(config.standard_ebooks_search_url, path)
+        detail_response = requests.get(
+            detail_url,
+            headers={"user-agent": config.fetch_user_agent},
+            timeout=_provider_timeout(config),
+        )
+        detail_response.raise_for_status()
+        download_path = _pick_standard_ebooks_download(detail_response.text)
+        if not download_path:
+            continue
+        candidates.append(
+            {
+                "provider": "standard_ebooks",
+                "title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
+                "author": _author_from_book_path(path),
+                "language": "en",
+                "download_url": _normalize_download_url(
+                    urljoin(config.standard_ebooks_search_url, download_path),
+                ),
+                "filename": PurePosixPath(download_path).name,
+            }
+        )
+    return candidates
+def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    response = requests.get(
+        config.project_gutenberg_search_url,
+        params={"query": query},
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=_provider_timeout(config),
+    )
+    response.raise_for_status()
+    book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
+    candidates: list[dict[str, Any]] = []
+    for book_id in book_ids[:5]:
+        detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
+        detail_response = requests.get(
+            detail_url,
+            headers={"user-agent": config.fetch_user_agent},
+            timeout=_provider_timeout(config),
+        )
+        detail_response.raise_for_status()
+        detail_html = detail_response.text
+        download_path = _pick_gutenberg_epub(detail_html)
+        if not download_path:
+            continue
+        candidates.append(
+            {
+                "provider": "project_gutenberg",
+                "title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
+                "author": "",
+                "language": "en",
+                "download_url": urljoin(detail_url, download_path),
+                "filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
+            }
+        )
+    return candidates
+def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    response = requests.get(
+        config.internet_archive_advancedsearch_url,
+        params={
+            "q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
+            "fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
+            "sort[]": "downloads desc",
+            "rows": 8,
+            "page": 1,
+            "output": "json",
+        },
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=_provider_timeout(config),
+    )
+    response.raise_for_status()
+    payload = response.json()
+    docs = payload.get("response", {}).get("docs", [])
+    candidates: list[dict[str, Any]] = []
+    for item in docs:
+        if not _is_english(item.get("language")):
+            continue
+        formats = item.get("format") or []
+        if isinstance(formats, str):
+            formats = [formats]
+        if not any(str(value).strip().lower() == "epub" for value in formats):
+            continue
+        identifier = str(item.get("identifier") or "").strip()
+        if not identifier:
+            continue
+        metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
+        metadata_response = requests.get(
+            metadata_url,
+            headers={"user-agent": config.fetch_user_agent},
+            timeout=_provider_timeout(config),
+        )
+        metadata_response.raise_for_status()
+        metadata = metadata_response.json()
+        filename = _pick_archive_epub_filename(metadata)
+        if not filename:
+            continue
+        candidates.append(
+            {
+                "provider": "internet_archive",
+                "title": str(item.get("title") or ""),
+                "author": _first_text(item.get("creator")),
+                "language": _first_text(item.get("language")),
+                "downloads": int(item.get("downloads") or 0),
+                "download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
+                "filename": filename,
+            }
+        )
+    return candidates
+def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    """
+    Search src_a for EPUB files matching the query.
+    Uses cloudscraper to bypass DDoS-Guard protection.
+    """
+    if not config.src_a_search_url or not config.src_a_base_url:
+        return []
+    if cloudscraper is None or BeautifulSoup is None:
+        return []
+    candidates: list[dict[str, Any]] = []
+    search_url = config.src_a_search_url
+    try:
+        scraper = cloudscraper.create_scraper(
+            browser={"browser": "chrome", "platform": "windows", "mobile": False},
+            delay=10,
+        )
+        headers = {
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "accept-language": "en-US,en;q=0.9",
+        }
+        response = scraper.get(
+            f"{search_url}?q={quote(query)}",
+            headers=headers,
+            timeout=60,
+        )
+        response.raise_for_status()
+        html = response.text
+        if not html or len(html) < 500:
+            raise FetchError("搜索页面加载失败")
+        soup = BeautifulSoup(html, "lxml")
+        # Find all links containing /md5/ - these are book entry links
+        md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
+        seen_md5 = set()
+        for link in md5_links:
+            link_text = (link.get_text() or "").strip()
+            # We want links with empty text (Save links) that point to md5 entries
+            if link_text and link_text != "Save":
+                continue
+            href = link.get("href", "")
+            md5_match = re.search(r"/md5/([a-f0-9]+)", href)
+            if not md5_match:
+                continue
+            md5 = md5_match.group(1)
+            if md5 in seen_md5:
+                continue
+            # Find the parent container to get context
+            container = link.find_parent("div")
+            if not container:
+                container = link.parent
+            if not container:
+                continue
+            container_text = container.get_text(separator="\n") or ""
+            # Check if this container has an EPUB file
+            epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
+            if not epub_match:
+                continue
+            epub_path = epub_match.group(1)
+            seen_md5.add(md5)
+            # Extract metadata: English [en] · EPUB · 1.2MB · 2020
+            meta_match = re.search(
+                r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
+                container_text,
+            )
+            metadata = {
+                "language": meta_match.group(1) if meta_match else "",
+                "format": meta_match.group(2) if meta_match else "",
+                "filesize": meta_match.group(3) if meta_match else "",
+                "year": meta_match.group(4) if meta_match else "",
+            } if meta_match else None
+            # Extract title and author from container text lines
+            lines = [l.strip() for l in container_text.split("\n") if l.strip()]
+            title = ""
+            author = ""
+            for line in lines:
+                if ".epub" in line.lower():
+                    continue
+                if re.match(r"^[\d.,]+$", line):
+                    continue
+                if re.match(r"^(English|Save|\d+)", line):
+                    continue
+                if not title:
+                    title = line[:200]
+                elif not author and len(line) < 100:
+                    author = line
+                    break
+            download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
+            filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
+            candidates.append({
+                "provider": "src_a",
+                "title": title,
+                "author": author,
+                "language": "en",
+                "download_url": download_url,
+                "filename": filename,
+                "filesize": metadata["filesize"] if metadata else "",
+                "year": metadata["year"] if metadata else "",
+            })
+    except FetchError:
+        raise
+    except Exception as exc:
+        raise FetchError(f"搜索失败: {str(exc)[:100]}")
+    return candidates
+def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    """
+    Search src_b for EPUB files matching the query.
+    Uses cloudscraper to bypass DDoS-Guard protection.
+    Tries alternative Z-Library domains if primary fails.
+    """
+    if not config.src_b_base_url:
+        return []
+    if cloudscraper is None or BeautifulSoup is None:
+        return []
+    # Try primary domain first, then alternatives
+    primary_base = config.src_b_base_url.rstrip("/")
+    alt_domains = ["https://z-lib.is", "https://z-library.se"]
+    bases_to_try = [primary_base] + [d for d in alt_domains if d.rstrip("/") != primary_base]
+    last_error = None
+    for base in bases_to_try:
+        try:
+            return _search_src_b_at_domain(base, query)
+        except FetchError as exc:
+            last_error = exc
+            continue
+        except (requests.RequestException, ValueError) as exc:
+            last_error = exc
+            continue
+    if last_error:
+        raise last_error
+    return []
+def _search_src_b_at_domain(base: str, query: str) -> list[dict[str, Any]]:
+    """Search a single Z-Library domain for EPUB files."""
+    candidates: list[dict[str, Any]] = []
+    scraper = cloudscraper.create_scraper(
+        browser={"browser": "chrome", "platform": "windows", "mobile": False},
+        delay=10,
+    )
+    headers = {
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "accept-language": "en-US,en;q=0.9",
+    }
+    # Search page
+    search_url = f"{base}/s/{quote(query)}"
+    response = scraper.get(search_url, headers=headers, timeout=60)
+    response.raise_for_status()
+    html = response.text
+    if not html or len(html) < 200:
+        raise FetchError("搜索页面加载失败")
+    soup = BeautifulSoup(html, "lxml")
+    # Find book items by looking for links to /book/
+    book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
+    if not book_links:
+        raise FetchError("未找到任何书籍")
+    # Deduplicate by href
+    seen_hrefs: set[str] = set()
+    books: list[dict[str, str]] = []
+    for link in book_links:
+        href = link.get("href", "")
+        if not href or href in seen_hrefs:
+            continue
+        seen_hrefs.add(href)
+        text = (link.get_text() or "").strip()
+        if text and len(text) > 3:
+            books.append({"href": href, "text": text})
+    # Visit each book page to check for EPUB format
+    for book in books[:15]:
+        href = book.get("href", "")
+        if not href:
+            continue
+        book_url = href if href.startswith("http") else f"{base}{href}"
+        try:
+            book_resp = scraper.get(book_url, headers=headers, timeout=30)
+            book_resp.raise_for_status()
+            book_html = book_resp.text
+            book_soup = BeautifulSoup(book_html, "lxml")
+            page_text = book_soup.get_text(separator="\n") or ""
+            if "epub" not in page_text.lower():
+                continue
+            # Extract author
+            author = ""
+            author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
+            if author_match:
+                author = author_match.group(1).strip()[:100]
+            # Get download URL from /dl/ link
+            dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
+            dl_id = dl_match.group(1) if dl_match else ""
+            download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
+            title_text = book.get("text", "").split("\n")[0].strip()
+            if not title_text:
+                title_text = book.get("text", "")[:100]
+            candidates.append({
+                "provider": "src_b",
+                "title": title_text,
+                "author": author,
+                "language": "en",
+                "download_url": download_url,
+                "filename": "",
+                "filesize": "",
+            })
+        except Exception:
+            continue
+    return candidates
+def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
+    if len(content) < 4 or not content.startswith(b"PK"):
+        raise FetchError("下载内容不是 EPUB")
+    try:
+        with zipfile.ZipFile(BytesIO(content)) as archive:
+            mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
+    except (KeyError, zipfile.BadZipFile) as exc:
+        raise FetchError("下载内容不是 EPUB") from exc
+    if mimetype != "application/epub+zip":
+        raise FetchError("下载内容不是 EPUB")
+    lowered_content_type = content_type.lower()
+    if filename.lower().endswith(".epub"):
+        return
+    if "application/epub+zip" in lowered_content_type:
+        return
+def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
+    hint = str(filename_hint or "").strip()
+    if hint:
+        return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
+    disposition = response.headers.get("content-disposition", "")
+    match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
+    if match:
+        return normalize_source_filename(
+            PurePosixPath(unquote(match.group(1).strip())).name,
+            default_extension=".epub",
+        )
+    parsed = urlparse(url)
+    name = PurePosixPath(unquote(parsed.path)).name
+    if name:
+        return normalize_source_filename(name, default_extension=".epub")
+    return "downloaded_book.epub"
+def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
+    if not candidate.get("download_url") or not _is_english(candidate.get("language")):
+        return -1.0
+    normalized_query = _normalize_text(query)
+    normalized_title = _normalize_text(candidate.get("title", ""))
+    # Word overlap scoring - more accurate for title matching
+    query_words = set(normalized_query.split())
+    title_words = set(normalized_title.split())
+    if not query_words or not title_words:
+        return -1.0
+    # Calculate word overlap
+    common_words = query_words & title_words
+    all_words = query_words | title_words
+    # Jaccard similarity (word overlap / total unique words)
+    jaccard_score = len(common_words) / len(all_words) if all_words else 0
+    # Sequence similarity for word order
+    sequence_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
+    # Combined base score (weighted toward word overlap)
+    title_score = jaccard_score * 0.7 + sequence_score * 0.3
+    # Strong bonus for exact match
+    if normalized_query == normalized_title:
+        title_score += 1.0
+    # Bonus for all query words in title
+    elif query_words <= title_words:
+        title_score += 0.3
+    # Penalty for titles with no meaningful word overlap
+    elif len(common_words) == 0:
+        title_score -= 0.3
+    provider_bonus = {
+        "src_a": 0.1,
+        "src_b": 0.08,
+    }.get(candidate.get("provider"), 0.0)
+    filename = str(candidate.get("filename") or "").lower()
+    download_url = str(candidate.get("download_url") or "").lower()
+    epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
+    downloads = max(int(candidate.get("downloads") or 0), 0)
+    downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
+    return title_score + provider_bonus + epub_bonus + downloads_bonus
+def _looks_like_url(value: str) -> bool:
+    parsed = urlparse(value)
+    return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
+def _normalize_text(value: str) -> str:
+    lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
+    return " ".join(lowered.split())
+def _is_english(value: Any) -> bool:
+    if isinstance(value, (list, tuple, set)):
+        return any(_is_english(item) for item in value)
+    normalized = _normalize_text(str(value or ""))
+    return normalized in ENGLISH_CODES
+def _unique_matches(pattern: str, text: str) -> list[str]:
+    results: list[str] = []
+    for match in re.findall(pattern, text):
+        value = match.strip()
+        if value and value not in results:
+            results.append(value)
+    return results
+def _pick_standard_ebooks_download(html: str) -> str:
+    links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
+    for link in links:
+        lower_link = link.lower()
+        if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
+            continue
+        return link
+    return links[0] if links else ""
+def _pick_gutenberg_epub(html: str) -> str:
+    links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
+    for link in links:
+        lower_link = link.lower()
+        if lower_link.endswith(".epub") or ".epub." in lower_link:
+            return link
+    return ""
+def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
+    for item in metadata.get("files", []) or []:
+        name = str(item.get("name") or "")
+        if name.lower().endswith(".epub"):
+            return name
+    return ""
+def _extract_html_title(html: str) -> str:
+    title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
+    if not title_match:
+        return ""
+    title = re.sub(r"\s+", " ", title_match.group(1)).strip()
+    title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
+    title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
+    title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
+    return title
+def _author_from_book_path(path: str) -> str:
+    parts = [part for part in path.strip("/").split("/") if part]
+    if len(parts) < 3:
+        return ""
+    return " ".join(part.capitalize() for part in parts[1].split("-"))
+def _title_from_book_path(path: str) -> str:
+    parts = [part for part in path.strip("/").split("/") if part]
+    if len(parts) < 3:
+        return ""
+    return " ".join(part.capitalize() for part in parts[2].split("-"))
+def _first_text(value: Any) -> str:
+    if isinstance(value, (list, tuple)):
+        return str(value[0]) if value else ""
+    return str(value or "")
+def _provider_timeout(config: AppConfig) -> int:
+    return max(5, min(int(config.fetch_timeout_seconds), 10))
+def _normalize_download_url(url: str) -> str:
+    parsed = urlparse(url)
+    if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
+        params = dict(parse_qsl(parsed.query, keep_blank_values=True))
+        params.setdefault("source", "download")
+        return urlunparse(parsed._replace(query=urlencode(params)))
+    return url