Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

fromozu commited on May 4

Commit

35b16a0

verified ·

1 Parent(s): 026edb5

Upload hf_backend/fetcher.py with huggingface_hub

Browse files

Files changed (1) hide show

hf_backend/fetcher.py +828 -0

hf_backend/fetcher.py ADDED Viewed

	@@ -0,0 +1,828 @@

+from __future__ import annotations
+import math
+import re
+import time
+import zipfile
+from difflib import SequenceMatcher
+from io import BytesIO
+from pathlib import PurePosixPath
+from typing import Any
+from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
+import requests
+try:
+    import cloudscraper
+except ImportError:
+    cloudscraper = None
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    BeautifulSoup = None
+from hf_backend.config import AppConfig
+from hf_backend.filename_utils import normalize_source_filename
+class FetchError(RuntimeError):
+    pass
+USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB，请提供直链"
+ENGLISH_CODES = {
+    "en",
+    "eng",
+    "en-us",
+    "en-gb",
+    "english",
+}
+def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
+    normalized_query = str(query or "").strip()
+    if not normalized_query:
+        raise FetchError("请输入书名或 EPUB 下载链接")
+    if _looks_like_url(normalized_query):
+        filename, content = download_epub_from_url(config, normalized_query)
+        return {
+            "filename": normalize_source_filename(filename, default_extension=".epub"),
+            "content": content,
+            "origin": "link_fetch",
+            "provider": "direct_link",
+            "query": normalized_query,
+            "download_url": normalized_query,
+        }
+    candidates: list[dict[str, Any]] = []
+    last_error = None
+    # Provider order: src_a first (most comprehensive), then src_b, then other sources
+    for provider in (
+        search_src_a,
+        search_src_b,
+        search_standard_ebooks,
+        search_project_gutenberg,
+        search_internet_archive,
+    ):
+        try:
+            provider_candidates = provider(config, normalized_query)
+            candidates.extend(provider_candidates)
+        except FetchError as exc:
+            last_error = exc
+            continue  # Skip to next provider (source blocked or not found)
+        except (requests.RequestException, ValueError) as exc:
+            last_error = exc
+            continue  # Skip to next provider on connection errors
+        best_candidate = pick_best_candidate(normalized_query, candidates)
+        if best_candidate and _score_candidate(normalized_query, best_candidate) >= 0.9:
+            break
+    if not candidates:
+        if last_error:
+            error_msg = str(last_error)
+            if "src_a" in error_msg:
+                raise FetchError("未找到可用的英文 EPUB，请尝试提供直链或使用其他书名")
+            raise FetchError(f"搜索失败：{error_msg[:100]}")
+        raise FetchError(USER_FACING_NOT_FOUND)
+    # Rank candidates by score, then try downloading from best to worst
+    ranked = sorted(
+        candidates,
+        key=lambda c: _score_candidate(normalized_query, c),
+        reverse=True,
+    )
+    download_error = None
+    for candidate in ranked:
+        try:
+            filename, content = download_epub_from_url(
+                config,
+                candidate["download_url"],
+                filename_hint=candidate.get("filename", ""),
+                provider=candidate.get("provider", ""),
+            )
+            return {
+                "filename": normalize_source_filename(filename, default_extension=".epub"),
+                "content": content,
+                "origin": "title_fetch",
+                "provider": candidate.get("provider", ""),
+                "query": normalized_query,
+                "title": candidate.get("title", ""),
+                "author": candidate.get("author", ""),
+                "download_url": candidate["download_url"],
+            }
+        except FetchError as exc:
+            download_error = exc
+            continue  # Try next candidate
+        except (requests.RequestException, ValueError) as exc:
+            download_error = exc
+            continue
+    raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
+def download_epub_from_url(
+    config: AppConfig,
+    url: str,
+    *,
+    filename_hint: str = "",
+    provider: str = "",
+) -> tuple[str, bytes]:
+    # Route to specialized downloaders based on provider hint or URL pattern
+    if provider == "src_a" or (
+        config.src_a_base_url
+        and url.lower().startswith(config.src_a_base_url.lower())
+        and "/slow_download/" in url.lower()
+    ):
+        return _download_from_src_a(config, url, filename_hint)
+    if provider == "src_b" or (
+        config.src_b_base_url
+        and url.lower().startswith(config.src_b_base_url.lower())
+        and "/dl/" in url.lower()
+    ):
+        return _download_from_src_b(config, url, filename_hint)
+    effective_url = _normalize_download_url(url)
+    response = requests.get(
+        effective_url,
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=config.fetch_timeout_seconds,
+        allow_redirects=True,
+    )
+    response.raise_for_status()
+    content = response.content
+    if not content:
+        raise FetchError("下载结果为空")
+    filename = _derive_filename(response, response.url or url, filename_hint)
+    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+    if not filename.lower().endswith(".epub"):
+        filename = f"{filename}.epub"
+    return filename, content
+def _download_from_src_a(
+    config: AppConfig,
+    url: str,
+    filename_hint: str,
+) -> tuple[str, bytes]:
+    """
+    Download EPUB from a protected slow_download URL.
+    Uses cloudscraper to handle DDoS-Guard / Cloudflare redirect.
+    Falls back to requests.get if cloudscraper is unavailable.
+    """
+    effective_url = _normalize_download_url(url)
+    if cloudscraper is not None:
+        try:
+            scraper = cloudscraper.create_scraper(
+                browser={"browser": "chrome", "platform": "windows", "mobile": False},
+                delay=10,
+            )
+            headers = {
+                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                "accept": "*/*",
+            }
+            response = scraper.get(
+                effective_url,
+                headers=headers,
+                timeout=120,
+                allow_redirects=True,
+            )
+            if response.status_code == 403:
+                raise FetchError("下载被阻止，请在浏览器中打开此链接手动下载")
+            response.raise_for_status()
+            content = response.content
+            if not content:
+                raise FetchError("下载结果为空")
+            filename = _derive_filename(response, response.url or url, filename_hint)
+            _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+            if not filename.lower().endswith(".epub"):
+                filename = f"{filename}.epub"
+            return filename, content
+        except FetchError:
+            raise
+        except Exception:
+            pass  # Fall through to requests
+    # Fallback to plain requests
+    response = requests.get(
+        effective_url,
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=config.fetch_timeout_seconds,
+        allow_redirects=True,
+    )
+    if response.status_code == 403:
+        raise FetchError("下载被阻止，请在浏览器中打开此链接手动下载")
+    response.raise_for_status()
+    content = response.content
+    if not content:
+        raise FetchError("下载结果为空")
+    filename = _derive_filename(response, response.url or url, filename_hint)
+    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+    if not filename.lower().endswith(".epub"):
+        filename = f"{filename}.epub"
+    return filename, content
+def _download_from_src_b(
+    config: AppConfig,
+    url: str,
+    filename_hint: str,
+) -> tuple[str, bytes]:
+    """
+    Download EPUB from src_b download URL.
+    Uses cloudscraper to handle protection.
+    Falls back to requests.get if cloudscraper is unavailable.
+    """
+    if cloudscraper is not None:
+        try:
+            scraper = cloudscraper.create_scraper(
+                browser={"browser": "chrome", "platform": "windows", "mobile": False},
+                delay=10,
+            )
+            headers = {
+                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+                "accept": "*/*",
+            }
+            response = scraper.get(
+                url,
+                headers=headers,
+                timeout=120,
+                allow_redirects=True,
+            )
+            if response.status_code == 403:
+                raise FetchError("下载被阻止，请手动下载")
+            response.raise_for_status()
+            content = response.content
+            if not content:
+                raise FetchError("下载结果为空")
+            filename = _derive_filename(response, response.url or url, filename_hint)
+            _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+            if not filename.lower().endswith(".epub"):
+                filename = f"{filename}.epub"
+            return filename, content
+        except FetchError:
+            raise
+        except Exception:
+            pass  # Fall through to requests
+    # Fallback to plain requests
+    response = requests.get(
+        url,
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=config.fetch_timeout_seconds,
+        allow_redirects=True,
+    )
+    if response.status_code == 403:
+        raise FetchError("下载被阻止，请手动下载")
+    response.raise_for_status()
+    content = response.content
+    if not content:
+        raise FetchError("下载结果为空")
+    filename = _derive_filename(response, response.url or url, filename_hint)
+    _validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
+    if not filename.lower().endswith(".epub"):
+        filename = f"{filename}.epub"
+    return filename, content
+def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
+    scored: list[tuple[float, dict[str, Any]]] = []
+    for candidate in candidates:
+        score = _score_candidate(query, candidate)
+        if score >= 0.45:
+            scored.append((score, candidate))
+    if not scored:
+        return None
+    scored.sort(key=lambda item: item[0], reverse=True)
+    return scored[0][1]
+def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    response = requests.get(
+        config.standard_ebooks_search_url,
+        params={"query": query},
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=_provider_timeout(config),
+    )
+    response.raise_for_status()
+    paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
+    candidates: list[dict[str, Any]] = []
+    for path in paths[:6]:
+        detail_url = urljoin(config.standard_ebooks_search_url, path)
+        detail_response = requests.get(
+            detail_url,
+            headers={"user-agent": config.fetch_user_agent},
+            timeout=_provider_timeout(config),
+        )
+        detail_response.raise_for_status()
+        download_path = _pick_standard_ebooks_download(detail_response.text)
+        if not download_path:
+            continue
+        candidates.append(
+            {
+                "provider": "standard_ebooks",
+                "title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
+                "author": _author_from_book_path(path),
+                "language": "en",
+                "download_url": _normalize_download_url(
+                    urljoin(config.standard_ebooks_search_url, download_path),
+                ),
+                "filename": PurePosixPath(download_path).name,
+            }
+        )
+    return candidates
+def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    response = requests.get(
+        config.project_gutenberg_search_url,
+        params={"query": query},
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=_provider_timeout(config),
+    )
+    response.raise_for_status()
+    book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
+    candidates: list[dict[str, Any]] = []
+    for book_id in book_ids[:5]:
+        detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
+        detail_response = requests.get(
+            detail_url,
+            headers={"user-agent": config.fetch_user_agent},
+            timeout=_provider_timeout(config),
+        )
+        detail_response.raise_for_status()
+        detail_html = detail_response.text
+        download_path = _pick_gutenberg_epub(detail_html)
+        if not download_path:
+            continue
+        candidates.append(
+            {
+                "provider": "project_gutenberg",
+                "title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
+                "author": "",
+                "language": "en",
+                "download_url": urljoin(detail_url, download_path),
+                "filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
+            }
+        )
+    return candidates
+def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    response = requests.get(
+        config.internet_archive_advancedsearch_url,
+        params={
+            "q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
+            "fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
+            "sort[]": "downloads desc",
+            "rows": 8,
+            "page": 1,
+            "output": "json",
+        },
+        headers={"user-agent": config.fetch_user_agent},
+        timeout=_provider_timeout(config),
+    )
+    response.raise_for_status()
+    payload = response.json()
+    docs = payload.get("response", {}).get("docs", [])
+    candidates: list[dict[str, Any]] = []
+    for item in docs:
+        if not _is_english(item.get("language")):
+            continue
+        formats = item.get("format") or []
+        if isinstance(formats, str):
+            formats = [formats]
+        if not any(str(value).strip().lower() == "epub" for value in formats):
+            continue
+        identifier = str(item.get("identifier") or "").strip()
+        if not identifier:
+            continue
+        metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
+        metadata_response = requests.get(
+            metadata_url,
+            headers={"user-agent": config.fetch_user_agent},
+            timeout=_provider_timeout(config),
+        )
+        metadata_response.raise_for_status()
+        metadata = metadata_response.json()
+        filename = _pick_archive_epub_filename(metadata)
+        if not filename:
+            continue
+        candidates.append(
+            {
+                "provider": "internet_archive",
+                "title": str(item.get("title") or ""),
+                "author": _first_text(item.get("creator")),
+                "language": _first_text(item.get("language")),
+                "downloads": int(item.get("downloads") or 0),
+                "download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
+                "filename": filename,
+            }
+        )
+    return candidates
+def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    """
+    Search src_a for EPUB files matching the query.
+    Uses cloudscraper to bypass DDoS-Guard protection.
+    """
+    if not config.src_a_search_url or not config.src_a_base_url:
+        return []
+    if cloudscraper is None or BeautifulSoup is None:
+        return []
+    candidates: list[dict[str, Any]] = []
+    search_url = config.src_a_search_url
+    try:
+        scraper = cloudscraper.create_scraper(
+            browser={"browser": "chrome", "platform": "windows", "mobile": False},
+            delay=10,
+        )
+        headers = {
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "accept-language": "en-US,en;q=0.9",
+        }
+        response = scraper.get(
+            f"{search_url}?q={quote(query)}",
+            headers=headers,
+            timeout=60,
+        )
+        response.raise_for_status()
+        html = response.text
+        if not html or len(html) < 500:
+            raise FetchError("搜索页面加载失败")
+        soup = BeautifulSoup(html, "lxml")
+        # Find all links containing /md5/ - these are book entry links
+        md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
+        seen_md5 = set()
+        for link in md5_links:
+            link_text = (link.get_text() or "").strip()
+            # We want links with empty text (Save links) that point to md5 entries
+            if link_text and link_text != "Save":
+                continue
+            href = link.get("href", "")
+            md5_match = re.search(r"/md5/([a-f0-9]+)", href)
+            if not md5_match:
+                continue
+            md5 = md5_match.group(1)
+            if md5 in seen_md5:
+                continue
+            # Find the parent container to get context
+            container = link.find_parent("div")
+            if not container:
+                container = link.parent
+            if not container:
+                continue
+            container_text = container.get_text(separator="\n") or ""
+            # Check if this container has an EPUB file
+            epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
+            if not epub_match:
+                continue
+            epub_path = epub_match.group(1)
+            seen_md5.add(md5)
+            # Extract metadata: English [en] · EPUB · 1.2MB · 2020
+            meta_match = re.search(
+                r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
+                container_text,
+            )
+            metadata = {
+                "language": meta_match.group(1) if meta_match else "",
+                "format": meta_match.group(2) if meta_match else "",
+                "filesize": meta_match.group(3) if meta_match else "",
+                "year": meta_match.group(4) if meta_match else "",
+            } if meta_match else None
+            # Extract title and author from container text lines
+            lines = [l.strip() for l in container_text.split("\n") if l.strip()]
+            title = ""
+            author = ""
+            for line in lines:
+                if ".epub" in line.lower():
+                    continue
+                if re.match(r"^[\d.,]+$", line):
+                    continue
+                if re.match(r"^(English|Save|\d+)", line):
+                    continue
+                if not title:
+                    title = line[:200]
+                elif not author and len(line) < 100:
+                    author = line
+                    break
+            download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
+            filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
+            candidates.append({
+                "provider": "src_a",
+                "title": title,
+                "author": author,
+                "language": "en",
+                "download_url": download_url,
+                "filename": filename,
+                "filesize": metadata["filesize"] if metadata else "",
+                "year": metadata["year"] if metadata else "",
+            })
+    except FetchError:
+        raise
+    except Exception as exc:
+        raise FetchError(f"搜索失败: {str(exc)[:100]}")
+    return candidates
+def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
+    """
+    Search src_b for EPUB files matching the query.
+    Uses cloudscraper to bypass DDoS-Guard protection.
+    """
+    if not config.src_b_base_url:
+        return []
+    if cloudscraper is None or BeautifulSoup is None:
+        return []
+    base = config.src_b_base_url.rstrip("/")
+    candidates: list[dict[str, Any]] = []
+    try:
+        scraper = cloudscraper.create_scraper(
+            browser={"browser": "chrome", "platform": "windows", "mobile": False},
+            delay=10,
+        )
+        headers = {
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "accept-language": "en-US,en;q=0.9",
+        }
+        # Search page
+        search_url = f"{base}/s/{quote(query)}"
+        response = scraper.get(search_url, headers=headers, timeout=60)
+        response.raise_for_status()
+        html = response.text
+        if not html or len(html) < 200:
+            raise FetchError("搜索页面加载失败")
+        soup = BeautifulSoup(html, "lxml")
+        # Find book items by looking for links to /book/
+        book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
+        if not book_links:
+            raise FetchError("未找到任何书籍")
+        # Deduplicate by href
+        seen_hrefs: set[str] = set()
+        books: list[dict[str, str]] = []
+        for link in book_links:
+            href = link.get("href", "")
+            if not href or href in seen_hrefs:
+                continue
+            seen_hrefs.add(href)
+            text = (link.get_text() or "").strip()
+            if text and len(text) > 3:
+                books.append({"href": href, "text": text})
+        # Visit each book page to check for EPUB format
+        for book in books[:15]:
+            href = book.get("href", "")
+            if not href:
+                continue
+            book_url = href if href.startswith("http") else f"{base}{href}"
+            try:
+                book_resp = scraper.get(book_url, headers=headers, timeout=30)
+                book_resp.raise_for_status()
+                book_html = book_resp.text
+                book_soup = BeautifulSoup(book_html, "lxml")
+                page_text = book_soup.get_text(separator="\n") or ""
+                if "epub" not in page_text.lower():
+                    continue
+                # Extract author
+                author = ""
+                author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
+                if author_match:
+                    author = author_match.group(1).strip()[:100]
+                # Get download URL from /dl/ link
+                dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
+                dl_id = dl_match.group(1) if dl_match else ""
+                download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
+                title_text = book.get("text", "").split("\n")[0].strip()
+                if not title_text:
+                    title_text = book.get("text", "")[:100]
+                candidates.append({
+                    "provider": "src_b",
+                    "title": title_text,
+                    "author": author,
+                    "language": "en",
+                    "download_url": download_url,
+                    "filename": "",
+                    "filesize": "",
+                })
+            except Exception:
+                continue
+    except FetchError:
+        raise
+    except Exception as exc:
+        raise FetchError(f"搜索失败: {str(exc)[:100]}")
+    return candidates
+def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
+    if len(content) < 4 or not content.startswith(b"PK"):
+        raise FetchError("下载内容不是 EPUB")
+    try:
+        with zipfile.ZipFile(BytesIO(content)) as archive:
+            mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
+    except (KeyError, zipfile.BadZipFile) as exc:
+        raise FetchError("下载内容不是 EPUB") from exc
+    if mimetype != "application/epub+zip":
+        raise FetchError("下载内容不是 EPUB")
+    lowered_content_type = content_type.lower()
+    if filename.lower().endswith(".epub"):
+        return
+    if "application/epub+zip" in lowered_content_type:
+        return
+def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
+    hint = str(filename_hint or "").strip()
+    if hint:
+        return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
+    disposition = response.headers.get("content-disposition", "")
+    match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
+    if match:
+        return normalize_source_filename(
+            PurePosixPath(unquote(match.group(1).strip())).name,
+            default_extension=".epub",
+        )
+    parsed = urlparse(url)
+    name = PurePosixPath(unquote(parsed.path)).name
+    if name:
+        return normalize_source_filename(name, default_extension=".epub")
+    return "downloaded_book.epub"
+def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
+    if not candidate.get("download_url") or not _is_english(candidate.get("language")):
+        return -1.0
+    normalized_query = _normalize_text(query)
+    normalized_title = _normalize_text(candidate.get("title", ""))
+    title_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
+    if normalized_query and normalized_title and normalized_query == normalized_title:
+        title_score += 0.25
+    provider_bonus = {
+        "standard_ebooks": 0.08,
+        "project_gutenberg": 0.05,
+        "internet_archive": 0.03,
+        "src_a": 0.04,
+        "src_b": 0.05,
+    }.get(candidate.get("provider"), 0.0)
+    filename = str(candidate.get("filename") or "").lower()
+    download_url = str(candidate.get("download_url") or "").lower()
+    epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
+    downloads = max(int(candidate.get("downloads") or 0), 0)
+    downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
+    return title_score + provider_bonus + epub_bonus + downloads_bonus
+def _looks_like_url(value: str) -> bool:
+    parsed = urlparse(value)
+    return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
+def _normalize_text(value: str) -> str:
+    lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
+    return " ".join(lowered.split())
+def _is_english(value: Any) -> bool:
+    if isinstance(value, (list, tuple, set)):
+        return any(_is_english(item) for item in value)
+    normalized = _normalize_text(str(value or ""))
+    return normalized in ENGLISH_CODES
+def _unique_matches(pattern: str, text: str) -> list[str]:
+    results: list[str] = []
+    for match in re.findall(pattern, text):
+        value = match.strip()
+        if value and value not in results:
+            results.append(value)
+    return results
+def _pick_standard_ebooks_download(html: str) -> str:
+    links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
+    for link in links:
+        lower_link = link.lower()
+        if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
+            continue
+        return link
+    return links[0] if links else ""
+def _pick_gutenberg_epub(html: str) -> str:
+    links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
+    for link in links:
+        lower_link = link.lower()
+        if lower_link.endswith(".epub") or ".epub." in lower_link:
+            return link
+    return ""
+def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
+    for item in metadata.get("files", []) or []:
+        name = str(item.get("name") or "")
+        if name.lower().endswith(".epub"):
+            return name
+    return ""
+def _extract_html_title(html: str) -> str:
+    title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
+    if not title_match:
+        return ""
+    title = re.sub(r"\s+", " ", title_match.group(1)).strip()
+    title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
+    title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
+    title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
+    return title
+def _author_from_book_path(path: str) -> str:
+    parts = [part for part in path.strip("/").split("/") if part]
+    if len(parts) < 3:
+        return ""
+    return " ".join(part.capitalize() for part in parts[1].split("-"))
+def _title_from_book_path(path: str) -> str:
+    parts = [part for part in path.strip("/").split("/") if part]
+    if len(parts) < 3:
+        return ""
+    return " ".join(part.capitalize() for part in parts[2].split("-"))
+def _first_text(value: Any) -> str:
+    if isinstance(value, (list, tuple)):
+        return str(value[0]) if value else ""
+    return str(value or "")
+def _provider_timeout(config: AppConfig) -> int:
+    return max(5, min(int(config.fetch_timeout_seconds), 10))
+def _normalize_download_url(url: str) -> str:
+    parsed = urlparse(url)
+    if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
+        params = dict(parse_qsl(parsed.query, keep_blank_values=True))
+        params.setdefault("source", "download")
+        return urlunparse(parsed._replace(query=urlencode(params)))
+    return url