import asyncio import logging import re from urllib.parse import urljoin, urlparse import httpx from bs4 import BeautifulSoup from crawl.http_utils import fetch_with_retry, safe_text logger = logging.getLogger(__name__) async def fetch_generic_html( index_url: str, seen: set[str], link_pattern: str = r".*", max_pages: int = 150, crawl_delay: float = 0.0, ) -> tuple[list[tuple[str, str]], int]: """Fetch index_url, follow links matching link_pattern, return (pages, skipped_count). Discovered links are de-duplicated and fragments stripped before comparison. The index page itself is never returned as a content page. """ try: index_resp = await fetch_with_retry(index_url) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("Generic index fetch failed for %r: %s", index_url, exc) return [], 0 soup = BeautifulSoup(safe_text(index_resp), "html.parser") pattern = re.compile(link_pattern) index_bare = index_url.split("#")[0] seen_urls: set[str] = set() discovered: list[str] = [] for a in soup.find_all("a", href=True): href = a["href"].strip() if not href or href.startswith("mailto:") or href.startswith("javascript:"): continue url = urljoin(index_url, href).split("#")[0] if url == index_bare or url in seen_urls: continue if pattern.search(url): seen_urls.add(url) discovered.append(url) discovered = discovered[:max_pages] logger.info("Generic source: found %d candidate pages from %s", len(discovered), index_url) results: list[tuple[str, str]] = [] skipped = 0 for url in discovered: if url in seen: skipped += 1 continue if crawl_delay: await asyncio.sleep(crawl_delay) try: page_resp = await fetch_with_retry(url) results.append((url, safe_text(page_resp))) except (httpx.HTTPStatusError, httpx.RequestError) as exc: logger.warning("Generic page fetch failed for %r: %s", url, exc) return results, skipped