from __future__ import annotations from typing import Any from urllib.parse import urljoin, urlsplit from bs4 import BeautifulSoup from .models import FetchResult def parse_page(item: FetchResult) -> tuple[dict[str, Any] | None, list[str]]: if not item.html: return None, [] soup = BeautifulSoup(item.html, "lxml") for tag in soup(["script", "style", "noscript", "svg", "iframe", "canvas"]): tag.decompose() text = soup.get_text(" ", strip=True) if not text: return None, [] links: list[str] = [] for anchor in soup.find_all("a", href=True): href = anchor.get("href", "").strip() if not href: continue links.append(urljoin(item.url, href)) domain = (urlsplit(item.url).hostname or "").lower().strip(".") record = { "text": text, "url": item.url, "domain": domain, "timestamp": item.fetched_at, } return record, links