"""Simple web‑scraping tool used by the agent. The real implementation would use :pypi:`trafilatura` (or Playwright) to fetch and clean the main article text from a URL. Here we provide a lightweight fallback that works in the HF Space without extra system dependencies. """ import os import requests from loguru import logger def scrape(url: str, timeout: int = 10) -> str: """Return the main textual content of *url*. The function tries to fetch the page with ``requests`` and then extracts a crude text representation by stripping HTML tags. If the optional ``trafilatura`` package is available it will be used for a higher‑quality extraction. """ logger.info(f"Scraping URL: {url}") try: resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"}) resp.raise_for_status() html = resp.text except Exception as e: logger.error(f"Failed to fetch {url}: {e}") return f"Error: could not retrieve the page ({e})" # Try to use trafilatura if installed – it gives a clean article body. try: import trafilatura text = trafilatura.extract(html) if text: return text.strip() except Exception: # Fallback: very naive tag removal. from html import unescape import re text = re.sub(r"", "", html, flags=re.DOTALL) text = re.sub(r"", "", text, flags=re.DOTALL) text = re.sub(r"<[^>]+>", " ", text) text = unescape(text) # Collapse whitespace. text = re.sub(r"\s+", " ", text).strip() return text