Spaces:
Runtime error
Runtime error
| """Simple web‑scraping tool used by the agent. | |
| The real implementation would use :pypi:`trafilatura` (or Playwright) to fetch | |
| and clean the main article text from a URL. Here we provide a lightweight | |
| fallback that works in the HF Space without extra system dependencies. | |
| """ | |
| import os | |
| import requests | |
| from loguru import logger | |
| def scrape(url: str, timeout: int = 10) -> str: | |
| """Return the main textual content of *url*. | |
| The function tries to fetch the page with ``requests`` and then extracts a | |
| crude text representation by stripping HTML tags. If the optional | |
| ``trafilatura`` package is available it will be used for a higher‑quality | |
| extraction. | |
| """ | |
| logger.info(f"Scraping URL: {url}") | |
| try: | |
| resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"}) | |
| resp.raise_for_status() | |
| html = resp.text | |
| except Exception as e: | |
| logger.error(f"Failed to fetch {url}: {e}") | |
| return f"Error: could not retrieve the page ({e})" | |
| # Try to use trafilatura if installed – it gives a clean article body. | |
| try: | |
| import trafilatura | |
| text = trafilatura.extract(html) | |
| if text: | |
| return text.strip() | |
| except Exception: | |
| # Fallback: very naive tag removal. | |
| from html import unescape | |
| import re | |
| text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL) | |
| text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL) | |
| text = re.sub(r"<[^>]+>", " ", text) | |
| text = unescape(text) | |
| # Collapse whitespace. | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |