agent-dev-07-space / src /tools /web_scraper.py
OjciecTadeusz's picture
Upload src/tools/web_scraper.py with huggingface_hub
6701858 verified
"""Simple web‑scraping tool used by the agent.
The real implementation would use :pypi:`trafilatura` (or Playwright) to fetch
and clean the main article text from a URL. Here we provide a lightweight
fallback that works in the HF Space without extra system dependencies.
"""
import os
import requests
from loguru import logger
def scrape(url: str, timeout: int = 10) -> str:
"""Return the main textual content of *url*.
The function tries to fetch the page with ``requests`` and then extracts a
crude text representation by stripping HTML tags. If the optional
``trafilatura`` package is available it will be used for a higher‑quality
extraction.
"""
logger.info(f"Scraping URL: {url}")
try:
resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
resp.raise_for_status()
html = resp.text
except Exception as e:
logger.error(f"Failed to fetch {url}: {e}")
return f"Error: could not retrieve the page ({e})"
# Try to use trafilatura if installed – it gives a clean article body.
try:
import trafilatura
text = trafilatura.extract(html)
if text:
return text.strip()
except Exception:
# Fallback: very naive tag removal.
from html import unescape
import re
text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
text = re.sub(r"<[^>]+>", " ", text)
text = unescape(text)
# Collapse whitespace.
text = re.sub(r"\s+", " ", text).strip()
return text