File size: 1,659 Bytes
6701858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
"""Simple web‑scraping tool used by the agent.

The real implementation would use :pypi:`trafilatura` (or Playwright) to fetch
and clean the main article text from a URL.  Here we provide a lightweight
fallback that works in the HF Space without extra system dependencies.
"""
import os
import requests
from loguru import logger


def scrape(url: str, timeout: int = 10) -> str:
    """Return the main textual content of *url*.

    The function tries to fetch the page with ``requests`` and then extracts a
    crude text representation by stripping HTML tags.  If the optional
    ``trafilatura`` package is available it will be used for a higher‑quality
    extraction.
    """
    logger.info(f"Scraping URL: {url}")
    try:
        resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        resp.raise_for_status()
        html = resp.text
    except Exception as e:
        logger.error(f"Failed to fetch {url}: {e}")
        return f"Error: could not retrieve the page ({e})"

    # Try to use trafilatura if installed – it gives a clean article body.
    try:
        import trafilatura
        text = trafilatura.extract(html)
        if text:
            return text.strip()
    except Exception:
        # Fallback: very naive tag removal.
        from html import unescape
        import re
        text = re.sub(r"<script.*?</script>", "", html, flags=re.DOTALL)
        text = re.sub(r"<style.*?</style>", "", text, flags=re.DOTALL)
        text = re.sub(r"<[^>]+>", " ", text)
        text = unescape(text)
        # Collapse whitespace.
        text = re.sub(r"\s+", " ", text).strip()
        return text