Spaces:

nahArnav
/

thetruthbureau

Sleeping

File size: 2,574 Bytes

39bbca0

"""
Web Scraper for VeriLens AI (V2 - Trafilatura Engine)
Uses the modern trafilatura library to bypass bot-blockers,
strip out cookie banners, and extract pristine article text for NLP.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass

import trafilatura

logger = logging.getLogger(__name__)


@dataclass
class ScrapedArticle:
    title: str
    text: str
    authors: list[str]
    publish_date: str | None
    source_url: str


def extract_article(url: str) -> ScrapedArticle:
    """
    Download and parse a news article from *url* using Trafilatura.
    Raises ValueError on failure or if the site aggressively blocks scraping.
    """
    logger.info(f"Attempting to scrape URL: {url}")
    
    # 1. Fetch the raw HTML (Trafilatura handles redirects and headers automatically)
    downloaded = trafilatura.fetch_url(url)
    
    if downloaded is None:
        logger.error(f"Fetch failed for {url}. The site may be down or actively blocking bots.")
        raise ValueError("Could not access URL. The site may be blocking automated requests or is invalid.")

    # 2. Extract the text and metadata (bare_extraction returns a dictionary)
    # We disable comments and tables to keep the text as pure as possible for the AI.
    extracted = trafilatura.bare_extraction(
        downloaded, 
        include_comments=False, 
        include_tables=False
    )

    # 3. Guardrail: Did we actually get text?
    if extracted is None or not extracted.get('text') or len(extracted.get('text', '').strip()) < 50:
        logger.warning(f"Extraction failed or returned too little text for {url}")
        raise ValueError(
            "Extracted article content is too short or empty. "
            "The URL may be a video, a paywalled article, or heavily obfuscated with JavaScript."
        )

    # 4. Clean up the metadata
    title = extracted.get('title') or "Unknown Title"
    text = extracted.get('text', '')
    date = extracted.get('date')
    
    # Trafilatura usually returns authors as a single string separated by semicolons or commas
    raw_author = extracted.get('author')
    if raw_author:
        # Split by comma or semicolon and clean up whitespace
        authors = [a.strip() for a in raw_author.replace(';', ',').split(',') if a.strip()]
    else:
        authors = []

    logger.info(f"Successfully scraped: '{title[:30]}...' ({len(text)} characters)")

    return ScrapedArticle(
        title=title,
        text=text,
        authors=authors,
        publish_date=date,
        source_url=url,
    )