""" Web Scraper for VeriLens AI (V2 - Trafilatura Engine) Uses the modern trafilatura library to bypass bot-blockers, strip out cookie banners, and extract pristine article text for NLP. """ from __future__ import annotations import logging from dataclasses import dataclass import trafilatura logger = logging.getLogger(__name__) @dataclass class ScrapedArticle: title: str text: str authors: list[str] publish_date: str | None source_url: str def extract_article(url: str) -> ScrapedArticle: """ Download and parse a news article from *url* using Trafilatura. Raises ValueError on failure or if the site aggressively blocks scraping. """ logger.info(f"Attempting to scrape URL: {url}") # 1. Fetch the raw HTML (Trafilatura handles redirects and headers automatically) downloaded = trafilatura.fetch_url(url) if downloaded is None: logger.error(f"Fetch failed for {url}. The site may be down or actively blocking bots.") raise ValueError("Could not access URL. The site may be blocking automated requests or is invalid.") # 2. Extract the text and metadata (bare_extraction returns a dictionary) # We disable comments and tables to keep the text as pure as possible for the AI. extracted = trafilatura.bare_extraction( downloaded, include_comments=False, include_tables=False ) # 3. Guardrail: Did we actually get text? if extracted is None or not extracted.get('text') or len(extracted.get('text', '').strip()) < 50: logger.warning(f"Extraction failed or returned too little text for {url}") raise ValueError( "Extracted article content is too short or empty. " "The URL may be a video, a paywalled article, or heavily obfuscated with JavaScript." ) # 4. Clean up the metadata title = extracted.get('title') or "Unknown Title" text = extracted.get('text', '') date = extracted.get('date') # Trafilatura usually returns authors as a single string separated by semicolons or commas raw_author = extracted.get('author') if raw_author: # Split by comma or semicolon and clean up whitespace authors = [a.strip() for a in raw_author.replace(';', ',').split(',') if a.strip()] else: authors = [] logger.info(f"Successfully scraped: '{title[:30]}...' ({len(text)} characters)") return ScrapedArticle( title=title, text=text, authors=authors, publish_date=date, source_url=url, )