Spaces:

nahArnav
/

thetruthbureau

Running

App Files Files Community

thetruthbureau / scraper.py

nahArnav

Upload 13 files

39bbca0 verified 2 months ago

raw

history blame contribute delete

2.57 kB

	"""
	Web Scraper for VeriLens AI (V2 - Trafilatura Engine)
	Uses the modern trafilatura library to bypass bot-blockers,
	strip out cookie banners, and extract pristine article text for NLP.
	"""

	from __future__ import annotations

	import logging
	from dataclasses import dataclass

	import trafilatura

	logger = logging.getLogger(__name__)


	@dataclass
	class ScrapedArticle:
	title: str
	text: str
	authors: list[str]
	publish_date: str \| None
	source_url: str


	def extract_article(url: str) -> ScrapedArticle:
	"""
	Download and parse a news article from url using Trafilatura.
	Raises ValueError on failure or if the site aggressively blocks scraping.
	"""
	logger.info(f"Attempting to scrape URL: {url}")

	# 1. Fetch the raw HTML (Trafilatura handles redirects and headers automatically)
	downloaded = trafilatura.fetch_url(url)

	if downloaded is None:
	logger.error(f"Fetch failed for {url}. The site may be down or actively blocking bots.")
	raise ValueError("Could not access URL. The site may be blocking automated requests or is invalid.")

	# 2. Extract the text and metadata (bare_extraction returns a dictionary)
	# We disable comments and tables to keep the text as pure as possible for the AI.
	extracted = trafilatura.bare_extraction(
	downloaded,
	include_comments=False,
	include_tables=False
	)

	# 3. Guardrail: Did we actually get text?
	if extracted is None or not extracted.get('text') or len(extracted.get('text', '').strip()) < 50:
	logger.warning(f"Extraction failed or returned too little text for {url}")
	raise ValueError(
	"Extracted article content is too short or empty. "
	"The URL may be a video, a paywalled article, or heavily obfuscated with JavaScript."
	)

	# 4. Clean up the metadata
	title = extracted.get('title') or "Unknown Title"
	text = extracted.get('text', '')
	date = extracted.get('date')

	# Trafilatura usually returns authors as a single string separated by semicolons or commas
	raw_author = extracted.get('author')
	if raw_author:
	# Split by comma or semicolon and clean up whitespace
	authors = [a.strip() for a in raw_author.replace(';', ',').split(',') if a.strip()]
	else:
	authors = []

	logger.info(f"Successfully scraped: '{title[:30]}...' ({len(text)} characters)")

	return ScrapedArticle(
	title=title,
	text=text,
	authors=authors,
	publish_date=date,
	source_url=url,
	)