Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / ml /data_sources /rappler_scraper.py

Ryan Christian D. Deniega

fix: cold start 502, favicon, verify state persistence

b1c84b5 about 1 month ago

19.3 kB

	"""
	rappler_scraper.py
	------------------
	Scrapes fact-check articles from Rappler's Facts First / Fact-Check sections.
	(https://www.rappler.com/facts-first/ and https://www.rappler.com/newsbreak/fact-check/)

	Respects robots.txt, caches results for 7 days, and never raises on failure.
	"""
	from __future__ import annotations

	import json
	import logging
	import re
	import time
	from datetime import datetime, timezone
	from pathlib import Path
	from typing import Optional
	from urllib.parse import urljoin
	from urllib.robotparser import RobotFileParser

	import requests
	from bs4 import BeautifulSoup

	from .base import DataSource, NormalizedSample, clean_text, detect_language

	logger = logging.getLogger(__name__)

	_UA = "PhilVerify-Research/1.0 (academic research; contact: research@philverify.ph)"
	_HEADERS = {
	"User-Agent": _UA,
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	}

	# ---------------------------------------------------------------------------
	# Verdict → label mapping
	# ---------------------------------------------------------------------------
	_VERDICT_MAP: dict[str, int] = {
	# Likely Fake (label 2)
	"FALSE": 2,
	"FAKE": 2,
	"MISLEADING": 2,
	"DISINFORMATION": 2,
	"FABRICATED": 2,
	# Unverified (label 1)
	"UNVERIFIED": 1,
	"NEEDS MORE CONTEXT": 1,
	"MISSING CONTEXT": 1,
	"NEEDS CONTEXT": 1,
	"PARTLY TRUE": 1,
	"PARTLY FALSE": 1,
	"HALF TRUE": 1,
	"MIXTURE": 1,
	"UNPROVEN": 1,
	# Credible (label 0)
	"TRUE": 0,
	"ACCURATE": 0,
	"CORRECT": 0,
	"VERIFIED": 0,
	}

	_CACHE_TTL_DAYS = 7
	_REQUEST_DELAY = 1.5 # seconds between requests


	def _resolve_verdict(raw: str) -> Optional[int]:
	"""Normalise a raw verdict string to a label int, or None if unrecognised."""
	normalised = raw.strip().upper()
	if normalised in _VERDICT_MAP:
	return _VERDICT_MAP[normalised]
	for key, label in _VERDICT_MAP.items():
	if key in normalised:
	return label
	return None


	def _robots_allows(base_url: str, path: str) -> bool:
	"""Return True when robots.txt permits PhilVerify to access path."""
	robots_url = urljoin(base_url, "/robots.txt")
	rp = RobotFileParser()
	rp.set_url(robots_url)
	try:
	rp.read()
	except Exception as exc:
	logger.warning("Could not read robots.txt (%s): %s — proceeding with caution", robots_url, exc)
	return True
	target = urljoin(base_url, path)
	allowed = rp.can_fetch(_UA, target)
	if not allowed:
	logger.warning("robots.txt disallows scraping %s", target)
	return allowed


	def _get(url: str, timeout: int = 20) -> Optional[requests.Response]:
	"""GET url with the project User-Agent; return None on any error."""
	try:
	resp = requests.get(url, headers=_HEADERS, timeout=timeout)
	resp.raise_for_status()
	return resp
	except requests.RequestException as exc:
	logger.warning("GET %s failed: %s", url, exc)
	return None


	def _cache_fresh(cache_path: Path) -> bool:
	"""True if cache_path exists and was written within the TTL window."""
	if not cache_path.exists():
	return False
	mtime = datetime.fromtimestamp(cache_path.stat().st_mtime, tz=timezone.utc)
	age_days = (datetime.now(tz=timezone.utc) - mtime).days
	return age_days < _CACHE_TTL_DAYS


	class RapplerScraper(DataSource):
	"""Scrape fact-check articles from Rappler and return NormalizedSample list.

	Tries both:
	- https://www.rappler.com/facts-first/
	- https://www.rappler.com/newsbreak/fact-check/

	Parameters
	----------
	max_pages:
	Maximum number of listing pages to iterate per section. Defaults to 10.
	"""

	BASE_URL = "https://www.rappler.com"

	# Ordered list of archive sections to attempt; first one that yields articles wins.
	ARCHIVE_PATHS = [
	"/facts-first/",
	"/newsbreak/fact-check/",
	]

	def __init__(self, max_pages: int = 10) -> None:
	self.max_pages = max_pages
	self.cache_file: Path = (
	Path(__file__).parent.parent / "data" / "raw" / "rappler_cache.json"
	)
	self.cache_file.parent.mkdir(parents=True, exist_ok=True)

	# ------------------------------------------------------------------
	# DataSource interface
	# ------------------------------------------------------------------

	@property
	def source_name(self) -> str:
	return "rappler_factcheck"

	def fetch(self) -> list[NormalizedSample]:
	"""Fetch and return normalised samples from Rappler.

	Loads from local cache when available and fresh; otherwise scrapes
	the live site and persists results to cache.
	"""
	# 1. Try cache first
	if _cache_fresh(self.cache_file):
	logger.info("Loading Rappler data from cache: %s", self.cache_file)
	return self._load_cache()

	# 2. Respect robots.txt (check each section path)
	allowed_paths = [
	path for path in self.ARCHIVE_PATHS
	if _robots_allows(self.BASE_URL, path)
	]
	if not allowed_paths:
	logger.error("robots.txt forbids all Rappler fact-check paths — returning []")
	return []

	logger.info("Scraping Rappler (paths: %s, max %d pages each)…", allowed_paths, self.max_pages)

	article_urls: list[str] = []

	# 3. Collect article URLs across all allowed archive sections
	for archive_path in allowed_paths:
	section_urls = self._collect_article_urls(archive_path)
	logger.info("Section %s: found %d article links", archive_path, len(section_urls))
	article_urls.extend(section_urls)
	time.sleep(_REQUEST_DELAY)

	# De-duplicate
	seen_set: set[str] = set()
	unique_urls: list[str] = []
	for u in article_urls:
	if u not in seen_set:
	seen_set.add(u)
	unique_urls.append(u)

	if not unique_urls:
	logger.warning("No article URLs collected from Rappler — returning []")
	return []

	# 4. Scrape individual articles
	samples: list[NormalizedSample] = []
	for idx, url in enumerate(unique_urls, start=1):
	logger.debug("[%d/%d] Scraping %s", idx, len(unique_urls), url)
	sample = self._scrape_article(url)
	if sample is not None:
	samples.append(sample)
	time.sleep(_REQUEST_DELAY)

	logger.info("Rappler: collected %d labelled samples", len(samples))

	# 5. Persist to cache
	if samples:
	self._save_cache(samples)

	return samples

	# ------------------------------------------------------------------
	# Private helpers
	# ------------------------------------------------------------------

	def _collect_article_urls(self, archive_path: str) -> list[str]:
	"""Return all article URLs found across paginated listing pages for archive_path."""
	urls: list[str] = []
	for page_num in range(1, self.max_pages + 1):
	page_urls = self._get_article_urls_from_page(archive_path, page_num)
	if not page_urls:
	logger.info(
	"No articles on page %d of %s — stopping pagination",
	page_num,
	archive_path,
	)
	break
	logger.info(" %s page %d: %d links", archive_path, page_num, len(page_urls))
	urls.extend(page_urls)
	time.sleep(_REQUEST_DELAY)
	return urls

	def _listing_page_candidates(self, archive_path: str, page_num: int) -> list[str]:
	"""Return concrete URLs to try for a given archive path + page number."""
	base = f"{self.BASE_URL}{archive_path}"
	base = base.rstrip("/")
	candidates = [
	f"{base}/", # page 1 root
	f"{base}/page/{page_num}/", # WordPress-style
	f"{base}?page={page_num}", # query-param style
	f"{base}?paged={page_num}",
	]
	if page_num == 1:
	# For page 1 try root first; duplicates are fine — we break early
	candidates.insert(0, f"{base}/")
	return candidates

	def _get_article_urls_from_page(self, archive_path: str, page_num: int) -> list[str]:
	"""Fetch one listing page and return article URLs found on it."""
	for url in self._listing_page_candidates(archive_path, page_num):
	resp = _get(url)
	if resp is None:
	time.sleep(0.5)
	continue

	soup = BeautifulSoup(resp.text, "lxml")
	links = self._parse_article_links(soup)
	if links:
	return links
	# If the page loaded but had no links, try next candidate
	time.sleep(0.3)

	return []

	def _parse_article_links(self, soup: BeautifulSoup) -> list[str]:
	"""Extract article hrefs from a listing-page soup object."""
	links: list[str] = []

	selectors = [
	"article h2 a",
	"article h3 a",
	".entry-title a",
	"h2.entry-title a",
	".story-card__title a",
	".article-title a",
	".post-title a",
	"h2 a[href*='fact-check']",
	"h3 a[href*='fact-check']",
	"h2 a[href*='facts-first']",
	"h3 a[href*='facts-first']",
	"h2 a",
	]
	for selector in selectors:
	nodes = soup.select(selector)
	if not nodes:
	continue
	for node in nodes:
	href = node.get("href", "")
	if not href:
	continue
	if href.startswith("http"):
	full = href
	elif href.startswith("/"):
	full = urljoin(self.BASE_URL, href)
	else:
	continue
	# Only keep URLs that look like Rappler articles
	if "rappler.com" in full:
	links.append(full)
	if links:
	break

	# De-duplicate preserving order
	seen: set[str] = set()
	unique: list[str] = []
	for link in links:
	if link not in seen:
	seen.add(link)
	unique.append(link)
	return unique

	def _scrape_article(self, url: str) -> Optional[NormalizedSample]:
	"""Fetch a single Rappler article page and return a NormalizedSample or None."""
	resp = _get(url)
	if resp is None:
	return None

	soup = BeautifulSoup(resp.text, "lxml")

	# --- Verdict ---
	raw_verdict = self._extract_verdict(soup)
	if raw_verdict is None:
	logger.debug("No recognisable verdict in %s — skipping", url)
	return None

	label = _resolve_verdict(raw_verdict)
	if label is None:
	logger.debug("Unknown verdict %r at %s — skipping", raw_verdict, url)
	return None

	# --- Headline ---
	headline = ""
	h1 = soup.find("h1")
	if h1:
	headline = h1.get_text(separator=" ", strip=True)

	# --- Body / summary text ---
	body_text = self._extract_body_text(soup) or headline
	if not body_text:
	return None

	text = clean_text(body_text)
	if not text:
	return None

	lang = detect_language(text)

	return NormalizedSample(
	text=text,
	label=label,
	source=self.source_name,
	language=lang,
	original_label=raw_verdict,
	confidence=1.0,
	)

	def _extract_verdict(self, soup: BeautifulSoup) -> Optional[str]:
	"""Try several heuristics to extract the verdict string from a Rappler article."""

	# 1. Dedicated verdict / rating blocks — Rappler uses coloured label boxes
	verdict_selectors = [
	".verdict",
	".rating",
	".label",
	".fact-check-label",
	".fc-label",
	"[class*='verdict']",
	"[class*='rating']",
	"[class*='label-']",
	".wp-block-group",
	".rappler-verdict",
	]
	for sel in verdict_selectors:
	for node in soup.select(sel):
	raw = node.get_text(separator=" ", strip=True)
	if _resolve_verdict(raw) is not None:
	return raw.strip()

	# 2. Open Graph / Twitter card meta (Rappler often embeds verdict in og:description)
	for meta in soup.find_all("meta"):
	content = meta.get("content", "")
	if not content:
	continue
	upper = content.upper()
	for key in _VERDICT_MAP:
	# Look for the verdict keyword appearing near the start or as a standalone token
	pattern = r"\b" + re.escape(key) + r"\b"
	if re.search(pattern, upper):
	return key

	# 3. Structured data / JSON-LD (some CMS setups put verdict in schema.org ClaimReview)
	for script in soup.find_all("script", type="application/ld+json"):
	try:
	data = json.loads(script.string or "{}")
	# ClaimReview schema
	if isinstance(data, dict):
	items = data if not isinstance(data.get("@graph"), list) else {}
	review = items if items.get("@type") == "ClaimReview" else {}
	rating = review.get("reviewRating", {})
	alt_name = rating.get("alternateName", "")
	if alt_name and _resolve_verdict(alt_name) is not None:
	return alt_name
	except (json.JSONDecodeError, AttributeError):
	pass

	# 4. Bold/strong within article body
	article_body = (
	soup.find("div", class_=lambda c: c and "article-body" in c)
	or soup.find("div", class_=lambda c: c and "entry-content" in c)
	or soup.find("div", class_=lambda c: c and "content" in c)
	)
	if article_body:
	for tag in article_body.find_all(["strong", "b", "em", "span"]):
	raw = tag.get_text(strip=True)
	if _resolve_verdict(raw) is not None:
	return raw

	# 5. Headline heuristic (e.g. "FACT CHECK: … is FALSE")
	h1 = soup.find("h1")
	if h1:
	h1_text = h1.get_text(strip=True).upper()
	for key in _VERDICT_MAP:
	if re.search(r"\b" + re.escape(key) + r"\b", h1_text):
	return key

	# 6. Page title tag
	title_tag = soup.find("title")
	if title_tag:
	title_text = title_tag.get_text(strip=True).upper()
	for key in _VERDICT_MAP:
	if re.search(r"\b" + re.escape(key) + r"\b", title_text):
	return key

	return None

	def _extract_body_text(self, soup: BeautifulSoup) -> str:
	"""Extract the best representative text (claim + summary) from the article."""
	# Priority 1: claim box or summary paragraph
	claim_selectors = [
	".claim",
	".claim-text",
	".fact-check-claim",
	".article-summary",
	".entry-summary",
	"blockquote",
	]
	for sel in claim_selectors:
	node = soup.select_one(sel)
	if node:
	text = node.get_text(separator=" ", strip=True)
	if len(text) > 20:
	return text

	# Priority 2: first substantive paragraph in article body
	body = (
	soup.find("div", class_=lambda c: c and "article-body" in c)
	or soup.find("div", class_=lambda c: c and "entry-content" in c)
	or soup.find("div", class_=lambda c: c and "content" in c)
	)
	if body:
	for p in body.find_all("p"):
	text = p.get_text(separator=" ", strip=True)
	if len(text) > 40:
	return text

	# Priority 3: OG description
	og_desc = soup.find("meta", property="og:description")
	if og_desc:
	content = og_desc.get("content", "")
	if len(content) > 20:
	return content

	# Priority 4: meta description
	meta_desc = soup.find("meta", attrs={"name": "description"})
	if meta_desc:
	return meta_desc.get("content", "")

	return ""

	# ------------------------------------------------------------------
	# Cache helpers
	# ------------------------------------------------------------------

	def _save_cache(self, samples: list[NormalizedSample]) -> None:
	payload = {
	"timestamp": datetime.now(tz=timezone.utc).isoformat(),
	"source": self.source_name,
	"samples": [
	{
	"text": s.text,
	"label": s.label,
	"source": s.source,
	"language": s.language,
	"original_label": s.original_label,
	"confidence": s.confidence,
	}
	for s in samples
	],
	}
	try:
	self.cache_file.write_text(
	json.dumps(payload, ensure_ascii=False, indent=2),
	encoding="utf-8",
	)
	logger.info("Rappler cache saved: %s (%d samples)", self.cache_file, len(samples))
	except OSError as exc:
	logger.error("Failed to write cache file %s: %s", self.cache_file, exc)

	def _load_cache(self) -> list[NormalizedSample]:
	try:
	payload = json.loads(self.cache_file.read_text(encoding="utf-8"))
	samples = [
	NormalizedSample(
	text=item["text"],
	label=item["label"],
	source=item["source"],
	language=item["language"],
	original_label=item["original_label"],
	confidence=item.get("confidence", 1.0),
	)
	for item in payload.get("samples", [])
	]
	logger.info("Loaded %d samples from Rappler cache", len(samples))
	return samples
	except (OSError, json.JSONDecodeError, KeyError) as exc:
	logger.error("Cache load failed (%s): %s — will re-scrape", self.cache_file, exc)
	return []


	# ---------------------------------------------------------------------------
	# Quick smoke-test
	# ---------------------------------------------------------------------------
	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
	scraper = RapplerScraper(max_pages=2)
	results = scraper.fetch()
	print(f"\nTotal samples: {len(results)}")
	for sample in results[:5]:
	print(f" [{sample.label}] ({sample.original_label}) {sample.text[:120]!r}")