philverify-api / ml /data_sources /rappler_scraper.py
Ryan Christian D. Deniega
fix: cold start 502, favicon, verify state persistence
b1c84b5
"""
rappler_scraper.py
------------------
Scrapes fact-check articles from Rappler's Facts First / Fact-Check sections.
(https://www.rappler.com/facts-first/ and https://www.rappler.com/newsbreak/fact-check/)
Respects robots.txt, caches results for 7 days, and never raises on failure.
"""
from __future__ import annotations
import json
import logging
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
import requests
from bs4 import BeautifulSoup
from .base import DataSource, NormalizedSample, clean_text, detect_language
logger = logging.getLogger(__name__)
_UA = "PhilVerify-Research/1.0 (academic research; contact: research@philverify.ph)"
_HEADERS = {
"User-Agent": _UA,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
# ---------------------------------------------------------------------------
# Verdict → label mapping
# ---------------------------------------------------------------------------
_VERDICT_MAP: dict[str, int] = {
# Likely Fake (label 2)
"FALSE": 2,
"FAKE": 2,
"MISLEADING": 2,
"DISINFORMATION": 2,
"FABRICATED": 2,
# Unverified (label 1)
"UNVERIFIED": 1,
"NEEDS MORE CONTEXT": 1,
"MISSING CONTEXT": 1,
"NEEDS CONTEXT": 1,
"PARTLY TRUE": 1,
"PARTLY FALSE": 1,
"HALF TRUE": 1,
"MIXTURE": 1,
"UNPROVEN": 1,
# Credible (label 0)
"TRUE": 0,
"ACCURATE": 0,
"CORRECT": 0,
"VERIFIED": 0,
}
_CACHE_TTL_DAYS = 7
_REQUEST_DELAY = 1.5 # seconds between requests
def _resolve_verdict(raw: str) -> Optional[int]:
"""Normalise a raw verdict string to a label int, or None if unrecognised."""
normalised = raw.strip().upper()
if normalised in _VERDICT_MAP:
return _VERDICT_MAP[normalised]
for key, label in _VERDICT_MAP.items():
if key in normalised:
return label
return None
def _robots_allows(base_url: str, path: str) -> bool:
"""Return True when robots.txt permits PhilVerify to access *path*."""
robots_url = urljoin(base_url, "/robots.txt")
rp = RobotFileParser()
rp.set_url(robots_url)
try:
rp.read()
except Exception as exc:
logger.warning("Could not read robots.txt (%s): %s — proceeding with caution", robots_url, exc)
return True
target = urljoin(base_url, path)
allowed = rp.can_fetch(_UA, target)
if not allowed:
logger.warning("robots.txt disallows scraping %s", target)
return allowed
def _get(url: str, timeout: int = 20) -> Optional[requests.Response]:
"""GET *url* with the project User-Agent; return None on any error."""
try:
resp = requests.get(url, headers=_HEADERS, timeout=timeout)
resp.raise_for_status()
return resp
except requests.RequestException as exc:
logger.warning("GET %s failed: %s", url, exc)
return None
def _cache_fresh(cache_path: Path) -> bool:
"""True if *cache_path* exists and was written within the TTL window."""
if not cache_path.exists():
return False
mtime = datetime.fromtimestamp(cache_path.stat().st_mtime, tz=timezone.utc)
age_days = (datetime.now(tz=timezone.utc) - mtime).days
return age_days < _CACHE_TTL_DAYS
class RapplerScraper(DataSource):
"""Scrape fact-check articles from Rappler and return NormalizedSample list.
Tries both:
- https://www.rappler.com/facts-first/
- https://www.rappler.com/newsbreak/fact-check/
Parameters
----------
max_pages:
Maximum number of listing pages to iterate per section. Defaults to 10.
"""
BASE_URL = "https://www.rappler.com"
# Ordered list of archive sections to attempt; first one that yields articles wins.
ARCHIVE_PATHS = [
"/facts-first/",
"/newsbreak/fact-check/",
]
def __init__(self, max_pages: int = 10) -> None:
self.max_pages = max_pages
self.cache_file: Path = (
Path(__file__).parent.parent / "data" / "raw" / "rappler_cache.json"
)
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
# ------------------------------------------------------------------
# DataSource interface
# ------------------------------------------------------------------
@property
def source_name(self) -> str:
return "rappler_factcheck"
def fetch(self) -> list[NormalizedSample]:
"""Fetch and return normalised samples from Rappler.
Loads from local cache when available and fresh; otherwise scrapes
the live site and persists results to cache.
"""
# 1. Try cache first
if _cache_fresh(self.cache_file):
logger.info("Loading Rappler data from cache: %s", self.cache_file)
return self._load_cache()
# 2. Respect robots.txt (check each section path)
allowed_paths = [
path for path in self.ARCHIVE_PATHS
if _robots_allows(self.BASE_URL, path)
]
if not allowed_paths:
logger.error("robots.txt forbids all Rappler fact-check paths — returning []")
return []
logger.info("Scraping Rappler (paths: %s, max %d pages each)…", allowed_paths, self.max_pages)
article_urls: list[str] = []
# 3. Collect article URLs across all allowed archive sections
for archive_path in allowed_paths:
section_urls = self._collect_article_urls(archive_path)
logger.info("Section %s: found %d article links", archive_path, len(section_urls))
article_urls.extend(section_urls)
time.sleep(_REQUEST_DELAY)
# De-duplicate
seen_set: set[str] = set()
unique_urls: list[str] = []
for u in article_urls:
if u not in seen_set:
seen_set.add(u)
unique_urls.append(u)
if not unique_urls:
logger.warning("No article URLs collected from Rappler — returning []")
return []
# 4. Scrape individual articles
samples: list[NormalizedSample] = []
for idx, url in enumerate(unique_urls, start=1):
logger.debug("[%d/%d] Scraping %s", idx, len(unique_urls), url)
sample = self._scrape_article(url)
if sample is not None:
samples.append(sample)
time.sleep(_REQUEST_DELAY)
logger.info("Rappler: collected %d labelled samples", len(samples))
# 5. Persist to cache
if samples:
self._save_cache(samples)
return samples
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _collect_article_urls(self, archive_path: str) -> list[str]:
"""Return all article URLs found across paginated listing pages for *archive_path*."""
urls: list[str] = []
for page_num in range(1, self.max_pages + 1):
page_urls = self._get_article_urls_from_page(archive_path, page_num)
if not page_urls:
logger.info(
"No articles on page %d of %s — stopping pagination",
page_num,
archive_path,
)
break
logger.info(" %s page %d: %d links", archive_path, page_num, len(page_urls))
urls.extend(page_urls)
time.sleep(_REQUEST_DELAY)
return urls
def _listing_page_candidates(self, archive_path: str, page_num: int) -> list[str]:
"""Return concrete URLs to try for a given archive path + page number."""
base = f"{self.BASE_URL}{archive_path}"
base = base.rstrip("/")
candidates = [
f"{base}/", # page 1 root
f"{base}/page/{page_num}/", # WordPress-style
f"{base}?page={page_num}", # query-param style
f"{base}?paged={page_num}",
]
if page_num == 1:
# For page 1 try root first; duplicates are fine — we break early
candidates.insert(0, f"{base}/")
return candidates
def _get_article_urls_from_page(self, archive_path: str, page_num: int) -> list[str]:
"""Fetch one listing page and return article URLs found on it."""
for url in self._listing_page_candidates(archive_path, page_num):
resp = _get(url)
if resp is None:
time.sleep(0.5)
continue
soup = BeautifulSoup(resp.text, "lxml")
links = self._parse_article_links(soup)
if links:
return links
# If the page loaded but had no links, try next candidate
time.sleep(0.3)
return []
def _parse_article_links(self, soup: BeautifulSoup) -> list[str]:
"""Extract article hrefs from a listing-page soup object."""
links: list[str] = []
selectors = [
"article h2 a",
"article h3 a",
".entry-title a",
"h2.entry-title a",
".story-card__title a",
".article-title a",
".post-title a",
"h2 a[href*='fact-check']",
"h3 a[href*='fact-check']",
"h2 a[href*='facts-first']",
"h3 a[href*='facts-first']",
"h2 a",
]
for selector in selectors:
nodes = soup.select(selector)
if not nodes:
continue
for node in nodes:
href = node.get("href", "")
if not href:
continue
if href.startswith("http"):
full = href
elif href.startswith("/"):
full = urljoin(self.BASE_URL, href)
else:
continue
# Only keep URLs that look like Rappler articles
if "rappler.com" in full:
links.append(full)
if links:
break
# De-duplicate preserving order
seen: set[str] = set()
unique: list[str] = []
for link in links:
if link not in seen:
seen.add(link)
unique.append(link)
return unique
def _scrape_article(self, url: str) -> Optional[NormalizedSample]:
"""Fetch a single Rappler article page and return a NormalizedSample or None."""
resp = _get(url)
if resp is None:
return None
soup = BeautifulSoup(resp.text, "lxml")
# --- Verdict ---
raw_verdict = self._extract_verdict(soup)
if raw_verdict is None:
logger.debug("No recognisable verdict in %s — skipping", url)
return None
label = _resolve_verdict(raw_verdict)
if label is None:
logger.debug("Unknown verdict %r at %s — skipping", raw_verdict, url)
return None
# --- Headline ---
headline = ""
h1 = soup.find("h1")
if h1:
headline = h1.get_text(separator=" ", strip=True)
# --- Body / summary text ---
body_text = self._extract_body_text(soup) or headline
if not body_text:
return None
text = clean_text(body_text)
if not text:
return None
lang = detect_language(text)
return NormalizedSample(
text=text,
label=label,
source=self.source_name,
language=lang,
original_label=raw_verdict,
confidence=1.0,
)
def _extract_verdict(self, soup: BeautifulSoup) -> Optional[str]:
"""Try several heuristics to extract the verdict string from a Rappler article."""
# 1. Dedicated verdict / rating blocks — Rappler uses coloured label boxes
verdict_selectors = [
".verdict",
".rating",
".label",
".fact-check-label",
".fc-label",
"[class*='verdict']",
"[class*='rating']",
"[class*='label-']",
".wp-block-group",
".rappler-verdict",
]
for sel in verdict_selectors:
for node in soup.select(sel):
raw = node.get_text(separator=" ", strip=True)
if _resolve_verdict(raw) is not None:
return raw.strip()
# 2. Open Graph / Twitter card meta (Rappler often embeds verdict in og:description)
for meta in soup.find_all("meta"):
content = meta.get("content", "")
if not content:
continue
upper = content.upper()
for key in _VERDICT_MAP:
# Look for the verdict keyword appearing near the start or as a standalone token
pattern = r"\b" + re.escape(key) + r"\b"
if re.search(pattern, upper):
return key
# 3. Structured data / JSON-LD (some CMS setups put verdict in schema.org ClaimReview)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "{}")
# ClaimReview schema
if isinstance(data, dict):
items = data if not isinstance(data.get("@graph"), list) else {}
review = items if items.get("@type") == "ClaimReview" else {}
rating = review.get("reviewRating", {})
alt_name = rating.get("alternateName", "")
if alt_name and _resolve_verdict(alt_name) is not None:
return alt_name
except (json.JSONDecodeError, AttributeError):
pass
# 4. Bold/strong within article body
article_body = (
soup.find("div", class_=lambda c: c and "article-body" in c)
or soup.find("div", class_=lambda c: c and "entry-content" in c)
or soup.find("div", class_=lambda c: c and "content" in c)
)
if article_body:
for tag in article_body.find_all(["strong", "b", "em", "span"]):
raw = tag.get_text(strip=True)
if _resolve_verdict(raw) is not None:
return raw
# 5. Headline heuristic (e.g. "FACT CHECK: … is FALSE")
h1 = soup.find("h1")
if h1:
h1_text = h1.get_text(strip=True).upper()
for key in _VERDICT_MAP:
if re.search(r"\b" + re.escape(key) + r"\b", h1_text):
return key
# 6. Page title tag
title_tag = soup.find("title")
if title_tag:
title_text = title_tag.get_text(strip=True).upper()
for key in _VERDICT_MAP:
if re.search(r"\b" + re.escape(key) + r"\b", title_text):
return key
return None
def _extract_body_text(self, soup: BeautifulSoup) -> str:
"""Extract the best representative text (claim + summary) from the article."""
# Priority 1: claim box or summary paragraph
claim_selectors = [
".claim",
".claim-text",
".fact-check-claim",
".article-summary",
".entry-summary",
"blockquote",
]
for sel in claim_selectors:
node = soup.select_one(sel)
if node:
text = node.get_text(separator=" ", strip=True)
if len(text) > 20:
return text
# Priority 2: first substantive paragraph in article body
body = (
soup.find("div", class_=lambda c: c and "article-body" in c)
or soup.find("div", class_=lambda c: c and "entry-content" in c)
or soup.find("div", class_=lambda c: c and "content" in c)
)
if body:
for p in body.find_all("p"):
text = p.get_text(separator=" ", strip=True)
if len(text) > 40:
return text
# Priority 3: OG description
og_desc = soup.find("meta", property="og:description")
if og_desc:
content = og_desc.get("content", "")
if len(content) > 20:
return content
# Priority 4: meta description
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
return meta_desc.get("content", "")
return ""
# ------------------------------------------------------------------
# Cache helpers
# ------------------------------------------------------------------
def _save_cache(self, samples: list[NormalizedSample]) -> None:
payload = {
"timestamp": datetime.now(tz=timezone.utc).isoformat(),
"source": self.source_name,
"samples": [
{
"text": s.text,
"label": s.label,
"source": s.source,
"language": s.language,
"original_label": s.original_label,
"confidence": s.confidence,
}
for s in samples
],
}
try:
self.cache_file.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
logger.info("Rappler cache saved: %s (%d samples)", self.cache_file, len(samples))
except OSError as exc:
logger.error("Failed to write cache file %s: %s", self.cache_file, exc)
def _load_cache(self) -> list[NormalizedSample]:
try:
payload = json.loads(self.cache_file.read_text(encoding="utf-8"))
samples = [
NormalizedSample(
text=item["text"],
label=item["label"],
source=item["source"],
language=item["language"],
original_label=item["original_label"],
confidence=item.get("confidence", 1.0),
)
for item in payload.get("samples", [])
]
logger.info("Loaded %d samples from Rappler cache", len(samples))
return samples
except (OSError, json.JSONDecodeError, KeyError) as exc:
logger.error("Cache load failed (%s): %s — will re-scrape", self.cache_file, exc)
return []
# ---------------------------------------------------------------------------
# Quick smoke-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
scraper = RapplerScraper(max_pages=2)
results = scraper.fetch()
print(f"\nTotal samples: {len(results)}")
for sample in results[:5]:
print(f" [{sample.label}] ({sample.original_label}) {sample.text[:120]!r}")