""" rappler_scraper.py ------------------ Scrapes fact-check articles from Rappler's Facts First / Fact-Check sections. (https://www.rappler.com/facts-first/ and https://www.rappler.com/newsbreak/fact-check/) Respects robots.txt, caches results for 7 days, and never raises on failure. """ from __future__ import annotations import json import logging import re import time from datetime import datetime, timezone from pathlib import Path from typing import Optional from urllib.parse import urljoin from urllib.robotparser import RobotFileParser import requests from bs4 import BeautifulSoup from .base import DataSource, NormalizedSample, clean_text, detect_language logger = logging.getLogger(__name__) _UA = "PhilVerify-Research/1.0 (academic research; contact: research@philverify.ph)" _HEADERS = { "User-Agent": _UA, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } # --------------------------------------------------------------------------- # Verdict → label mapping # --------------------------------------------------------------------------- _VERDICT_MAP: dict[str, int] = { # Likely Fake (label 2) "FALSE": 2, "FAKE": 2, "MISLEADING": 2, "DISINFORMATION": 2, "FABRICATED": 2, # Unverified (label 1) "UNVERIFIED": 1, "NEEDS MORE CONTEXT": 1, "MISSING CONTEXT": 1, "NEEDS CONTEXT": 1, "PARTLY TRUE": 1, "PARTLY FALSE": 1, "HALF TRUE": 1, "MIXTURE": 1, "UNPROVEN": 1, # Credible (label 0) "TRUE": 0, "ACCURATE": 0, "CORRECT": 0, "VERIFIED": 0, } _CACHE_TTL_DAYS = 7 _REQUEST_DELAY = 1.5 # seconds between requests def _resolve_verdict(raw: str) -> Optional[int]: """Normalise a raw verdict string to a label int, or None if unrecognised.""" normalised = raw.strip().upper() if normalised in _VERDICT_MAP: return _VERDICT_MAP[normalised] for key, label in _VERDICT_MAP.items(): if key in normalised: return label return None def _robots_allows(base_url: str, path: str) -> bool: """Return True when robots.txt permits PhilVerify to access *path*.""" robots_url = urljoin(base_url, "/robots.txt") rp = RobotFileParser() rp.set_url(robots_url) try: rp.read() except Exception as exc: logger.warning("Could not read robots.txt (%s): %s — proceeding with caution", robots_url, exc) return True target = urljoin(base_url, path) allowed = rp.can_fetch(_UA, target) if not allowed: logger.warning("robots.txt disallows scraping %s", target) return allowed def _get(url: str, timeout: int = 20) -> Optional[requests.Response]: """GET *url* with the project User-Agent; return None on any error.""" try: resp = requests.get(url, headers=_HEADERS, timeout=timeout) resp.raise_for_status() return resp except requests.RequestException as exc: logger.warning("GET %s failed: %s", url, exc) return None def _cache_fresh(cache_path: Path) -> bool: """True if *cache_path* exists and was written within the TTL window.""" if not cache_path.exists(): return False mtime = datetime.fromtimestamp(cache_path.stat().st_mtime, tz=timezone.utc) age_days = (datetime.now(tz=timezone.utc) - mtime).days return age_days < _CACHE_TTL_DAYS class RapplerScraper(DataSource): """Scrape fact-check articles from Rappler and return NormalizedSample list. Tries both: - https://www.rappler.com/facts-first/ - https://www.rappler.com/newsbreak/fact-check/ Parameters ---------- max_pages: Maximum number of listing pages to iterate per section. Defaults to 10. """ BASE_URL = "https://www.rappler.com" # Ordered list of archive sections to attempt; first one that yields articles wins. ARCHIVE_PATHS = [ "/facts-first/", "/newsbreak/fact-check/", ] def __init__(self, max_pages: int = 10) -> None: self.max_pages = max_pages self.cache_file: Path = ( Path(__file__).parent.parent / "data" / "raw" / "rappler_cache.json" ) self.cache_file.parent.mkdir(parents=True, exist_ok=True) # ------------------------------------------------------------------ # DataSource interface # ------------------------------------------------------------------ @property def source_name(self) -> str: return "rappler_factcheck" def fetch(self) -> list[NormalizedSample]: """Fetch and return normalised samples from Rappler. Loads from local cache when available and fresh; otherwise scrapes the live site and persists results to cache. """ # 1. Try cache first if _cache_fresh(self.cache_file): logger.info("Loading Rappler data from cache: %s", self.cache_file) return self._load_cache() # 2. Respect robots.txt (check each section path) allowed_paths = [ path for path in self.ARCHIVE_PATHS if _robots_allows(self.BASE_URL, path) ] if not allowed_paths: logger.error("robots.txt forbids all Rappler fact-check paths — returning []") return [] logger.info("Scraping Rappler (paths: %s, max %d pages each)…", allowed_paths, self.max_pages) article_urls: list[str] = [] # 3. Collect article URLs across all allowed archive sections for archive_path in allowed_paths: section_urls = self._collect_article_urls(archive_path) logger.info("Section %s: found %d article links", archive_path, len(section_urls)) article_urls.extend(section_urls) time.sleep(_REQUEST_DELAY) # De-duplicate seen_set: set[str] = set() unique_urls: list[str] = [] for u in article_urls: if u not in seen_set: seen_set.add(u) unique_urls.append(u) if not unique_urls: logger.warning("No article URLs collected from Rappler — returning []") return [] # 4. Scrape individual articles samples: list[NormalizedSample] = [] for idx, url in enumerate(unique_urls, start=1): logger.debug("[%d/%d] Scraping %s", idx, len(unique_urls), url) sample = self._scrape_article(url) if sample is not None: samples.append(sample) time.sleep(_REQUEST_DELAY) logger.info("Rappler: collected %d labelled samples", len(samples)) # 5. Persist to cache if samples: self._save_cache(samples) return samples # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ def _collect_article_urls(self, archive_path: str) -> list[str]: """Return all article URLs found across paginated listing pages for *archive_path*.""" urls: list[str] = [] for page_num in range(1, self.max_pages + 1): page_urls = self._get_article_urls_from_page(archive_path, page_num) if not page_urls: logger.info( "No articles on page %d of %s — stopping pagination", page_num, archive_path, ) break logger.info(" %s page %d: %d links", archive_path, page_num, len(page_urls)) urls.extend(page_urls) time.sleep(_REQUEST_DELAY) return urls def _listing_page_candidates(self, archive_path: str, page_num: int) -> list[str]: """Return concrete URLs to try for a given archive path + page number.""" base = f"{self.BASE_URL}{archive_path}" base = base.rstrip("/") candidates = [ f"{base}/", # page 1 root f"{base}/page/{page_num}/", # WordPress-style f"{base}?page={page_num}", # query-param style f"{base}?paged={page_num}", ] if page_num == 1: # For page 1 try root first; duplicates are fine — we break early candidates.insert(0, f"{base}/") return candidates def _get_article_urls_from_page(self, archive_path: str, page_num: int) -> list[str]: """Fetch one listing page and return article URLs found on it.""" for url in self._listing_page_candidates(archive_path, page_num): resp = _get(url) if resp is None: time.sleep(0.5) continue soup = BeautifulSoup(resp.text, "lxml") links = self._parse_article_links(soup) if links: return links # If the page loaded but had no links, try next candidate time.sleep(0.3) return [] def _parse_article_links(self, soup: BeautifulSoup) -> list[str]: """Extract article hrefs from a listing-page soup object.""" links: list[str] = [] selectors = [ "article h2 a", "article h3 a", ".entry-title a", "h2.entry-title a", ".story-card__title a", ".article-title a", ".post-title a", "h2 a[href*='fact-check']", "h3 a[href*='fact-check']", "h2 a[href*='facts-first']", "h3 a[href*='facts-first']", "h2 a", ] for selector in selectors: nodes = soup.select(selector) if not nodes: continue for node in nodes: href = node.get("href", "") if not href: continue if href.startswith("http"): full = href elif href.startswith("/"): full = urljoin(self.BASE_URL, href) else: continue # Only keep URLs that look like Rappler articles if "rappler.com" in full: links.append(full) if links: break # De-duplicate preserving order seen: set[str] = set() unique: list[str] = [] for link in links: if link not in seen: seen.add(link) unique.append(link) return unique def _scrape_article(self, url: str) -> Optional[NormalizedSample]: """Fetch a single Rappler article page and return a NormalizedSample or None.""" resp = _get(url) if resp is None: return None soup = BeautifulSoup(resp.text, "lxml") # --- Verdict --- raw_verdict = self._extract_verdict(soup) if raw_verdict is None: logger.debug("No recognisable verdict in %s — skipping", url) return None label = _resolve_verdict(raw_verdict) if label is None: logger.debug("Unknown verdict %r at %s — skipping", raw_verdict, url) return None # --- Headline --- headline = "" h1 = soup.find("h1") if h1: headline = h1.get_text(separator=" ", strip=True) # --- Body / summary text --- body_text = self._extract_body_text(soup) or headline if not body_text: return None text = clean_text(body_text) if not text: return None lang = detect_language(text) return NormalizedSample( text=text, label=label, source=self.source_name, language=lang, original_label=raw_verdict, confidence=1.0, ) def _extract_verdict(self, soup: BeautifulSoup) -> Optional[str]: """Try several heuristics to extract the verdict string from a Rappler article.""" # 1. Dedicated verdict / rating blocks — Rappler uses coloured label boxes verdict_selectors = [ ".verdict", ".rating", ".label", ".fact-check-label", ".fc-label", "[class*='verdict']", "[class*='rating']", "[class*='label-']", ".wp-block-group", ".rappler-verdict", ] for sel in verdict_selectors: for node in soup.select(sel): raw = node.get_text(separator=" ", strip=True) if _resolve_verdict(raw) is not None: return raw.strip() # 2. Open Graph / Twitter card meta (Rappler often embeds verdict in og:description) for meta in soup.find_all("meta"): content = meta.get("content", "") if not content: continue upper = content.upper() for key in _VERDICT_MAP: # Look for the verdict keyword appearing near the start or as a standalone token pattern = r"\b" + re.escape(key) + r"\b" if re.search(pattern, upper): return key # 3. Structured data / JSON-LD (some CMS setups put verdict in schema.org ClaimReview) for script in soup.find_all("script", type="application/ld+json"): try: data = json.loads(script.string or "{}") # ClaimReview schema if isinstance(data, dict): items = data if not isinstance(data.get("@graph"), list) else {} review = items if items.get("@type") == "ClaimReview" else {} rating = review.get("reviewRating", {}) alt_name = rating.get("alternateName", "") if alt_name and _resolve_verdict(alt_name) is not None: return alt_name except (json.JSONDecodeError, AttributeError): pass # 4. Bold/strong within article body article_body = ( soup.find("div", class_=lambda c: c and "article-body" in c) or soup.find("div", class_=lambda c: c and "entry-content" in c) or soup.find("div", class_=lambda c: c and "content" in c) ) if article_body: for tag in article_body.find_all(["strong", "b", "em", "span"]): raw = tag.get_text(strip=True) if _resolve_verdict(raw) is not None: return raw # 5. Headline heuristic (e.g. "FACT CHECK: … is FALSE") h1 = soup.find("h1") if h1: h1_text = h1.get_text(strip=True).upper() for key in _VERDICT_MAP: if re.search(r"\b" + re.escape(key) + r"\b", h1_text): return key # 6. Page title tag title_tag = soup.find("title") if title_tag: title_text = title_tag.get_text(strip=True).upper() for key in _VERDICT_MAP: if re.search(r"\b" + re.escape(key) + r"\b", title_text): return key return None def _extract_body_text(self, soup: BeautifulSoup) -> str: """Extract the best representative text (claim + summary) from the article.""" # Priority 1: claim box or summary paragraph claim_selectors = [ ".claim", ".claim-text", ".fact-check-claim", ".article-summary", ".entry-summary", "blockquote", ] for sel in claim_selectors: node = soup.select_one(sel) if node: text = node.get_text(separator=" ", strip=True) if len(text) > 20: return text # Priority 2: first substantive paragraph in article body body = ( soup.find("div", class_=lambda c: c and "article-body" in c) or soup.find("div", class_=lambda c: c and "entry-content" in c) or soup.find("div", class_=lambda c: c and "content" in c) ) if body: for p in body.find_all("p"): text = p.get_text(separator=" ", strip=True) if len(text) > 40: return text # Priority 3: OG description og_desc = soup.find("meta", property="og:description") if og_desc: content = og_desc.get("content", "") if len(content) > 20: return content # Priority 4: meta description meta_desc = soup.find("meta", attrs={"name": "description"}) if meta_desc: return meta_desc.get("content", "") return "" # ------------------------------------------------------------------ # Cache helpers # ------------------------------------------------------------------ def _save_cache(self, samples: list[NormalizedSample]) -> None: payload = { "timestamp": datetime.now(tz=timezone.utc).isoformat(), "source": self.source_name, "samples": [ { "text": s.text, "label": s.label, "source": s.source, "language": s.language, "original_label": s.original_label, "confidence": s.confidence, } for s in samples ], } try: self.cache_file.write_text( json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8", ) logger.info("Rappler cache saved: %s (%d samples)", self.cache_file, len(samples)) except OSError as exc: logger.error("Failed to write cache file %s: %s", self.cache_file, exc) def _load_cache(self) -> list[NormalizedSample]: try: payload = json.loads(self.cache_file.read_text(encoding="utf-8")) samples = [ NormalizedSample( text=item["text"], label=item["label"], source=item["source"], language=item["language"], original_label=item["original_label"], confidence=item.get("confidence", 1.0), ) for item in payload.get("samples", []) ] logger.info("Loaded %d samples from Rappler cache", len(samples)) return samples except (OSError, json.JSONDecodeError, KeyError) as exc: logger.error("Cache load failed (%s): %s — will re-scrape", self.cache_file, exc) return [] # --------------------------------------------------------------------------- # Quick smoke-test # --------------------------------------------------------------------------- if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s") scraper = RapplerScraper(max_pages=2) results = scraper.fetch() print(f"\nTotal samples: {len(results)}") for sample in results[:5]: print(f" [{sample.label}] ({sample.original_label}) {sample.text[:120]!r}")