Spaces:
Running
Running
| """ | |
| ml/data_sources/base.py | |
| Abstract base class and shared utilities for PhilVerify data source adapters. | |
| Provides: | |
| - NormalizedSample : canonical dataclass for all ingested samples | |
| - DataSource : ABC that every source adapter must implement | |
| - clean_text : HTML-strip + Unicode normalization + whitespace collapse | |
| - detect_language : langdetect wrapper returning "tl" / "en" / "mixed" | |
| - domain_to_credibility_score : looks up domain tier from domain_credibility.json | |
| - binary_to_three_class : maps raw dataset labels to {0, 1, 2} | |
| Label schema | |
| ------------ | |
| 0 β Credible | |
| 1 β Unverified | |
| 2 β Likely Fake | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import re | |
| import unicodedata | |
| from abc import ABC, abstractmethod | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import ClassVar | |
| logger = logging.getLogger(__name__) | |
| # --------------------------------------------------------------------------- | |
| # Default path: ml/data_sources/ β ml/ β PhilVerify/ β domain_credibility.json | |
| # --------------------------------------------------------------------------- | |
| _DEFAULT_CREDIBILITY_JSON: Path = ( | |
| Path(__file__).parent.parent.parent / "domain_credibility.json" | |
| ) | |
| # Module-level cache so the JSON file is only read from disk once per process. | |
| _credibility_cache: dict[str, dict] = {} | |
| # --------------------------------------------------------------------------- | |
| # Dataclass | |
| # --------------------------------------------------------------------------- | |
| class NormalizedSample: | |
| """A single article or headline normalized to PhilVerify's label schema. | |
| Attributes | |
| ---------- | |
| text: | |
| Cleaned article text or headline. | |
| label: | |
| Integer label in {0, 1, 2} (Credible / Unverified / Likely Fake). | |
| source: | |
| Dataset identifier, e.g. ``"jcblaise/fake_news_filipino"``. | |
| language: | |
| BCP-47-style language code: ``"tl"``, ``"en"``, or ``"mixed"``. | |
| original_label: | |
| The raw label string from the upstream dataset, e.g. ``"fake"``, | |
| ``"real"``, ``"pants-fire"``. Preserved for debugging / auditing. | |
| confidence: | |
| A float in [0.0, 1.0] representing how confident the label mapping is. | |
| Defaults to ``1.0`` for unambiguous remappings; use lower values for | |
| heuristic or model-assisted mappings. | |
| """ | |
| text: str | |
| label: int | |
| source: str | |
| language: str | |
| original_label: str | |
| confidence: float = field(default=1.0) | |
| def __post_init__(self) -> None: | |
| if self.label not in {0, 1, 2}: | |
| raise ValueError( | |
| f"label must be 0, 1, or 2; got {self.label!r}" | |
| ) | |
| if not (0.0 <= self.confidence <= 1.0): | |
| raise ValueError( | |
| f"confidence must be in [0.0, 1.0]; got {self.confidence!r}" | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Abstract base class | |
| # --------------------------------------------------------------------------- | |
| class DataSource(ABC): | |
| """Abstract base class for PhilVerify data source adapters. | |
| Subclasses must implement :meth:`fetch` and the :attr:`source_name` | |
| property. Callers should use :meth:`load`, which wraps :meth:`fetch` | |
| with logging and error handling. | |
| Class Attributes | |
| ---------------- | |
| LABEL_NAMES: | |
| Human-readable names for each integer label. | |
| """ | |
| LABEL_NAMES: ClassVar[dict[int, str]] = { | |
| 0: "Credible", | |
| 1: "Unverified", | |
| 2: "Likely Fake", | |
| } | |
| # -- Abstract interface -------------------------------------------------- | |
| def source_name(self) -> str: | |
| """A stable, unique identifier for this data source. | |
| Recommended format: ``"<owner>/<dataset>"`` for HuggingFace datasets, | |
| or a descriptive slug for scraped / local sources. | |
| Example: ``"jcblaise/fake_news_filipino"`` | |
| """ | |
| def fetch(self) -> list[NormalizedSample]: | |
| """Download or load raw data and return normalized samples. | |
| This method may perform network I/O and should not swallow exceptions; | |
| error handling is the responsibility of :meth:`load`. | |
| Returns | |
| ------- | |
| list[NormalizedSample] | |
| Every sample extracted from this source after normalization. | |
| """ | |
| # -- Concrete helpers ---------------------------------------------------- | |
| def load(self) -> list[NormalizedSample]: | |
| """Call :meth:`fetch`, log progress, and handle errors gracefully. | |
| Returns an empty list (rather than raising) if fetching fails, so that | |
| a single broken source does not abort a multi-source pipeline. | |
| Returns | |
| ------- | |
| list[NormalizedSample] | |
| Normalized samples, or ``[]`` on failure. | |
| """ | |
| logger.info("Loading data source: %s", self.source_name) | |
| try: | |
| samples = self.fetch() | |
| logger.info( | |
| "Loaded %d samples from %s", len(samples), self.source_name | |
| ) | |
| return samples | |
| except Exception: # noqa: BLE001 | |
| logger.warning( | |
| "Failed to load data source '%s'. Returning empty list.", | |
| self.source_name, | |
| exc_info=True, | |
| ) | |
| return [] | |
| # --------------------------------------------------------------------------- | |
| # NLP utility functions | |
| # --------------------------------------------------------------------------- | |
| _HTML_TAG_RE = re.compile(r"<[^>]+>", re.UNICODE) | |
| _WHITESPACE_RE = re.compile(r"\s+", re.UNICODE) | |
| _MIN_TEXT_LENGTH = 10 | |
| def clean_text(text: str) -> str: | |
| """Clean article text for downstream tokenization. | |
| Steps applied in order: | |
| 1. Strip HTML / XML tags with a regex (no third-party HTML parser needed). | |
| 2. Normalize Unicode to NFC (handles combining characters, full-width | |
| glyphs, etc.). | |
| 3. Collapse consecutive whitespace characters (spaces, tabs, newlines) to | |
| a single ASCII space. | |
| 4. Strip leading and trailing whitespace. | |
| 5. Return an empty string if the result is shorter than 10 characters | |
| (avoids feeding near-empty strings to the model). | |
| Parameters | |
| ---------- | |
| text: | |
| Raw text, possibly containing HTML markup. | |
| Returns | |
| ------- | |
| str | |
| Cleaned text, or ``""`` if the cleaned result is too short. | |
| """ | |
| if not text: | |
| return "" | |
| # 1. Remove HTML tags | |
| cleaned = _HTML_TAG_RE.sub(" ", text) | |
| # 2. Unicode NFC normalization | |
| cleaned = unicodedata.normalize("NFC", cleaned) | |
| # 3. Collapse whitespace | |
| cleaned = _WHITESPACE_RE.sub(" ", cleaned) | |
| # 4. Strip edges | |
| cleaned = cleaned.strip() | |
| # 5. Minimum length guard | |
| if len(cleaned) < _MIN_TEXT_LENGTH: | |
| return "" | |
| return cleaned | |
| def detect_language(text: str) -> str: | |
| """Detect the primary language of *text*. | |
| Uses ``langdetect`` (which must be installed in the environment). | |
| Returns | |
| ------- | |
| str | |
| ``"tl"`` for Filipino/Tagalog, ``"en"`` for English, | |
| ``"mixed"`` for any other detected language or on detection failure. | |
| """ | |
| try: | |
| from langdetect import detect # type: ignore[import-untyped] | |
| from langdetect.lang_detect_exception import ( # type: ignore[import-untyped] | |
| LangDetectException, | |
| ) | |
| try: | |
| lang = detect(text) | |
| if lang == "tl": | |
| return "tl" | |
| if lang == "en": | |
| return "en" | |
| return "mixed" | |
| except LangDetectException: | |
| return "mixed" | |
| except ImportError: | |
| logger.warning( | |
| "langdetect is not installed; defaulting language to 'mixed'." | |
| ) | |
| return "mixed" | |
| def domain_to_credibility_score( | |
| domain: str, | |
| credibility_json_path: Path = _DEFAULT_CREDIBILITY_JSON, | |
| ) -> int: | |
| """Look up a domain's credibility tier score. | |
| Reads ``domain_credibility.json`` (cached after the first call) and maps | |
| the domain to a numeric score: | |
| +---------+-------+---------------------------+ | |
| | Tier | Score | Meaning | | |
| +=========+=======+===========================+ | |
| | tier1 | 100 | High-credibility outlet | | |
| +---------+-------+---------------------------+ | |
| | tier2 | 50 | Mainstream / mid-tier | | |
| +---------+-------+---------------------------+ | |
| | tier3 | 25 | Low-credibility | | |
| +---------+-------+---------------------------+ | |
| | tier4 | 0 | Known misinformation site | | |
| +---------+-------+---------------------------+ | |
| | unknown | 50 | Domain not found (default)| | |
| +---------+-------+---------------------------+ | |
| Parameters | |
| ---------- | |
| domain: | |
| Bare domain name, e.g. ``"rappler.com"``. | |
| credibility_json_path: | |
| Path to ``domain_credibility.json``. Defaults to the file at the | |
| PhilVerify project root. | |
| Returns | |
| ------- | |
| int | |
| Credibility score for the domain. | |
| """ | |
| cache_key = str(credibility_json_path) | |
| if cache_key not in _credibility_cache: | |
| try: | |
| with credibility_json_path.open(encoding="utf-8") as fh: | |
| _credibility_cache[cache_key] = json.load(fh) | |
| except (FileNotFoundError, json.JSONDecodeError): | |
| logger.warning( | |
| "Could not load domain_credibility.json from %s; " | |
| "all domains will receive a default score of 50.", | |
| credibility_json_path, | |
| ) | |
| _credibility_cache[cache_key] = {} | |
| data: dict = _credibility_cache[cache_key] | |
| tier_scores: dict[str, int] = { | |
| "tier1": 100, | |
| "tier2": 50, | |
| "tier3": 25, | |
| "tier4": 0, | |
| } | |
| for tier, score in tier_scores.items(): | |
| tier_domains: list[str] = data.get(tier, []) | |
| if domain in tier_domains: | |
| return score | |
| # Domain not found β treat as tier2 / unknown | |
| return 50 | |
| def binary_to_three_class( | |
| raw_label: str, | |
| domain: str | None, | |
| credibility_json_path: Path = _DEFAULT_CREDIBILITY_JSON, | |
| ) -> int: | |
| """Map a raw dataset label string to PhilVerify's three-class schema. | |
| Label mapping rules | |
| ------------------- | |
| * ``"fake"`` / ``"0"`` / ``"FALSE"`` / ``"pants-fire"`` / ``"false"`` | |
| β **2** (Likely Fake) | |
| * ``"real"`` / ``"1"`` / ``"TRUE"`` / ``"true"`` | |
| β credibility-aware decision: | |
| - domain score β₯ 75 β **0** (Credible) | |
| - domain score β₯ 40 β **0** (Credible, mainstream source) | |
| - domain score < 40 β **1** (Unverified, low-credibility domain) | |
| * ``"mostly-true"`` | |
| β **0** (Credible) | |
| * ``"half-true"`` / ``"barely-true"`` | |
| β **1** (Unverified) | |
| * *anything else* | |
| β **1** (Unverified, safe default) | |
| Parameters | |
| ---------- | |
| raw_label: | |
| The label string exactly as it appears in the upstream dataset. | |
| domain: | |
| The publisher domain used for credibility lookup when the raw label | |
| indicates truth. Pass ``None`` to skip domain lookup (score β 50). | |
| credibility_json_path: | |
| Path to ``domain_credibility.json``. | |
| Returns | |
| ------- | |
| int | |
| An integer in ``{0, 1, 2}``. | |
| """ | |
| _FAKE_LABELS: frozenset[str] = frozenset( | |
| {"fake", "0", "FALSE", "pants-fire", "false"} | |
| ) | |
| _TRUE_LABELS: frozenset[str] = frozenset({"real", "1", "TRUE", "true"}) | |
| if raw_label in _FAKE_LABELS: | |
| return 2 | |
| if raw_label in _TRUE_LABELS: | |
| if domain: | |
| score = domain_to_credibility_score(domain, credibility_json_path) | |
| else: | |
| score = 50 # neutral default when no domain is available | |
| if score >= 75: | |
| return 0 # Credible | |
| if score >= 40: | |
| return 0 # Credible β mainstream source | |
| return 1 # Unverified β low-credibility domain | |
| if raw_label == "mostly-true": | |
| return 0 | |
| if raw_label in {"half-true", "barely-true"}: | |
| return 1 | |
| # Default: treat as Unverified | |
| return 1 | |