Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / ml /data_sources /base.py

Ryan Christian D. Deniega

fix: cold start 502, favicon, verify state persistence

b1c84b5 19 days ago

12.2 kB

	"""
	ml/data_sources/base.py
	Abstract base class and shared utilities for PhilVerify data source adapters.

	Provides:
	- NormalizedSample : canonical dataclass for all ingested samples
	- DataSource : ABC that every source adapter must implement
	- clean_text : HTML-strip + Unicode normalization + whitespace collapse
	- detect_language : langdetect wrapper returning "tl" / "en" / "mixed"
	- domain_to_credibility_score : looks up domain tier from domain_credibility.json
	- binary_to_three_class : maps raw dataset labels to {0, 1, 2}

	Label schema
	------------
	0 → Credible
	1 → Unverified
	2 → Likely Fake
	"""

	from __future__ import annotations

	import json
	import logging
	import re
	import unicodedata
	from abc import ABC, abstractmethod
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import ClassVar

	logger = logging.getLogger(__name__)

	# ---------------------------------------------------------------------------
	# Default path: ml/data_sources/ → ml/ → PhilVerify/ → domain_credibility.json
	# ---------------------------------------------------------------------------
	_DEFAULT_CREDIBILITY_JSON: Path = (
	Path(__file__).parent.parent.parent / "domain_credibility.json"
	)

	# Module-level cache so the JSON file is only read from disk once per process.
	_credibility_cache: dict[str, dict] = {}


	# ---------------------------------------------------------------------------
	# Dataclass
	# ---------------------------------------------------------------------------


	@dataclass
	class NormalizedSample:
	"""A single article or headline normalized to PhilVerify's label schema.

	Attributes
	----------
	text:
	Cleaned article text or headline.
	label:
	Integer label in {0, 1, 2} (Credible / Unverified / Likely Fake).
	source:
	Dataset identifier, e.g. ``"jcblaise/fake_news_filipino"``.
	language:
	BCP-47-style language code: ``"tl"``, ``"en"``, or ``"mixed"``.
	original_label:
	The raw label string from the upstream dataset, e.g. ``"fake"``,
	``"real"``, ``"pants-fire"``. Preserved for debugging / auditing.
	confidence:
	A float in [0.0, 1.0] representing how confident the label mapping is.
	Defaults to ``1.0`` for unambiguous remappings; use lower values for
	heuristic or model-assisted mappings.
	"""

	text: str
	label: int
	source: str
	language: str
	original_label: str
	confidence: float = field(default=1.0)

	def __post_init__(self) -> None:
	if self.label not in {0, 1, 2}:
	raise ValueError(
	f"label must be 0, 1, or 2; got {self.label!r}"
	)
	if not (0.0 <= self.confidence <= 1.0):
	raise ValueError(
	f"confidence must be in [0.0, 1.0]; got {self.confidence!r}"
	)


	# ---------------------------------------------------------------------------
	# Abstract base class
	# ---------------------------------------------------------------------------


	class DataSource(ABC):
	"""Abstract base class for PhilVerify data source adapters.

	Subclasses must implement :meth:`fetch` and the :attr:`source_name`
	property. Callers should use :meth:`load`, which wraps :meth:`fetch`
	with logging and error handling.

	Class Attributes
	----------------
	LABEL_NAMES:
	Human-readable names for each integer label.
	"""

	LABEL_NAMES: ClassVar[dict[int, str]] = {
	0: "Credible",
	1: "Unverified",
	2: "Likely Fake",
	}

	# -- Abstract interface --------------------------------------------------

	@property
	@abstractmethod
	def source_name(self) -> str:
	"""A stable, unique identifier for this data source.

	Recommended format: ``"<owner>/<dataset>"`` for HuggingFace datasets,
	or a descriptive slug for scraped / local sources.

	Example: ``"jcblaise/fake_news_filipino"``
	"""

	@abstractmethod
	def fetch(self) -> list[NormalizedSample]:
	"""Download or load raw data and return normalized samples.

	This method may perform network I/O and should not swallow exceptions;
	error handling is the responsibility of :meth:`load`.

	Returns
	-------
	list[NormalizedSample]
	Every sample extracted from this source after normalization.
	"""

	# -- Concrete helpers ----------------------------------------------------

	def load(self) -> list[NormalizedSample]:
	"""Call :meth:`fetch`, log progress, and handle errors gracefully.

	Returns an empty list (rather than raising) if fetching fails, so that
	a single broken source does not abort a multi-source pipeline.

	Returns
	-------
	list[NormalizedSample]
	Normalized samples, or ``[]`` on failure.
	"""
	logger.info("Loading data source: %s", self.source_name)
	try:
	samples = self.fetch()
	logger.info(
	"Loaded %d samples from %s", len(samples), self.source_name
	)
	return samples
	except Exception: # noqa: BLE001
	logger.warning(
	"Failed to load data source '%s'. Returning empty list.",
	self.source_name,
	exc_info=True,
	)
	return []


	# ---------------------------------------------------------------------------
	# NLP utility functions
	# ---------------------------------------------------------------------------


	_HTML_TAG_RE = re.compile(r"<[^>]+>", re.UNICODE)
	_WHITESPACE_RE = re.compile(r"\s+", re.UNICODE)
	_MIN_TEXT_LENGTH = 10


	def clean_text(text: str) -> str:
	"""Clean article text for downstream tokenization.

	Steps applied in order:

	1. Strip HTML / XML tags with a regex (no third-party HTML parser needed).
	2. Normalize Unicode to NFC (handles combining characters, full-width
	glyphs, etc.).
	3. Collapse consecutive whitespace characters (spaces, tabs, newlines) to
	a single ASCII space.
	4. Strip leading and trailing whitespace.
	5. Return an empty string if the result is shorter than 10 characters
	(avoids feeding near-empty strings to the model).

	Parameters
	----------
	text:
	Raw text, possibly containing HTML markup.

	Returns
	-------
	str
	Cleaned text, or ``""`` if the cleaned result is too short.
	"""
	if not text:
	return ""

	# 1. Remove HTML tags
	cleaned = _HTML_TAG_RE.sub(" ", text)

	# 2. Unicode NFC normalization
	cleaned = unicodedata.normalize("NFC", cleaned)

	# 3. Collapse whitespace
	cleaned = _WHITESPACE_RE.sub(" ", cleaned)

	# 4. Strip edges
	cleaned = cleaned.strip()

	# 5. Minimum length guard
	if len(cleaned) < _MIN_TEXT_LENGTH:
	return ""

	return cleaned


	def detect_language(text: str) -> str:
	"""Detect the primary language of text.

	Uses ``langdetect`` (which must be installed in the environment).

	Returns
	-------
	str
	``"tl"`` for Filipino/Tagalog, ``"en"`` for English,
	``"mixed"`` for any other detected language or on detection failure.
	"""
	try:
	from langdetect import detect # type: ignore[import-untyped]
	from langdetect.lang_detect_exception import ( # type: ignore[import-untyped]
	LangDetectException,
	)

	try:
	lang = detect(text)
	if lang == "tl":
	return "tl"
	if lang == "en":
	return "en"
	return "mixed"
	except LangDetectException:
	return "mixed"

	except ImportError:
	logger.warning(
	"langdetect is not installed; defaulting language to 'mixed'."
	)
	return "mixed"


	def domain_to_credibility_score(
	domain: str,
	credibility_json_path: Path = _DEFAULT_CREDIBILITY_JSON,
	) -> int:
	"""Look up a domain's credibility tier score.

	Reads ``domain_credibility.json`` (cached after the first call) and maps
	the domain to a numeric score:

	+---------+-------+---------------------------+
	\| Tier \| Score \| Meaning \|
	+=========+=======+===========================+
	\| tier1 \| 100 \| High-credibility outlet \|
	+---------+-------+---------------------------+
	\| tier2 \| 50 \| Mainstream / mid-tier \|
	+---------+-------+---------------------------+
	\| tier3 \| 25 \| Low-credibility \|
	+---------+-------+---------------------------+
	\| tier4 \| 0 \| Known misinformation site \|
	+---------+-------+---------------------------+
	\| unknown \| 50 \| Domain not found (default)\|
	+---------+-------+---------------------------+

	Parameters
	----------
	domain:
	Bare domain name, e.g. ``"rappler.com"``.
	credibility_json_path:
	Path to ``domain_credibility.json``. Defaults to the file at the
	PhilVerify project root.

	Returns
	-------
	int
	Credibility score for the domain.
	"""
	cache_key = str(credibility_json_path)

	if cache_key not in _credibility_cache:
	try:
	with credibility_json_path.open(encoding="utf-8") as fh:
	_credibility_cache[cache_key] = json.load(fh)
	except (FileNotFoundError, json.JSONDecodeError):
	logger.warning(
	"Could not load domain_credibility.json from %s; "
	"all domains will receive a default score of 50.",
	credibility_json_path,
	)
	_credibility_cache[cache_key] = {}

	data: dict = _credibility_cache[cache_key]

	tier_scores: dict[str, int] = {
	"tier1": 100,
	"tier2": 50,
	"tier3": 25,
	"tier4": 0,
	}

	for tier, score in tier_scores.items():
	tier_domains: list[str] = data.get(tier, [])
	if domain in tier_domains:
	return score

	# Domain not found → treat as tier2 / unknown
	return 50


	def binary_to_three_class(
	raw_label: str,
	domain: str \| None,
	credibility_json_path: Path = _DEFAULT_CREDIBILITY_JSON,
	) -> int:
	"""Map a raw dataset label string to PhilVerify's three-class schema.

	Label mapping rules
	-------------------
	* ``"fake"`` / ``"0"`` / ``"FALSE"`` / ``"pants-fire"`` / ``"false"``
	→ 2 (Likely Fake)

	* ``"real"`` / ``"1"`` / ``"TRUE"`` / ``"true"``
	→ credibility-aware decision:

	- domain score ≥ 75 → 0 (Credible)
	- domain score ≥ 40 → 0 (Credible, mainstream source)
	- domain score < 40 → 1 (Unverified, low-credibility domain)

	* ``"mostly-true"``
	→ 0 (Credible)

	* ``"half-true"`` / ``"barely-true"``
	→ 1 (Unverified)

	* anything else
	→ 1 (Unverified, safe default)

	Parameters
	----------
	raw_label:
	The label string exactly as it appears in the upstream dataset.
	domain:
	The publisher domain used for credibility lookup when the raw label
	indicates truth. Pass ``None`` to skip domain lookup (score → 50).
	credibility_json_path:
	Path to ``domain_credibility.json``.

	Returns
	-------
	int
	An integer in ``{0, 1, 2}``.
	"""
	_FAKE_LABELS: frozenset[str] = frozenset(
	{"fake", "0", "FALSE", "pants-fire", "false"}
	)
	_TRUE_LABELS: frozenset[str] = frozenset({"real", "1", "TRUE", "true"})

	if raw_label in _FAKE_LABELS:
	return 2

	if raw_label in _TRUE_LABELS:
	if domain:
	score = domain_to_credibility_score(domain, credibility_json_path)
	else:
	score = 50 # neutral default when no domain is available

	if score >= 75:
	return 0 # Credible
	if score >= 40:
	return 0 # Credible — mainstream source
	return 1 # Unverified — low-credibility domain

	if raw_label == "mostly-true":
	return 0

	if raw_label in {"half-true", "barely-true"}:
	return 1

	# Default: treat as Unverified
	return 1