Spaces:

SemiAutomat1c
/

philverify-api

Running

File size: 5,615 Bytes

6c9b8f1

"""
PhilVerify — TF-IDF + Logistic Regression Baseline Classifier (Layer 1)
Seed dataset of 30 labeled PH news headlines (10 per class).
Replaced by fine-tuned XLM-RoBERTa in Phase 10.
"""
import os
import logging
import pickle
from dataclasses import dataclass, field
from pathlib import Path

logger = logging.getLogger(__name__)

MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl"

# ── Seed dataset (30 samples — 10 per class) ──────────────────────────────────
# Labels: 0=Credible, 1=Unverified, 2=Fake
SEED_DATA = [
    # Credible (0)
    ("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0),
    ("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0),
    ("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0),
    ("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0),
    ("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0),
    ("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0),
    ("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0),
    ("DOF confirms revenue collection targets met for fiscal year 2025", 0),
    ("DSWD distributes relief packs to 10,000 families in Cotabato", 0),
    ("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0),

    # Unverified (1)
    ("SHOCKING: Politician caught taking selfie during Senate hearing", 1),
    ("VIRAL: Celebrity spotted at secret meeting with government official", 1),
    ("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1),
    ("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1),
    ("CLAIM: Government hiding true COVID-19 death count from public", 1),
    ("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1),
    ("REPORT: Certain barangay officials accepting bribes according to residents", 1),
    ("Alleged: Shipment of smuggled goods found in Manila port last week", 1),
    ("CLAIM: New mandatory vaccine policy for all government employees", 1),
    ("Source says: Manila Water to increase rates by 20% next month", 1),

    # Fake (2)
    ("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2),
    ("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2),
    ("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2),
    ("WATCH: Senator caught stealing money in Senate vault - full video", 2),
    ("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2),
    ("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2),
    ("EXPOSED: Duterte has secret family in Davao that government is hiding", 2),
    ("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2),
    ("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2),
    ("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2),
]


@dataclass
class Layer1Result:
    verdict: str         # "Credible" | "Unverified" | "Fake"
    confidence: float    # 0.0 – 100.0
    triggered_features: list[str] = field(default_factory=list)


class TFIDFClassifier:
    """
    TF-IDF + Logistic Regression baseline.
    Train() fits on the seed dataset and saves to disk.
    Predict() loads persisted model first call.
    """

    _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}

    def __init__(self):
        self._vectorizer = None
        self._clf = None

    def train(self) -> None:
        """Fit on seed data. Skips training if persisted model exists."""
        if MODEL_PATH.exists():
            self._load()
            return

        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.linear_model import LogisticRegression

        texts, labels = zip(*SEED_DATA)
        self._vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=1000,
            sublinear_tf=True,
        )
        X = self._vectorizer.fit_transform(texts)
        self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
        self._clf.fit(X, labels)

        MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
        with open(MODEL_PATH, "wb") as f:
            pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f)
        logger.info("TF-IDF model trained and saved to %s", MODEL_PATH)

    def _load(self) -> None:
        with open(MODEL_PATH, "rb") as f:
            data = pickle.load(f)
        self._vectorizer = data["vectorizer"]
        self._clf = data["clf"]
        logger.info("TF-IDF model loaded from %s", MODEL_PATH)

    def predict(self, text: str) -> Layer1Result:
        if self._vectorizer is None:
            self.train()

        X = self._vectorizer.transform([text])
        pred_label = int(self._clf.predict(X)[0])
        proba = self._clf.predict_proba(X)[0]
        confidence = round(float(max(proba)) * 100, 1)
        verdict = self._LABELS[pred_label]

        # Extract top TF-IDF features as human-readable triggers
        feature_names = self._vectorizer.get_feature_names_out()
        tfidf_scores = X.toarray()[0]
        top_indices = tfidf_scores.argsort()[-5:][::-1]
        triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0]

        return Layer1Result(
            verdict=verdict,
            confidence=confidence,
            triggered_features=triggered,
        )