File size: 5,615 Bytes
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
PhilVerify β€” TF-IDF + Logistic Regression Baseline Classifier (Layer 1)
Seed dataset of 30 labeled PH news headlines (10 per class).
Replaced by fine-tuned XLM-RoBERTa in Phase 10.
"""
import os
import logging
import pickle
from dataclasses import dataclass, field
from pathlib import Path

logger = logging.getLogger(__name__)

MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl"

# ── Seed dataset (30 samples β€” 10 per class) ──────────────────────────────────
# Labels: 0=Credible, 1=Unverified, 2=Fake
SEED_DATA = [
    # Credible (0)
    ("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0),
    ("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0),
    ("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0),
    ("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0),
    ("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0),
    ("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0),
    ("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0),
    ("DOF confirms revenue collection targets met for fiscal year 2025", 0),
    ("DSWD distributes relief packs to 10,000 families in Cotabato", 0),
    ("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0),

    # Unverified (1)
    ("SHOCKING: Politician caught taking selfie during Senate hearing", 1),
    ("VIRAL: Celebrity spotted at secret meeting with government official", 1),
    ("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1),
    ("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1),
    ("CLAIM: Government hiding true COVID-19 death count from public", 1),
    ("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1),
    ("REPORT: Certain barangay officials accepting bribes according to residents", 1),
    ("Alleged: Shipment of smuggled goods found in Manila port last week", 1),
    ("CLAIM: New mandatory vaccine policy for all government employees", 1),
    ("Source says: Manila Water to increase rates by 20% next month", 1),

    # Fake (2)
    ("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2),
    ("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2),
    ("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2),
    ("WATCH: Senator caught stealing money in Senate vault - full video", 2),
    ("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2),
    ("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2),
    ("EXPOSED: Duterte has secret family in Davao that government is hiding", 2),
    ("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2),
    ("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2),
    ("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2),
]


@dataclass
class Layer1Result:
    verdict: str         # "Credible" | "Unverified" | "Fake"
    confidence: float    # 0.0 – 100.0
    triggered_features: list[str] = field(default_factory=list)


class TFIDFClassifier:
    """
    TF-IDF + Logistic Regression baseline.
    Train() fits on the seed dataset and saves to disk.
    Predict() loads persisted model first call.
    """

    _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}

    def __init__(self):
        self._vectorizer = None
        self._clf = None

    def train(self) -> None:
        """Fit on seed data. Skips training if persisted model exists."""
        if MODEL_PATH.exists():
            self._load()
            return

        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.linear_model import LogisticRegression

        texts, labels = zip(*SEED_DATA)
        self._vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=1000,
            sublinear_tf=True,
        )
        X = self._vectorizer.fit_transform(texts)
        self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
        self._clf.fit(X, labels)

        MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
        with open(MODEL_PATH, "wb") as f:
            pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f)
        logger.info("TF-IDF model trained and saved to %s", MODEL_PATH)

    def _load(self) -> None:
        with open(MODEL_PATH, "rb") as f:
            data = pickle.load(f)
        self._vectorizer = data["vectorizer"]
        self._clf = data["clf"]
        logger.info("TF-IDF model loaded from %s", MODEL_PATH)

    def predict(self, text: str) -> Layer1Result:
        if self._vectorizer is None:
            self.train()

        X = self._vectorizer.transform([text])
        pred_label = int(self._clf.predict(X)[0])
        proba = self._clf.predict_proba(X)[0]
        confidence = round(float(max(proba)) * 100, 1)
        verdict = self._LABELS[pred_label]

        # Extract top TF-IDF features as human-readable triggers
        feature_names = self._vectorizer.get_feature_names_out()
        tfidf_scores = X.toarray()[0]
        top_indices = tfidf_scores.argsort()[-5:][::-1]
        triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0]

        return Layer1Result(
            verdict=verdict,
            confidence=confidence,
            triggered_features=triggered,
        )