Spaces:
Running
Running
Ryan Christian D. Deniega
feat: PhilVerify Phase 1-3 β FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)
6c9b8f1 | """ | |
| PhilVerify β TF-IDF + Logistic Regression Baseline Classifier (Layer 1) | |
| Seed dataset of 30 labeled PH news headlines (10 per class). | |
| Replaced by fine-tuned XLM-RoBERTa in Phase 10. | |
| """ | |
| import os | |
| import logging | |
| import pickle | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl" | |
| # ββ Seed dataset (30 samples β 10 per class) ββββββββββββββββββββββββββββββββββ | |
| # Labels: 0=Credible, 1=Unverified, 2=Fake | |
| SEED_DATA = [ | |
| # Credible (0) | |
| ("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0), | |
| ("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0), | |
| ("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0), | |
| ("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0), | |
| ("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0), | |
| ("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0), | |
| ("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0), | |
| ("DOF confirms revenue collection targets met for fiscal year 2025", 0), | |
| ("DSWD distributes relief packs to 10,000 families in Cotabato", 0), | |
| ("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0), | |
| # Unverified (1) | |
| ("SHOCKING: Politician caught taking selfie during Senate hearing", 1), | |
| ("VIRAL: Celebrity spotted at secret meeting with government official", 1), | |
| ("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1), | |
| ("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1), | |
| ("CLAIM: Government hiding true COVID-19 death count from public", 1), | |
| ("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1), | |
| ("REPORT: Certain barangay officials accepting bribes according to residents", 1), | |
| ("Alleged: Shipment of smuggled goods found in Manila port last week", 1), | |
| ("CLAIM: New mandatory vaccine policy for all government employees", 1), | |
| ("Source says: Manila Water to increase rates by 20% next month", 1), | |
| # Fake (2) | |
| ("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2), | |
| ("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2), | |
| ("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2), | |
| ("WATCH: Senator caught stealing money in Senate vault - full video", 2), | |
| ("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2), | |
| ("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2), | |
| ("EXPOSED: Duterte has secret family in Davao that government is hiding", 2), | |
| ("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2), | |
| ("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2), | |
| ("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2), | |
| ] | |
| class Layer1Result: | |
| verdict: str # "Credible" | "Unverified" | "Fake" | |
| confidence: float # 0.0 β 100.0 | |
| triggered_features: list[str] = field(default_factory=list) | |
| class TFIDFClassifier: | |
| """ | |
| TF-IDF + Logistic Regression baseline. | |
| Train() fits on the seed dataset and saves to disk. | |
| Predict() loads persisted model first call. | |
| """ | |
| _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"} | |
| def __init__(self): | |
| self._vectorizer = None | |
| self._clf = None | |
| def train(self) -> None: | |
| """Fit on seed data. Skips training if persisted model exists.""" | |
| if MODEL_PATH.exists(): | |
| self._load() | |
| return | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| texts, labels = zip(*SEED_DATA) | |
| self._vectorizer = TfidfVectorizer( | |
| ngram_range=(1, 2), | |
| max_features=1000, | |
| sublinear_tf=True, | |
| ) | |
| X = self._vectorizer.fit_transform(texts) | |
| self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42) | |
| self._clf.fit(X, labels) | |
| MODEL_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| with open(MODEL_PATH, "wb") as f: | |
| pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f) | |
| logger.info("TF-IDF model trained and saved to %s", MODEL_PATH) | |
| def _load(self) -> None: | |
| with open(MODEL_PATH, "rb") as f: | |
| data = pickle.load(f) | |
| self._vectorizer = data["vectorizer"] | |
| self._clf = data["clf"] | |
| logger.info("TF-IDF model loaded from %s", MODEL_PATH) | |
| def predict(self, text: str) -> Layer1Result: | |
| if self._vectorizer is None: | |
| self.train() | |
| X = self._vectorizer.transform([text]) | |
| pred_label = int(self._clf.predict(X)[0]) | |
| proba = self._clf.predict_proba(X)[0] | |
| confidence = round(float(max(proba)) * 100, 1) | |
| verdict = self._LABELS[pred_label] | |
| # Extract top TF-IDF features as human-readable triggers | |
| feature_names = self._vectorizer.get_feature_names_out() | |
| tfidf_scores = X.toarray()[0] | |
| top_indices = tfidf_scores.argsort()[-5:][::-1] | |
| triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0] | |
| return Layer1Result( | |
| verdict=verdict, | |
| confidence=confidence, | |
| triggered_features=triggered, | |
| ) | |