Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / ml /tfidf_classifier.py

Ryan Christian D. Deniega

feat: PhilVerify Phase 1-3 — FastAPI backend, NLP pipeline, TF-IDF classifier (23/23 tests)

6c9b8f1 20 days ago

5.62 kB

	"""
	PhilVerify — TF-IDF + Logistic Regression Baseline Classifier (Layer 1)
	Seed dataset of 30 labeled PH news headlines (10 per class).
	Replaced by fine-tuned XLM-RoBERTa in Phase 10.
	"""
	import os
	import logging
	import pickle
	from dataclasses import dataclass, field
	from pathlib import Path

	logger = logging.getLogger(__name__)

	MODEL_PATH = Path(__file__).parent / "models" / "tfidf_model.pkl"

	# ── Seed dataset (30 samples — 10 per class) ──────────────────────────────────
	# Labels: 0=Credible, 1=Unverified, 2=Fake
	SEED_DATA = [
	# Credible (0)
	("DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila", 0),
	("Rappler: Supreme Court upholds Comelec ruling on disqualification case", 0),
	("GMA News: PNP arrests 12 suspects in Bulacan drug bust", 0),
	("Philippine Star: GDP growth slows to 5.3% in Q3 says BSP", 0),
	("Inquirer: Senate passes revised anti-terrorism bill on third reading", 0),
	("Manila Bulletin: Typhoon Carina leaves P2B damage in Isabela province", 0),
	("ABS-CBN News: Marcos signs executive order on agricultural modernization", 0),
	("DOF confirms revenue collection targets met for fiscal year 2025", 0),
	("DSWD distributes relief packs to 10,000 families in Cotabato", 0),
	("PhilStar: Meralco rate hike of P0.18 per kilowatt-hour approved by ERC", 0),

	# Unverified (1)
	("SHOCKING: Politician caught taking selfie during Senate hearing", 1),
	("VIRAL: Celebrity spotted at secret meeting with government official", 1),
	("BREAKING: 'Anonymous source' says president planning cabinet reshuffle", 1),
	("Rumor has it: New tax policy to affect OFW remittances starting 2026", 1),
	("CLAIM: Government hiding true COVID-19 death count from public", 1),
	("Unconfirmed: Military says there are 500 rebels still in Mindanao", 1),
	("REPORT: Certain barangay officials accepting bribes according to residents", 1),
	("Alleged: Shipment of smuggled goods found in Manila port last week", 1),
	("CLAIM: New mandatory vaccine policy for all government employees", 1),
	("Source says: Manila Water to increase rates by 20% next month", 1),

	# Fake (2)
	("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!", 2),
	("TOTOO BA? Marcos nagsabi na libreng kuryente na simula bukas!", 2),
	("SHOCKING TRUTH: Bill Gates microchip found in COVID vaccine in Cebu!", 2),
	("WATCH: Senator caught stealing money in Senate vault - full video", 2),
	("CONFIRMED: Philippines to become 51st state of the United States in 2026!", 2),
	("KATOTOHANAN: DOH secretly poisoning water supply to control population", 2),
	("EXPOSED: Duterte has secret family in Davao that government is hiding", 2),
	("100% TOTOO: Garlic cures COVID-19, doctors don't want you to know this!", 2),
	("GALING NG PILIPINAS: Filipino scientist discovers cure for cancer, suppressed by big pharma", 2),
	("BREAKING: Entire Luzon to experience 3-day total blackout next week, says NGCP", 2),
	]


	@dataclass
	class Layer1Result:
	verdict: str # "Credible" \| "Unverified" \| "Fake"
	confidence: float # 0.0 – 100.0
	triggered_features: list[str] = field(default_factory=list)


	class TFIDFClassifier:
	"""
	TF-IDF + Logistic Regression baseline.
	Train() fits on the seed dataset and saves to disk.
	Predict() loads persisted model first call.
	"""

	_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}

	def __init__(self):
	self._vectorizer = None
	self._clf = None

	def train(self) -> None:
	"""Fit on seed data. Skips training if persisted model exists."""
	if MODEL_PATH.exists():
	self._load()
	return

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression

	texts, labels = zip(*SEED_DATA)
	self._vectorizer = TfidfVectorizer(
	ngram_range=(1, 2),
	max_features=1000,
	sublinear_tf=True,
	)
	X = self._vectorizer.fit_transform(texts)
	self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
	self._clf.fit(X, labels)

	MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
	with open(MODEL_PATH, "wb") as f:
	pickle.dump({"vectorizer": self._vectorizer, "clf": self._clf}, f)
	logger.info("TF-IDF model trained and saved to %s", MODEL_PATH)

	def _load(self) -> None:
	with open(MODEL_PATH, "rb") as f:
	data = pickle.load(f)
	self._vectorizer = data["vectorizer"]
	self._clf = data["clf"]
	logger.info("TF-IDF model loaded from %s", MODEL_PATH)

	def predict(self, text: str) -> Layer1Result:
	if self._vectorizer is None:
	self.train()

	X = self._vectorizer.transform([text])
	pred_label = int(self._clf.predict(X)[0])
	proba = self._clf.predict_proba(X)[0]
	confidence = round(float(max(proba)) * 100, 1)
	verdict = self._LABELS[pred_label]

	# Extract top TF-IDF features as human-readable triggers
	feature_names = self._vectorizer.get_feature_names_out()
	tfidf_scores = X.toarray()[0]
	top_indices = tfidf_scores.argsort()[-5:][::-1]
	triggered = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0]

	return Layer1Result(
	verdict=verdict,
	confidence=confidence,
	triggered_features=triggered,
	)