philverify-api / ml /naive_bayes_classifier.py
Ryan Christian D. Deniega
feat: extension button placement, text extraction, OCR display + ML improvements
c78c2c1
"""
PhilVerify β€” TF-IDF + Naive Bayes Classifier (Layer 1)
MultinomialNB with TF-IDF features. Trains on the provided sample split so that
eval comparisons are fair (same train/val split as transformer models).
Supports optional WordNet lemmatization to measure its effect on Filipino/Taglish text.
"""
import logging
logger = logging.getLogger(__name__)
def _lemmatize_tokens(tokens: list[str]) -> list[str]:
"""
Lemmatize tokens with POS-aware WordNet lemmatization.
Downloads required NLTK data on first call. Falls back to identity on any error.
Note: WordNet is English-biased β€” Tagalog tokens are returned unchanged.
"""
try:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
for resource, path in [
("wordnet", "corpora/wordnet"),
("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),
]:
try:
nltk.data.find(path)
except LookupError:
nltk.download(resource, quiet=True)
def _wn_pos(tag: str) -> str:
if tag.startswith("J"):
return wordnet.ADJ
if tag.startswith("V"):
return wordnet.VERB
if tag.startswith("R"):
return wordnet.ADV
return wordnet.NOUN
lemmatizer = WordNetLemmatizer()
tagged = nltk.pos_tag(tokens)
return [lemmatizer.lemmatize(w, _wn_pos(t)) for w, t in tagged]
except Exception as exc:
logger.debug("Lemmatization skipped (%s) β€” returning raw tokens", exc)
return tokens
# Import shared result type
from ml.tfidf_classifier import Layer1Result # noqa: E402
class NaiveBayesClassifier:
"""
TF-IDF + MultinomialNB classifier. Same predict() interface as TFIDFClassifier.
Args:
train_samples: list[Sample] from ml.dataset. If None, uses the full 100-sample dataset.
lemmatize: apply WordNet lemmatization before vectorization.
"""
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
def __init__(self, train_samples=None, lemmatize: bool = False):
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
self._lemmatize = lemmatize
if train_samples is None:
from ml.dataset import get_dataset
train_samples = get_dataset()
texts = [self._preprocess(s.text) for s in train_samples]
labels = [s.label for s in train_samples]
self._vectorizer = TfidfVectorizer(
ngram_range=(1, 2),
max_features=1000,
sublinear_tf=True,
)
X = self._vectorizer.fit_transform(texts)
self._clf = MultinomialNB(alpha=1.0)
self._clf.fit(X, labels)
logger.info(
"NaiveBayesClassifier trained on %d samples (lemmatize=%s)",
len(texts), lemmatize,
)
def _preprocess(self, text: str) -> str:
text = text.lower()
if self._lemmatize:
return " ".join(_lemmatize_tokens(text.split()))
return text
def predict(self, text: str) -> Layer1Result:
processed = self._preprocess(text)
X = self._vectorizer.transform([processed])
pred_label = int(self._clf.predict(X)[0])
proba = self._clf.predict_proba(X)[0]
confidence = round(float(max(proba)) * 100, 1)
verdict = self._LABELS[pred_label]
feature_names = self._vectorizer.get_feature_names_out()
tfidf_scores = X.toarray()[0]
top_idx = tfidf_scores.argsort()[-5:][::-1]
triggered = [feature_names[i] for i in top_idx if tfidf_scores[i] > 0]
return Layer1Result(verdict=verdict, confidence=confidence, triggered_features=triggered)