philverify-api / ml /lda_analysis.py
Ryan Christian D. Deniega
Add LDA topic inference β€” show detected topic label + defining words in UI
affe2db
"""
PhilVerify β€” LDA Topic Analysis + LDA Feature Classifier (Layer 1)
Two responsibilities:
1. run_topic_analysis(samples, n_topics)
Fits LDA on training texts, prints top-N words per topic and the dominant
topic distribution per class (Credible / Unverified / Likely Fake).
Call directly to explore what topics the model discovers.
2. LDAFeatureClassifier
Concatenates LDA topic distribution features with TF-IDF features and feeds
the combined vector into LogisticRegression. Same predict() interface as
TFIDFClassifier β€” slots directly into eval.py.
Usage:
python -m ml.lda_analysis # standalone topic analysis
python -m ml.eval # compare LDAFeatureClassifier against others
"""
import logging
import numpy as np
import scipy.sparse as sp
from ml.dataset import LABEL_NAMES, get_split
from ml.naive_bayes_classifier import _lemmatize_tokens
from ml.tfidf_classifier import Layer1Result
logger = logging.getLogger(__name__)
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
# Human-readable labels for each LDA topic (1-indexed).
# Assigned by inspecting run_topic_analysis() output on the 100-sample PH dataset.
TOPIC_LABELS: dict[int, str] = {
1: "Health & Conspiracy",
2: "Breaking News",
3: "Crime & Law",
4: "Politics & Government",
5: "Filipino Current Events",
}
# ── Standalone topic analysis ──────────────────────────────────────────────────
def run_topic_analysis(
samples,
n_topics: int = 5,
n_top_words: int = 10,
) -> None:
"""
Fit LDA on samples and print:
- Top-N words per topic
- Mean topic distribution per class label
"""
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
texts = [s.text.lower() for s in samples]
labels = [s.label for s in samples]
# LDA requires raw counts (not TF-IDF)
vectorizer = CountVectorizer(max_features=500, stop_words="english")
X = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names_out()
lda = LatentDirichletAllocation(
n_components=n_topics, random_state=42, max_iter=30, learning_method="batch"
)
doc_topics = lda.fit_transform(X) # (n_samples, n_topics)
print(f"\n{'='*62}")
print(f" LDA Topic Analysis ({n_topics} topics, {len(samples)} samples)")
print(f"{'='*62}")
for i, topic_vec in enumerate(lda.components_):
top_idx = topic_vec.argsort()[-n_top_words:][::-1]
top_words = [vocab[j] for j in top_idx]
print(f"\n Topic {i + 1}: {', '.join(top_words)}")
print(f"\n Per-class dominant topics:")
for label_id, label_name in sorted(LABEL_NAMES.items()):
class_idx = [i for i, l in enumerate(labels) if l == label_id]
if not class_idx:
continue
mean_dist = doc_topics[class_idx].mean(axis=0)
top2 = mean_dist.argsort()[-2:][::-1]
topic_str = " ".join(f"T{d+1}:{mean_dist[d]:.2f}" for d in top2)
print(f" {label_name:<14} {topic_str}")
# ── LDA Feature Classifier ─────────────────────────────────────────────────────
class LDAFeatureClassifier:
"""
LDA topic distribution + TF-IDF features β†’ LogisticRegression.
Feature vector = sparse_hstack([tfidf_features, lda_topic_distribution])
Args:
train_samples: list[Sample]. If None, uses the full 100-sample dataset.
n_topics: number of LDA topics (default 5).
lemmatize: apply WordNet lemmatization before vectorization.
"""
def __init__(self, train_samples=None, n_topics: int = 5, lemmatize: bool = False):
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
self._lemmatize = lemmatize
self._n_topics = n_topics
if train_samples is None:
from ml.dataset import get_dataset
train_samples = get_dataset()
texts = [self._preprocess(s.text) for s in train_samples]
labels = [s.label for s in train_samples]
# TF-IDF part
self._tfidf = TfidfVectorizer(
ngram_range=(1, 2), max_features=1000, sublinear_tf=True
)
X_tfidf = self._tfidf.fit_transform(texts)
# LDA part (requires raw counts)
self._count_vec = CountVectorizer(max_features=500)
X_counts = self._count_vec.fit_transform(texts)
self._lda = LatentDirichletAllocation(
n_components=n_topics, random_state=42, max_iter=30, learning_method="batch"
)
X_lda = self._lda.fit_transform(X_counts) # dense (n_samples, n_topics)
# Combine: sparse TF-IDF + dense LDA β†’ sparse
X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)])
self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
self._clf.fit(X_combined, labels)
logger.info(
"LDAFeatureClassifier trained on %d samples (n_topics=%d, lemmatize=%s)",
len(texts), n_topics, lemmatize,
)
def _preprocess(self, text: str) -> str:
text = text.lower()
if self._lemmatize:
return " ".join(_lemmatize_tokens(text.split()))
return text
def predict(self, text: str) -> Layer1Result:
processed = self._preprocess(text)
X_tfidf = self._tfidf.transform([processed])
X_counts = self._count_vec.transform([processed])
X_lda = self._lda.transform(X_counts) # (1, n_topics)
X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)])
pred_label = int(self._clf.predict(X_combined)[0])
proba = self._clf.predict_proba(X_combined)[0]
confidence = round(float(max(proba)) * 100, 1)
verdict = _LABELS[pred_label]
# Top TF-IDF features
feature_names = self._tfidf.get_feature_names_out()
tfidf_scores = X_tfidf.toarray()[0]
top_idx = tfidf_scores.argsort()[-4:][::-1]
triggered = [feature_names[i] for i in top_idx if tfidf_scores[i] > 0]
# Prepend dominant topic label
dominant_topic = int(X_lda[0].argmax()) + 1
triggered.insert(0, f"lda_topic_{dominant_topic}")
return Layer1Result(
verdict=verdict,
confidence=confidence,
triggered_features=triggered[:5],
)
def get_topic_info(self, text: str) -> dict:
"""
Infer the dominant LDA topic for a new text.
Returns label (human-assigned), top 6 defining words, and confidence
(the probability mass on the dominant topic, 0–100%).
"""
processed = self._preprocess(text)
X_counts = self._count_vec.transform([processed])
X_lda = self._lda.transform(X_counts) # (1, n_topics)
topic_idx = int(X_lda[0].argmax())
confidence = round(float(X_lda[0][topic_idx]) * 100, 1)
vocab = self._count_vec.get_feature_names_out()
topic_vec = self._lda.components_[topic_idx]
top_words = [vocab[i] for i in topic_vec.argsort()[-6:][::-1]]
label = TOPIC_LABELS.get(topic_idx + 1, f"Topic {topic_idx + 1}")
return {"label": label, "top_words": top_words, "confidence": confidence}
# ── Direct run ─────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="LDA topic analysis on PhilVerify dataset")
parser.add_argument("--n-topics", type=int, default=5)
parser.add_argument("--n-top-words", type=int, default=10)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
train_samples, _ = get_split(seed=args.seed)
run_topic_analysis(train_samples, n_topics=args.n_topics, n_top_words=args.n_top_words)