Spaces:

SemiAutomat1c
/

philverify-api

Running

Ryan Christian D. Deniega

Add LDA topic inference — show detected topic label + defining words in UI

affe2db about 2 months ago

8.31 kB

	"""
	PhilVerify — LDA Topic Analysis + LDA Feature Classifier (Layer 1)

	Two responsibilities:

	1. run_topic_analysis(samples, n_topics)
	Fits LDA on training texts, prints top-N words per topic and the dominant
	topic distribution per class (Credible / Unverified / Likely Fake).
	Call directly to explore what topics the model discovers.

	2. LDAFeatureClassifier
	Concatenates LDA topic distribution features with TF-IDF features and feeds
	the combined vector into LogisticRegression. Same predict() interface as
	TFIDFClassifier — slots directly into eval.py.

	Usage:
	python -m ml.lda_analysis # standalone topic analysis
	python -m ml.eval # compare LDAFeatureClassifier against others
	"""
	import logging

	import numpy as np
	import scipy.sparse as sp

	from ml.dataset import LABEL_NAMES, get_split
	from ml.naive_bayes_classifier import _lemmatize_tokens
	from ml.tfidf_classifier import Layer1Result

	logger = logging.getLogger(__name__)

	_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}

	# Human-readable labels for each LDA topic (1-indexed).
	# Assigned by inspecting run_topic_analysis() output on the 100-sample PH dataset.
	TOPIC_LABELS: dict[int, str] = {
	1: "Health & Conspiracy",
	2: "Breaking News",
	3: "Crime & Law",
	4: "Politics & Government",
	5: "Filipino Current Events",
	}


	# ── Standalone topic analysis ──────────────────────────────────────────────────

	def run_topic_analysis(
	samples,
	n_topics: int = 5,
	n_top_words: int = 10,
	) -> None:
	"""
	Fit LDA on samples and print:
	- Top-N words per topic
	- Mean topic distribution per class label
	"""
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import CountVectorizer

	texts = [s.text.lower() for s in samples]
	labels = [s.label for s in samples]

	# LDA requires raw counts (not TF-IDF)
	vectorizer = CountVectorizer(max_features=500, stop_words="english")
	X = vectorizer.fit_transform(texts)
	vocab = vectorizer.get_feature_names_out()

	lda = LatentDirichletAllocation(
	n_components=n_topics, random_state=42, max_iter=30, learning_method="batch"
	)
	doc_topics = lda.fit_transform(X) # (n_samples, n_topics)

	print(f"\n{'='*62}")
	print(f" LDA Topic Analysis ({n_topics} topics, {len(samples)} samples)")
	print(f"{'='*62}")

	for i, topic_vec in enumerate(lda.components_):
	top_idx = topic_vec.argsort()[-n_top_words:][::-1]
	top_words = [vocab[j] for j in top_idx]
	print(f"\n Topic {i + 1}: {', '.join(top_words)}")

	print(f"\n Per-class dominant topics:")
	for label_id, label_name in sorted(LABEL_NAMES.items()):
	class_idx = [i for i, l in enumerate(labels) if l == label_id]
	if not class_idx:
	continue
	mean_dist = doc_topics[class_idx].mean(axis=0)
	top2 = mean_dist.argsort()[-2:][::-1]
	topic_str = " ".join(f"T{d+1}:{mean_dist[d]:.2f}" for d in top2)
	print(f" {label_name:<14} {topic_str}")


	# ── LDA Feature Classifier ─────────────────────────────────────────────────────

	class LDAFeatureClassifier:
	"""
	LDA topic distribution + TF-IDF features → LogisticRegression.

	Feature vector = sparse_hstack([tfidf_features, lda_topic_distribution])

	Args:
	train_samples: list[Sample]. If None, uses the full 100-sample dataset.
	n_topics: number of LDA topics (default 5).
	lemmatize: apply WordNet lemmatization before vectorization.
	"""

	def __init__(self, train_samples=None, n_topics: int = 5, lemmatize: bool = False):
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.linear_model import LogisticRegression

	self._lemmatize = lemmatize
	self._n_topics = n_topics

	if train_samples is None:
	from ml.dataset import get_dataset
	train_samples = get_dataset()

	texts = [self._preprocess(s.text) for s in train_samples]
	labels = [s.label for s in train_samples]

	# TF-IDF part
	self._tfidf = TfidfVectorizer(
	ngram_range=(1, 2), max_features=1000, sublinear_tf=True
	)
	X_tfidf = self._tfidf.fit_transform(texts)

	# LDA part (requires raw counts)
	self._count_vec = CountVectorizer(max_features=500)
	X_counts = self._count_vec.fit_transform(texts)
	self._lda = LatentDirichletAllocation(
	n_components=n_topics, random_state=42, max_iter=30, learning_method="batch"
	)
	X_lda = self._lda.fit_transform(X_counts) # dense (n_samples, n_topics)

	# Combine: sparse TF-IDF + dense LDA → sparse
	X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)])

	self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
	self._clf.fit(X_combined, labels)
	logger.info(
	"LDAFeatureClassifier trained on %d samples (n_topics=%d, lemmatize=%s)",
	len(texts), n_topics, lemmatize,
	)

	def _preprocess(self, text: str) -> str:
	text = text.lower()
	if self._lemmatize:
	return " ".join(_lemmatize_tokens(text.split()))
	return text

	def predict(self, text: str) -> Layer1Result:
	processed = self._preprocess(text)
	X_tfidf = self._tfidf.transform([processed])
	X_counts = self._count_vec.transform([processed])
	X_lda = self._lda.transform(X_counts) # (1, n_topics)
	X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)])

	pred_label = int(self._clf.predict(X_combined)[0])
	proba = self._clf.predict_proba(X_combined)[0]
	confidence = round(float(max(proba)) * 100, 1)
	verdict = _LABELS[pred_label]

	# Top TF-IDF features
	feature_names = self._tfidf.get_feature_names_out()
	tfidf_scores = X_tfidf.toarray()[0]
	top_idx = tfidf_scores.argsort()[-4:][::-1]
	triggered = [feature_names[i] for i in top_idx if tfidf_scores[i] > 0]

	# Prepend dominant topic label
	dominant_topic = int(X_lda[0].argmax()) + 1
	triggered.insert(0, f"lda_topic_{dominant_topic}")

	return Layer1Result(
	verdict=verdict,
	confidence=confidence,
	triggered_features=triggered[:5],
	)

	def get_topic_info(self, text: str) -> dict:
	"""
	Infer the dominant LDA topic for a new text.
	Returns label (human-assigned), top 6 defining words, and confidence
	(the probability mass on the dominant topic, 0–100%).
	"""
	processed = self._preprocess(text)
	X_counts = self._count_vec.transform([processed])
	X_lda = self._lda.transform(X_counts) # (1, n_topics)
	topic_idx = int(X_lda[0].argmax())
	confidence = round(float(X_lda[0][topic_idx]) * 100, 1)

	vocab = self._count_vec.get_feature_names_out()
	topic_vec = self._lda.components_[topic_idx]
	top_words = [vocab[i] for i in topic_vec.argsort()[-6:][::-1]]

	label = TOPIC_LABELS.get(topic_idx + 1, f"Topic {topic_idx + 1}")
	return {"label": label, "top_words": top_words, "confidence": confidence}


	# ── Direct run ─────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="LDA topic analysis on PhilVerify dataset")
	parser.add_argument("--n-topics", type=int, default=5)
	parser.add_argument("--n-top-words", type=int, default=10)
	parser.add_argument("--seed", type=int, default=42)
	args = parser.parse_args()

	train_samples, _ = get_split(seed=args.seed)
	run_topic_analysis(train_samples, n_topics=args.n_topics, n_top_words=args.n_top_words)