""" PhilVerify — LDA Topic Analysis + LDA Feature Classifier (Layer 1) Two responsibilities: 1. run_topic_analysis(samples, n_topics) Fits LDA on training texts, prints top-N words per topic and the dominant topic distribution per class (Credible / Unverified / Likely Fake). Call directly to explore what topics the model discovers. 2. LDAFeatureClassifier Concatenates LDA topic distribution features with TF-IDF features and feeds the combined vector into LogisticRegression. Same predict() interface as TFIDFClassifier — slots directly into eval.py. Usage: python -m ml.lda_analysis # standalone topic analysis python -m ml.eval # compare LDAFeatureClassifier against others """ import logging import numpy as np import scipy.sparse as sp from ml.dataset import LABEL_NAMES, get_split from ml.naive_bayes_classifier import _lemmatize_tokens from ml.tfidf_classifier import Layer1Result logger = logging.getLogger(__name__) _LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"} # Human-readable labels for each LDA topic (1-indexed). # Assigned by inspecting run_topic_analysis() output on the 100-sample PH dataset. TOPIC_LABELS: dict[int, str] = { 1: "Health & Conspiracy", 2: "Breaking News", 3: "Crime & Law", 4: "Politics & Government", 5: "Filipino Current Events", } # ── Standalone topic analysis ────────────────────────────────────────────────── def run_topic_analysis( samples, n_topics: int = 5, n_top_words: int = 10, ) -> None: """ Fit LDA on samples and print: - Top-N words per topic - Mean topic distribution per class label """ from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer texts = [s.text.lower() for s in samples] labels = [s.label for s in samples] # LDA requires raw counts (not TF-IDF) vectorizer = CountVectorizer(max_features=500, stop_words="english") X = vectorizer.fit_transform(texts) vocab = vectorizer.get_feature_names_out() lda = LatentDirichletAllocation( n_components=n_topics, random_state=42, max_iter=30, learning_method="batch" ) doc_topics = lda.fit_transform(X) # (n_samples, n_topics) print(f"\n{'='*62}") print(f" LDA Topic Analysis ({n_topics} topics, {len(samples)} samples)") print(f"{'='*62}") for i, topic_vec in enumerate(lda.components_): top_idx = topic_vec.argsort()[-n_top_words:][::-1] top_words = [vocab[j] for j in top_idx] print(f"\n Topic {i + 1}: {', '.join(top_words)}") print(f"\n Per-class dominant topics:") for label_id, label_name in sorted(LABEL_NAMES.items()): class_idx = [i for i, l in enumerate(labels) if l == label_id] if not class_idx: continue mean_dist = doc_topics[class_idx].mean(axis=0) top2 = mean_dist.argsort()[-2:][::-1] topic_str = " ".join(f"T{d+1}:{mean_dist[d]:.2f}" for d in top2) print(f" {label_name:<14} {topic_str}") # ── LDA Feature Classifier ───────────────────────────────────────────────────── class LDAFeatureClassifier: """ LDA topic distribution + TF-IDF features → LogisticRegression. Feature vector = sparse_hstack([tfidf_features, lda_topic_distribution]) Args: train_samples: list[Sample]. If None, uses the full 100-sample dataset. n_topics: number of LDA topics (default 5). lemmatize: apply WordNet lemmatization before vectorization. """ def __init__(self, train_samples=None, n_topics: int = 5, lemmatize: bool = False): from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.linear_model import LogisticRegression self._lemmatize = lemmatize self._n_topics = n_topics if train_samples is None: from ml.dataset import get_dataset train_samples = get_dataset() texts = [self._preprocess(s.text) for s in train_samples] labels = [s.label for s in train_samples] # TF-IDF part self._tfidf = TfidfVectorizer( ngram_range=(1, 2), max_features=1000, sublinear_tf=True ) X_tfidf = self._tfidf.fit_transform(texts) # LDA part (requires raw counts) self._count_vec = CountVectorizer(max_features=500) X_counts = self._count_vec.fit_transform(texts) self._lda = LatentDirichletAllocation( n_components=n_topics, random_state=42, max_iter=30, learning_method="batch" ) X_lda = self._lda.fit_transform(X_counts) # dense (n_samples, n_topics) # Combine: sparse TF-IDF + dense LDA → sparse X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)]) self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42) self._clf.fit(X_combined, labels) logger.info( "LDAFeatureClassifier trained on %d samples (n_topics=%d, lemmatize=%s)", len(texts), n_topics, lemmatize, ) def _preprocess(self, text: str) -> str: text = text.lower() if self._lemmatize: return " ".join(_lemmatize_tokens(text.split())) return text def predict(self, text: str) -> Layer1Result: processed = self._preprocess(text) X_tfidf = self._tfidf.transform([processed]) X_counts = self._count_vec.transform([processed]) X_lda = self._lda.transform(X_counts) # (1, n_topics) X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)]) pred_label = int(self._clf.predict(X_combined)[0]) proba = self._clf.predict_proba(X_combined)[0] confidence = round(float(max(proba)) * 100, 1) verdict = _LABELS[pred_label] # Top TF-IDF features feature_names = self._tfidf.get_feature_names_out() tfidf_scores = X_tfidf.toarray()[0] top_idx = tfidf_scores.argsort()[-4:][::-1] triggered = [feature_names[i] for i in top_idx if tfidf_scores[i] > 0] # Prepend dominant topic label dominant_topic = int(X_lda[0].argmax()) + 1 triggered.insert(0, f"lda_topic_{dominant_topic}") return Layer1Result( verdict=verdict, confidence=confidence, triggered_features=triggered[:5], ) def get_topic_info(self, text: str) -> dict: """ Infer the dominant LDA topic for a new text. Returns label (human-assigned), top 6 defining words, and confidence (the probability mass on the dominant topic, 0–100%). """ processed = self._preprocess(text) X_counts = self._count_vec.transform([processed]) X_lda = self._lda.transform(X_counts) # (1, n_topics) topic_idx = int(X_lda[0].argmax()) confidence = round(float(X_lda[0][topic_idx]) * 100, 1) vocab = self._count_vec.get_feature_names_out() topic_vec = self._lda.components_[topic_idx] top_words = [vocab[i] for i in topic_vec.argsort()[-6:][::-1]] label = TOPIC_LABELS.get(topic_idx + 1, f"Topic {topic_idx + 1}") return {"label": label, "top_words": top_words, "confidence": confidence} # ── Direct run ───────────────────────────────────────────────────────────────── if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="LDA topic analysis on PhilVerify dataset") parser.add_argument("--n-topics", type=int, default=5) parser.add_argument("--n-top-words", type=int, default=10) parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() train_samples, _ = get_split(seed=args.seed) run_topic_analysis(train_samples, n_topics=args.n_topics, n_top_words=args.n_top_words)