AdhyaSuman's picture
Initial commit with Git LFS for large files
11c72a2
# dynamic_topic_quality.py
import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from backend.evaluation.CoherenceModel_ttc import CoherenceModel_ttc
from typing import List, Dict
class TopicQualityAssessor:
"""
Calculates various quality metrics for dynamic topic models from in-memory data.
This class provides methods to compute:
- Temporal Topic Coherence (TTC)
- Temporal Topic Smoothness (TTS)
- Temporal Topic Quality (TTQ)
- Yearly Topic Coherence (TC)
- Yearly Topic Diversity (TD)
- Yearly Topic Quality (TQ)
"""
def __init__(self, topics: List[List[List[str]]], train_texts: List[List[str]], topn: int, coherence_type: str):
"""
Initializes the TopicQualityAssessor with data in memory.
Args:
topics (List[List[List[str]]]): A nested list of topics with structure (T, K, W),
where T is time slices, K is topics, and W is words.
train_texts (List[List[str]]): A list of tokenized documents for the reference corpus.
topn (int): Number of top words per topic to consider for calculations.
coherence_type (str): The type of coherence to calculate (e.g., 'c_npmi', 'c_v').
"""
# 1. Set texts and dictionary
self.texts = train_texts
self.dictionary = Dictionary(self.texts)
# 2. Process topics
# User provides topics as (T, K, W) -> List[timestamps][topics][words]
# Internal representation for temporal evolution is (K, T, W)
topics_array_T_K_W = np.array(topics, dtype=object)
if topics_array_T_K_W.ndim != 3:
raise ValueError(f"Input 'topics' must be a 3-dimensional list/array. Got {topics_array_T_K_W.ndim} dimensions.")
self.total_topics = topics_array_T_K_W.transpose(1, 0, 2) # Shape: (K, T, W)
# 3. Get dimensions
self.K, self.T, _ = self.total_topics.shape
# 4. Create topic groups for smoothness calculation (pairs of topics over time)
groups = []
for k in range(self.K):
time_pairs = []
for t in range(self.T - 1):
time_pairs.append([self.total_topics[k, t].tolist(), self.total_topics[k, t+1].tolist()])
groups.append(time_pairs)
self.group_topics = np.array(groups, dtype=object)
# 5. Create yearly topics (T, K, W) for TC/TD calculation
self.yearly_topics = self.total_topics.transpose(1, 0, 2)
# 6. Set parameters
self.topn = topn
self.coherence_type = coherence_type
def _compute_coherence(self, topics: List[List[str]]) -> List[float]:
cm = CoherenceModel(
topics=topics, texts=self.texts, dictionary=self.dictionary,
coherence=self.coherence_type, topn=self.topn
)
return cm.get_coherence_per_topic()
def _compute_coherence_ttc(self, topics: List[List[str]]) -> List[float]:
cm = CoherenceModel_ttc(
topics=topics, texts=self.texts, dictionary=self.dictionary,
coherence=self.coherence_type, topn=self.topn
)
return cm.get_coherence_per_topic()
def _topic_smoothness(self, topics: List[List[str]]) -> float:
K = len(topics)
if K <= 1:
return 1.0 # Or 0.0, depending on definition. A single topic has no other topic to be dissimilar to.
scores = []
for i, base in enumerate(topics):
base_set = set(base[:self.topn])
others = [other for j, other in enumerate(topics) if j != i]
if not others:
return 1.0
overlaps = [len(base_set & set(other[:self.topn])) / self.topn for other in others]
scores.append(sum(overlaps) / len(overlaps))
return float(sum(scores) / K)
def get_ttq_dataframe(self) -> pd.DataFrame:
"""Computes and returns a DataFrame with detailed TTQ metrics per topic chain."""
all_coh_scores, avg_coh_scores = [], []
for k in range(self.K):
coh_per_topic = self._compute_coherence_ttc(self.total_topics[k].tolist())
all_coh_scores.append(coh_per_topic)
avg_coh_scores.append(float(np.mean(coh_per_topic)))
all_smooth_scores, avg_smooth_scores = [], []
for k in range(self.K):
pair_scores = [self._topic_smoothness(pair) for pair in self.group_topics[k]]
all_smooth_scores.append(pair_scores)
avg_smooth_scores.append(float(np.mean(pair_scores)))
df = pd.DataFrame({
'topic_idx': list(range(self.K)),
'temporal_coherence': all_coh_scores,
'temporal_smoothness': all_smooth_scores,
'avg_temporal_coherence': avg_coh_scores,
'avg_temporal_smoothness': avg_smooth_scores
})
df['ttq_product'] = df['avg_temporal_coherence'] * df['avg_temporal_smoothness']
return df
def get_tq_dataframe(self) -> pd.DataFrame:
"""Computes and returns a DataFrame with detailed TQ metrics per time slice."""
all_coh, avg_coh, div = [], [], []
for t in range(self.T):
yearly_t_topics = self.yearly_topics[t].tolist()
coh_per_topic = self._compute_coherence(yearly_t_topics)
all_coh.append(coh_per_topic)
avg_coh.append(float(np.mean(coh_per_topic)))
div.append(1 - self._topic_smoothness(yearly_t_topics))
df = pd.DataFrame({
'year': list(range(self.T)),
'all_coherence': all_coh,
'avg_coherence': avg_coh,
'diversity': div
})
df['tq_product'] = df['avg_coherence'] * df['diversity']
return df
def get_ttc_score(self) -> float:
"""Calculates the overall Temporal Topic Coherence (TTC)."""
ttq_df = self.get_ttq_dataframe()
return ttq_df['avg_temporal_coherence'].mean()
def get_tts_score(self) -> float:
"""Calculates the overall Temporal Topic Smoothness (TTS)."""
ttq_df = self.get_ttq_dataframe()
return ttq_df['avg_temporal_smoothness'].mean()
def get_ttq_score(self) -> float:
"""Calculates the overall Temporal Topic Quality (TTQ)."""
ttq_df = self.get_ttq_dataframe()
return ttq_df['ttq_product'].mean()
def get_tc_score(self) -> float:
"""Calculates the overall yearly Topic Coherence (TC)."""
tq_df = self.get_tq_dataframe()
return tq_df['avg_coherence'].mean()
def get_td_score(self) -> float:
"""Calculates the overall yearly Topic Diversity (TD)."""
tq_df = self.get_tq_dataframe()
return tq_df['diversity'].mean()
def get_tq_score(self) -> float:
"""Calculates the overall yearly Topic Quality (TQ)."""
tq_df = self.get_tq_dataframe()
return tq_df['tq_product'].mean()
def get_dtq_summary(self) -> Dict[str, float]:
"""
Computes all dynamic topic quality metrics and returns them in a dictionary.
"""
ttq_df = self.get_ttq_dataframe()
tq_df = self.get_tq_dataframe()
summary = {
'TTC': ttq_df['avg_temporal_coherence'].mean(),
'TTS': ttq_df['avg_temporal_smoothness'].mean(),
'TTQ': ttq_df['ttq_product'].mean(),
'TC': tq_df['avg_coherence'].mean(),
'TD': tq_df['diversity'].mean(),
'TQ': tq_df['tq_product'].mean()
}
return summary