Spaces:
Running
Running
| # dynamic_topic_quality.py | |
| import numpy as np | |
| import pandas as pd | |
| from gensim.corpora.dictionary import Dictionary | |
| from gensim.models.coherencemodel import CoherenceModel | |
| from backend.evaluation.CoherenceModel_ttc import CoherenceModel_ttc | |
| from typing import List, Dict | |
| class TopicQualityAssessor: | |
| """ | |
| Calculates various quality metrics for dynamic topic models from in-memory data. | |
| This class provides methods to compute: | |
| - Temporal Topic Coherence (TTC) | |
| - Temporal Topic Smoothness (TTS) | |
| - Temporal Topic Quality (TTQ) | |
| - Yearly Topic Coherence (TC) | |
| - Yearly Topic Diversity (TD) | |
| - Yearly Topic Quality (TQ) | |
| """ | |
| def __init__(self, topics: List[List[List[str]]], train_texts: List[List[str]], topn: int, coherence_type: str): | |
| """ | |
| Initializes the TopicQualityAssessor with data in memory. | |
| Args: | |
| topics (List[List[List[str]]]): A nested list of topics with structure (T, K, W), | |
| where T is time slices, K is topics, and W is words. | |
| train_texts (List[List[str]]): A list of tokenized documents for the reference corpus. | |
| topn (int): Number of top words per topic to consider for calculations. | |
| coherence_type (str): The type of coherence to calculate (e.g., 'c_npmi', 'c_v'). | |
| """ | |
| # 1. Set texts and dictionary | |
| self.texts = train_texts | |
| self.dictionary = Dictionary(self.texts) | |
| # 2. Process topics | |
| # User provides topics as (T, K, W) -> List[timestamps][topics][words] | |
| # Internal representation for temporal evolution is (K, T, W) | |
| topics_array_T_K_W = np.array(topics, dtype=object) | |
| if topics_array_T_K_W.ndim != 3: | |
| raise ValueError(f"Input 'topics' must be a 3-dimensional list/array. Got {topics_array_T_K_W.ndim} dimensions.") | |
| self.total_topics = topics_array_T_K_W.transpose(1, 0, 2) # Shape: (K, T, W) | |
| # 3. Get dimensions | |
| self.K, self.T, _ = self.total_topics.shape | |
| # 4. Create topic groups for smoothness calculation (pairs of topics over time) | |
| groups = [] | |
| for k in range(self.K): | |
| time_pairs = [] | |
| for t in range(self.T - 1): | |
| time_pairs.append([self.total_topics[k, t].tolist(), self.total_topics[k, t+1].tolist()]) | |
| groups.append(time_pairs) | |
| self.group_topics = np.array(groups, dtype=object) | |
| # 5. Create yearly topics (T, K, W) for TC/TD calculation | |
| self.yearly_topics = self.total_topics.transpose(1, 0, 2) | |
| # 6. Set parameters | |
| self.topn = topn | |
| self.coherence_type = coherence_type | |
| def _compute_coherence(self, topics: List[List[str]]) -> List[float]: | |
| cm = CoherenceModel( | |
| topics=topics, texts=self.texts, dictionary=self.dictionary, | |
| coherence=self.coherence_type, topn=self.topn | |
| ) | |
| return cm.get_coherence_per_topic() | |
| def _compute_coherence_ttc(self, topics: List[List[str]]) -> List[float]: | |
| cm = CoherenceModel_ttc( | |
| topics=topics, texts=self.texts, dictionary=self.dictionary, | |
| coherence=self.coherence_type, topn=self.topn | |
| ) | |
| return cm.get_coherence_per_topic() | |
| def _topic_smoothness(self, topics: List[List[str]]) -> float: | |
| K = len(topics) | |
| if K <= 1: | |
| return 1.0 # Or 0.0, depending on definition. A single topic has no other topic to be dissimilar to. | |
| scores = [] | |
| for i, base in enumerate(topics): | |
| base_set = set(base[:self.topn]) | |
| others = [other for j, other in enumerate(topics) if j != i] | |
| if not others: | |
| return 1.0 | |
| overlaps = [len(base_set & set(other[:self.topn])) / self.topn for other in others] | |
| scores.append(sum(overlaps) / len(overlaps)) | |
| return float(sum(scores) / K) | |
| def get_ttq_dataframe(self) -> pd.DataFrame: | |
| """Computes and returns a DataFrame with detailed TTQ metrics per topic chain.""" | |
| all_coh_scores, avg_coh_scores = [], [] | |
| for k in range(self.K): | |
| coh_per_topic = self._compute_coherence_ttc(self.total_topics[k].tolist()) | |
| all_coh_scores.append(coh_per_topic) | |
| avg_coh_scores.append(float(np.mean(coh_per_topic))) | |
| all_smooth_scores, avg_smooth_scores = [], [] | |
| for k in range(self.K): | |
| pair_scores = [self._topic_smoothness(pair) for pair in self.group_topics[k]] | |
| all_smooth_scores.append(pair_scores) | |
| avg_smooth_scores.append(float(np.mean(pair_scores))) | |
| df = pd.DataFrame({ | |
| 'topic_idx': list(range(self.K)), | |
| 'temporal_coherence': all_coh_scores, | |
| 'temporal_smoothness': all_smooth_scores, | |
| 'avg_temporal_coherence': avg_coh_scores, | |
| 'avg_temporal_smoothness': avg_smooth_scores | |
| }) | |
| df['ttq_product'] = df['avg_temporal_coherence'] * df['avg_temporal_smoothness'] | |
| return df | |
| def get_tq_dataframe(self) -> pd.DataFrame: | |
| """Computes and returns a DataFrame with detailed TQ metrics per time slice.""" | |
| all_coh, avg_coh, div = [], [], [] | |
| for t in range(self.T): | |
| yearly_t_topics = self.yearly_topics[t].tolist() | |
| coh_per_topic = self._compute_coherence(yearly_t_topics) | |
| all_coh.append(coh_per_topic) | |
| avg_coh.append(float(np.mean(coh_per_topic))) | |
| div.append(1 - self._topic_smoothness(yearly_t_topics)) | |
| df = pd.DataFrame({ | |
| 'year': list(range(self.T)), | |
| 'all_coherence': all_coh, | |
| 'avg_coherence': avg_coh, | |
| 'diversity': div | |
| }) | |
| df['tq_product'] = df['avg_coherence'] * df['diversity'] | |
| return df | |
| def get_ttc_score(self) -> float: | |
| """Calculates the overall Temporal Topic Coherence (TTC).""" | |
| ttq_df = self.get_ttq_dataframe() | |
| return ttq_df['avg_temporal_coherence'].mean() | |
| def get_tts_score(self) -> float: | |
| """Calculates the overall Temporal Topic Smoothness (TTS).""" | |
| ttq_df = self.get_ttq_dataframe() | |
| return ttq_df['avg_temporal_smoothness'].mean() | |
| def get_ttq_score(self) -> float: | |
| """Calculates the overall Temporal Topic Quality (TTQ).""" | |
| ttq_df = self.get_ttq_dataframe() | |
| return ttq_df['ttq_product'].mean() | |
| def get_tc_score(self) -> float: | |
| """Calculates the overall yearly Topic Coherence (TC).""" | |
| tq_df = self.get_tq_dataframe() | |
| return tq_df['avg_coherence'].mean() | |
| def get_td_score(self) -> float: | |
| """Calculates the overall yearly Topic Diversity (TD).""" | |
| tq_df = self.get_tq_dataframe() | |
| return tq_df['diversity'].mean() | |
| def get_tq_score(self) -> float: | |
| """Calculates the overall yearly Topic Quality (TQ).""" | |
| tq_df = self.get_tq_dataframe() | |
| return tq_df['tq_product'].mean() | |
| def get_dtq_summary(self) -> Dict[str, float]: | |
| """ | |
| Computes all dynamic topic quality metrics and returns them in a dictionary. | |
| """ | |
| ttq_df = self.get_ttq_dataframe() | |
| tq_df = self.get_tq_dataframe() | |
| summary = { | |
| 'TTC': ttq_df['avg_temporal_coherence'].mean(), | |
| 'TTS': ttq_df['avg_temporal_smoothness'].mean(), | |
| 'TTQ': ttq_df['ttq_product'].mean(), | |
| 'TC': tq_df['avg_coherence'].mean(), | |
| 'TD': tq_df['diversity'].mean(), | |
| 'TQ': tq_df['tq_product'].mean() | |
| } | |
| return summary |