Spaces:
Running
Running
File size: 7,622 Bytes
11c72a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | # dynamic_topic_quality.py
import numpy as np
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from backend.evaluation.CoherenceModel_ttc import CoherenceModel_ttc
from typing import List, Dict
class TopicQualityAssessor:
"""
Calculates various quality metrics for dynamic topic models from in-memory data.
This class provides methods to compute:
- Temporal Topic Coherence (TTC)
- Temporal Topic Smoothness (TTS)
- Temporal Topic Quality (TTQ)
- Yearly Topic Coherence (TC)
- Yearly Topic Diversity (TD)
- Yearly Topic Quality (TQ)
"""
def __init__(self, topics: List[List[List[str]]], train_texts: List[List[str]], topn: int, coherence_type: str):
"""
Initializes the TopicQualityAssessor with data in memory.
Args:
topics (List[List[List[str]]]): A nested list of topics with structure (T, K, W),
where T is time slices, K is topics, and W is words.
train_texts (List[List[str]]): A list of tokenized documents for the reference corpus.
topn (int): Number of top words per topic to consider for calculations.
coherence_type (str): The type of coherence to calculate (e.g., 'c_npmi', 'c_v').
"""
# 1. Set texts and dictionary
self.texts = train_texts
self.dictionary = Dictionary(self.texts)
# 2. Process topics
# User provides topics as (T, K, W) -> List[timestamps][topics][words]
# Internal representation for temporal evolution is (K, T, W)
topics_array_T_K_W = np.array(topics, dtype=object)
if topics_array_T_K_W.ndim != 3:
raise ValueError(f"Input 'topics' must be a 3-dimensional list/array. Got {topics_array_T_K_W.ndim} dimensions.")
self.total_topics = topics_array_T_K_W.transpose(1, 0, 2) # Shape: (K, T, W)
# 3. Get dimensions
self.K, self.T, _ = self.total_topics.shape
# 4. Create topic groups for smoothness calculation (pairs of topics over time)
groups = []
for k in range(self.K):
time_pairs = []
for t in range(self.T - 1):
time_pairs.append([self.total_topics[k, t].tolist(), self.total_topics[k, t+1].tolist()])
groups.append(time_pairs)
self.group_topics = np.array(groups, dtype=object)
# 5. Create yearly topics (T, K, W) for TC/TD calculation
self.yearly_topics = self.total_topics.transpose(1, 0, 2)
# 6. Set parameters
self.topn = topn
self.coherence_type = coherence_type
def _compute_coherence(self, topics: List[List[str]]) -> List[float]:
cm = CoherenceModel(
topics=topics, texts=self.texts, dictionary=self.dictionary,
coherence=self.coherence_type, topn=self.topn
)
return cm.get_coherence_per_topic()
def _compute_coherence_ttc(self, topics: List[List[str]]) -> List[float]:
cm = CoherenceModel_ttc(
topics=topics, texts=self.texts, dictionary=self.dictionary,
coherence=self.coherence_type, topn=self.topn
)
return cm.get_coherence_per_topic()
def _topic_smoothness(self, topics: List[List[str]]) -> float:
K = len(topics)
if K <= 1:
return 1.0 # Or 0.0, depending on definition. A single topic has no other topic to be dissimilar to.
scores = []
for i, base in enumerate(topics):
base_set = set(base[:self.topn])
others = [other for j, other in enumerate(topics) if j != i]
if not others:
return 1.0
overlaps = [len(base_set & set(other[:self.topn])) / self.topn for other in others]
scores.append(sum(overlaps) / len(overlaps))
return float(sum(scores) / K)
def get_ttq_dataframe(self) -> pd.DataFrame:
"""Computes and returns a DataFrame with detailed TTQ metrics per topic chain."""
all_coh_scores, avg_coh_scores = [], []
for k in range(self.K):
coh_per_topic = self._compute_coherence_ttc(self.total_topics[k].tolist())
all_coh_scores.append(coh_per_topic)
avg_coh_scores.append(float(np.mean(coh_per_topic)))
all_smooth_scores, avg_smooth_scores = [], []
for k in range(self.K):
pair_scores = [self._topic_smoothness(pair) for pair in self.group_topics[k]]
all_smooth_scores.append(pair_scores)
avg_smooth_scores.append(float(np.mean(pair_scores)))
df = pd.DataFrame({
'topic_idx': list(range(self.K)),
'temporal_coherence': all_coh_scores,
'temporal_smoothness': all_smooth_scores,
'avg_temporal_coherence': avg_coh_scores,
'avg_temporal_smoothness': avg_smooth_scores
})
df['ttq_product'] = df['avg_temporal_coherence'] * df['avg_temporal_smoothness']
return df
def get_tq_dataframe(self) -> pd.DataFrame:
"""Computes and returns a DataFrame with detailed TQ metrics per time slice."""
all_coh, avg_coh, div = [], [], []
for t in range(self.T):
yearly_t_topics = self.yearly_topics[t].tolist()
coh_per_topic = self._compute_coherence(yearly_t_topics)
all_coh.append(coh_per_topic)
avg_coh.append(float(np.mean(coh_per_topic)))
div.append(1 - self._topic_smoothness(yearly_t_topics))
df = pd.DataFrame({
'year': list(range(self.T)),
'all_coherence': all_coh,
'avg_coherence': avg_coh,
'diversity': div
})
df['tq_product'] = df['avg_coherence'] * df['diversity']
return df
def get_ttc_score(self) -> float:
"""Calculates the overall Temporal Topic Coherence (TTC)."""
ttq_df = self.get_ttq_dataframe()
return ttq_df['avg_temporal_coherence'].mean()
def get_tts_score(self) -> float:
"""Calculates the overall Temporal Topic Smoothness (TTS)."""
ttq_df = self.get_ttq_dataframe()
return ttq_df['avg_temporal_smoothness'].mean()
def get_ttq_score(self) -> float:
"""Calculates the overall Temporal Topic Quality (TTQ)."""
ttq_df = self.get_ttq_dataframe()
return ttq_df['ttq_product'].mean()
def get_tc_score(self) -> float:
"""Calculates the overall yearly Topic Coherence (TC)."""
tq_df = self.get_tq_dataframe()
return tq_df['avg_coherence'].mean()
def get_td_score(self) -> float:
"""Calculates the overall yearly Topic Diversity (TD)."""
tq_df = self.get_tq_dataframe()
return tq_df['diversity'].mean()
def get_tq_score(self) -> float:
"""Calculates the overall yearly Topic Quality (TQ)."""
tq_df = self.get_tq_dataframe()
return tq_df['tq_product'].mean()
def get_dtq_summary(self) -> Dict[str, float]:
"""
Computes all dynamic topic quality metrics and returns them in a dictionary.
"""
ttq_df = self.get_ttq_dataframe()
tq_df = self.get_tq_dataframe()
summary = {
'TTC': ttq_df['avg_temporal_coherence'].mean(),
'TTS': ttq_df['avg_temporal_smoothness'].mean(),
'TTQ': ttq_df['ttq_product'].mean(),
'TC': tq_df['avg_coherence'].mean(),
'TD': tq_df['diversity'].mean(),
'TQ': tq_df['tq_product'].mean()
}
return summary |