Spaces:

AdhyaSuman
/

DTECT

Running

App Files Files Community

DTECT / backend /evaluation /eval.py

AdhyaSuman

Initial commit with Git LFS for large files

11c72a2 9 months ago

raw

history blame contribute delete

7.62 kB

	# dynamic_topic_quality.py
	import numpy as np
	import pandas as pd
	from gensim.corpora.dictionary import Dictionary
	from gensim.models.coherencemodel import CoherenceModel
	from backend.evaluation.CoherenceModel_ttc import CoherenceModel_ttc
	from typing import List, Dict

	class TopicQualityAssessor:
	"""
	Calculates various quality metrics for dynamic topic models from in-memory data.

	This class provides methods to compute:
	- Temporal Topic Coherence (TTC)
	- Temporal Topic Smoothness (TTS)
	- Temporal Topic Quality (TTQ)
	- Yearly Topic Coherence (TC)
	- Yearly Topic Diversity (TD)
	- Yearly Topic Quality (TQ)
	"""

	def __init__(self, topics: List[List[List[str]]], train_texts: List[List[str]], topn: int, coherence_type: str):
	"""
	Initializes the TopicQualityAssessor with data in memory.

	Args:
	topics (List[List[List[str]]]): A nested list of topics with structure (T, K, W),
	where T is time slices, K is topics, and W is words.
	train_texts (List[List[str]]): A list of tokenized documents for the reference corpus.
	topn (int): Number of top words per topic to consider for calculations.
	coherence_type (str): The type of coherence to calculate (e.g., 'c_npmi', 'c_v').
	"""
	# 1. Set texts and dictionary
	self.texts = train_texts
	self.dictionary = Dictionary(self.texts)

	# 2. Process topics
	# User provides topics as (T, K, W) -> List[timestamps][topics][words]
	# Internal representation for temporal evolution is (K, T, W)
	topics_array_T_K_W = np.array(topics, dtype=object)
	if topics_array_T_K_W.ndim != 3:
	raise ValueError(f"Input 'topics' must be a 3-dimensional list/array. Got {topics_array_T_K_W.ndim} dimensions.")
	self.total_topics = topics_array_T_K_W.transpose(1, 0, 2) # Shape: (K, T, W)

	# 3. Get dimensions
	self.K, self.T, _ = self.total_topics.shape

	# 4. Create topic groups for smoothness calculation (pairs of topics over time)
	groups = []
	for k in range(self.K):
	time_pairs = []
	for t in range(self.T - 1):
	time_pairs.append([self.total_topics[k, t].tolist(), self.total_topics[k, t+1].tolist()])
	groups.append(time_pairs)
	self.group_topics = np.array(groups, dtype=object)

	# 5. Create yearly topics (T, K, W) for TC/TD calculation
	self.yearly_topics = self.total_topics.transpose(1, 0, 2)

	# 6. Set parameters
	self.topn = topn
	self.coherence_type = coherence_type

	def _compute_coherence(self, topics: List[List[str]]) -> List[float]:
	cm = CoherenceModel(
	topics=topics, texts=self.texts, dictionary=self.dictionary,
	coherence=self.coherence_type, topn=self.topn
	)
	return cm.get_coherence_per_topic()

	def _compute_coherence_ttc(self, topics: List[List[str]]) -> List[float]:
	cm = CoherenceModel_ttc(
	topics=topics, texts=self.texts, dictionary=self.dictionary,
	coherence=self.coherence_type, topn=self.topn
	)
	return cm.get_coherence_per_topic()

	def _topic_smoothness(self, topics: List[List[str]]) -> float:
	K = len(topics)
	if K <= 1:
	return 1.0 # Or 0.0, depending on definition. A single topic has no other topic to be dissimilar to.
	scores = []
	for i, base in enumerate(topics):
	base_set = set(base[:self.topn])
	others = [other for j, other in enumerate(topics) if j != i]
	if not others:
	return 1.0
	overlaps = [len(base_set & set(other[:self.topn])) / self.topn for other in others]
	scores.append(sum(overlaps) / len(overlaps))
	return float(sum(scores) / K)

	def get_ttq_dataframe(self) -> pd.DataFrame:
	"""Computes and returns a DataFrame with detailed TTQ metrics per topic chain."""
	all_coh_scores, avg_coh_scores = [], []
	for k in range(self.K):
	coh_per_topic = self._compute_coherence_ttc(self.total_topics[k].tolist())
	all_coh_scores.append(coh_per_topic)
	avg_coh_scores.append(float(np.mean(coh_per_topic)))

	all_smooth_scores, avg_smooth_scores = [], []
	for k in range(self.K):
	pair_scores = [self._topic_smoothness(pair) for pair in self.group_topics[k]]
	all_smooth_scores.append(pair_scores)
	avg_smooth_scores.append(float(np.mean(pair_scores)))

	df = pd.DataFrame({
	'topic_idx': list(range(self.K)),
	'temporal_coherence': all_coh_scores,
	'temporal_smoothness': all_smooth_scores,
	'avg_temporal_coherence': avg_coh_scores,
	'avg_temporal_smoothness': avg_smooth_scores
	})
	df['ttq_product'] = df['avg_temporal_coherence'] * df['avg_temporal_smoothness']
	return df

	def get_tq_dataframe(self) -> pd.DataFrame:
	"""Computes and returns a DataFrame with detailed TQ metrics per time slice."""
	all_coh, avg_coh, div = [], [], []
	for t in range(self.T):
	yearly_t_topics = self.yearly_topics[t].tolist()
	coh_per_topic = self._compute_coherence(yearly_t_topics)
	all_coh.append(coh_per_topic)
	avg_coh.append(float(np.mean(coh_per_topic)))
	div.append(1 - self._topic_smoothness(yearly_t_topics))

	df = pd.DataFrame({
	'year': list(range(self.T)),
	'all_coherence': all_coh,
	'avg_coherence': avg_coh,
	'diversity': div
	})
	df['tq_product'] = df['avg_coherence'] * df['diversity']
	return df

	def get_ttc_score(self) -> float:
	"""Calculates the overall Temporal Topic Coherence (TTC)."""
	ttq_df = self.get_ttq_dataframe()
	return ttq_df['avg_temporal_coherence'].mean()

	def get_tts_score(self) -> float:
	"""Calculates the overall Temporal Topic Smoothness (TTS)."""
	ttq_df = self.get_ttq_dataframe()
	return ttq_df['avg_temporal_smoothness'].mean()

	def get_ttq_score(self) -> float:
	"""Calculates the overall Temporal Topic Quality (TTQ)."""
	ttq_df = self.get_ttq_dataframe()
	return ttq_df['ttq_product'].mean()

	def get_tc_score(self) -> float:
	"""Calculates the overall yearly Topic Coherence (TC)."""
	tq_df = self.get_tq_dataframe()
	return tq_df['avg_coherence'].mean()

	def get_td_score(self) -> float:
	"""Calculates the overall yearly Topic Diversity (TD)."""
	tq_df = self.get_tq_dataframe()
	return tq_df['diversity'].mean()

	def get_tq_score(self) -> float:
	"""Calculates the overall yearly Topic Quality (TQ)."""
	tq_df = self.get_tq_dataframe()
	return tq_df['tq_product'].mean()

	def get_dtq_summary(self) -> Dict[str, float]:
	"""
	Computes all dynamic topic quality metrics and returns them in a dictionary.
	"""
	ttq_df = self.get_ttq_dataframe()
	tq_df = self.get_tq_dataframe()
	summary = {
	'TTC': ttq_df['avg_temporal_coherence'].mean(),
	'TTS': ttq_df['avg_temporal_smoothness'].mean(),
	'TTQ': ttq_df['ttq_product'].mean(),
	'TC': tq_df['avg_coherence'].mean(),
	'TD': tq_df['diversity'].mean(),
	'TQ': tq_df['tq_product'].mean()
	}
	return summary