cjen1008's picture
NLP Topic Analysis v0.0 (April 2026) β€” initial HF Space commit
a882be6
"""
Type definitions and model configuration for the topic analysis pipeline.
"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional
class ModelType(str, Enum):
BERTOPIC_MINI = "bertopic_mini" # BERTopic + all-MiniLM-L6-v2 (fast)
BERTOPIC_MPNET = "bertopic_mpnet" # BERTopic + all-mpnet-base-v2 (quality)
LSI = "lsi" # Latent Semantic Indexing (gensim)
HDP = "hdp" # Hierarchical Dirichlet Process (gensim)
LDA = "lda" # Latent Dirichlet Allocation (gensim)
NMF = "nmf" # Non-negative Matrix Factorization (sklearn)
SUPPORTED_MODELS: Dict[str, Dict] = {
ModelType.BERTOPIC_MINI: {
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
"display": "BERTopic (MiniLM)",
"description": "Fast transformer-based topic discovery",
"type": "bertopic",
},
ModelType.BERTOPIC_MPNET: {
"embedding_model": "sentence-transformers/all-mpnet-base-v2",
"display": "BERTopic (MPNet)",
"description": "Higher quality transformer-based topic discovery",
"type": "bertopic",
},
ModelType.LSI: {
"display": "LSI",
"description": "Latent Semantic Indexing β€” SVD on TF-IDF, fast and deterministic",
"type": "gensim",
},
ModelType.HDP: {
"display": "HDP",
"description": "Hierarchical Dirichlet Process β€” Bayesian, auto topic count",
"type": "gensim",
},
ModelType.LDA: {
"display": "LDA",
"description": "Latent Dirichlet Allocation β€” gensim corpus, interpretable",
"type": "gensim",
},
ModelType.NMF: {
"display": "NMF",
"description": "Non-negative Matrix Factorization β€” TF-IDF, good for short texts",
"type": "sklearn",
},
}
MODEL_LABEL_TO_TYPE: Dict[str, str] = {
"BERTopic (MiniLM) β€” fast transformer": ModelType.BERTOPIC_MINI,
"BERTopic (MPNet) β€” quality transformer": ModelType.BERTOPIC_MPNET,
"LSI β€” latent semantic indexing": ModelType.LSI,
"HDP β€” auto topic count": ModelType.HDP,
"LDA β€” gensim corpus": ModelType.LDA,
"NMF β€” matrix factorization": ModelType.NMF,
}
@dataclass
class TopicInfo:
topic_id: int
keywords: List[str] # top words for this topic
scores: List[float] # keyword weights
doc_count: int # number of docs assigned to this topic
@dataclass
class DocumentResult:
doc_id: int
text: str
topic_id: int
topic_keywords: List[str]
probability: float # confidence of assignment
@dataclass
class TopicResult:
model_type: str
num_topics: int
topics: List[TopicInfo]
documents: List[DocumentResult]
outlier_count: int # docs assigned to topic -1 (BERTopic noise)