File size: 3,136 Bytes
a882be6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | """
Type definitions and model configuration for the topic analysis pipeline.
"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional
class ModelType(str, Enum):
BERTOPIC_MINI = "bertopic_mini" # BERTopic + all-MiniLM-L6-v2 (fast)
BERTOPIC_MPNET = "bertopic_mpnet" # BERTopic + all-mpnet-base-v2 (quality)
LSI = "lsi" # Latent Semantic Indexing (gensim)
HDP = "hdp" # Hierarchical Dirichlet Process (gensim)
LDA = "lda" # Latent Dirichlet Allocation (gensim)
NMF = "nmf" # Non-negative Matrix Factorization (sklearn)
SUPPORTED_MODELS: Dict[str, Dict] = {
ModelType.BERTOPIC_MINI: {
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
"display": "BERTopic (MiniLM)",
"description": "Fast transformer-based topic discovery",
"type": "bertopic",
},
ModelType.BERTOPIC_MPNET: {
"embedding_model": "sentence-transformers/all-mpnet-base-v2",
"display": "BERTopic (MPNet)",
"description": "Higher quality transformer-based topic discovery",
"type": "bertopic",
},
ModelType.LSI: {
"display": "LSI",
"description": "Latent Semantic Indexing β SVD on TF-IDF, fast and deterministic",
"type": "gensim",
},
ModelType.HDP: {
"display": "HDP",
"description": "Hierarchical Dirichlet Process β Bayesian, auto topic count",
"type": "gensim",
},
ModelType.LDA: {
"display": "LDA",
"description": "Latent Dirichlet Allocation β gensim corpus, interpretable",
"type": "gensim",
},
ModelType.NMF: {
"display": "NMF",
"description": "Non-negative Matrix Factorization β TF-IDF, good for short texts",
"type": "sklearn",
},
}
MODEL_LABEL_TO_TYPE: Dict[str, str] = {
"BERTopic (MiniLM) β fast transformer": ModelType.BERTOPIC_MINI,
"BERTopic (MPNet) β quality transformer": ModelType.BERTOPIC_MPNET,
"LSI β latent semantic indexing": ModelType.LSI,
"HDP β auto topic count": ModelType.HDP,
"LDA β gensim corpus": ModelType.LDA,
"NMF β matrix factorization": ModelType.NMF,
}
@dataclass
class TopicInfo:
topic_id: int
keywords: List[str] # top words for this topic
scores: List[float] # keyword weights
doc_count: int # number of docs assigned to this topic
@dataclass
class DocumentResult:
doc_id: int
text: str
topic_id: int
topic_keywords: List[str]
probability: float # confidence of assignment
@dataclass
class TopicResult:
model_type: str
num_topics: int
topics: List[TopicInfo]
documents: List[DocumentResult]
outlier_count: int # docs assigned to topic -1 (BERTopic noise)
|