""" Type definitions and model configuration for the topic analysis pipeline. """ from dataclasses import dataclass, field from enum import Enum from typing import Dict, List, Optional class ModelType(str, Enum): BERTOPIC_MINI = "bertopic_mini" # BERTopic + all-MiniLM-L6-v2 (fast) BERTOPIC_MPNET = "bertopic_mpnet" # BERTopic + all-mpnet-base-v2 (quality) LSI = "lsi" # Latent Semantic Indexing (gensim) HDP = "hdp" # Hierarchical Dirichlet Process (gensim) LDA = "lda" # Latent Dirichlet Allocation (gensim) NMF = "nmf" # Non-negative Matrix Factorization (sklearn) SUPPORTED_MODELS: Dict[str, Dict] = { ModelType.BERTOPIC_MINI: { "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", "display": "BERTopic (MiniLM)", "description": "Fast transformer-based topic discovery", "type": "bertopic", }, ModelType.BERTOPIC_MPNET: { "embedding_model": "sentence-transformers/all-mpnet-base-v2", "display": "BERTopic (MPNet)", "description": "Higher quality transformer-based topic discovery", "type": "bertopic", }, ModelType.LSI: { "display": "LSI", "description": "Latent Semantic Indexing — SVD on TF-IDF, fast and deterministic", "type": "gensim", }, ModelType.HDP: { "display": "HDP", "description": "Hierarchical Dirichlet Process — Bayesian, auto topic count", "type": "gensim", }, ModelType.LDA: { "display": "LDA", "description": "Latent Dirichlet Allocation — gensim corpus, interpretable", "type": "gensim", }, ModelType.NMF: { "display": "NMF", "description": "Non-negative Matrix Factorization — TF-IDF, good for short texts", "type": "sklearn", }, } MODEL_LABEL_TO_TYPE: Dict[str, str] = { "BERTopic (MiniLM) — fast transformer": ModelType.BERTOPIC_MINI, "BERTopic (MPNet) — quality transformer": ModelType.BERTOPIC_MPNET, "LSI — latent semantic indexing": ModelType.LSI, "HDP — auto topic count": ModelType.HDP, "LDA — gensim corpus": ModelType.LDA, "NMF — matrix factorization": ModelType.NMF, } @dataclass class TopicInfo: topic_id: int keywords: List[str] # top words for this topic scores: List[float] # keyword weights doc_count: int # number of docs assigned to this topic @dataclass class DocumentResult: doc_id: int text: str topic_id: int topic_keywords: List[str] probability: float # confidence of assignment @dataclass class TopicResult: model_type: str num_topics: int topics: List[TopicInfo] documents: List[DocumentResult] outlier_count: int # docs assigned to topic -1 (BERTopic noise)