| """ |
| Type definitions and model configuration for the topic analysis pipeline. |
| """ |
|
|
| from dataclasses import dataclass, field |
| from enum import Enum |
| from typing import Dict, List, Optional |
|
|
|
|
| class ModelType(str, Enum): |
| BERTOPIC_MINI = "bertopic_mini" |
| BERTOPIC_MPNET = "bertopic_mpnet" |
| LSI = "lsi" |
| HDP = "hdp" |
| LDA = "lda" |
| NMF = "nmf" |
|
|
|
|
| SUPPORTED_MODELS: Dict[str, Dict] = { |
| ModelType.BERTOPIC_MINI: { |
| "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", |
| "display": "BERTopic (MiniLM)", |
| "description": "Fast transformer-based topic discovery", |
| "type": "bertopic", |
| }, |
| ModelType.BERTOPIC_MPNET: { |
| "embedding_model": "sentence-transformers/all-mpnet-base-v2", |
| "display": "BERTopic (MPNet)", |
| "description": "Higher quality transformer-based topic discovery", |
| "type": "bertopic", |
| }, |
| ModelType.LSI: { |
| "display": "LSI", |
| "description": "Latent Semantic Indexing β SVD on TF-IDF, fast and deterministic", |
| "type": "gensim", |
| }, |
| ModelType.HDP: { |
| "display": "HDP", |
| "description": "Hierarchical Dirichlet Process β Bayesian, auto topic count", |
| "type": "gensim", |
| }, |
| ModelType.LDA: { |
| "display": "LDA", |
| "description": "Latent Dirichlet Allocation β gensim corpus, interpretable", |
| "type": "gensim", |
| }, |
| ModelType.NMF: { |
| "display": "NMF", |
| "description": "Non-negative Matrix Factorization β TF-IDF, good for short texts", |
| "type": "sklearn", |
| }, |
| } |
|
|
| MODEL_LABEL_TO_TYPE: Dict[str, str] = { |
| "BERTopic (MiniLM) β fast transformer": ModelType.BERTOPIC_MINI, |
| "BERTopic (MPNet) β quality transformer": ModelType.BERTOPIC_MPNET, |
| "LSI β latent semantic indexing": ModelType.LSI, |
| "HDP β auto topic count": ModelType.HDP, |
| "LDA β gensim corpus": ModelType.LDA, |
| "NMF β matrix factorization": ModelType.NMF, |
| } |
|
|
|
|
| @dataclass |
| class TopicInfo: |
| topic_id: int |
| keywords: List[str] |
| scores: List[float] |
| doc_count: int |
|
|
|
|
| @dataclass |
| class DocumentResult: |
| doc_id: int |
| text: str |
| topic_id: int |
| topic_keywords: List[str] |
| probability: float |
|
|
|
|
| @dataclass |
| class TopicResult: |
| model_type: str |
| num_topics: int |
| topics: List[TopicInfo] |
| documents: List[DocumentResult] |
| outlier_count: int |
|
|