File size: 3,136 Bytes
a882be6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Type definitions and model configuration for the topic analysis pipeline.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional


class ModelType(str, Enum):
    BERTOPIC_MINI  = "bertopic_mini"   # BERTopic + all-MiniLM-L6-v2  (fast)
    BERTOPIC_MPNET = "bertopic_mpnet"  # BERTopic + all-mpnet-base-v2  (quality)
    LSI            = "lsi"             # Latent Semantic Indexing  (gensim)
    HDP            = "hdp"             # Hierarchical Dirichlet Process  (gensim)
    LDA            = "lda"             # Latent Dirichlet Allocation  (gensim)
    NMF            = "nmf"             # Non-negative Matrix Factorization  (sklearn)


SUPPORTED_MODELS: Dict[str, Dict] = {
    ModelType.BERTOPIC_MINI: {
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
        "display":         "BERTopic (MiniLM)",
        "description":     "Fast transformer-based topic discovery",
        "type":            "bertopic",
    },
    ModelType.BERTOPIC_MPNET: {
        "embedding_model": "sentence-transformers/all-mpnet-base-v2",
        "display":         "BERTopic (MPNet)",
        "description":     "Higher quality transformer-based topic discovery",
        "type":            "bertopic",
    },
    ModelType.LSI: {
        "display":     "LSI",
        "description": "Latent Semantic Indexing β€” SVD on TF-IDF, fast and deterministic",
        "type":        "gensim",
    },
    ModelType.HDP: {
        "display":     "HDP",
        "description": "Hierarchical Dirichlet Process β€” Bayesian, auto topic count",
        "type":        "gensim",
    },
    ModelType.LDA: {
        "display":     "LDA",
        "description": "Latent Dirichlet Allocation β€” gensim corpus, interpretable",
        "type":        "gensim",
    },
    ModelType.NMF: {
        "display":     "NMF",
        "description": "Non-negative Matrix Factorization β€” TF-IDF, good for short texts",
        "type":        "sklearn",
    },
}

MODEL_LABEL_TO_TYPE: Dict[str, str] = {
    "BERTopic (MiniLM)  β€” fast transformer":       ModelType.BERTOPIC_MINI,
    "BERTopic (MPNet)   β€” quality transformer":     ModelType.BERTOPIC_MPNET,
    "LSI                β€” latent semantic indexing": ModelType.LSI,
    "HDP                β€” auto topic count":         ModelType.HDP,
    "LDA                β€” gensim corpus":            ModelType.LDA,
    "NMF                β€” matrix factorization":     ModelType.NMF,
}


@dataclass
class TopicInfo:
    topic_id:   int
    keywords:   List[str]          # top words for this topic
    scores:     List[float]        # keyword weights
    doc_count:  int                # number of docs assigned to this topic


@dataclass
class DocumentResult:
    doc_id:    int
    text:      str
    topic_id:  int
    topic_keywords: List[str]
    probability: float             # confidence of assignment


@dataclass
class TopicResult:
    model_type:   str
    num_topics:   int
    topics:       List[TopicInfo]
    documents:    List[DocumentResult]
    outlier_count: int             # docs assigned to topic -1 (BERTopic noise)