Spaces:

cjen1008
/

NLP-Topic-Analysis-Apr-2026

Sleeping

NLP-Topic-Analysis-Apr-2026

File size: 3,136 Bytes

a882be6

"""
Type definitions and model configuration for the topic analysis pipeline.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional


class ModelType(str, Enum):
    BERTOPIC_MINI  = "bertopic_mini"   # BERTopic + all-MiniLM-L6-v2  (fast)
    BERTOPIC_MPNET = "bertopic_mpnet"  # BERTopic + all-mpnet-base-v2  (quality)
    LSI            = "lsi"             # Latent Semantic Indexing  (gensim)
    HDP            = "hdp"             # Hierarchical Dirichlet Process  (gensim)
    LDA            = "lda"             # Latent Dirichlet Allocation  (gensim)
    NMF            = "nmf"             # Non-negative Matrix Factorization  (sklearn)


SUPPORTED_MODELS: Dict[str, Dict] = {
    ModelType.BERTOPIC_MINI: {
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
        "display":         "BERTopic (MiniLM)",
        "description":     "Fast transformer-based topic discovery",
        "type":            "bertopic",
    },
    ModelType.BERTOPIC_MPNET: {
        "embedding_model": "sentence-transformers/all-mpnet-base-v2",
        "display":         "BERTopic (MPNet)",
        "description":     "Higher quality transformer-based topic discovery",
        "type":            "bertopic",
    },
    ModelType.LSI: {
        "display":     "LSI",
        "description": "Latent Semantic Indexing — SVD on TF-IDF, fast and deterministic",
        "type":        "gensim",
    },
    ModelType.HDP: {
        "display":     "HDP",
        "description": "Hierarchical Dirichlet Process — Bayesian, auto topic count",
        "type":        "gensim",
    },
    ModelType.LDA: {
        "display":     "LDA",
        "description": "Latent Dirichlet Allocation — gensim corpus, interpretable",
        "type":        "gensim",
    },
    ModelType.NMF: {
        "display":     "NMF",
        "description": "Non-negative Matrix Factorization — TF-IDF, good for short texts",
        "type":        "sklearn",
    },
}

MODEL_LABEL_TO_TYPE: Dict[str, str] = {
    "BERTopic (MiniLM)  — fast transformer":       ModelType.BERTOPIC_MINI,
    "BERTopic (MPNet)   — quality transformer":     ModelType.BERTOPIC_MPNET,
    "LSI                — latent semantic indexing": ModelType.LSI,
    "HDP                — auto topic count":         ModelType.HDP,
    "LDA                — gensim corpus":            ModelType.LDA,
    "NMF                — matrix factorization":     ModelType.NMF,
}


@dataclass
class TopicInfo:
    topic_id:   int
    keywords:   List[str]          # top words for this topic
    scores:     List[float]        # keyword weights
    doc_count:  int                # number of docs assigned to this topic


@dataclass
class DocumentResult:
    doc_id:    int
    text:      str
    topic_id:  int
    topic_keywords: List[str]
    probability: float             # confidence of assignment


@dataclass
class TopicResult:
    model_type:   str
    num_topics:   int
    topics:       List[TopicInfo]
    documents:    List[DocumentResult]
    outlier_count: int             # docs assigned to topic -1 (BERTopic noise)