Spaces:

cjen1008
/

NLP-Topic-Analysis-Apr-2026

Running

App Files Files Community

NLP-Topic-Analysis-Apr-2026 / src /models.py

cjen1008

NLP Topic Analysis v0.0 (April 2026) — initial HF Space commit

a882be6 16 days ago

raw

history blame contribute delete

3.14 kB

	"""
	Type definitions and model configuration for the topic analysis pipeline.
	"""

	from dataclasses import dataclass, field
	from enum import Enum
	from typing import Dict, List, Optional


	class ModelType(str, Enum):
	BERTOPIC_MINI = "bertopic_mini" # BERTopic + all-MiniLM-L6-v2 (fast)
	BERTOPIC_MPNET = "bertopic_mpnet" # BERTopic + all-mpnet-base-v2 (quality)
	LSI = "lsi" # Latent Semantic Indexing (gensim)
	HDP = "hdp" # Hierarchical Dirichlet Process (gensim)
	LDA = "lda" # Latent Dirichlet Allocation (gensim)
	NMF = "nmf" # Non-negative Matrix Factorization (sklearn)


	SUPPORTED_MODELS: Dict[str, Dict] = {
	ModelType.BERTOPIC_MINI: {
	"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
	"display": "BERTopic (MiniLM)",
	"description": "Fast transformer-based topic discovery",
	"type": "bertopic",
	},
	ModelType.BERTOPIC_MPNET: {
	"embedding_model": "sentence-transformers/all-mpnet-base-v2",
	"display": "BERTopic (MPNet)",
	"description": "Higher quality transformer-based topic discovery",
	"type": "bertopic",
	},
	ModelType.LSI: {
	"display": "LSI",
	"description": "Latent Semantic Indexing — SVD on TF-IDF, fast and deterministic",
	"type": "gensim",
	},
	ModelType.HDP: {
	"display": "HDP",
	"description": "Hierarchical Dirichlet Process — Bayesian, auto topic count",
	"type": "gensim",
	},
	ModelType.LDA: {
	"display": "LDA",
	"description": "Latent Dirichlet Allocation — gensim corpus, interpretable",
	"type": "gensim",
	},
	ModelType.NMF: {
	"display": "NMF",
	"description": "Non-negative Matrix Factorization — TF-IDF, good for short texts",
	"type": "sklearn",
	},
	}

	MODEL_LABEL_TO_TYPE: Dict[str, str] = {
	"BERTopic (MiniLM) — fast transformer": ModelType.BERTOPIC_MINI,
	"BERTopic (MPNet) — quality transformer": ModelType.BERTOPIC_MPNET,
	"LSI — latent semantic indexing": ModelType.LSI,
	"HDP — auto topic count": ModelType.HDP,
	"LDA — gensim corpus": ModelType.LDA,
	"NMF — matrix factorization": ModelType.NMF,
	}


	@dataclass
	class TopicInfo:
	topic_id: int
	keywords: List[str] # top words for this topic
	scores: List[float] # keyword weights
	doc_count: int # number of docs assigned to this topic


	@dataclass
	class DocumentResult:
	doc_id: int
	text: str
	topic_id: int
	topic_keywords: List[str]
	probability: float # confidence of assignment


	@dataclass
	class TopicResult:
	model_type: str
	num_topics: int
	topics: List[TopicInfo]
	documents: List[DocumentResult]
	outlier_count: int # docs assigned to topic -1 (BERTopic noise)