bertopic / topic_modeling.py
Mars203020's picture
Upload 17 files
b7b041e verified
# topic_modeling.py
import random
import pandas as pd
from bertopic import BERTopic
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize
from typing import List
from sklearn.feature_extraction.text import CountVectorizer
def perform_topic_modeling(
docs: List[str],
language: str = "english",
nr_topics=None,
remove_stopwords_bertopic: bool = False, # New parameter to control behavior
custom_stopwords: List[str] = None
):
"""
Performs topic modeling on a list of documents.
Args:
docs (List[str]): A list of documents. Stopwords should be INCLUDED for best results.
language (str): Language for the BERTopic model ('english', 'multilingual').
nr_topics: The number of topics to find ("auto" or an int).
remove_stopwords_bertopic (bool): If True, stopwords will be removed internally by BERTopic.
custom_stopwords (List[str]): A list of custom stopwords to use.
Returns:
tuple: BERTopic model, topics, probabilities, and coherence score.
"""
vectorizer_model = None # Default to no custom vectorizer
if remove_stopwords_bertopic:
stop_words_list = []
if language == "english":
# Start with the built-in English stopword list from scikit-learn
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_words_list = list(ENGLISH_STOP_WORDS)
# Add any custom stopwords provided by the user
if custom_stopwords:
stop_words_list.extend(custom_stopwords)
# Only create a vectorizer if there's a list of stopwords to use
if stop_words_list:
vectorizer_model = CountVectorizer(stop_words=stop_words_list)
# Instantiate BERTopic, passing the vectorizer_model if it was created
if language == "multilingual":
topic_model = BERTopic(language="multilingual", nr_topics=nr_topics, vectorizer_model=vectorizer_model)
else:
topic_model = BERTopic(language=language, nr_topics=nr_topics, vectorizer_model=vectorizer_model)
# The 'docs' passed here should contain stopwords for the embedding model to work best
topics, probs = topic_model.fit_transform(docs)
# --- Calculate Coherence Score ---
# Sample documents for faster coherence calculation (2000 docs is sufficient for accurate estimate)
max_coherence_docs = 2000
if len(docs) > max_coherence_docs:
sample_docs = random.sample(docs, max_coherence_docs)
else:
sample_docs = docs
tokenized_docs = [word_tokenize(doc) for doc in sample_docs]
dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
topic_words = topic_model.get_topics()
topics_for_coherence = []
for topic_id in sorted(topic_words.keys()):
if topic_id != -1:
words = [word for word, _ in topic_model.get_topic(topic_id)]
topics_for_coherence.append(words)
coherence_score = None
if topics_for_coherence and corpus:
try:
coherence_model = CoherenceModel(
topics=topics_for_coherence,
texts=tokenized_docs,
dictionary=dictionary,
coherence='c_v'
)
coherence_score = coherence_model.get_coherence()
except Exception as e:
print(f"Could not calculate coherence score: {e}")
return topic_model, topics, probs, coherence_score