Spaces:

Mars203020
/

bertopic

Sleeping

App Files Files Community

bertopic / topic_modeling.py

Mars203020

Upload 17 files

b7b041e verified 18 days ago

raw

history blame contribute delete

3.55 kB

	# topic_modeling.py

	import random
	import pandas as pd
	from bertopic import BERTopic
	from gensim.corpora import Dictionary
	from gensim.models import CoherenceModel
	from nltk.tokenize import word_tokenize
	from typing import List
	from sklearn.feature_extraction.text import CountVectorizer

	def perform_topic_modeling(
	docs: List[str],
	language: str = "english",
	nr_topics=None,
	remove_stopwords_bertopic: bool = False, # New parameter to control behavior
	custom_stopwords: List[str] = None
	):
	"""
	Performs topic modeling on a list of documents.

	Args:
	docs (List[str]): A list of documents. Stopwords should be INCLUDED for best results.
	language (str): Language for the BERTopic model ('english', 'multilingual').
	nr_topics: The number of topics to find ("auto" or an int).
	remove_stopwords_bertopic (bool): If True, stopwords will be removed internally by BERTopic.
	custom_stopwords (List[str]): A list of custom stopwords to use.

	Returns:
	tuple: BERTopic model, topics, probabilities, and coherence score.
	"""
	vectorizer_model = None # Default to no custom vectorizer

	if remove_stopwords_bertopic:
	stop_words_list = []
	if language == "english":
	# Start with the built-in English stopword list from scikit-learn
	from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
	stop_words_list = list(ENGLISH_STOP_WORDS)

	# Add any custom stopwords provided by the user
	if custom_stopwords:
	stop_words_list.extend(custom_stopwords)

	# Only create a vectorizer if there's a list of stopwords to use
	if stop_words_list:
	vectorizer_model = CountVectorizer(stop_words=stop_words_list)

	# Instantiate BERTopic, passing the vectorizer_model if it was created
	if language == "multilingual":
	topic_model = BERTopic(language="multilingual", nr_topics=nr_topics, vectorizer_model=vectorizer_model)
	else:
	topic_model = BERTopic(language=language, nr_topics=nr_topics, vectorizer_model=vectorizer_model)

	# The 'docs' passed here should contain stopwords for the embedding model to work best
	topics, probs = topic_model.fit_transform(docs)

	# --- Calculate Coherence Score ---
	# Sample documents for faster coherence calculation (2000 docs is sufficient for accurate estimate)
	max_coherence_docs = 2000
	if len(docs) > max_coherence_docs:
	sample_docs = random.sample(docs, max_coherence_docs)
	else:
	sample_docs = docs

	tokenized_docs = [word_tokenize(doc) for doc in sample_docs]
	dictionary = Dictionary(tokenized_docs)
	corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
	topic_words = topic_model.get_topics()
	topics_for_coherence = []
	for topic_id in sorted(topic_words.keys()):
	if topic_id != -1:
	words = [word for word, _ in topic_model.get_topic(topic_id)]
	topics_for_coherence.append(words)
	coherence_score = None
	if topics_for_coherence and corpus:
	try:
	coherence_model = CoherenceModel(
	topics=topics_for_coherence,
	texts=tokenized_docs,
	dictionary=dictionary,
	coherence='c_v'
	)
	coherence_score = coherence_model.get_coherence()
	except Exception as e:
	print(f"Could not calculate coherence score: {e}")

	return topic_model, topics, probs, coherence_score