Spaces:
Sleeping
Sleeping
| # topic_modeling.py | |
| import random | |
| import pandas as pd | |
| from bertopic import BERTopic | |
| from gensim.corpora import Dictionary | |
| from gensim.models import CoherenceModel | |
| from nltk.tokenize import word_tokenize | |
| from typing import List | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| def perform_topic_modeling( | |
| docs: List[str], | |
| language: str = "english", | |
| nr_topics=None, | |
| remove_stopwords_bertopic: bool = False, # New parameter to control behavior | |
| custom_stopwords: List[str] = None | |
| ): | |
| """ | |
| Performs topic modeling on a list of documents. | |
| Args: | |
| docs (List[str]): A list of documents. Stopwords should be INCLUDED for best results. | |
| language (str): Language for the BERTopic model ('english', 'multilingual'). | |
| nr_topics: The number of topics to find ("auto" or an int). | |
| remove_stopwords_bertopic (bool): If True, stopwords will be removed internally by BERTopic. | |
| custom_stopwords (List[str]): A list of custom stopwords to use. | |
| Returns: | |
| tuple: BERTopic model, topics, probabilities, and coherence score. | |
| """ | |
| vectorizer_model = None # Default to no custom vectorizer | |
| if remove_stopwords_bertopic: | |
| stop_words_list = [] | |
| if language == "english": | |
| # Start with the built-in English stopword list from scikit-learn | |
| from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS | |
| stop_words_list = list(ENGLISH_STOP_WORDS) | |
| # Add any custom stopwords provided by the user | |
| if custom_stopwords: | |
| stop_words_list.extend(custom_stopwords) | |
| # Only create a vectorizer if there's a list of stopwords to use | |
| if stop_words_list: | |
| vectorizer_model = CountVectorizer(stop_words=stop_words_list) | |
| # Instantiate BERTopic, passing the vectorizer_model if it was created | |
| if language == "multilingual": | |
| topic_model = BERTopic(language="multilingual", nr_topics=nr_topics, vectorizer_model=vectorizer_model) | |
| else: | |
| topic_model = BERTopic(language=language, nr_topics=nr_topics, vectorizer_model=vectorizer_model) | |
| # The 'docs' passed here should contain stopwords for the embedding model to work best | |
| topics, probs = topic_model.fit_transform(docs) | |
| # --- Calculate Coherence Score --- | |
| # Sample documents for faster coherence calculation (2000 docs is sufficient for accurate estimate) | |
| max_coherence_docs = 2000 | |
| if len(docs) > max_coherence_docs: | |
| sample_docs = random.sample(docs, max_coherence_docs) | |
| else: | |
| sample_docs = docs | |
| tokenized_docs = [word_tokenize(doc) for doc in sample_docs] | |
| dictionary = Dictionary(tokenized_docs) | |
| corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs] | |
| topic_words = topic_model.get_topics() | |
| topics_for_coherence = [] | |
| for topic_id in sorted(topic_words.keys()): | |
| if topic_id != -1: | |
| words = [word for word, _ in topic_model.get_topic(topic_id)] | |
| topics_for_coherence.append(words) | |
| coherence_score = None | |
| if topics_for_coherence and corpus: | |
| try: | |
| coherence_model = CoherenceModel( | |
| topics=topics_for_coherence, | |
| texts=tokenized_docs, | |
| dictionary=dictionary, | |
| coherence='c_v' | |
| ) | |
| coherence_score = coherence_model.get_coherence() | |
| except Exception as e: | |
| print(f"Could not calculate coherence score: {e}") | |
| return topic_model, topics, probs, coherence_score |