File size: 3,551 Bytes
b7b041e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# topic_modeling.py

import random
import pandas as pd
from bertopic import BERTopic
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from nltk.tokenize import word_tokenize
from typing import List
from sklearn.feature_extraction.text import CountVectorizer

def perform_topic_modeling(
    docs: List[str],
    language: str = "english",
    nr_topics=None,
    remove_stopwords_bertopic: bool = False, # New parameter to control behavior
    custom_stopwords: List[str] = None
):
    """
    Performs topic modeling on a list of documents.

    Args:
        docs (List[str]): A list of documents. Stopwords should be INCLUDED for best results.
        language (str): Language for the BERTopic model ('english', 'multilingual').
        nr_topics: The number of topics to find ("auto" or an int).
        remove_stopwords_bertopic (bool): If True, stopwords will be removed internally by BERTopic.
        custom_stopwords (List[str]): A list of custom stopwords to use.

    Returns:
        tuple: BERTopic model, topics, probabilities, and coherence score.
    """
    vectorizer_model = None  # Default to no custom vectorizer

    if remove_stopwords_bertopic:
        stop_words_list = []
        if language == "english":
            # Start with the built-in English stopword list from scikit-learn
            from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
            stop_words_list = list(ENGLISH_STOP_WORDS)

        # Add any custom stopwords provided by the user
        if custom_stopwords:
            stop_words_list.extend(custom_stopwords)
        
        # Only create a vectorizer if there's a list of stopwords to use
        if stop_words_list:
            vectorizer_model = CountVectorizer(stop_words=stop_words_list)

    # Instantiate BERTopic, passing the vectorizer_model if it was created
    if language == "multilingual":
        topic_model = BERTopic(language="multilingual", nr_topics=nr_topics, vectorizer_model=vectorizer_model)
    else:
        topic_model = BERTopic(language=language, nr_topics=nr_topics, vectorizer_model=vectorizer_model)

    # The 'docs' passed here should contain stopwords for the embedding model to work best
    topics, probs = topic_model.fit_transform(docs)

    # --- Calculate Coherence Score ---
    # Sample documents for faster coherence calculation (2000 docs is sufficient for accurate estimate)
    max_coherence_docs = 2000
    if len(docs) > max_coherence_docs:
        sample_docs = random.sample(docs, max_coherence_docs)
    else:
        sample_docs = docs

    tokenized_docs = [word_tokenize(doc) for doc in sample_docs]
    dictionary = Dictionary(tokenized_docs)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
    topic_words = topic_model.get_topics()
    topics_for_coherence = []
    for topic_id in sorted(topic_words.keys()):
        if topic_id != -1:
            words = [word for word, _ in topic_model.get_topic(topic_id)]
            topics_for_coherence.append(words)
    coherence_score = None
    if topics_for_coherence and corpus:
        try:
            coherence_model = CoherenceModel(
                topics=topics_for_coherence,
                texts=tokenized_docs,
                dictionary=dictionary,
                coherence='c_v'
            )
            coherence_score = coherence_model.get_coherence()
        except Exception as e:
            print(f"Could not calculate coherence score: {e}")
            
    return topic_model, topics, probs, coherence_score