File size: 5,972 Bytes
19b102a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import numpy as np
from itertools import chain
from typing import List

from scipy import sparse
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import CountVectorizer


class OnlineCountVectorizer(CountVectorizer):
    """ An online variant of the CountVectorizer with updating vocabulary.

    At each `.partial_fit`, its vocabulary is updated based on any OOV words
    it might find. Then, `.update_bow` can be used to track and update
    the Bag-of-Words representation. These functions are separated such that
    the vectorizer can be used in iteration without updating the Bag-of-Words
    representation can might speed up the fitting process. However, the
    `.update_bow` function is used in BERTopic to track changes in the
    topic representations and allow for decay.

    This class inherits its parameters and attributes from:
        `sklearn.feature_extraction.text.CountVectorizer`

    Arguments:
        decay: A value between [0, 1] to weight the percentage of frequencies
               the previous bag-of-words should be decreased. For example,
               a value of `.1` will decrease the frequencies in the bag-of-words
               matrix with 10% at each iteration.
        delete_min_df: Delete words at each iteration from its vocabulary
                       that are below a minimum frequency.
                       This will keep the resulting bag-of-words matrix small
                       such that it does not explode in size with increasing
                       vocabulary. If `decay` is None then this equals `min_df`.
        **kwargs: Set of parameters inherited from:
                  `sklearn.feature_extraction.text.CountVectorizer`
                  In practice, this means that you can still use parameters
                  from the original CountVectorizer, like `stop_words` and
                  `ngram_range`.

    Attributes:
        X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation

    Examples:

    ```python
    from bertopic.vectorizers import OnlineCountVectorizer
    vectorizer = OnlineCountVectorizer(stop_words="english")

    for index, doc in enumerate(my_docs):
        vectorizer.partial_fit(doc)

        # Update and clean the bow every 100 iterations:
        if index % 100 == 0:
            X = vectorizer.update_bow()
    ```

    To use the model in BERTopic:

    ```python
    from bertopic import BERTopic
    from bertopic.vectorizers import OnlineCountVectorizer

    vectorizer_model = OnlineCountVectorizer(stop_words="english")
    topic_model = BERTopic(vectorizer_model=vectorizer_model)
    ```

    References:
        Adapted from: https://github.com/idoshlomo/online_vectorizers
    """
    def __init__(self,
                 decay: float = None,
                 delete_min_df: float = None,
                 **kwargs):
        self.decay = decay
        self.delete_min_df = delete_min_df
        super(OnlineCountVectorizer, self).__init__(**kwargs)

    def partial_fit(self, raw_documents: List[str]) -> None:
        """ Perform a partial fit and update vocabulary with OOV tokens

        Arguments:
            raw_documents: A list of documents
        """
        if not hasattr(self, 'vocabulary_'):
            return self.fit(raw_documents)

        analyzer = self.build_analyzer()
        analyzed_documents = [analyzer(doc) for doc in raw_documents]
        new_tokens = set(chain.from_iterable(analyzed_documents))
        oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))

        if oov_tokens:
            max_index = max(self.vocabulary_.values())
            oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1))))
            self.vocabulary_.update(oov_vocabulary)

        return self

    def update_bow(self, raw_documents: List[str]) -> csr_matrix:
        """ Create or update the bag-of-words matrix

        Update the bag-of-words matrix by adding the newly transformed
        documents. This may add empty columns if new words are found and/or
        add empty rows if new topics are found.

        During this process, the previous bag-of-words matrix might be
        decayed if `self.decay` has been set during init. Similarly, words
        that do not exceed `self.delete_min_df` are removed from its
        vocabulary and bag-of-words matrix.

        Arguments:
            raw_documents: A list of documents

        Returns:
            X_: Bag-of-words matrix
        """
        if hasattr(self, "X_"):
            X = self.transform(raw_documents)

            # Add empty columns if new words are found
            columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
            self.X_ = sparse.hstack([self.X_, columns])

            # Add empty rows if new topics are found
            rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
            self.X_ = sparse.vstack([self.X_, rows])

            # Decay of BoW matrix
            if self.decay is not None:
                self.X_ = self.X_ * (1 - self.decay)

            self.X_ += X
        else:
            self.X_ = self.transform(raw_documents)

        if self.delete_min_df is not None:
            self._clean_bow()

        return self.X_

    def _clean_bow(self) -> None:
        """ Remove words that do not exceed `self.delete_min_df` """
        # Only keep words with a minimum frequency
        indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
        indices_dict = {index: index for index in indices}
        self.X_ = self.X_[:, indices]

        # Update vocabulary with new words
        new_vocab = {}
        vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}
        for i, index in enumerate(indices):
            if indices_dict.get(index) is not None:
                new_vocab[vocabulary_dict[index]] = i

        self.vocabulary_ = new_vocab