| | import numpy as np |
| | from itertools import chain |
| | from typing import List |
| |
|
| | from scipy import sparse |
| | from scipy.sparse import csr_matrix |
| |
|
| | from sklearn.feature_extraction.text import CountVectorizer |
| |
|
| |
|
| | class OnlineCountVectorizer(CountVectorizer): |
| | """ An online variant of the CountVectorizer with updating vocabulary. |
| | |
| | At each `.partial_fit`, its vocabulary is updated based on any OOV words |
| | it might find. Then, `.update_bow` can be used to track and update |
| | the Bag-of-Words representation. These functions are separated such that |
| | the vectorizer can be used in iteration without updating the Bag-of-Words |
| | representation can might speed up the fitting process. However, the |
| | `.update_bow` function is used in BERTopic to track changes in the |
| | topic representations and allow for decay. |
| | |
| | This class inherits its parameters and attributes from: |
| | `sklearn.feature_extraction.text.CountVectorizer` |
| | |
| | Arguments: |
| | decay: A value between [0, 1] to weight the percentage of frequencies |
| | the previous bag-of-words should be decreased. For example, |
| | a value of `.1` will decrease the frequencies in the bag-of-words |
| | matrix with 10% at each iteration. |
| | delete_min_df: Delete words at each iteration from its vocabulary |
| | that are below a minimum frequency. |
| | This will keep the resulting bag-of-words matrix small |
| | such that it does not explode in size with increasing |
| | vocabulary. If `decay` is None then this equals `min_df`. |
| | **kwargs: Set of parameters inherited from: |
| | `sklearn.feature_extraction.text.CountVectorizer` |
| | In practice, this means that you can still use parameters |
| | from the original CountVectorizer, like `stop_words` and |
| | `ngram_range`. |
| | |
| | Attributes: |
| | X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation |
| | |
| | Examples: |
| | |
| | ```python |
| | from bertopic.vectorizers import OnlineCountVectorizer |
| | vectorizer = OnlineCountVectorizer(stop_words="english") |
| | |
| | for index, doc in enumerate(my_docs): |
| | vectorizer.partial_fit(doc) |
| | |
| | # Update and clean the bow every 100 iterations: |
| | if index % 100 == 0: |
| | X = vectorizer.update_bow() |
| | ``` |
| | |
| | To use the model in BERTopic: |
| | |
| | ```python |
| | from bertopic import BERTopic |
| | from bertopic.vectorizers import OnlineCountVectorizer |
| | |
| | vectorizer_model = OnlineCountVectorizer(stop_words="english") |
| | topic_model = BERTopic(vectorizer_model=vectorizer_model) |
| | ``` |
| | |
| | References: |
| | Adapted from: https://github.com/idoshlomo/online_vectorizers |
| | """ |
| | def __init__(self, |
| | decay: float = None, |
| | delete_min_df: float = None, |
| | **kwargs): |
| | self.decay = decay |
| | self.delete_min_df = delete_min_df |
| | super(OnlineCountVectorizer, self).__init__(**kwargs) |
| |
|
| | def partial_fit(self, raw_documents: List[str]) -> None: |
| | """ Perform a partial fit and update vocabulary with OOV tokens |
| | |
| | Arguments: |
| | raw_documents: A list of documents |
| | """ |
| | if not hasattr(self, 'vocabulary_'): |
| | return self.fit(raw_documents) |
| |
|
| | analyzer = self.build_analyzer() |
| | analyzed_documents = [analyzer(doc) for doc in raw_documents] |
| | new_tokens = set(chain.from_iterable(analyzed_documents)) |
| | oov_tokens = new_tokens.difference(set(self.vocabulary_.keys())) |
| |
|
| | if oov_tokens: |
| | max_index = max(self.vocabulary_.values()) |
| | oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)))) |
| | self.vocabulary_.update(oov_vocabulary) |
| |
|
| | return self |
| |
|
| | def update_bow(self, raw_documents: List[str]) -> csr_matrix: |
| | """ Create or update the bag-of-words matrix |
| | |
| | Update the bag-of-words matrix by adding the newly transformed |
| | documents. This may add empty columns if new words are found and/or |
| | add empty rows if new topics are found. |
| | |
| | During this process, the previous bag-of-words matrix might be |
| | decayed if `self.decay` has been set during init. Similarly, words |
| | that do not exceed `self.delete_min_df` are removed from its |
| | vocabulary and bag-of-words matrix. |
| | |
| | Arguments: |
| | raw_documents: A list of documents |
| | |
| | Returns: |
| | X_: Bag-of-words matrix |
| | """ |
| | if hasattr(self, "X_"): |
| | X = self.transform(raw_documents) |
| |
|
| | |
| | columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int) |
| | self.X_ = sparse.hstack([self.X_, columns]) |
| |
|
| | |
| | rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int) |
| | self.X_ = sparse.vstack([self.X_, rows]) |
| |
|
| | |
| | if self.decay is not None: |
| | self.X_ = self.X_ * (1 - self.decay) |
| |
|
| | self.X_ += X |
| | else: |
| | self.X_ = self.transform(raw_documents) |
| |
|
| | if self.delete_min_df is not None: |
| | self._clean_bow() |
| |
|
| | return self.X_ |
| |
|
| | def _clean_bow(self) -> None: |
| | """ Remove words that do not exceed `self.delete_min_df` """ |
| | |
| | indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1] |
| | indices_dict = {index: index for index in indices} |
| | self.X_ = self.X_[:, indices] |
| |
|
| | |
| | new_vocab = {} |
| | vocabulary_dict = {v: k for k, v in self.vocabulary_.items()} |
| | for i, index in enumerate(indices): |
| | if indices_dict.get(index) is not None: |
| | new_vocab[vocabulary_dict[index]] = i |
| |
|
| | self.vocabulary_ = new_vocab |
| |
|