File size: 5,972 Bytes
19b102a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import numpy as np
from itertools import chain
from typing import List
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
class OnlineCountVectorizer(CountVectorizer):
""" An online variant of the CountVectorizer with updating vocabulary.
At each `.partial_fit`, its vocabulary is updated based on any OOV words
it might find. Then, `.update_bow` can be used to track and update
the Bag-of-Words representation. These functions are separated such that
the vectorizer can be used in iteration without updating the Bag-of-Words
representation can might speed up the fitting process. However, the
`.update_bow` function is used in BERTopic to track changes in the
topic representations and allow for decay.
This class inherits its parameters and attributes from:
`sklearn.feature_extraction.text.CountVectorizer`
Arguments:
decay: A value between [0, 1] to weight the percentage of frequencies
the previous bag-of-words should be decreased. For example,
a value of `.1` will decrease the frequencies in the bag-of-words
matrix with 10% at each iteration.
delete_min_df: Delete words at each iteration from its vocabulary
that are below a minimum frequency.
This will keep the resulting bag-of-words matrix small
such that it does not explode in size with increasing
vocabulary. If `decay` is None then this equals `min_df`.
**kwargs: Set of parameters inherited from:
`sklearn.feature_extraction.text.CountVectorizer`
In practice, this means that you can still use parameters
from the original CountVectorizer, like `stop_words` and
`ngram_range`.
Attributes:
X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation
Examples:
```python
from bertopic.vectorizers import OnlineCountVectorizer
vectorizer = OnlineCountVectorizer(stop_words="english")
for index, doc in enumerate(my_docs):
vectorizer.partial_fit(doc)
# Update and clean the bow every 100 iterations:
if index % 100 == 0:
X = vectorizer.update_bow()
```
To use the model in BERTopic:
```python
from bertopic import BERTopic
from bertopic.vectorizers import OnlineCountVectorizer
vectorizer_model = OnlineCountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)
```
References:
Adapted from: https://github.com/idoshlomo/online_vectorizers
"""
def __init__(self,
decay: float = None,
delete_min_df: float = None,
**kwargs):
self.decay = decay
self.delete_min_df = delete_min_df
super(OnlineCountVectorizer, self).__init__(**kwargs)
def partial_fit(self, raw_documents: List[str]) -> None:
""" Perform a partial fit and update vocabulary with OOV tokens
Arguments:
raw_documents: A list of documents
"""
if not hasattr(self, 'vocabulary_'):
return self.fit(raw_documents)
analyzer = self.build_analyzer()
analyzed_documents = [analyzer(doc) for doc in raw_documents]
new_tokens = set(chain.from_iterable(analyzed_documents))
oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))
if oov_tokens:
max_index = max(self.vocabulary_.values())
oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1))))
self.vocabulary_.update(oov_vocabulary)
return self
def update_bow(self, raw_documents: List[str]) -> csr_matrix:
""" Create or update the bag-of-words matrix
Update the bag-of-words matrix by adding the newly transformed
documents. This may add empty columns if new words are found and/or
add empty rows if new topics are found.
During this process, the previous bag-of-words matrix might be
decayed if `self.decay` has been set during init. Similarly, words
that do not exceed `self.delete_min_df` are removed from its
vocabulary and bag-of-words matrix.
Arguments:
raw_documents: A list of documents
Returns:
X_: Bag-of-words matrix
"""
if hasattr(self, "X_"):
X = self.transform(raw_documents)
# Add empty columns if new words are found
columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
self.X_ = sparse.hstack([self.X_, columns])
# Add empty rows if new topics are found
rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
self.X_ = sparse.vstack([self.X_, rows])
# Decay of BoW matrix
if self.decay is not None:
self.X_ = self.X_ * (1 - self.decay)
self.X_ += X
else:
self.X_ = self.transform(raw_documents)
if self.delete_min_df is not None:
self._clean_bow()
return self.X_
def _clean_bow(self) -> None:
""" Remove words that do not exceed `self.delete_min_df` """
# Only keep words with a minimum frequency
indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
indices_dict = {index: index for index in indices}
self.X_ = self.X_[:, indices]
# Update vocabulary with new words
new_vocab = {}
vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}
for i, index in enumerate(indices):
if indices_dict.get(index) is not None:
new_vocab[vocabulary_dict[index]] = i
self.vocabulary_ = new_vocab
|