Upload 261 files

19b102a verified almost 2 years ago

5.97 kB

	import numpy as np
	from itertools import chain
	from typing import List

	from scipy import sparse
	from scipy.sparse import csr_matrix

	from sklearn.feature_extraction.text import CountVectorizer


	class OnlineCountVectorizer(CountVectorizer):
	""" An online variant of the CountVectorizer with updating vocabulary.

	At each `.partial_fit`, its vocabulary is updated based on any OOV words
	it might find. Then, `.update_bow` can be used to track and update
	the Bag-of-Words representation. These functions are separated such that
	the vectorizer can be used in iteration without updating the Bag-of-Words
	representation can might speed up the fitting process. However, the
	`.update_bow` function is used in BERTopic to track changes in the
	topic representations and allow for decay.

	This class inherits its parameters and attributes from:
	`sklearn.feature_extraction.text.CountVectorizer`

	Arguments:
	decay: A value between [0, 1] to weight the percentage of frequencies
	the previous bag-of-words should be decreased. For example,
	a value of `.1` will decrease the frequencies in the bag-of-words
	matrix with 10% at each iteration.
	delete_min_df: Delete words at each iteration from its vocabulary
	that are below a minimum frequency.
	This will keep the resulting bag-of-words matrix small
	such that it does not explode in size with increasing
	vocabulary. If `decay` is None then this equals `min_df`.
	**kwargs: Set of parameters inherited from:
	`sklearn.feature_extraction.text.CountVectorizer`
	In practice, this means that you can still use parameters
	from the original CountVectorizer, like `stop_words` and
	`ngram_range`.

	Attributes:
	X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation

	Examples:

	```python
	from bertopic.vectorizers import OnlineCountVectorizer
	vectorizer = OnlineCountVectorizer(stop_words="english")

	for index, doc in enumerate(my_docs):
	vectorizer.partial_fit(doc)

	# Update and clean the bow every 100 iterations:
	if index % 100 == 0:
	X = vectorizer.update_bow()
	```

	To use the model in BERTopic:

	```python
	from bertopic import BERTopic
	from bertopic.vectorizers import OnlineCountVectorizer

	vectorizer_model = OnlineCountVectorizer(stop_words="english")
	topic_model = BERTopic(vectorizer_model=vectorizer_model)
	```

	References:
	Adapted from: https://github.com/idoshlomo/online_vectorizers
	"""
	def __init__(self,
	decay: float = None,
	delete_min_df: float = None,
	**kwargs):
	self.decay = decay
	self.delete_min_df = delete_min_df
	super(OnlineCountVectorizer, self).__init__(**kwargs)

	def partial_fit(self, raw_documents: List[str]) -> None:
	""" Perform a partial fit and update vocabulary with OOV tokens

	Arguments:
	raw_documents: A list of documents
	"""
	if not hasattr(self, 'vocabulary_'):
	return self.fit(raw_documents)

	analyzer = self.build_analyzer()
	analyzed_documents = [analyzer(doc) for doc in raw_documents]
	new_tokens = set(chain.from_iterable(analyzed_documents))
	oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))

	if oov_tokens:
	max_index = max(self.vocabulary_.values())
	oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1))))
	self.vocabulary_.update(oov_vocabulary)

	return self

	def update_bow(self, raw_documents: List[str]) -> csr_matrix:
	""" Create or update the bag-of-words matrix

	Update the bag-of-words matrix by adding the newly transformed
	documents. This may add empty columns if new words are found and/or
	add empty rows if new topics are found.

	During this process, the previous bag-of-words matrix might be
	decayed if `self.decay` has been set during init. Similarly, words
	that do not exceed `self.delete_min_df` are removed from its
	vocabulary and bag-of-words matrix.

	Arguments:
	raw_documents: A list of documents

	Returns:
	X_: Bag-of-words matrix
	"""
	if hasattr(self, "X_"):
	X = self.transform(raw_documents)

	# Add empty columns if new words are found
	columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)
	self.X_ = sparse.hstack([self.X_, columns])

	# Add empty rows if new topics are found
	rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)
	self.X_ = sparse.vstack([self.X_, rows])

	# Decay of BoW matrix
	if self.decay is not None:
	self.X_ = self.X_ * (1 - self.decay)

	self.X_ += X
	else:
	self.X_ = self.transform(raw_documents)

	if self.delete_min_df is not None:
	self._clean_bow()

	return self.X_

	def _clean_bow(self) -> None:
	""" Remove words that do not exceed `self.delete_min_df` """
	# Only keep words with a minimum frequency
	indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]
	indices_dict = {index: index for index in indices}
	self.X_ = self.X_[:, indices]

	# Update vocabulary with new words
	new_vocab = {}
	vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}
	for i, index in enumerate(indices):
	if indices_dict.get(index) is not None:
	new_vocab[vocabulary_dict[index]] = i

	self.vocabulary_ = new_vocab