kisejin
/

TopicModelingRepo

Model card Files Files and versions

TopicModelingRepo / BERTopic /bertopic /vectorizers /_ctfidf.py

kisejin's picture

Upload 261 files

19b102a verified almost 2 years ago

history blame contribute delete

4.32 kB

	from typing import List
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.preprocessing import normalize
	from sklearn.utils import check_array
	import numpy as np
	import scipy.sparse as sp


	class ClassTfidfTransformer(TfidfTransformer):
	"""
	A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.

	![](../algorithm/c-TF-IDF.svg)

	c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes
	by joining all documents per class. Thus, each class is converted to a single document
	instead of set of documents. The frequency of each word x is extracted
	for each class c and is l1 normalized. This constitutes the term frequency.

	Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus
	the average number of words per class A divided by the frequency of word x
	across all classes.

	Arguments:
	bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure
	as defined in the c-TF-IDF formula. It uses the following weighting scheme:
	`log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`
	reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.
	Helps to reduce the impact of words that appear too frequently.
	seed_words: Specific words that will have their idf value increased by
	the value of `seed_multiplier`.
	NOTE: This will only increase the value of words that have an exact match.
	seed_multiplier: The value with which the idf values of the words in `seed_words`
	are multiplied.

	Examples:

	```python
	transformer = ClassTfidfTransformer()
	```
	"""
	def __init__(self,
	bm25_weighting: bool = False,
	reduce_frequent_words: bool = False,
	seed_words: List[str] = None,
	seed_multiplier: float = 2
	):
	self.bm25_weighting = bm25_weighting
	self.reduce_frequent_words = reduce_frequent_words
	self.seed_words = seed_words
	self.seed_multiplier = seed_multiplier
	super(ClassTfidfTransformer, self).__init__()

	def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
	"""Learn the idf vector (global term weights).

	Arguments:
	X: A matrix of term/token counts.
	multiplier: A multiplier for increasing/decreasing certain IDF scores
	"""
	X = check_array(X, accept_sparse=('csr', 'csc'))
	if not sp.issparse(X):
	X = sp.csr_matrix(X)
	dtype = np.float64

	if self.use_idf:
	_, n_features = X.shape

	# Calculate the frequency of words across all classes
	df = np.squeeze(np.asarray(X.sum(axis=0)))

	# Calculate the average number of samples as regularization
	avg_nr_samples = int(X.sum(axis=1).mean())

	# BM25-inspired weighting procedure
	if self.bm25_weighting:
	idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))

	# Divide the average number of samples by the word frequency
	# +1 is added to force values to be positive
	else:
	idf = np.log((avg_nr_samples / df)+1)

	# Multiplier to increase/decrease certain idf scores
	if multiplier is not None:
	idf = idf * multiplier

	self._idf_diag = sp.diags(idf, offsets=0,
	shape=(n_features, n_features),
	format='csr',
	dtype=dtype)

	return self

	def transform(self, X: sp.csr_matrix):
	"""Transform a count-based matrix to c-TF-IDF

	Arguments:
	X (sparse matrix): A matrix of term/token counts.

	Returns:
	X (sparse matrix): A c-TF-IDF matrix
	"""
	if self.use_idf:
	X = normalize(X, axis=1, norm='l1', copy=False)

	if self.reduce_frequent_words:
	X.data = np.sqrt(X.data)

	X = X * self._idf_diag

	return X