| | from typing import List |
| | from sklearn.feature_extraction.text import TfidfTransformer |
| | from sklearn.preprocessing import normalize |
| | from sklearn.utils import check_array |
| | import numpy as np |
| | import scipy.sparse as sp |
| |
|
| |
|
| | class ClassTfidfTransformer(TfidfTransformer): |
| | """ |
| | A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base. |
| | |
| |  |
| | |
| | c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes |
| | by joining all documents per class. Thus, each class is converted to a single document |
| | instead of set of documents. The frequency of each word **x** is extracted |
| | for each class **c** and is **l1** normalized. This constitutes the term frequency. |
| | |
| | Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus |
| | the average number of words per class **A** divided by the frequency of word **x** |
| | across all classes. |
| | |
| | Arguments: |
| | bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure |
| | as defined in the c-TF-IDF formula. It uses the following weighting scheme: |
| | `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))` |
| | reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix. |
| | Helps to reduce the impact of words that appear too frequently. |
| | seed_words: Specific words that will have their idf value increased by |
| | the value of `seed_multiplier`. |
| | NOTE: This will only increase the value of words that have an exact match. |
| | seed_multiplier: The value with which the idf values of the words in `seed_words` |
| | are multiplied. |
| | |
| | Examples: |
| | |
| | ```python |
| | transformer = ClassTfidfTransformer() |
| | ``` |
| | """ |
| | def __init__(self, |
| | bm25_weighting: bool = False, |
| | reduce_frequent_words: bool = False, |
| | seed_words: List[str] = None, |
| | seed_multiplier: float = 2 |
| | ): |
| | self.bm25_weighting = bm25_weighting |
| | self.reduce_frequent_words = reduce_frequent_words |
| | self.seed_words = seed_words |
| | self.seed_multiplier = seed_multiplier |
| | super(ClassTfidfTransformer, self).__init__() |
| |
|
| | def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None): |
| | """Learn the idf vector (global term weights). |
| | |
| | Arguments: |
| | X: A matrix of term/token counts. |
| | multiplier: A multiplier for increasing/decreasing certain IDF scores |
| | """ |
| | X = check_array(X, accept_sparse=('csr', 'csc')) |
| | if not sp.issparse(X): |
| | X = sp.csr_matrix(X) |
| | dtype = np.float64 |
| |
|
| | if self.use_idf: |
| | _, n_features = X.shape |
| |
|
| | |
| | df = np.squeeze(np.asarray(X.sum(axis=0))) |
| |
|
| | |
| | avg_nr_samples = int(X.sum(axis=1).mean()) |
| |
|
| | |
| | if self.bm25_weighting: |
| | idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5))) |
| |
|
| | |
| | |
| | else: |
| | idf = np.log((avg_nr_samples / df)+1) |
| |
|
| | |
| | if multiplier is not None: |
| | idf = idf * multiplier |
| |
|
| | self._idf_diag = sp.diags(idf, offsets=0, |
| | shape=(n_features, n_features), |
| | format='csr', |
| | dtype=dtype) |
| |
|
| | return self |
| |
|
| | def transform(self, X: sp.csr_matrix): |
| | """Transform a count-based matrix to c-TF-IDF |
| | |
| | Arguments: |
| | X (sparse matrix): A matrix of term/token counts. |
| | |
| | Returns: |
| | X (sparse matrix): A c-TF-IDF matrix |
| | """ |
| | if self.use_idf: |
| | X = normalize(X, axis=1, norm='l1', copy=False) |
| |
|
| | if self.reduce_frequent_words: |
| | X.data = np.sqrt(X.data) |
| |
|
| | X = X * self._idf_diag |
| |
|
| | return X |
| |
|