import copy import pytest import numpy as np import pandas as pd from packaging import version from scipy.sparse import csr_matrix from sklearn import __version__ as sklearn_version from sklearn.feature_extraction.text import CountVectorizer from bertopic.vectorizers import ClassTfidfTransformer @pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), ('base_topic_model'), ('custom_topic_model'), ('merged_topic_model'), ('reduced_topic_model'), ('online_topic_model')]) def test_ctfidf(model, documents, request): topic_model = copy.deepcopy(request.getfixturevalue(model)) topics = topic_model.topics_ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) documents = topic_model._preprocess_text(documents_per_topic.Document.values) count = topic_model.vectorizer_model.fit(documents) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = count.get_feature_names_out() else: words = count.get_feature_names() X = count.transform(documents) transformer = ClassTfidfTransformer().fit(X) c_tf_idf = transformer.transform(X) assert len(words) > 1000 assert all([isinstance(x, str) for x in words]) assert isinstance(X, csr_matrix) assert isinstance(c_tf_idf, csr_matrix) assert X.shape[0] == len(set(topics)) assert X.shape[1] == len(words) assert c_tf_idf.shape[0] == len(set(topics)) assert c_tf_idf.shape[1] == len(words) assert np.min(X) == 0 @pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), ('base_topic_model'), ('custom_topic_model'), ('merged_topic_model'), ('reduced_topic_model'), ('online_topic_model')]) def test_ctfidf_custom_cv(model, documents, request): cv = CountVectorizer(ngram_range=(1, 3), stop_words="english") topic_model = copy.deepcopy(request.getfixturevalue(model)) topic_model.vectorizer_model = cv topics = topic_model.topics_ documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) documents = topic_model._preprocess_text(documents_per_topic.Document.values) count = topic_model.vectorizer_model.fit(documents) # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 # and will be removed in 1.2. Please use get_feature_names_out instead. if version.parse(sklearn_version) >= version.parse("1.0.0"): words = count.get_feature_names_out() else: words = count.get_feature_names() X = count.transform(documents) transformer = ClassTfidfTransformer().fit(X) c_tf_idf = transformer.transform(X) assert len(words) > 1000 assert all([isinstance(x, str) for x in words]) assert isinstance(X, csr_matrix) assert isinstance(c_tf_idf, csr_matrix) assert X.shape[0] == len(set(topics)) assert X.shape[1] == len(words) assert c_tf_idf.shape[0] == len(set(topics)) assert c_tf_idf.shape[1] == len(words) assert np.min(X) == 0