|
|
| import copy |
| import pytest |
| import numpy as np |
| import pandas as pd |
| from packaging import version |
| from scipy.sparse import csr_matrix |
| from sklearn import __version__ as sklearn_version |
| from sklearn.feature_extraction.text import CountVectorizer |
| from bertopic.vectorizers import ClassTfidfTransformer |
|
|
|
|
| @pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), |
| ('base_topic_model'), |
| ('custom_topic_model'), |
| ('merged_topic_model'), |
| ('reduced_topic_model'), |
| ('online_topic_model')]) |
| def test_ctfidf(model, documents, request): |
| topic_model = copy.deepcopy(request.getfixturevalue(model)) |
| topics = topic_model.topics_ |
| documents = pd.DataFrame({"Document": documents, |
| "ID": range(len(documents)), |
| "Topic": topics}) |
| documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) |
| documents = topic_model._preprocess_text(documents_per_topic.Document.values) |
| count = topic_model.vectorizer_model.fit(documents) |
|
|
| |
| |
| if version.parse(sklearn_version) >= version.parse("1.0.0"): |
| words = count.get_feature_names_out() |
| else: |
| words = count.get_feature_names() |
|
|
| X = count.transform(documents) |
| transformer = ClassTfidfTransformer().fit(X) |
| c_tf_idf = transformer.transform(X) |
|
|
| assert len(words) > 1000 |
| assert all([isinstance(x, str) for x in words]) |
|
|
| assert isinstance(X, csr_matrix) |
| assert isinstance(c_tf_idf, csr_matrix) |
|
|
| assert X.shape[0] == len(set(topics)) |
| assert X.shape[1] == len(words) |
|
|
| assert c_tf_idf.shape[0] == len(set(topics)) |
| assert c_tf_idf.shape[1] == len(words) |
|
|
| assert np.min(X) == 0 |
|
|
|
|
| @pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), |
| ('base_topic_model'), |
| ('custom_topic_model'), |
| ('merged_topic_model'), |
| ('reduced_topic_model'), |
| ('online_topic_model')]) |
| def test_ctfidf_custom_cv(model, documents, request): |
| cv = CountVectorizer(ngram_range=(1, 3), stop_words="english") |
| topic_model = copy.deepcopy(request.getfixturevalue(model)) |
| topic_model.vectorizer_model = cv |
| topics = topic_model.topics_ |
| documents = pd.DataFrame({"Document": documents, |
| "ID": range(len(documents)), |
| "Topic": topics}) |
| documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) |
| documents = topic_model._preprocess_text(documents_per_topic.Document.values) |
| count = topic_model.vectorizer_model.fit(documents) |
|
|
| |
| |
| if version.parse(sklearn_version) >= version.parse("1.0.0"): |
| words = count.get_feature_names_out() |
| else: |
| words = count.get_feature_names() |
|
|
| X = count.transform(documents) |
| transformer = ClassTfidfTransformer().fit(X) |
| c_tf_idf = transformer.transform(X) |
|
|
| assert len(words) > 1000 |
| assert all([isinstance(x, str) for x in words]) |
|
|
| assert isinstance(X, csr_matrix) |
| assert isinstance(c_tf_idf, csr_matrix) |
|
|
| assert X.shape[0] == len(set(topics)) |
| assert X.shape[1] == len(words) |
|
|
| assert c_tf_idf.shape[0] == len(set(topics)) |
| assert c_tf_idf.shape[1] == len(words) |
|
|
| assert np.min(X) == 0 |
|
|