|
|
|
|
|
import copy |
|
|
import pytest |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from packaging import version |
|
|
from scipy.sparse import csr_matrix |
|
|
from sklearn import __version__ as sklearn_version |
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
from bertopic.vectorizers import ClassTfidfTransformer |
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), |
|
|
('base_topic_model'), |
|
|
('custom_topic_model'), |
|
|
('merged_topic_model'), |
|
|
('reduced_topic_model'), |
|
|
('online_topic_model')]) |
|
|
def test_ctfidf(model, documents, request): |
|
|
topic_model = copy.deepcopy(request.getfixturevalue(model)) |
|
|
topics = topic_model.topics_ |
|
|
documents = pd.DataFrame({"Document": documents, |
|
|
"ID": range(len(documents)), |
|
|
"Topic": topics}) |
|
|
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) |
|
|
documents = topic_model._preprocess_text(documents_per_topic.Document.values) |
|
|
count = topic_model.vectorizer_model.fit(documents) |
|
|
|
|
|
|
|
|
|
|
|
if version.parse(sklearn_version) >= version.parse("1.0.0"): |
|
|
words = count.get_feature_names_out() |
|
|
else: |
|
|
words = count.get_feature_names() |
|
|
|
|
|
X = count.transform(documents) |
|
|
transformer = ClassTfidfTransformer().fit(X) |
|
|
c_tf_idf = transformer.transform(X) |
|
|
|
|
|
assert len(words) > 1000 |
|
|
assert all([isinstance(x, str) for x in words]) |
|
|
|
|
|
assert isinstance(X, csr_matrix) |
|
|
assert isinstance(c_tf_idf, csr_matrix) |
|
|
|
|
|
assert X.shape[0] == len(set(topics)) |
|
|
assert X.shape[1] == len(words) |
|
|
|
|
|
assert c_tf_idf.shape[0] == len(set(topics)) |
|
|
assert c_tf_idf.shape[1] == len(words) |
|
|
|
|
|
assert np.min(X) == 0 |
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'), |
|
|
('base_topic_model'), |
|
|
('custom_topic_model'), |
|
|
('merged_topic_model'), |
|
|
('reduced_topic_model'), |
|
|
('online_topic_model')]) |
|
|
def test_ctfidf_custom_cv(model, documents, request): |
|
|
cv = CountVectorizer(ngram_range=(1, 3), stop_words="english") |
|
|
topic_model = copy.deepcopy(request.getfixturevalue(model)) |
|
|
topic_model.vectorizer_model = cv |
|
|
topics = topic_model.topics_ |
|
|
documents = pd.DataFrame({"Document": documents, |
|
|
"ID": range(len(documents)), |
|
|
"Topic": topics}) |
|
|
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) |
|
|
documents = topic_model._preprocess_text(documents_per_topic.Document.values) |
|
|
count = topic_model.vectorizer_model.fit(documents) |
|
|
|
|
|
|
|
|
|
|
|
if version.parse(sklearn_version) >= version.parse("1.0.0"): |
|
|
words = count.get_feature_names_out() |
|
|
else: |
|
|
words = count.get_feature_names() |
|
|
|
|
|
X = count.transform(documents) |
|
|
transformer = ClassTfidfTransformer().fit(X) |
|
|
c_tf_idf = transformer.transform(X) |
|
|
|
|
|
assert len(words) > 1000 |
|
|
assert all([isinstance(x, str) for x in words]) |
|
|
|
|
|
assert isinstance(X, csr_matrix) |
|
|
assert isinstance(c_tf_idf, csr_matrix) |
|
|
|
|
|
assert X.shape[0] == len(set(topics)) |
|
|
assert X.shape[1] == len(words) |
|
|
|
|
|
assert c_tf_idf.shape[0] == len(set(topics)) |
|
|
assert c_tf_idf.shape[1] == len(words) |
|
|
|
|
|
assert np.min(X) == 0 |
|
|
|