File size: 3,807 Bytes
19b102a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

import copy
import pytest
import numpy as np
import pandas as pd
from packaging import version
from scipy.sparse import csr_matrix
from sklearn import __version__ as sklearn_version
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer


@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'),
                                   ('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model'),
                                   ('online_topic_model')])
def test_ctfidf(model, documents, request):
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    topics = topic_model.topics_
    documents = pd.DataFrame({"Document": documents,
                              "ID": range(len(documents)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    documents = topic_model._preprocess_text(documents_per_topic.Document.values)
    count = topic_model.vectorizer_model.fit(documents)

    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
    # and will be removed in 1.2. Please use get_feature_names_out instead.
    if version.parse(sklearn_version) >= version.parse("1.0.0"):
        words = count.get_feature_names_out()
    else:
        words = count.get_feature_names()

    X = count.transform(documents)
    transformer = ClassTfidfTransformer().fit(X)
    c_tf_idf = transformer.transform(X)

    assert len(words) > 1000
    assert all([isinstance(x, str) for x in words])

    assert isinstance(X, csr_matrix)
    assert isinstance(c_tf_idf, csr_matrix)

    assert X.shape[0] == len(set(topics))
    assert X.shape[1] == len(words)

    assert c_tf_idf.shape[0] == len(set(topics))
    assert c_tf_idf.shape[1] == len(words)

    assert np.min(X) == 0


@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'),
                                   ('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model'),
                                   ('online_topic_model')])
def test_ctfidf_custom_cv(model, documents, request):
    cv = CountVectorizer(ngram_range=(1, 3), stop_words="english")
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    topic_model.vectorizer_model = cv
    topics = topic_model.topics_
    documents = pd.DataFrame({"Document": documents,
                              "ID": range(len(documents)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    documents = topic_model._preprocess_text(documents_per_topic.Document.values)
    count = topic_model.vectorizer_model.fit(documents)

    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
    # and will be removed in 1.2. Please use get_feature_names_out instead.
    if version.parse(sklearn_version) >= version.parse("1.0.0"):
        words = count.get_feature_names_out()
    else:
        words = count.get_feature_names()

    X = count.transform(documents)
    transformer = ClassTfidfTransformer().fit(X)
    c_tf_idf = transformer.transform(X)

    assert len(words) > 1000
    assert all([isinstance(x, str) for x in words])

    assert isinstance(X, csr_matrix)
    assert isinstance(c_tf_idf, csr_matrix)

    assert X.shape[0] == len(set(topics))
    assert X.shape[1] == len(words)

    assert c_tf_idf.shape[0] == len(set(topics))
    assert c_tf_idf.shape[1] == len(words)

    assert np.min(X) == 0