File size: 4,202 Bytes
19b102a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import copy
import pytest
import numpy as np
import pandas as pd


@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'),
                                   ('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model'),
                                   ('online_topic_model')])
def test_get_topic(model, request):
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    topics = [topic_model.get_topic(topic) for topic in set(topic_model.topics_)]
    unknown_topic = topic_model.get_topic(500)

    for topic in topics:
        assert topic is not False

    assert len(topics) == len(topic_model.get_topic_info())
    assert not unknown_topic

@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'),
                                   ('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model'),
                                   ('online_topic_model')])
def test_get_topics(model, request):
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    topics = topic_model.get_topics()

    assert topics == topic_model.topic_representations_
    assert len(topics.keys()) == len(set(topic_model.topics_))


@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'),
                                   ('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model'),
                                   ('online_topic_model')])
def test_get_topic_freq(model, request):
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    for topic in set(topic_model.topics_):
        assert not isinstance(topic_model.get_topic_freq(topic), pd.DataFrame)

    topic_freq = topic_model.get_topic_freq()
    unique_topics = set(topic_model.topics_)
    topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[: ,-1])

    assert isinstance(topic_freq, pd.DataFrame)

    assert len(topic_freq) == len(set(topic_model.topics_))
    assert len(topics_in_mapper.difference(unique_topics)) == 0
    assert len(unique_topics.difference(topics_in_mapper)) == 0


@pytest.mark.parametrize('model', [('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model')])
def test_get_representative_docs(model, request):
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    all_docs = topic_model.get_representative_docs()
    unique_topics = set(topic_model.topics_)
    topics_in_mapper = set(np.array(topic_model.topic_mapper_.mappings_)[:, -1])

    assert len(all_docs) == len(topic_model.topic_sizes_.keys())
    assert len(all_docs) == len(topics_in_mapper)
    assert len(all_docs) == topic_model.c_tf_idf_.shape[0]
    assert len(all_docs) == len(topic_model.topic_labels_)
    assert all([True if len(docs) == 3 else False for docs in all_docs.values()])

    topics = set(list(all_docs.keys()))

    assert len(topics.difference(unique_topics)) == 0
    assert len(topics.difference(topics_in_mapper)) == 0


@pytest.mark.parametrize('model', [('kmeans_pca_topic_model'),
                                   ('base_topic_model'),
                                   ('custom_topic_model'),
                                   ('merged_topic_model'),
                                   ('reduced_topic_model'),
                                   ('online_topic_model')])
def test_get_topic_info(model, request):
    topic_model = copy.deepcopy(request.getfixturevalue(model))
    info = topic_model.get_topic_info()

    if topic_model._outliers:
        assert info.iloc[0].Topic == -1
    else:
        assert info.iloc[0].Topic == 0

    for topic in set(topic_model.topics_):
        assert len(topic_model.get_topic_info(topic)) == 1

    assert len(topic_model.get_topic_info(200)) == 0