| | import copy |
| | import pytest |
| | from umap import UMAP |
| | from hdbscan import HDBSCAN |
| | from bertopic import BERTopic |
| | from sklearn.datasets import fetch_20newsgroups |
| | from sentence_transformers import SentenceTransformer |
| | from sklearn.cluster import KMeans, MiniBatchKMeans |
| | from sklearn.decomposition import PCA, IncrementalPCA |
| | from bertopic.vectorizers import OnlineCountVectorizer |
| | from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance |
| | from bertopic.cluster import BaseCluster |
| | from bertopic.dimensionality import BaseDimensionalityReduction |
| | from sklearn.linear_model import LogisticRegression |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def embedding_model(): |
| | model = SentenceTransformer("all-MiniLM-L6-v2") |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def document_embeddings(documents, embedding_model): |
| | embeddings = embedding_model.encode(documents) |
| | return embeddings |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def reduced_embeddings(document_embeddings): |
| | reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(document_embeddings) |
| | return reduced_embeddings |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def documents(): |
| | newsgroup_docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'][:1000] |
| | return newsgroup_docs |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def targets(): |
| | data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) |
| | y = data['target'][:1000] |
| | return y |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def base_topic_model(documents, document_embeddings, embedding_model): |
| | model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True) |
| | model.umap_model.random_state = 42 |
| | model.hdbscan_model.min_cluster_size = 3 |
| | model.fit(documents, document_embeddings) |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def zeroshot_topic_model(documents, document_embeddings, embedding_model): |
| | zeroshot_topic_list = ["religion", "cars", "electronics"] |
| | model = BERTopic(embedding_model=embedding_model, calculate_probabilities=True, zeroshot_topic_list=zeroshot_topic_list, zeroshot_min_similarity=0.5) |
| | model.umap_model.random_state = 42 |
| | model.hdbscan_model.min_cluster_size = 2 |
| | model.fit(documents, document_embeddings) |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def custom_topic_model(documents, document_embeddings, embedding_model): |
| | umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric='cosine', random_state=42) |
| | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) |
| | model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model, calculate_probabilities=True).fit(documents, document_embeddings) |
| | return model |
| |
|
| | @pytest.fixture(scope="session") |
| | def representation_topic_model(documents, document_embeddings, embedding_model): |
| | umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric='cosine', random_state=42) |
| | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True) |
| | representation_model = {"Main": KeyBERTInspired(), "MMR": [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance()]} |
| | model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model=embedding_model, representation_model=representation_model, |
| | calculate_probabilities=True).fit(documents, document_embeddings) |
| | return model |
| |
|
| | @pytest.fixture(scope="session") |
| | def reduced_topic_model(custom_topic_model, documents): |
| | model = copy.deepcopy(custom_topic_model) |
| | model.reduce_topics(documents, nr_topics=12) |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def merged_topic_model(custom_topic_model, documents): |
| | model = copy.deepcopy(custom_topic_model) |
| |
|
| | |
| | topics_to_merge = [[1, 2], |
| | [3, 4]] |
| | model.merge_topics(documents, topics_to_merge) |
| |
|
| | |
| | topics_to_merge = [[5, 6, 7]] |
| | model.merge_topics(documents, topics_to_merge) |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def kmeans_pca_topic_model(documents, document_embeddings): |
| | hdbscan_model = KMeans(n_clusters=15, random_state=42) |
| | dim_model = PCA(n_components=5) |
| | model = BERTopic(hdbscan_model=hdbscan_model, umap_model=dim_model, embedding_model=embedding_model).fit(documents, document_embeddings) |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def supervised_topic_model(documents, document_embeddings, embedding_model, targets): |
| | empty_dimensionality_model = BaseDimensionalityReduction() |
| | clf = LogisticRegression() |
| |
|
| | model = BERTopic( |
| | embedding_model=embedding_model, |
| | umap_model=empty_dimensionality_model, |
| | hdbscan_model=clf, |
| | ).fit(documents, embeddings=document_embeddings, y=targets) |
| | return model |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def online_topic_model(documents, document_embeddings, embedding_model): |
| | umap_model = IncrementalPCA(n_components=5) |
| | cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) |
| | vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01) |
| | model = BERTopic(umap_model=umap_model, hdbscan_model=cluster_model, vectorizer_model=vectorizer_model, embedding_model=embedding_model) |
| |
|
| | topics = [] |
| | for index in range(0, len(documents), 50): |
| | model.partial_fit(documents[index: index+50], document_embeddings[index: index+50]) |
| | topics.extend(model.topics_) |
| | model.topics_ = topics |
| | return model |
| |
|