diff --git "a/BERTopic/bertopic/_bertopic.py" "b/BERTopic/bertopic/_bertopic.py" new file mode 100644--- /dev/null +++ "b/BERTopic/bertopic/_bertopic.py" @@ -0,0 +1,4521 @@ +import yaml +import warnings +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + +try: + yaml._warnings_enabled["YAMLLoadWarning"] = False +except (KeyError, AttributeError, TypeError) as e: + pass + +import re +import math +import joblib +import inspect +import collections +import numpy as np +import pandas as pd +import scipy.sparse as sp + +from tqdm import tqdm +from pathlib import Path +from packaging import version +from tempfile import TemporaryDirectory +from collections import defaultdict, Counter +from scipy.sparse import csr_matrix +from scipy.cluster import hierarchy as sch + +# Typing +import sys +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal +from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable + +# Models +import hdbscan +from umap import UMAP +from sklearn.preprocessing import normalize +from sklearn import __version__ as sklearn_version +from sklearn.cluster import AgglomerativeClustering +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer + +# BERTopic +from bertopic import plotting +from bertopic.cluster import BaseCluster +from bertopic.backend import BaseEmbedder +from bertopic.representation._mmr import mmr +from bertopic.backend._utils import select_backend +from bertopic.vectorizers import ClassTfidfTransformer +from bertopic.representation import BaseRepresentation +from bertopic.dimensionality import BaseDimensionalityReduction +from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan +from bertopic._utils import ( + MyLogger, check_documents_type, check_embeddings_shape, + check_is_fitted, validate_distance_matrix +) +import bertopic._save_utils as save_utils + +# Visualization +import plotly.graph_objects as go + +logger = MyLogger("WARNING") + + +class BERTopic: + """BERTopic is a topic modeling technique that leverages BERT embeddings and + c-TF-IDF to create dense clusters allowing for easily interpretable topics + whilst keeping important words in the topic descriptions. + + The default embedding model is `all-MiniLM-L6-v2` when selecting `language="english"` + and `paraphrase-multilingual-MiniLM-L12-v2` when selecting `language="multilingual"`. + + Attributes: + topics_ (List[int]) : The topics that are generated for each document after training or updating + the topic model. The most recent topics are tracked. + probabilities_ (List[float]): The probability of the assigned topic per document. These are + only calculated if a HDBSCAN model is used for the clustering step. + When `calculate_probabilities=True`, then it is the probabilities + of all topics per document. + topic_sizes_ (Mapping[int, int]) : The size of each topic. + topic_mapper_ (TopicMapper) : A class for tracking topics and their mappings anytime they are + merged, reduced, added, or removed. + topic_representations_ (Mapping[int, Tuple[int, float]]) : The top n terms per topic and their respective + c-TF-IDF values. + c_tf_idf_ (csr_matrix) : The topic-term matrix as calculated through c-TF-IDF. To access its respective + words, run `.vectorizer_model.get_feature_names()` or + `.vectorizer_model.get_feature_names_out()` + topic_labels_ (Mapping[int, str]) : The default labels for each topic. + custom_labels_ (List[str]) : Custom labels for each topic. + topic_embeddings_ (np.ndarray) : The embeddings for each topic. They are calculated by taking the + centroid embedding of each cluster. + representative_docs_ (Mapping[int, str]) : The representative documents for each topic. + + Examples: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + + docs = fetch_20newsgroups(subset='all')['data'] + topic_model = BERTopic() + topics, probabilities = topic_model.fit_transform(docs) + ``` + + If you want to use your own embedding model, use it as follows: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + + docs = fetch_20newsgroups(subset='all')['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + topic_model = BERTopic(embedding_model=sentence_model) + ``` + + Due to the stochastic nature of UMAP, the results from BERTopic might differ + and the quality can degrade. Using your own embeddings allows you to + try out BERTopic several times until you find the topics that suit + you best. + """ + def __init__(self, + language: str = "english", + top_n_words: int = 10, + n_gram_range: Tuple[int, int] = (1, 1), + min_topic_size: int = 10, + nr_topics: Union[int, str] = None, + low_memory: bool = False, + calculate_probabilities: bool = False, + seed_topic_list: List[List[str]] = None, + zeroshot_topic_list: List[str] = None, + zeroshot_min_similarity: float = .7, + embedding_model=None, + umap_model: UMAP = None, + hdbscan_model: hdbscan.HDBSCAN = None, + vectorizer_model: CountVectorizer = None, + ctfidf_model: TfidfTransformer = None, + representation_model: BaseRepresentation = None, + verbose: bool = False, + ): + """BERTopic initialization + + Arguments: + language: The main language used in your documents. The default sentence-transformers + model for "english" is `all-MiniLM-L6-v2`. For a full overview of + supported languages see bertopic.backend.languages. Select + "multilingual" to load in the `paraphrase-multilingual-MiniLM-L12-v2` + sentence-transformers model that supports 50+ languages. + NOTE: This is not used if `embedding_model` is used. + top_n_words: The number of words per topic to extract. Setting this + too high can negatively impact topic embeddings as topics + are typically best represented by at most 10 words. + n_gram_range: The n-gram range for the CountVectorizer. + Advised to keep high values between 1 and 3. + More would likely lead to memory issues. + NOTE: This param will not be used if you pass in your own + CountVectorizer. + min_topic_size: The minimum size of the topic. Increasing this value will lead + to a lower number of clusters/topics and vice versa. + It is the same parameter as `min_cluster_size` in HDBSCAN. + NOTE: This param will not be used if you are using `hdbscan_model`. + nr_topics: Specifying the number of topics will reduce the initial + number of topics to the value specified. This reduction can take + a while as each reduction in topics (-1) activates a c-TF-IDF + calculation. If this is set to None, no reduction is applied. Use + "auto" to automatically reduce topics using HDBSCAN. + NOTE: Controlling the number of topics is best done by adjusting + `min_topic_size` first before adjusting this parameter. + low_memory: Sets UMAP low memory to True to make sure less memory is used. + NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP + this parameter will not be used. + calculate_probabilities: Calculate the probabilities of all topics + per document instead of the probability of the assigned + topic per document. This could slow down the extraction + of topics if you have many documents (> 100_000). + NOTE: If false you cannot use the corresponding + visualization method `visualize_probabilities`. + NOTE: This is an approximation of topic probabilities + as used in HDBSCAN and not an exact representation. + seed_topic_list: A list of seed words per topic to converge around + zeroshot_topic_list: A list of topic names to use for zero-shot classification + zeroshot_min_similarity: The minimum similarity between a zero-shot topic and + a document for assignment. The higher this value, the more + confident the model needs to be to assign a zero-shot topic to a document. + verbose: Changes the verbosity of the model, Set to True if you want + to track the stages of the model. + embedding_model: Use a custom embedding model. + The following backends are currently supported + * SentenceTransformers + * Flair + * Spacy + * Gensim + * USE (TF-Hub) + You can also pass in a string that points to one of the following + sentence-transformers models: + * https://www.sbert.net/docs/pretrained_models.html + umap_model: Pass in a UMAP model to be used instead of the default. + NOTE: You can also pass in any dimensionality reduction algorithm as long + as it has `.fit` and `.transform` functions. + hdbscan_model: Pass in a hdbscan.HDBSCAN model to be used instead of the default + NOTE: You can also pass in any clustering algorithm as long as it has + `.fit` and `.predict` functions along with the `.labels_` variable. + vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model. + ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model. + representation_model: Pass in a model that fine-tunes the topic representations + calculated through c-TF-IDF. Models from `bertopic.representation` + are supported. + """ + # Topic-based parameters + if top_n_words > 100: + logger.warning("Note that extracting more than 100 words from a sparse " + "can slow down computation quite a bit.") + + self.top_n_words = top_n_words + self.min_topic_size = min_topic_size + self.nr_topics = nr_topics + self.low_memory = low_memory + self.calculate_probabilities = calculate_probabilities + self.verbose = verbose + self.seed_topic_list = seed_topic_list + self.zeroshot_topic_list = zeroshot_topic_list + self.zeroshot_min_similarity = zeroshot_min_similarity + + # Embedding model + self.language = language if not embedding_model else None + self.embedding_model = embedding_model + + # Vectorizer + self.n_gram_range = n_gram_range + self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range) + self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() + + # Representation model + self.representation_model = representation_model + + # UMAP or another algorithm that has .fit and .transform functions + self.umap_model = umap_model or UMAP(n_neighbors=15, + n_components=5, + min_dist=0.0, + metric='cosine', + low_memory=self.low_memory) + + # HDBSCAN or another clustering algorithm that has .fit and .predict functions and + # the .labels_ variable to extract the labels + self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size, + metric='euclidean', + cluster_selection_method='eom', + prediction_data=True) + + # Public attributes + self.topics_ = None + self.probabilities_ = None + self.topic_sizes_ = None + self.topic_mapper_ = None + self.topic_representations_ = None + self.topic_embeddings_ = None + self.topic_labels_ = None + self.custom_labels_ = None + self.c_tf_idf_ = None + self.representative_images_ = None + self.representative_docs_ = {} + self.topic_aspects_ = {} + + # Private attributes for internal tracking purposes + self._outliers = 1 + self._merged_topics = None + + if verbose: + logger.set_level("DEBUG") + else: + logger.set_level("WARNING") + + def fit(self, + documents: List[str], + embeddings: np.ndarray = None, + images: List[str] = None, + y: Union[List[int], np.ndarray] = None): + """ Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics + + Arguments: + documents: A list of documents to fit on + embeddings: Pre-trained document embeddings. These can be used + instead of the sentence-transformer model + images: A list of paths to the images to fit on or the images themselves + y: The target class for (semi)-supervised modeling. Use -1 if no class for a + specific instance is specified. + + Examples: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + + docs = fetch_20newsgroups(subset='all')['data'] + topic_model = BERTopic().fit(docs) + ``` + + If you want to use your own embeddings, use it as follows: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + + # Create embeddings + docs = fetch_20newsgroups(subset='all')['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + embeddings = sentence_model.encode(docs, show_progress_bar=True) + + # Create topic model + topic_model = BERTopic().fit(docs, embeddings) + ``` + """ + self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images) + return self + + def fit_transform(self, + documents: List[str], + embeddings: np.ndarray = None, + images: List[str] = None, + y: Union[List[int], np.ndarray] = None) -> Tuple[List[int], + Union[np.ndarray, None]]: + """ Fit the models on a collection of documents, generate topics, + and return the probabilities and topic per document. + + Arguments: + documents: A list of documents to fit on + embeddings: Pre-trained document embeddings. These can be used + instead of the sentence-transformer model + images: A list of paths to the images to fit on or the images themselves + y: The target class for (semi)-supervised modeling. Use -1 if no class for a + specific instance is specified. + + Returns: + predictions: Topic predictions for each documents + probabilities: The probability of the assigned topic per document. + If `calculate_probabilities` in BERTopic is set to True, then + it calculates the probabilities of all topics across all documents + instead of only the assigned topic. This, however, slows down + computation and may increase memory usage. + + Examples: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + + docs = fetch_20newsgroups(subset='all')['data'] + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs) + ``` + + If you want to use your own embeddings, use it as follows: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + + # Create embeddings + docs = fetch_20newsgroups(subset='all')['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + embeddings = sentence_model.encode(docs, show_progress_bar=True) + + # Create topic model + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs, embeddings) + ``` + """ + if documents is not None: + check_documents_type(documents) + check_embeddings_shape(embeddings, documents) + + doc_ids = range(len(documents)) if documents is not None else range(len(images)) + documents = pd.DataFrame({"Document": documents, + "ID": doc_ids, + "Topic": None, + "Image": images}) + + # Extract embeddings + if embeddings is None: + logger.info("Embedding - Transforming documents to embeddings.") + self.embedding_model = select_backend(self.embedding_model, + language=self.language) + embeddings = self._extract_embeddings(documents.Document.values.tolist(), + images=images, + method="document", + verbose=self.verbose) + logger.info("Embedding - Completed \u2713") + else: + if self.embedding_model is not None: + self.embedding_model = select_backend(self.embedding_model, + language=self.language) + + # Guided Topic Modeling + if self.seed_topic_list is not None and self.embedding_model is not None: + y, embeddings = self._guided_topic_modeling(embeddings) + + # Zero-shot Topic Modeling + if self._is_zeroshot(): + documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(documents, embeddings) + if documents is None: + return self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings) + + # Reduce dimensionality + umap_embeddings = self._reduce_dimensionality(embeddings, y) + + # Cluster reduced embeddings + documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) + + # Sort and Map Topic IDs by their frequency + if not self.nr_topics: + documents = self._sort_mappings_by_frequency(documents) + + # Create documents from images if we have images only + if documents.Document.values[0] is None: + custom_documents = self._images_to_text(documents, embeddings) + + # Extract topics by calculating c-TF-IDF + self._extract_topics(custom_documents, embeddings=embeddings) + self._create_topic_vectors(documents=documents, embeddings=embeddings) + + # Reduce topics + if self.nr_topics: + custom_documents = self._reduce_topics(custom_documents) + + # Save the top 3 most representative documents per topic + self._save_representative_docs(custom_documents) + else: + # Extract topics by calculating c-TF-IDF + self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose) + + # Reduce topics + if self.nr_topics: + documents = self._reduce_topics(documents) + + # Save the top 3 most representative documents per topic + self._save_representative_docs(documents) + + # Resulting output + self.probabilities_ = self._map_probabilities(probabilities, original_topics=True) + predictions = documents.Topic.to_list() + + # Combine Zero-shot with outliers + if self._is_zeroshot() and len(documents) != len(doc_ids): + predictions = self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings) + + return predictions, self.probabilities_ + + def transform(self, + documents: Union[str, List[str]], + embeddings: np.ndarray = None, + images: List[str] = None) -> Tuple[List[int], np.ndarray]: + """ After having fit a model, use transform to predict new instances + + Arguments: + documents: A single document or a list of documents to predict on + embeddings: Pre-trained document embeddings. These can be used + instead of the sentence-transformer model. + images: A list of paths to the images to predict on or the images themselves + + Returns: + predictions: Topic predictions for each documents + probabilities: The topic probability distribution which is returned by default. + If `calculate_probabilities` in BERTopic is set to False, then the + probabilities are not calculated to speed up computation and + decrease memory usage. + + Examples: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + + docs = fetch_20newsgroups(subset='all')['data'] + topic_model = BERTopic().fit(docs) + topics, probs = topic_model.transform(docs) + ``` + + If you want to use your own embeddings: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + + # Create embeddings + docs = fetch_20newsgroups(subset='all')['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + embeddings = sentence_model.encode(docs, show_progress_bar=True) + + # Create topic model + topic_model = BERTopic().fit(docs, embeddings) + topics, probs = topic_model.transform(docs, embeddings) + ``` + """ + check_is_fitted(self) + check_embeddings_shape(embeddings, documents) + + if isinstance(documents, str) or documents is None: + documents = [documents] + + if embeddings is None: + embeddings = self._extract_embeddings(documents, + images=images, + method="document", + verbose=self.verbose) + + # Check if an embedding model was found + if embeddings is None: + raise ValueError("No embedding model was found to embed the documents." + "Make sure when loading in the model using BERTopic.load()" + "to also specify the embedding model.") + + # Transform without hdbscan_model and umap_model using only cosine similarity + elif type(self.hdbscan_model) == BaseCluster: + logger.info("Predicting topic assignments through cosine similarity of topic and document embeddings.") + sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_)) + predictions = np.argmax(sim_matrix, axis=1) - self._outliers + + if self.calculate_probabilities: + probabilities = sim_matrix + else: + probabilities = np.max(sim_matrix, axis=1) + + # Transform with full pipeline + else: + logger.info("Dimensionality - Reducing dimensionality of input embeddings.") + umap_embeddings = self.umap_model.transform(embeddings) + logger.info("Dimensionality - Completed \u2713") + + # Extract predictions and probabilities if it is a HDBSCAN-like model + logger.info("Clustering - Approximating new points with `hdbscan_model`") + if is_supported_hdbscan(self.hdbscan_model): + predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings) + + # Calculate probabilities + if self.calculate_probabilities: + logger.info("Probabilities - Start calculation of probabilities with HDBSCAN") + probabilities = hdbscan_delegator(self.hdbscan_model, "membership_vector", umap_embeddings) + logger.info("Probabilities - Completed \u2713") + else: + predictions = self.hdbscan_model.predict(umap_embeddings) + probabilities = None + logger.info("Cluster - Completed \u2713") + + # Map probabilities and predictions + probabilities = self._map_probabilities(probabilities, original_topics=True) + predictions = self._map_predictions(predictions) + return predictions, probabilities + + def partial_fit(self, + documents: List[str], + embeddings: np.ndarray = None, + y: Union[List[int], np.ndarray] = None): + """ Fit BERTopic on a subset of the data and perform online learning + with batch-like data. + + Online topic modeling in BERTopic is performed by using dimensionality + reduction and cluster algorithms that support a `partial_fit` method + in order to incrementally train the topic model. + + Likewise, the `bertopic.vectorizers.OnlineCountVectorizer` is used + to dynamically update its vocabulary when presented with new data. + It has several parameters for modeling decay and updating the + representations. + + In other words, although the main algorithm stays the same, the training + procedure now works as follows: + + For each subset of the data: + + 1. Generate embeddings with a pre-traing language model + 2. Incrementally update the dimensionality reduction algorithm with `partial_fit` + 3. Incrementally update the cluster algorithm with `partial_fit` + 4. Incrementally update the OnlineCountVectorizer and apply some form of decay + + Note that it is advised to use `partial_fit` with batches and + not single documents for the best performance. + + Arguments: + documents: A list of documents to fit on + embeddings: Pre-trained document embeddings. These can be used + instead of the sentence-transformer model + y: The target class for (semi)-supervised modeling. Use -1 if no class for a + specific instance is specified. + + Examples: + + ```python + from sklearn.datasets import fetch_20newsgroups + from sklearn.cluster import MiniBatchKMeans + from sklearn.decomposition import IncrementalPCA + from bertopic.vectorizers import OnlineCountVectorizer + from bertopic import BERTopic + + # Prepare documents + docs = fetch_20newsgroups(subset=subset, remove=('headers', 'footers', 'quotes'))["data"] + + # Prepare sub-models that support online learning + umap_model = IncrementalPCA(n_components=5) + cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0) + vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01) + + topic_model = BERTopic(umap_model=umap_model, + hdbscan_model=cluster_model, + vectorizer_model=vectorizer_model) + + # Incrementally fit the topic model by training on 1000 documents at a time + for index in range(0, len(docs), 1000): + topic_model.partial_fit(docs[index: index+1000]) + ``` + """ + # Checks + check_embeddings_shape(embeddings, documents) + if not hasattr(self.hdbscan_model, "partial_fit"): + raise ValueError("In order to use `.partial_fit`, the cluster model should have " + "a `.partial_fit` function.") + + # Prepare documents + if isinstance(documents, str): + documents = [documents] + documents = pd.DataFrame({"Document": documents, + "ID": range(len(documents)), + "Topic": None}) + + # Extract embeddings + if embeddings is None: + if self.topic_representations_ is None: + self.embedding_model = select_backend(self.embedding_model, + language=self.language) + embeddings = self._extract_embeddings(documents.Document.values.tolist(), + method="document", + verbose=self.verbose) + else: + if self.embedding_model is not None and self.topic_representations_ is None: + self.embedding_model = select_backend(self.embedding_model, + language=self.language) + + # Reduce dimensionality + if self.seed_topic_list is not None and self.embedding_model is not None: + y, embeddings = self._guided_topic_modeling(embeddings) + umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True) + + # Cluster reduced embeddings + documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True) + topics = documents.Topic.to_list() + + # Map and find new topics + if not self.topic_mapper_: + self.topic_mapper_ = TopicMapper(topics) + mappings = self.topic_mapper_.get_mappings() + new_topics = set(topics).difference(set(mappings.keys())) + new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)} + self.topic_mapper_.add_new_topics(new_topic_ids) + updated_mappings = self.topic_mapper_.get_mappings() + updated_topics = [updated_mappings[topic] for topic in topics] + documents["Topic"] = updated_topics + + # Add missing topics (topics that were originally created but are now missing) + if self.topic_representations_: + missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics)) + for missing_topic in missing_topics: + documents.loc[len(documents), :] = [" ", len(documents), missing_topic] + else: + missing_topics = {} + + # Prepare documents + documents_per_topic = documents.sort_values("Topic").groupby(['Topic'], as_index=False) + updated_topics = documents_per_topic.first().Topic.astype(int) + documents_per_topic = documents_per_topic.agg({'Document': ' '.join}) + + # Update topic representations + self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True) + self.topic_representations_ = self._extract_words_per_topic(updated_words, documents, self.c_tf_idf_, calculate_aspects=False) + self._create_topic_vectors() + self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) + for key, values in self.topic_representations_.items()} + + # Update topic sizes + if len(missing_topics) > 0: + documents = documents.iloc[:-len(missing_topics)] + + if self.topic_sizes_ is None: + self._update_topic_size(documents) + else: + sizes = documents.groupby(['Topic'], as_index=False).count() + for _, row in sizes.iterrows(): + topic = int(row.Topic) + if self.topic_sizes_.get(topic) is not None and topic not in missing_topics: + self.topic_sizes_[topic] += int(row.Document) + elif self.topic_sizes_.get(topic) is None: + self.topic_sizes_[topic] = int(row.Document) + self.topics_ = documents.Topic.astype(int).tolist() + + return self + + def topics_over_time(self, + docs: List[str], + timestamps: Union[List[str], + List[int]], + topics: List[int] = None, + nr_bins: int = None, + datetime_format: str = None, + evolution_tuning: bool = True, + global_tuning: bool = True) -> pd.DataFrame: + """ Create topics over time + + To create the topics over time, BERTopic needs to be already fitted once. + From the fitted models, the c-TF-IDF representations are calculate at + each timestamp t. Then, the c-TF-IDF representations at timestamp t are + averaged with the global c-TF-IDF representations in order to fine-tune the + local representations. + + NOTE: + Make sure to use a limited number of unique timestamps (<100) as the + c-TF-IDF representation will be calculated at each single unique timestamp. + Having a large number of unique timestamps can take some time to be calculated. + Moreover, there aren't many use-cases where you would like to see the difference + in topic representations over more than 100 different timestamps. + + Arguments: + docs: The documents you used when calling either `fit` or `fit_transform` + timestamps: The timestamp of each document. This can be either a list of strings or ints. + If it is a list of strings, then the datetime format will be automatically + inferred. If it is a list of ints, then the documents will be ordered in + ascending order. + topics: A list of topics where each topic is related to a document in `docs` and + a timestamp in `timestamps`. You can use this to apply topics_over_time on + a subset of the data. Make sure that `docs`, `timestamps`, and `topics` + all correspond to one another and have the same size. + nr_bins: The number of bins you want to create for the timestamps. The left interval will + be chosen as the timestamp. An additional column will be created with the + entire interval. + datetime_format: The datetime format of the timestamps if they are strings, eg “%d/%m/%Y”. + Set this to None if you want to have it automatically detect the format. + See strftime documentation for more information on choices: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + evolution_tuning: Fine-tune each topic representation at timestamp *t* by averaging its + c-TF-IDF matrix with the c-TF-IDF matrix at timestamp *t-1*. This creates + evolutionary topic representations. + global_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix + with the global c-TF-IDF matrix. Turn this off if you want to prevent words in + topic representations that could not be found in the documents at timestamp *t*. + + Returns: + topics_over_time: A dataframe that contains the topic, words, and frequency of topic + at timestamp *t*. + + Examples: + + The timestamps variable represents the timestamp of each document. If you have over + 100 unique timestamps, it is advised to bin the timestamps as shown below: + + ```python + from bertopic import BERTopic + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs) + topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20) + ``` + """ + check_is_fitted(self) + check_documents_type(docs) + selected_topics = topics if topics else self.topics_ + documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps}) + global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False) + + all_topics = sorted(list(documents.Topic.unique())) + all_topics_indices = {topic: index for index, topic in enumerate(all_topics)} + + if isinstance(timestamps[0], str): + infer_datetime_format = True if not datetime_format else False + documents["Timestamps"] = pd.to_datetime(documents["Timestamps"], + infer_datetime_format=infer_datetime_format, + format=datetime_format) + + if nr_bins: + documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins) + documents["Timestamps"] = documents.apply(lambda row: row.Bins.left, 1) + + # Sort documents in chronological order + documents = documents.sort_values("Timestamps") + timestamps = documents.Timestamps.unique() + if len(timestamps) > 100: + logger.warning(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) " + "which significantly slows down the application. Consider setting `nr_bins` " + "to a value lower than 100 to speed up calculation. ") + + # For each unique timestamp, create topic representations + topics_over_time = [] + for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose): + + # Calculate c-TF-IDF representation for a specific timestamp + selection = documents.loc[documents.Timestamps == timestamp, :] + documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join, + "Timestamps": "count"}) + c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) + + if global_tuning or evolution_tuning: + c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False) + + # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF + # matrix at timestamp t-1 + if evolution_tuning and index != 0: + current_topics = sorted(list(documents_per_topic.Topic.values)) + overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics)))) + + current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics] + previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics] + + c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] + + previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil() + + # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation + # by simply taking the average of the two + if global_tuning: + selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values] + c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0 + + # Extract the words per topic + words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) + topic_frequency = pd.Series(documents_per_topic.Timestamps.values, + index=documents_per_topic.Topic).to_dict() + + # Fill dataframe with results + topics_at_timestamp = [(topic, + ", ".join([words[0] for words in values][:5]), + topic_frequency[topic], + timestamp) for topic, values in words_per_topic.items()] + topics_over_time.extend(topics_at_timestamp) + + if evolution_tuning: + previous_topics = sorted(list(documents_per_topic.Topic.values)) + previous_c_tf_idf = c_tf_idf.copy() + + return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"]) + + def topics_per_class(self, + docs: List[str], + classes: Union[List[int], List[str]], + global_tuning: bool = True) -> pd.DataFrame: + """ Create topics per class + + To create the topics per class, BERTopic needs to be already fitted once. + From the fitted models, the c-TF-IDF representations are calculated at + each class c. Then, the c-TF-IDF representations at class c are + averaged with the global c-TF-IDF representations in order to fine-tune the + local representations. This can be turned off if the pure representation is + needed. + + NOTE: + Make sure to use a limited number of unique classes (<100) as the + c-TF-IDF representation will be calculated at each single unique class. + Having a large number of unique classes can take some time to be calculated. + + Arguments: + docs: The documents you used when calling either `fit` or `fit_transform` + classes: The class of each document. This can be either a list of strings or ints. + global_tuning: Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix + with the global c-TF-IDF matrix. Turn this off if you want to prevent words in + topic representations that could not be found in the documents for class c. + + Returns: + topics_per_class: A dataframe that contains the topic, words, and frequency of topics + for each class. + + Examples: + + ```python + from bertopic import BERTopic + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs) + topics_per_class = topic_model.topics_per_class(docs, classes) + ``` + """ + check_documents_type(docs) + documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes}) + global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False) + + # For each unique timestamp, create topic representations + topics_per_class = [] + for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose): + + # Calculate c-TF-IDF representation for a specific timestamp + selection = documents.loc[documents.Class == class_, :] + documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join, + "Class": "count"}) + c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) + + # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation + # by simply taking the average of the two + if global_tuning: + c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False) + c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0 + + # Extract the words per topic + words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) + topic_frequency = pd.Series(documents_per_topic.Class.values, + index=documents_per_topic.Topic).to_dict() + + # Fill dataframe with results + topics_at_class = [(topic, + ", ".join([words[0] for words in values][:5]), + topic_frequency[topic], + class_) for topic, values in words_per_topic.items()] + topics_per_class.extend(topics_at_class) + + topics_per_class = pd.DataFrame(topics_per_class, columns=["Topic", "Words", "Frequency", "Class"]) + + return topics_per_class + + def hierarchical_topics(self, + docs: List[str], + linkage_function: Callable[[csr_matrix], np.ndarray] = None, + distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame: + """ Create a hierarchy of topics + + To create this hierarchy, BERTopic needs to be already fitted once. + Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF + representation using `scipy.cluster.hierarchy.linkage`. + + Based on that hierarchy, we calculate the topic representation at each + merged step. This is a local representation, as we only assume that the + chosen step is merged and not all others which typically improves the + topic representation. + + Arguments: + docs: The documents you used when calling either `fit` or `fit_transform` + linkage_function: The linkage function to use. Default is: + `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` + distance_function: The distance function to use on the c-TF-IDF matrix. Default is: + `lambda x: 1 - cosine_similarity(x)`. + You can pass any function that returns either a square matrix of + shape (n_samples, n_samples) with zeros on the diagonal and + non-negative values or condensed distance matrix of shape + (n_samples * (n_samples - 1) / 2,) containing the upper + triangular of the distance matrix. + + Returns: + hierarchical_topics: A dataframe that contains a hierarchy of topics + represented by their parents and their children + + Examples: + + ```python + from bertopic import BERTopic + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs) + hierarchical_topics = topic_model.hierarchical_topics(docs) + ``` + + A custom linkage function can be used as follows: + + ```python + from scipy.cluster import hierarchy as sch + from bertopic import BERTopic + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs) + + # Hierarchical topics + linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) + hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function) + ``` + """ + check_documents_type(docs) + if distance_function is None: + distance_function = lambda x: 1 - cosine_similarity(x) + + if linkage_function is None: + linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) + + # Calculate distance + embeddings = self.c_tf_idf_[self._outliers:] + X = distance_function(embeddings) + X = validate_distance_matrix(X, embeddings.shape[0]) + + # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix + Z = linkage_function(X) + + # Calculate basic bag-of-words to be iteratively merged later + documents = pd.DataFrame({"Document": docs, + "ID": range(len(docs)), + "Topic": self.topics_}) + documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :] + clean_documents = self._preprocess_text(documents_per_topic.Document.values) + + # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 + # and will be removed in 1.2. Please use get_feature_names_out instead. + if version.parse(sklearn_version) >= version.parse("1.0.0"): + words = self.vectorizer_model.get_feature_names_out() + else: + words = self.vectorizer_model.get_feature_names() + + bow = self.vectorizer_model.transform(clean_documents) + + # Extract clusters + hier_topics = pd.DataFrame(columns=["Parent_ID", "Parent_Name", "Topics", + "Child_Left_ID", "Child_Left_Name", + "Child_Right_ID", "Child_Right_Name"]) + for index in tqdm(range(len(Z))): + + # Find clustered documents + clusters = sch.fcluster(Z, t=Z[index][2], criterion='distance') - self._outliers + nr_clusters = len(clusters) + + # Extract first topic we find to get the set of topics in a merged topic + topic = None + val = Z[index][0] + while topic is None: + if val - len(clusters) < 0: + topic = int(val) + else: + val = Z[int(val - len(clusters))][0] + clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]] + + # Group bow per cluster, calculate c-TF-IDF and extract words + grouped = csr_matrix(bow[clustered_topics].sum(axis=0)) + c_tf_idf = self.ctfidf_model.transform(grouped) + selection = documents.loc[documents.Topic.isin(clustered_topics), :] + selection.Topic = 0 + words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) + + # Extract parent's name and ID + parent_id = index + len(clusters) + parent_name = "_".join([x[0] for x in words_per_topic[0]][:5]) + + # Extract child's name and ID + Z_id = Z[index][0] + child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters + + if Z_id - nr_clusters < 0: + child_left_name = "_".join([x[0] for x in self.get_topic(Z_id)][:5]) + else: + child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name + + # Extract child's name and ID + Z_id = Z[index][1] + child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters + + if Z_id - nr_clusters < 0: + child_right_name = "_".join([x[0] for x in self.get_topic(Z_id)][:5]) + else: + child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name + + # Save results + hier_topics.loc[len(hier_topics), :] = [parent_id, parent_name, + clustered_topics, + int(Z[index][0]), child_left_name, + int(Z[index][1]), child_right_name] + + hier_topics["Distance"] = Z[:, 2] + hier_topics = hier_topics.sort_values("Parent_ID", ascending=False) + hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]].astype(str) + + return hier_topics + + def approximate_distribution(self, + documents: Union[str, List[str]], + window: int = 4, + stride: int = 1, + min_similarity: float = 0.1, + batch_size: int = 1000, + padding: bool = False, + use_embedding_model: bool = False, + calculate_tokens: bool = False, + separator: str = " ") -> Tuple[np.ndarray, + Union[List[np.ndarray], None]]: + """ A post-hoc approximation of topic distributions across documents. + + In order to perform this approximation, each document is split into tokens + according to the provided tokenizer in the `CountVectorizer`. Then, a + sliding window is applied on each document creating subsets of the document. + For example, with a window size of 3 and stride of 1, the sentence: + + `Solving the right problem is difficult.` + + can be split up into `solving the right`, `the right problem`, `right problem is`, + and `problem is difficult`. These are called tokensets. For each of these + tokensets, we calculate their c-TF-IDF representation and find out + how similar they are to the previously generated topics. Then, the + similarities to the topics for each tokenset are summed up in order to + create a topic distribution for the entire document. + + We can also dive into this a bit deeper by then splitting these tokensets + up into individual tokens and calculate how much a word, in a specific sentence, + contributes to the topics found in that document. This can be enabled by + setting `calculate_tokens=True` which can be used for visualization purposes + in `topic_model.visualize_approximate_distribution`. + + The main output, `topic_distributions`, can also be used directly in + `.visualize_distribution(topic_distributions[index])` by simply selecting + a single distribution. + + Arguments: + documents: A single document or a list of documents for which we + approximate their topic distributions + window: Size of the moving window which indicates the number of + tokens being considered. + stride: How far the window should move at each step. + min_similarity: The minimum similarity of a document's tokenset + with respect to the topics. + batch_size: The number of documents to process at a time. If None, + then all documents are processed at once. + NOTE: With a large number of documents, it is not + advised to process all documents at once. + padding: Whether to pad the beginning and ending of a document with + empty tokens. + use_embedding_model: Whether to use the topic model's embedding + model to calculate the similarity between + tokensets and topics instead of using c-TF-IDF. + calculate_tokens: Calculate the similarity of tokens with all topics. + NOTE: This is computation-wise more expensive and + can require more memory. Using this over batches of + documents might be preferred. + separator: The separator used to merge tokens into tokensets. + + Returns: + topic_distributions: A `n` x `m` matrix containing the topic distributions + for all input documents with `n` being the documents + and `m` the topics. + topic_token_distributions: A list of `t` x `m` arrays with `t` being the + number of tokens for the respective document + and `m` the topics. + + Examples: + + After fitting the model, the topic distributions can be calculated regardless + of the clustering model and regardless of whether the documents were previously + seen or not: + + ```python + topic_distr, _ = topic_model.approximate_distribution(docs) + ``` + + As a result, the topic distributions are calculated in `topic_distr` for the + entire document based on a token set with a specific window size and stride. + + If you want to calculate the topic distributions on a token-level: + + ```python + topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True) + ``` + + The `topic_token_distr` then contains, for each token, the best fitting topics. + As with `topic_distr`, it can contain multiple topics for a single token. + """ + if isinstance(documents, str): + documents = [documents] + + if batch_size is None: + batch_size = len(documents) + batches = 1 + else: + batches = math.ceil(len(documents)/batch_size) + + topic_distributions = [] + topic_token_distributions = [] + + for i in tqdm(range(batches), disable=not self.verbose): + doc_set = documents[i*batch_size: (i+1) * batch_size] + + # Extract tokens + analyzer = self.vectorizer_model.build_tokenizer() + tokens = [analyzer(document) for document in doc_set] + + # Extract token sets + all_sentences = [] + all_indices = [0] + all_token_sets_ids = [] + + for tokenset in tokens: + if len(tokenset) < window: + token_sets = [tokenset] + token_sets_ids = [list(range(len(tokenset)))] + else: + + # Extract tokensets using window and stride parameters + stride_indices = list(range(len(tokenset)))[::stride] + token_sets = [] + token_sets_ids = [] + for stride_index in stride_indices: + selected_tokens = tokenset[stride_index: stride_index+window] + + if padding or len(selected_tokens) == window: + token_sets.append(selected_tokens) + token_sets_ids.append(list(range(stride_index, stride_index+len(selected_tokens)))) + + # Add empty tokens at the beginning and end of a document + if padding: + padded = [] + padded_ids = [] + t = math.ceil(window / stride) - 1 + for i in range(math.ceil(window / stride) - 1): + padded.append(tokenset[:window - ((t-i) * stride)]) + padded_ids.append(list(range(0, window - ((t-i) * stride)))) + + token_sets = padded + token_sets + token_sets_ids = padded_ids + token_sets_ids + + # Join the tokens + sentences = [separator.join(token) for token in token_sets] + all_sentences.extend(sentences) + all_token_sets_ids.extend(token_sets_ids) + all_indices.append(all_indices[-1] + len(sentences)) + + # Calculate similarity between embeddings of token sets and the topics + if use_embedding_model: + embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True) + similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers:]) + + # Calculate similarity between c-TF-IDF of token sets and the topics + else: + bow_doc = self.vectorizer_model.transform(all_sentences) + c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + + # Only keep similarities that exceed the minimum + similarity[similarity < min_similarity] = 0 + + # Aggregate results on an individual token level + if calculate_tokens: + topic_distribution = [] + topic_token_distribution = [] + for index, token in enumerate(tokens): + start = all_indices[index] + end = all_indices[index+1] + + if start == end: + end = end + 1 + + # Assign topics to individual tokens + token_id = [i for i in range(len(token))] + token_val = {index: [] for index in token_id} + for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]): + for token in token_set: + if token in token_val: + token_val[token].append(sim) + + matrix = [] + for _, value in token_val.items(): + matrix.append(np.add.reduce(value)) + + # Take empty documents into account + matrix = np.array(matrix) + if len(matrix.shape) == 1: + matrix = np.zeros((1, len(self.topic_labels_) - self._outliers)) + + topic_token_distribution.append(np.array(matrix)) + topic_distribution.append(np.add.reduce(matrix)) + + topic_distribution = normalize(topic_distribution, norm='l1', axis=1) + + # Aggregate on a tokenset level indicated by the window and stride + else: + topic_distribution = [] + for index in range(len(all_indices)-1): + start = all_indices[index] + end = all_indices[index+1] + + if start == end: + end = end + 1 + group = similarity[start:end].sum(axis=0) + topic_distribution.append(group) + topic_distribution = normalize(np.array(topic_distribution), norm='l1', axis=1) + topic_token_distribution = None + + # Combine results + topic_distributions.append(topic_distribution) + if topic_token_distribution is None: + topic_token_distributions = None + else: + topic_token_distributions.extend(topic_token_distribution) + + topic_distributions = np.vstack(topic_distributions) + + return topic_distributions, topic_token_distributions + + def find_topics(self, + search_term: str = None, + image: str = None, + top_n: int = 5) -> Tuple[List[int], List[float]]: + """ Find topics most similar to a search_term + + Creates an embedding for search_term and compares that with + the topic embeddings. The most similar topics are returned + along with their similarity values. + + The search_term can be of any size but since it is compared + with the topic representation it is advised to keep it + below 5 words. + + Arguments: + search_term: the term you want to use to search for topics. + top_n: the number of topics to return + + Returns: + similar_topics: the most similar topics from high to low + similarity: the similarity scores from high to low + + Examples: + + You can use the underlying embedding model to find topics that + best represent the search term: + + ```python + topics, similarity = topic_model.find_topics("sports", top_n=5) + ``` + + Note that the search query is typically more accurate if the + search_term consists of a phrase or multiple words. + """ + if self.embedding_model is None: + raise Exception("This method can only be used if you did not use custom embeddings.") + + topic_list = list(self.topic_representations_.keys()) + topic_list.sort() + + # Extract search_term embeddings and compare with topic embeddings + if search_term is not None: + search_embedding = self._extract_embeddings([search_term], + method="word", + verbose=False).flatten() + elif image is not None: + search_embedding = self._extract_embeddings([None], + images=[image], + method="document", + verbose=False).flatten() + sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten() + + # Extract topics most similar to search_term + ids = np.argsort(sims)[-top_n:] + similarity = [sims[i] for i in ids][::-1] + similar_topics = [topic_list[index] for index in ids][::-1] + + return similar_topics, similarity + + def update_topics(self, + docs: List[str], + images: List[str] = None, + topics: List[int] = None, + top_n_words: int = 10, + n_gram_range: Tuple[int, int] = None, + vectorizer_model: CountVectorizer = None, + ctfidf_model: ClassTfidfTransformer = None, + representation_model: BaseRepresentation = None): + """ Updates the topic representation by recalculating c-TF-IDF with the new + parameters as defined in this function. + + When you have trained a model and viewed the topics and the words that represent them, + you might not be satisfied with the representation. Perhaps you forgot to remove + stop_words or you want to try out a different n_gram_range. This function allows you + to update the topic representation after they have been formed. + + Arguments: + docs: The documents you used when calling either `fit` or `fit_transform` + images: The images you used when calling either `fit` or `fit_transform` + topics: A list of topics where each topic is related to a document in `docs`. + Use this variable to change or map the topics. + NOTE: Using a custom list of topic assignments may lead to errors if + topic reduction techniques are used afterwards. Make sure that + manually assigning topics is the last step in the pipeline + top_n_words: The number of words per topic to extract. Setting this + too high can negatively impact topic embeddings as topics + are typically best represented by at most 10 words. + n_gram_range: The n-gram range for the CountVectorizer. + vectorizer_model: Pass in your own CountVectorizer from scikit-learn + ctfidf_model: Pass in your own c-TF-IDF model to update the representations + representation_model: Pass in a model that fine-tunes the topic representations + calculated through c-TF-IDF. Models from `bertopic.representation` + are supported. + + Examples: + + In order to update the topic representation, you will need to first fit the topic + model and extract topics from them. Based on these, you can update the representation: + + ```python + topic_model.update_topics(docs, n_gram_range=(2, 3)) + ``` + + You can also use a custom vectorizer to update the representation: + + ```python + from sklearn.feature_extraction.text import CountVectorizer + vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english") + topic_model.update_topics(docs, vectorizer_model=vectorizer_model) + ``` + + You can also use this function to change or map the topics to something else. + You can update them as follows: + + ```python + topic_model.update_topics(docs, my_updated_topics) + ``` + """ + check_documents_type(docs) + check_is_fitted(self) + if not n_gram_range: + n_gram_range = self.n_gram_range + + if top_n_words > 100: + logger.warning("Note that extracting more than 100 words from a sparse " + "can slow down computation quite a bit.") + self.top_n_words = top_n_words + self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range) + self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() + self.representation_model = representation_model + + if topics is None: + topics = self.topics_ + else: + logger.warning("Using a custom list of topic assignments may lead to errors if " + "topic reduction techniques are used afterwards. Make sure that " + "manually assigning topics is the last step in the pipeline." + "Note that topic embeddings will also be created through weighted" + "c-TF-IDF embeddings instead of centroid embeddings.") + + self._outliers = 1 if -1 in set(topics) else 0 + # Extract words + documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images}) + documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) + self.topic_representations_ = self._extract_words_per_topic(words, documents) + if set(topics) != self.topics_: + self._create_topic_vectors() + self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) + for key, values in + self.topic_representations_.items()} + self._update_topic_size(documents) + + def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]: + """ Return topics with top n words and their c-TF-IDF score + + Arguments: + full: If True, returns all different forms of topic representations + for each topic, including aspects + + Returns: + self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score + + Examples: + + ```python + all_topics = topic_model.get_topics() + ``` + """ + check_is_fitted(self) + + if full: + topic_representations = {"Main": self.topic_representations_} + topic_representations.update(self.topic_aspects_) + return topic_representations + else: + return self.topic_representations_ + + def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]: + """ Return top n words for a specific topic and their c-TF-IDF scores + + Arguments: + topic: A specific topic for which you want its representation + full: If True, returns all different forms of topic representations + for a topic, including aspects + + Returns: + The top n words for a specific word and its respective c-TF-IDF scores + + Examples: + + ```python + topic = topic_model.get_topic(12) + ``` + """ + check_is_fitted(self) + if topic in self.topic_representations_: + if full: + representations = {"Main": self.topic_representations_[topic]} + aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()} + representations.update(aspects) + return representations + else: + return self.topic_representations_[topic] + else: + return False + + def get_topic_info(self, topic: int = None) -> pd.DataFrame: + """ Get information about each topic including its ID, frequency, and name. + + Arguments: + topic: A specific topic for which you want the frequency + + Returns: + info: The information relating to either a single topic or all topics + + Examples: + + ```python + info_df = topic_model.get_topic_info() + ``` + """ + check_is_fitted(self) + + info = pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values("Topic") + info["Name"] = info.Topic.map(self.topic_labels_) + + # Custom label + if self.custom_labels_ is not None: + if len(self.custom_labels_) == len(info): + labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)} + info["CustomName"] = info["Topic"].map(labels) + + # Main Keywords + values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()} + info["Representation"] = info["Topic"].map(values) + + # Extract all topic aspects + if self.topic_aspects_: + for aspect, values in self.topic_aspects_.items(): + if isinstance(list(values.values())[-1], list): + if isinstance(list(values.values())[-1][0], tuple) or isinstance(list(values.values())[-1][0], list): + values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()} + elif isinstance(list(values.values())[-1][0], str): + values = {topic: " ".join(value).strip() for topic, value in values.items()} + info[aspect] = info["Topic"].map(values) + + # Representative Docs / Images + if self.representative_docs_ is not None: + info["Representative_Docs"] = info["Topic"].map(self.representative_docs_) + if self.representative_images_ is not None: + info["Representative_Images"] = info["Topic"].map(self.representative_images_) + + # Select specific topic to return + if topic is not None: + info = info.loc[info.Topic == topic, :] + + return info.reset_index(drop=True) + + def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]: + """ Return the size of topics (descending order) + + Arguments: + topic: A specific topic for which you want the frequency + + Returns: + Either the frequency of a single topic or dataframe with + the frequencies of all topics + + Examples: + + To extract the frequency of all topics: + + ```python + frequency = topic_model.get_topic_freq() + ``` + + To get the frequency of a single topic: + + ```python + frequency = topic_model.get_topic_freq(12) + ``` + """ + check_is_fitted(self) + if isinstance(topic, int): + return self.topic_sizes_[topic] + else: + return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values("Count", + ascending=False) + + def get_document_info(self, + docs: List[str], + df: pd.DataFrame = None, + metadata: Mapping[str, Any] = None) -> pd.DataFrame: + """ Get information about the documents on which the topic was trained + including the documents themselves, their respective topics, the name + of each topic, the top n words of each topic, whether it is a + representative document, and probability of the clustering if the cluster + model supports it. + + There are also options to include other meta data, such as the topic + distributions or the x and y coordinates of the reduced embeddings. + + Arguments: + docs: The documents on which the topic model was trained. + df: A dataframe containing the metadata and the documents on which + the topic model was originally trained on. + metadata: A dictionary with meta data for each document in the form + of column name (key) and the respective values (value). + + Returns: + document_info: A dataframe with several statistics regarding + the documents on which the topic model was trained. + + Usage: + + To get the document info, you will only need to pass the documents on which + the topic model was trained: + + ```python + document_info = topic_model.get_document_info(docs) + ``` + + There are additionally options to include meta data, such as the topic + distributions. Moreover, we can pass the original dataframe that contains + the documents and extend it with the information retrieved from BERTopic: + + ```python + from sklearn.datasets import fetch_20newsgroups + + # The original data in a dataframe format to include the target variable + data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) + df = pd.DataFrame({"Document": data['data'], "Class": data['target']}) + + # Add information about the percentage of the document that relates to the topic + topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000) + distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)] + + # Create our documents dataframe using the original dataframe and meta data about + # the topic distributions + document_info = topic_model.get_document_info(docs, df=df, + metadata={"Topic_distribution": distributions}) + ``` + """ + check_documents_type(docs) + if df is not None: + document_info = df.copy() + document_info["Document"] = docs + document_info["Topic"] = self.topics_ + else: + document_info = pd.DataFrame({"Document": docs, "Topic": self.topics_}) + + # Add topic info through `.get_topic_info()` + topic_info = self.get_topic_info().drop("Count", axis=1) + document_info = pd.merge(document_info, topic_info, on="Topic", how="left") + + # Add top n words + top_n_words = {topic: " - ".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)} + document_info["Top_n_words"] = document_info.Topic.map(top_n_words) + + # Add flat probabilities + if self.probabilities_ is not None: + if len(self.probabilities_.shape) == 1: + document_info["Probability"] = self.probabilities_ + else: + document_info["Probability"] = [max(probs) if topic != -1 else 1-sum(probs) + for topic, probs in zip(self.topics_, self.probabilities_)] + + # Add representative document labels + repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs] + document_info["Representative_document"] = False + document_info.loc[document_info.Document.isin(repr_docs), "Representative_document"] = True + + # Add custom meta data provided by the user + if metadata is not None: + for column, values in metadata.items(): + document_info[column] = values + return document_info + + def get_representative_docs(self, topic: int = None) -> List[str]: + """ Extract the best representing documents per topic. + + NOTE: + This does not extract all documents per topic as all documents + are not saved within BERTopic. To get all documents, please + run the following: + + ```python + # When you used `.fit_transform`: + df = pd.DataFrame({"Document": docs, "Topic": topic}) + + # When you used `.fit`: + df = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_}) + ``` + + Arguments: + topic: A specific topic for which you want + the representative documents + + Returns: + Representative documents of the chosen topic + + Examples: + + To extract the representative docs of all topics: + + ```python + representative_docs = topic_model.get_representative_docs() + ``` + + To get the representative docs of a single topic: + + ```python + representative_docs = topic_model.get_representative_docs(12) + ``` + """ + check_is_fitted(self) + if isinstance(topic, int): + if self.representative_docs_.get(topic): + return self.representative_docs_[topic] + else: + return None + else: + return self.representative_docs_ + + @staticmethod + def get_topic_tree(hier_topics: pd.DataFrame, + max_distance: float = None, + tight_layout: bool = False) -> str: + """ Extract the topic tree such that it can be printed + + Arguments: + hier_topics: A dataframe containing the structure of the topic tree. + This is the output of `topic_model.hierachical_topics()` + max_distance: The maximum distance between two topics. This value is + based on the Distance column in `hier_topics`. + tight_layout: Whether to use a tight layout (narrow width) for + easier readability if you have hundreds of topics. + + Returns: + A tree that has the following structure when printed: + . + . + └─health_medical_disease_patients_hiv + ├─patients_medical_disease_candida_health + │ ├─■──candida_yeast_infection_gonorrhea_infections ── Topic: 48 + │ └─patients_disease_cancer_medical_doctor + │ ├─■──hiv_medical_cancer_patients_doctor ── Topic: 34 + │ └─■──pain_drug_patients_disease_diet ── Topic: 26 + └─■──health_newsgroup_tobacco_vote_votes ── Topic: 9 + + The blocks (■) indicate that the topic is one you can directly access + from `topic_model.get_topic`. In other words, they are the original un-grouped topics. + + Examples: + + ```python + # Train model + from bertopic import BERTopic + topic_model = BERTopic() + topics, probs = topic_model.fit_transform(docs) + hierarchical_topics = topic_model.hierarchical_topics(docs) + + # Print topic tree + tree = topic_model.get_topic_tree(hierarchical_topics) + print(tree) + ``` + """ + width = 1 if tight_layout else 4 + if max_distance is None: + max_distance = hier_topics.Distance.max() + 1 + + max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1 + + # Extract mapping from ID to name + topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name)) + topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name))) + topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()} + + # Create tree + tree = {str(row[1].Parent_ID): [str(row[1].Child_Left_ID), str(row[1].Child_Right_ID)] + for row in hier_topics.iterrows()} + + def get_tree(start, tree): + """ Based on: https://stackoverflow.com/a/51920869/10532563 """ + + def _tree(to_print, start, parent, tree, grandpa=None, indent=""): + + # Get distance between merged topics + distance = hier_topics.loc[(hier_topics.Child_Left_ID == parent) | + (hier_topics.Child_Right_ID == parent), "Distance"] + distance = distance.values[0] if len(distance) > 0 else 10 + + if parent != start: + if grandpa is None: + to_print += topic_to_name[parent] + else: + if int(parent) <= max_original_topic: + + # Do not append topic ID if they are not merged + if distance < max_distance: + to_print += "■──" + topic_to_name[parent] + f" ── Topic: {parent}" + "\n" + else: + to_print += "O \n" + else: + to_print += topic_to_name[parent] + "\n" + + if parent not in tree: + return to_print + + for child in tree[parent][:-1]: + to_print += indent + "├" + "─" + to_print = _tree(to_print, start, child, tree, parent, indent + "│" + " " * width) + + child = tree[parent][-1] + to_print += indent + "└" + "─" + to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width+1)) + + return to_print + + to_print = "." + "\n" + to_print = _tree(to_print, start, start, tree) + return to_print + + start = str(hier_topics.Parent_ID.astype(int).max()) + return get_tree(start, tree) + + def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None: + """ Set custom topic labels in your fitted BERTopic model + + Arguments: + topic_labels: If a list of topic labels, it should contain the same number + of labels as there are topics. This must be ordered + from the topic with the lowest ID to the highest ID, + including topic -1 if it exists. + If a dictionary of `topic ID`: `topic_label`, it can have + any number of topics as it will only map the topics found + in the dictionary. + + Examples: + + First, we define our topic labels with `.generate_topic_labels` in which + we can customize our topic labels: + + ```python + topic_labels = topic_model.generate_topic_labels(nr_words=2, + topic_prefix=True, + word_length=10, + separator=", ") + ``` + + Then, we pass these `topic_labels` to our topic model which + can be accessed at any time with `.custom_labels_`: + + ```python + topic_model.set_topic_labels(topic_labels) + topic_model.custom_labels_ + ``` + + You might want to change only a few topic labels instead of all of them. + To do so, you can pass a dictionary where the keys are the topic IDs and + its keys the topic labels: + + ```python + topic_model.set_topic_labels({0: "Space", 1: "Sports", 2: "Medicine"}) + topic_model.custom_labels_ + ``` + """ + unique_topics = sorted(set(self.topics_)) + + if isinstance(topic_labels, dict): + if self.custom_labels_ is not None: + original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)} + else: + info = self.get_topic_info() + original_labels = dict(zip(info.Topic, info.Name)) + custom_labels = [topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics] + + elif isinstance(topic_labels, list): + if len(topic_labels) == len(unique_topics): + custom_labels = topic_labels + else: + raise ValueError("Make sure that `topic_labels` contains the same number " + "of labels as there are topics.") + + self.custom_labels_ = custom_labels + + def generate_topic_labels(self, + nr_words: int = 3, + topic_prefix: bool = True, + word_length: int = None, + separator: str = "_", + aspect: str = None) -> List[str]: + """ Get labels for each topic in a user-defined format + + Arguments: + nr_words: Top `n` words per topic to use + topic_prefix: Whether to use the topic ID as a prefix. + If set to True, the topic ID will be separated + using the `separator` + word_length: The maximum length of each word in the topic label. + Some words might be relatively long and setting this + value helps to make sure that all labels have relatively + similar lengths. + separator: The string with which the words and topic prefix will be + separated. Underscores are the default but a nice alternative + is `", "`. + aspect: The aspect from which to generate topic labels + + Returns: + topic_labels: A list of topic labels sorted from the lowest topic ID to the highest. + If the topic model was trained using HDBSCAN, the lowest topic ID is -1, + otherwise it is 0. + + Examples: + + To create our custom topic labels, usage is rather straightforward: + + ```python + topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=", ") + ``` + """ + unique_topics = sorted(set(self.topics_)) + + topic_labels = [] + for topic in unique_topics: + if aspect: + words, _ = zip(*self.topic_aspects_[aspect][topic]) + else: + words, _ = zip(*self.get_topic(topic)) + + if word_length: + words = [word[:word_length] for word in words][:nr_words] + else: + words = list(words)[:nr_words] + + if topic_prefix: + topic_label = f"{topic}{separator}" + separator.join(words) + else: + topic_label = separator.join(words) + + topic_labels.append(topic_label) + + return topic_labels + + def merge_topics(self, + docs: List[str], + topics_to_merge: List[Union[Iterable[int], int]], + images: List[str] = None) -> None: + """ + Arguments: + docs: The documents you used when calling either `fit` or `fit_transform` + topics_to_merge: Either a list of topics or a list of list of topics + to merge. For example: + [1, 2, 3] will merge topics 1, 2 and 3 + [[1, 2], [3, 4]] will merge topics 1 and 2, and + separately merge topics 3 and 4. + images: A list of paths to the images used when calling either + `fit` or `fit_transform` + + Examples: + + If you want to merge topics 1, 2, and 3: + + ```python + topics_to_merge = [1, 2, 3] + topic_model.merge_topics(docs, topics_to_merge) + ``` + + or if you want to merge topics 1 and 2, and separately + merge topics 3 and 4: + + ```python + topics_to_merge = [[1, 2], + [3, 4]] + topic_model.merge_topics(docs, topics_to_merge) + ``` + """ + check_is_fitted(self) + check_documents_type(docs) + documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))}) + + mapping = {topic: topic for topic in set(self.topics_)} + if isinstance(topics_to_merge[0], int): + for topic in sorted(topics_to_merge): + mapping[topic] = topics_to_merge[0] + elif isinstance(topics_to_merge[0], Iterable): + for topic_group in sorted(topics_to_merge): + for topic in topic_group: + mapping[topic] = topic_group[0] + else: + raise ValueError("Make sure that `topics_to_merge` is either" + "a list of topics or a list of list of topics.") + + # Track mappings and sizes of topics for merging topic embeddings + mappings = defaultdict(list) + for key, val in sorted(mapping.items()): + mappings[val].append(key) + mappings = {topic_from: + {"topics_to": topics_to, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} + for topic_from, topics_to in mappings.items()} + + # Update topics + documents.Topic = documents.Topic.map(mapping) + self.topic_mapper_.add_mappings(mapping) + documents = self._sort_mappings_by_frequency(documents) + self._extract_topics(documents, mappings=mappings) + self._update_topic_size(documents) + self._save_representative_docs(documents) + self.probabilities_ = self._map_probabilities(self.probabilities_) + + def reduce_topics(self, + docs: List[str], + nr_topics: Union[int, str] = 20, + images: List[str] = None) -> None: + """ Reduce the number of topics to a fixed number of topics + or automatically. + + If nr_topics is an integer, then the number of topics is reduced + to nr_topics using `AgglomerativeClustering` on the cosine distance matrix + of the topic embeddings. + + If nr_topics is `"auto"`, then HDBSCAN is used to automatically + reduce the number of topics by running it on the topic embeddings. + + The topics, their sizes, and representations are updated. + + Arguments: + docs: The docs you used when calling either `fit` or `fit_transform` + nr_topics: The number of topics you want reduced to + images: A list of paths to the images used when calling either + `fit` or `fit_transform` + + Updates: + topics_ : Assigns topics to their merged representations. + probabilities_ : Assigns probabilities to their merged representations. + + Examples: + + You can further reduce the topics by passing the documents with their + topics and probabilities (if they were calculated): + + ```python + topic_model.reduce_topics(docs, nr_topics=30) + ``` + + You can then access the updated topics and probabilities with: + + ```python + topics = topic_model.topics_ + probabilities = topic_model.probabilities_ + ``` + """ + check_is_fitted(self) + check_documents_type(docs) + + self.nr_topics = nr_topics + documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))}) + + # Reduce number of topics + documents = self._reduce_topics(documents) + self._merged_topics = None + self._save_representative_docs(documents) + self.probabilities_ = self._map_probabilities(self.probabilities_) + + return self + + def reduce_outliers(self, + documents: List[str], + topics: List[int], + images: List[str] = None, + strategy: str = "distributions", + probabilities: np.ndarray = None, + threshold: float = 0, + embeddings: np.ndarray = None, + distributions_params: Mapping[str, Any] = {}) -> List[int]: + """ Reduce outliers by merging them with their nearest topic according + to one of several strategies. + + When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created + that do not fall within any of the created topics. These are labeled as -1. + This function allows the user to match outlier documents with their nearest topic + using one of the following strategies using the `strategy` parameter: + * "probabilities" + This uses the soft-clustering as performed by HDBSCAN to find the + best matching topic for each outlier document. To use this, make + sure to calculate the `probabilities` beforehand by instantiating + BERTopic with `calculate_probabilities=True`. + * "distributions" + Use the topic distributions, as calculated with `.approximate_distribution` + to find the most frequent topic in each outlier document. You can use the + `distributions_params` variable to tweak the parameters of + `.approximate_distribution`. + * "c-tf-idf" + Calculate the c-TF-IDF representation for each outlier document and + find the best matching c-TF-IDF topic representation using + cosine similarity. + * "embeddings" + Using the embeddings of each outlier documents, find the best + matching topic embedding using cosine similarity. + + Arguments: + documents: A list of documents for which we reduce or remove the outliers. + topics: The topics that correspond to the documents + images: A list of paths to the images used when calling either + `fit` or `fit_transform` + strategy: The strategy used for reducing outliers. + Options: + * "probabilities" + This uses the soft-clustering as performed by HDBSCAN + to find the best matching topic for each outlier document. + + * "distributions" + Use the topic distributions, as calculated with `.approximate_distribution` + to find the most frequent topic in each outlier document. + + * "c-tf-idf" + Calculate the c-TF-IDF representation for outlier documents and + find the best matching c-TF-IDF topic representation. + + * "embeddings" + Calculate the embeddings for outlier documents and + find the best matching topic embedding. + threshold: The threshold for assigning topics to outlier documents. This value + represents the minimum probability when `strategy="probabilities"`. + For all other strategies, it represents the minimum similarity. + embeddings: The pre-computed embeddings to be used when `strategy="embeddings"`. + If this is None, then it will compute the embeddings for the outlier documents. + distributions_params: The parameters used in `.approximate_distribution` when using + the strategy `"distributions"`. + + Returns: + new_topics: The updated topics + + Usage: + + The default settings uses the `"distributions"` strategy: + + ```python + new_topics = topic_model.reduce_outliers(docs, topics) + ``` + + When you use the `"probabilities"` strategy, make sure to also pass the probabilities + as generated through HDBSCAN: + + ```python + from bertopic import BERTopic + topic_model = BERTopic(calculate_probabilities=True) + topics, probs = topic_model.fit_transform(docs) + + new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") + ``` + """ + if images is not None: + strategy = "embeddings" + + # Check correct use of parameters + if strategy.lower() == "probabilities" and probabilities is None: + raise ValueError("Make sure to pass in `probabilities` in order to use the probabilities strategy") + + # Reduce outliers by extracting most likely topics through the topic-term probability matrix + if strategy.lower() == "probabilities": + new_topics = [np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic + for topic, prob in zip(topics, probabilities)] + + # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions + elif strategy.lower() == "distributions": + outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] + outlier_docs = [documents[index] for index in outlier_ids] + topic_distr, _ = self.approximate_distribution(outlier_docs, min_similarity=threshold, **distributions_params) + outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] + + # Reduce outliers by finding the most similar c-TF-IDF representations + elif strategy.lower() == "c-tf-idf": + outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] + outlier_docs = [documents[index] for index in outlier_ids] + + # Calculate c-TF-IDF of outlier documents with all topics + bow_doc = self.vectorizer_model.transform(outlier_docs) + c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + + # Update topics + similarity[similarity < threshold] = 0 + outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] + + # Reduce outliers by finding the most similar topic embeddings + elif strategy.lower() == "embeddings": + if self.embedding_model is None and embeddings is None: + raise ValueError("To use this strategy, you will need to pass a model to `embedding_model`" + "when instantiating BERTopic.") + outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] + if images is not None: + outlier_docs = [images[index] for index in outlier_ids] + else: + outlier_docs = [documents[index] for index in outlier_ids] + + # Extract or calculate embeddings for outlier documents + if embeddings is not None: + outlier_embeddings = np.array([embeddings[index] for index in outlier_ids]) + elif images is not None: + outlier_images = [images[index] for index in outlier_ids] + outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose) + else: + outlier_embeddings = self.embedding_model.embed_documents(outlier_docs) + similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers:]) + + # Update topics + similarity[similarity < threshold] = 0 + outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity]) + new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] + + return new_topics + + def visualize_topics(self, + topics: List[int] = None, + top_n_topics: int = None, + custom_labels: bool = False, + title: str = "Intertopic Distance Map", + width: int = 650, + height: int = 650) -> go.Figure: + """ Visualize topics, their sizes, and their corresponding words + + This visualization is highly inspired by LDAvis, a great visualization + technique typically reserved for LDA. + + Arguments: + topics: A selection of topics to visualize + Not to be confused with the topics that you get from `.fit_transform`. + For example, if you want to visualize only topics 1 through 5: + `topics = [1, 2, 3, 4, 5]`. + top_n_topics: Only select the top n most frequent topics + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Examples: + + To visualize the topics simply run: + + ```python + topic_model.visualize_topics() + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_topics() + fig.write_html("path/to/file.html") + ``` + """ + check_is_fitted(self) + return plotting.visualize_topics(self, + topics=topics, + top_n_topics=top_n_topics, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_documents(self, + docs: List[str], + topics: List[int] = None, + embeddings: np.ndarray = None, + reduced_embeddings: np.ndarray = None, + sample: float = None, + hide_annotations: bool = False, + hide_document_hover: bool = False, + custom_labels: bool = False, + title: str = "Documents and Topics", + width: int = 1200, + height: int = 750) -> go.Figure: + """ Visualize documents and their topics in 2D + + Arguments: + topic_model: A fitted BERTopic instance. + docs: The documents you used when calling either `fit` or `fit_transform` + topics: A selection of topics to visualize. + Not to be confused with the topics that you get from `.fit_transform`. + For example, if you want to visualize only topics 1 through 5: + `topics = [1, 2, 3, 4, 5]`. + embeddings: The embeddings of all documents in `docs`. + reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. + sample: The percentage of documents in each topic that you would like to keep. + Value can be between 0 and 1. Setting this value to, for example, + 0.1 (10% of documents in each topic) makes it easier to visualize + millions of documents as a subset is chosen. + hide_annotations: Hide the names of the traces on top of each cluster. + hide_document_hover: Hide the content of the documents when hovering over + specific points. Helps to speed up generation of visualization. + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Examples: + + To visualize the topics simply run: + + ```python + topic_model.visualize_documents(docs) + ``` + + Do note that this re-calculates the embeddings and reduces them to 2D. + The advised and preferred pipeline for using this function is as follows: + + ```python + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + from bertopic import BERTopic + from umap import UMAP + + # Prepare embeddings + docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + embeddings = sentence_model.encode(docs, show_progress_bar=False) + + # Train BERTopic + topic_model = BERTopic().fit(docs, embeddings) + + # Reduce dimensionality of embeddings, this step is optional + # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) + + # Run the visualization with the original embeddings + topic_model.visualize_documents(docs, embeddings=embeddings) + + # Or, if you have reduced the original embeddings already: + topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings) + fig.write_html("path/to/file.html") + ``` + + + """ + check_is_fitted(self) + check_documents_type(docs) + return plotting.visualize_documents(self, + docs=docs, + topics=topics, + embeddings=embeddings, + reduced_embeddings=reduced_embeddings, + sample=sample, + hide_annotations=hide_annotations, + hide_document_hover=hide_document_hover, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_document_datamap(self, + docs: List[str], + topics: List[int] = None, + embeddings: np.ndarray = None, + reduced_embeddings: np.ndarray = None, + custom_labels: Union[bool, str] = False, + title: str = "Documents and Topics", + sub_title: Union[str, None] = None, + width: int = 1200, + height: int = 1200, + **datamap_kwds): + """ Visualize documents and their topics in 2D as a static plot for publication using + DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best + to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model. + + Arguments: + topic_model: A fitted BERTopic instance. + docs: The documents you used when calling either `fit` or `fit_transform` + embeddings: The embeddings of all documents in `docs`. + reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. + custom_labels: If bool, whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + If `str`, it uses labels from other aspects, e.g., "Aspect1". + title: Title of the plot. + sub_title: Sub-title of the plot. + width: The width of the figure. + height: The height of the figure. + **datamap_kwds: All further keyword args will be passed on to DataMapPlot's + `create_plot` function. See the DataMapPlot documentation + for more details. + + Returns: + figure: A Matplotlib Figure object. + + Examples: + + To visualize the topics simply run: + + ```python + topic_model.visualize_document_datamap(docs) + ``` + + Do note that this re-calculates the embeddings and reduces them to 2D. + The advised and preferred pipeline for using this function is as follows: + + ```python + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + from bertopic import BERTopic + from umap import UMAP + + # Prepare embeddings + docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + embeddings = sentence_model.encode(docs, show_progress_bar=False) + + # Train BERTopic + topic_model = BERTopic(min_topic_size=36).fit(docs, embeddings) + + # Reduce dimensionality of embeddings, this step is optional + # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) + + # Run the visualization with the original embeddings + topic_model.visualize_document_datamap(docs, embeddings=embeddings) + + # Or, if you have reduced the original embeddings already: + topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings) + fig.savefig("path/to/file.png", bbox_inches="tight") + ``` + """ + check_is_fitted(self) + check_documents_type(docs) + return plotting.visualize_document_datamap(self, + docs, + topics, + embeddings, + reduced_embeddings, + custom_labels, + title, + sub_title, + width, + height, + **datamap_kwds) + def visualize_hierarchical_documents(self, + docs: List[str], + hierarchical_topics: pd.DataFrame, + topics: List[int] = None, + embeddings: np.ndarray = None, + reduced_embeddings: np.ndarray = None, + sample: Union[float, int] = None, + hide_annotations: bool = False, + hide_document_hover: bool = True, + nr_levels: int = 10, + level_scale: str = 'linear', + custom_labels: bool = False, + title: str = "Hierarchical Documents and Topics", + width: int = 1200, + height: int = 750) -> go.Figure: + """ Visualize documents and their topics in 2D at different levels of hierarchy + + Arguments: + docs: The documents you used when calling either `fit` or `fit_transform` + hierarchical_topics: A dataframe that contains a hierarchy of topics + represented by their parents and their children + topics: A selection of topics to visualize. + Not to be confused with the topics that you get from `.fit_transform`. + For example, if you want to visualize only topics 1 through 5: + `topics = [1, 2, 3, 4, 5]`. + embeddings: The embeddings of all documents in `docs`. + reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. + sample: The percentage of documents in each topic that you would like to keep. + Value can be between 0 and 1. Setting this value to, for example, + 0.1 (10% of documents in each topic) makes it easier to visualize + millions of documents as a subset is chosen. + hide_annotations: Hide the names of the traces on top of each cluster. + hide_document_hover: Hide the content of the documents when hovering over + specific points. Helps to speed up generation of visualizations. + nr_levels: The number of levels to be visualized in the hierarchy. First, the distances + in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with + equal length. Then, for each list of distances, the merged topics, that have + a distance less or equal to the maximum distance of the selected list of distances, are selected. + NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to + the length of `hierarchical_topics`. + level_scale: Whether to apply a linear or logarithmic ('log') scale levels of the distance + vector. Linear scaling will perform an equal number of merges at each level + while logarithmic scaling will perform more mergers in earlier levels to + provide more resolution at higher levels (this can be used for when the number + of topics is large). + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + NOTE: Custom labels are only generated for the original + un-merged topics. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Examples: + + To visualize the topics simply run: + + ```python + topic_model.visualize_hierarchical_documents(docs, hierarchical_topics) + ``` + + Do note that this re-calculates the embeddings and reduces them to 2D. + The advised and preferred pipeline for using this function is as follows: + + ```python + from sklearn.datasets import fetch_20newsgroups + from sentence_transformers import SentenceTransformer + from bertopic import BERTopic + from umap import UMAP + + # Prepare embeddings + docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] + sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + embeddings = sentence_model.encode(docs, show_progress_bar=False) + + # Train BERTopic and extract hierarchical topics + topic_model = BERTopic().fit(docs, embeddings) + hierarchical_topics = topic_model.hierarchical_topics(docs) + + # Reduce dimensionality of embeddings, this step is optional + # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings) + + # Run the visualization with the original embeddings + topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings) + + # Or, if you have reduced the original embeddings already: + topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings) + fig.write_html("path/to/file.html") + ``` + + + """ + check_is_fitted(self) + check_documents_type(docs) + return plotting.visualize_hierarchical_documents(self, + docs=docs, + hierarchical_topics=hierarchical_topics, + topics=topics, + embeddings=embeddings, + reduced_embeddings=reduced_embeddings, + sample=sample, + hide_annotations=hide_annotations, + hide_document_hover=hide_document_hover, + nr_levels=nr_levels, + level_scale=level_scale, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_term_rank(self, + topics: List[int] = None, + log_scale: bool = False, + custom_labels: bool = False, + title: str = "Term score decline per Topic", + width: int = 800, + height: int = 500) -> go.Figure: + """ Visualize the ranks of all terms across all topics + + Each topic is represented by a set of words. These words, however, + do not all equally represent the topic. This visualization shows + how many words are needed to represent a topic and at which point + the beneficial effect of adding words starts to decline. + + Arguments: + topics: A selection of topics to visualize. These will be colored + red where all others will be colored black. + log_scale: Whether to represent the ranking on a log scale + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Returns: + fig: A plotly figure + + Examples: + + To visualize the ranks of all words across + all topics simply run: + + ```python + topic_model.visualize_term_rank() + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_term_rank() + fig.write_html("path/to/file.html") + ``` + + Reference: + + This visualization was heavily inspired by the + "Term Probability Decline" visualization found in an + analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/). + Reference to that specific analysis can be found + [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). + """ + check_is_fitted(self) + return plotting.visualize_term_rank(self, + topics=topics, + log_scale=log_scale, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_topics_over_time(self, + topics_over_time: pd.DataFrame, + top_n_topics: int = None, + topics: List[int] = None, + normalize_frequency: bool = False, + custom_labels: bool = False, + title: str = "Topics over Time", + width: int = 1250, + height: int = 450) -> go.Figure: + """ Visualize topics over time + + Arguments: + topics_over_time: The topics you would like to be visualized with the + corresponding topic representation + top_n_topics: To visualize the most frequent topics instead of all + topics: Select which topics you would like to be visualized + normalize_frequency: Whether to normalize each topic's frequency individually + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Returns: + A plotly.graph_objects.Figure including all traces + + Examples: + + To visualize the topics over time, simply run: + + ```python + topics_over_time = topic_model.topics_over_time(docs, timestamps) + topic_model.visualize_topics_over_time(topics_over_time) + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_topics_over_time(topics_over_time) + fig.write_html("path/to/file.html") + ``` + """ + check_is_fitted(self) + return plotting.visualize_topics_over_time(self, + topics_over_time=topics_over_time, + top_n_topics=top_n_topics, + topics=topics, + normalize_frequency=normalize_frequency, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_topics_per_class(self, + topics_per_class: pd.DataFrame, + top_n_topics: int = 10, + topics: List[int] = None, + normalize_frequency: bool = False, + custom_labels: bool = False, + title: str = "Topics per Class", + width: int = 1250, + height: int = 900) -> go.Figure: + """ Visualize topics per class + + Arguments: + topics_per_class: The topics you would like to be visualized with the + corresponding topic representation + top_n_topics: To visualize the most frequent topics instead of all + topics: Select which topics you would like to be visualized + normalize_frequency: Whether to normalize each topic's frequency individually + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Returns: + A plotly.graph_objects.Figure including all traces + + Examples: + + To visualize the topics per class, simply run: + + ```python + topics_per_class = topic_model.topics_per_class(docs, classes) + topic_model.visualize_topics_per_class(topics_per_class) + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_topics_per_class(topics_per_class) + fig.write_html("path/to/file.html") + ``` + """ + check_is_fitted(self) + return plotting.visualize_topics_per_class(self, + topics_per_class=topics_per_class, + top_n_topics=top_n_topics, + topics=topics, + normalize_frequency=normalize_frequency, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_distribution(self, + probabilities: np.ndarray, + min_probability: float = 0.015, + custom_labels: bool = False, + title: str = "Topic Probability Distribution", + width: int = 800, + height: int = 600) -> go.Figure: + """ Visualize the distribution of topic probabilities + + Arguments: + probabilities: An array of probability scores + min_probability: The minimum probability score to visualize. + All others are ignored. + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Examples: + + Make sure to fit the model before and only input the + probabilities of a single document: + + ```python + topic_model.visualize_distribution(topic_model.probabilities_[0]) + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_distribution(topic_model.probabilities_[0]) + fig.write_html("path/to/file.html") + ``` + """ + check_is_fitted(self) + return plotting.visualize_distribution(self, + probabilities=probabilities, + min_probability=min_probability, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_approximate_distribution(self, + document: str, + topic_token_distribution: np.ndarray, + normalize: bool = False): + """ Visualize the topic distribution calculated by `.approximate_topic_distribution` + on a token level. Thereby indicating the extent to which a certain word or phrase belongs + to a specific topic. The assumption here is that a single word can belong to multiple + similar topics and as such can give information about the broader set of topics within + a single document. + + Arguments: + topic_model: A fitted BERTopic instance. + document: The document for which you want to visualize + the approximated topic distribution. + topic_token_distribution: The topic-token distribution of the document as + extracted by `.approximate_topic_distribution` + normalize: Whether to normalize, between 0 and 1 (summing up to 1), the + topic distribution values. + + Returns: + df: A stylized dataframe indicating the best fitting topics + for each token. + + Examples: + + ```python + # Calculate the topic distributions on a token level + # Note that we need to have `calculate_token_level=True` + topic_distr, topic_token_distr = topic_model.approximate_distribution( + docs, calculate_token_level=True + ) + + # Visualize the approximated topic distributions + df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]) + df + ``` + + To revert this stylized dataframe back to a regular dataframe, + you can run the following: + + ```python + df.data.columns = [column.strip() for column in df.data.columns] + df = df.data + ``` + """ + check_is_fitted(self) + return plotting.visualize_approximate_distribution(self, + document=document, + topic_token_distribution=topic_token_distribution, + normalize=normalize) + + def visualize_hierarchy(self, + orientation: str = "left", + topics: List[int] = None, + top_n_topics: int = None, + custom_labels: bool = False, + title: str = "Hierarchical Clustering", + width: int = 1000, + height: int = 600, + hierarchical_topics: pd.DataFrame = None, + linkage_function: Callable[[csr_matrix], np.ndarray] = None, + distance_function: Callable[[csr_matrix], csr_matrix] = None, + color_threshold: int = 1) -> go.Figure: + """ Visualize a hierarchical structure of the topics + + A ward linkage function is used to perform the + hierarchical clustering based on the cosine distance + matrix between topic embeddings. + + Arguments: + topic_model: A fitted BERTopic instance. + orientation: The orientation of the figure. + Either 'left' or 'bottom' + topics: A selection of topics to visualize + top_n_topics: Only select the top n most frequent topics + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + NOTE: Custom labels are only generated for the original + un-merged topics. + title: Title of the plot. + width: The width of the figure. Only works if orientation is set to 'left' + height: The height of the figure. Only works if orientation is set to 'bottom' + hierarchical_topics: A dataframe that contains a hierarchy of topics + represented by their parents and their children. + NOTE: The hierarchical topic names are only visualized + if both `topics` and `top_n_topics` are not set. + linkage_function: The linkage function to use. Default is: + `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` + NOTE: Make sure to use the same `linkage_function` as used + in `topic_model.hierarchical_topics`. + distance_function: The distance function to use on the c-TF-IDF matrix. Default is: + `lambda x: 1 - cosine_similarity(x)` + NOTE: Make sure to use the same `distance_function` as used + in `topic_model.hierarchical_topics`. + color_threshold: Value at which the separation of clusters will be made which + will result in different colors for different clusters. + A higher value will typically lead to less colored clusters. + + Returns: + fig: A plotly figure + + Examples: + + To visualize the hierarchical structure of + topics simply run: + + ```python + topic_model.visualize_hierarchy() + ``` + + If you also want the labels of hierarchical topics visualized, + run the following: + + ```python + # Extract hierarchical topics and their representations + hierarchical_topics = topic_model.hierarchical_topics(docs) + + # Visualize these representations + topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics) + ``` + + If you want to save the resulting figure: + + ```python + fig = topic_model.visualize_hierarchy() + fig.write_html("path/to/file.html") + ``` + + """ + check_is_fitted(self) + return plotting.visualize_hierarchy(self, + orientation=orientation, + topics=topics, + top_n_topics=top_n_topics, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + hierarchical_topics=hierarchical_topics, + linkage_function=linkage_function, + distance_function=distance_function, + color_threshold=color_threshold + ) + + def visualize_heatmap(self, + topics: List[int] = None, + top_n_topics: int = None, + n_clusters: int = None, + custom_labels: bool = False, + title: str = "Similarity Matrix", + width: int = 800, + height: int = 800) -> go.Figure: + """ Visualize a heatmap of the topic's similarity matrix + + Based on the cosine similarity matrix between topic embeddings, + a heatmap is created showing the similarity between topics. + + Arguments: + topics: A selection of topics to visualize. + top_n_topics: Only select the top n most frequent topics. + n_clusters: Create n clusters and order the similarity + matrix by those clusters. + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of the figure. + height: The height of the figure. + + Returns: + fig: A plotly figure + + Examples: + + To visualize the similarity matrix of + topics simply run: + + ```python + topic_model.visualize_heatmap() + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_heatmap() + fig.write_html("path/to/file.html") + ``` + """ + check_is_fitted(self) + return plotting.visualize_heatmap(self, + topics=topics, + top_n_topics=top_n_topics, + n_clusters=n_clusters, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def visualize_barchart(self, + topics: List[int] = None, + top_n_topics: int = 8, + n_words: int = 5, + custom_labels: bool = False, + title: str = "Topic Word Scores", + width: int = 250, + height: int = 250) -> go.Figure: + """ Visualize a barchart of selected topics + + Arguments: + topics: A selection of topics to visualize. + top_n_topics: Only select the top n most frequent topics. + n_words: Number of words to show in a topic + custom_labels: Whether to use custom topic labels that were defined using + `topic_model.set_topic_labels`. + title: Title of the plot. + width: The width of each figure. + height: The height of each figure. + + Returns: + fig: A plotly figure + + Examples: + + To visualize the barchart of selected topics + simply run: + + ```python + topic_model.visualize_barchart() + ``` + + Or if you want to save the resulting figure: + + ```python + fig = topic_model.visualize_barchart() + fig.write_html("path/to/file.html") + ``` + """ + check_is_fitted(self) + return plotting.visualize_barchart(self, + topics=topics, + top_n_topics=top_n_topics, + n_words=n_words, + custom_labels=custom_labels, + title=title, + width=width, + height=height) + + def save(self, + path, + serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle", + save_embedding_model: Union[bool, str] = True, + save_ctfidf: bool = False): + """ Saves the model to the specified path or folder + + When saving the model, make sure to also keep track of the versions + of dependencies and Python used. Loading and saving the model should + be done using the same dependencies and Python. Moreover, models + saved in one version of BERTopic should not be loaded in other versions. + + Arguments: + path: If `serialization` is 'safetensors' or `pytorch`, this is a directory. + If `serialization` is `pickle`, then this is a file. + serialization: If `pickle`, the entire model will be pickled. If `safetensors` + or `pytorch` the model will be saved without the embedding, + dimensionality reduction, and clustering algorithms. + This is a very efficient format and typically advised. + save_embedding_model: If serialization is `pickle`, then you can choose to skip + saving the embedding model. If serialization is `safetensors` + or `pytorch`, this variable can be used as a string pointing + towards a huggingface model. + save_ctfidf: Whether to save c-TF-IDF information if serialization is `safetensors` + or `pytorch` + + Examples: + + To save the model in an efficient and safe format (safetensors) with c-TF-IDF information: + + ```python + topic_model.save("model_dir", serialization="safetensors", save_ctfidf=True) + ``` + + If you wish to also add a pointer to the embedding model, which will be downloaded from + HuggingFace upon loading: + + ```python + embedding_model = "sentence-transformers/all-MiniLM-L6-v2" + topic_model.save("model_dir", serialization="safetensors", save_embedding_model=embedding_model) + ``` + + or if you want save the full model with pickle: + + ```python + topic_model.save("my_model") + ``` + + NOTE: Pickle can run arbitrary code and is generally considered to be less safe than + safetensors. + """ + if serialization == "pickle": + logger.warning("When you use `pickle` to save/load a BERTopic model," + "please make sure that the environments in which you save" + "and load the model are **exactly** the same. The version of BERTopic," + "its dependencies, and python need to remain the same.") + + with open(path, 'wb') as file: + + # This prevents the vectorizer from being too large in size if `min_df` was + # set to a value higher than 1 + self.vectorizer_model.stop_words_ = None + + if not save_embedding_model: + embedding_model = self.embedding_model + self.embedding_model = None + joblib.dump(self, file) + self.embedding_model = embedding_model + else: + joblib.dump(self, file) + elif serialization == "safetensors" or serialization == "pytorch": + + # Directory + save_directory = Path(path) + save_directory.mkdir(exist_ok=True, parents=True) + + # Check embedding model + if save_embedding_model and hasattr(self.embedding_model, '_hf_model') and not isinstance(save_embedding_model, str): + save_embedding_model = self.embedding_model._hf_model + elif not save_embedding_model: + logger.warning("You are saving a BERTopic model without explicitly defining an embedding model." + "If you are using a sentence-transformers model or a HuggingFace model supported" + "by sentence-transformers, please save the model by using a pointer towards that model." + "For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`") + + # Minimal + save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization) + save_utils.save_topics(model=self, path=save_directory / "topics.json") + save_utils.save_images(model=self, path=save_directory / "images") + save_utils.save_config(model=self, path=save_directory / 'config.json', embedding_model=save_embedding_model) + + # Additional + if save_ctfidf: + save_utils.save_ctfidf(model=self, save_directory=save_directory, serialization=serialization) + save_utils.save_ctfidf_config(model=self, path=save_directory / 'ctfidf_config.json') + + @classmethod + def load(cls, + path: str, + embedding_model=None): + """ Loads the model from the specified path or directory + + Arguments: + path: Either load a BERTopic model from a file (`.pickle`) or a folder containing + `.safetensors` or `.bin` files. + embedding_model: Additionally load in an embedding model if it was not saved + in the BERTopic model file or directory. + + Examples: + + ```python + BERTopic.load("model_dir") + ``` + + or if you did not save the embedding model: + + ```python + BERTopic.load("model_dir", embedding_model="all-MiniLM-L6-v2") + ``` + """ + file_or_dir = Path(path) + + # Load from Pickle + if file_or_dir.is_file(): + with open(file_or_dir, 'rb') as file: + if embedding_model: + topic_model = joblib.load(file) + topic_model.embedding_model = select_backend(embedding_model) + else: + topic_model = joblib.load(file) + return topic_model + + # Load from directory or HF + if file_or_dir.is_dir(): + topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir) + elif "/" in str(path): + topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path) + else: + raise ValueError("Make sure to either pass a valid directory or HF model.") + topic_model = _create_model_from_files(topics, params, tensors, ctfidf_tensors, ctfidf_config, images, + warn_no_backend=(embedding_model is None)) + + # Replace embedding model if one is specifically chosen + if embedding_model is not None: + topic_model.embedding_model = select_backend(embedding_model) + + return topic_model + + @classmethod + def merge_models(cls, models, min_similarity: float = .7, embedding_model=None): + """ Merge multiple pre-trained BERTopic models into a single model. + + The models are merged as if they were all saved using pytorch or + safetensors, so a minimal version without c-TF-IDF. + + To do this, we choose the first model in the list of + models as a baseline. Then, we check each model whether + they contain topics that are not in the baseline. + This check is based on the cosine similarity between + topics embeddings. If topic embeddings between two models + are similar, then the topic of the second model is re-assigned + to the first. If they are dissimilar, the topic of the second + model is assigned to the first. + + In essence, we simply check whether sufficiently "new" + topics emerge and add them. + + Arguments: + models: A list of fitted BERTopic models + min_similarity: The minimum similarity for when topics are merged. + embedding_model: Additionally load in an embedding model if necessary. + + Returns: + A new BERTopic model that was created as if you were + loading a model from the HuggingFace Hub without c-TF-IDF + + Examples: + + ```python + from bertopic import BERTopic + from sklearn.datasets import fetch_20newsgroups + + docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] + + # Create three separate models + topic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000]) + topic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000]) + topic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:]) + + # Combine all models into one + merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3]) + ``` + """ + import torch + + # Temporarily save model and push to HF + with TemporaryDirectory() as tmpdir: + + # Save model weights and config. + all_topics, all_params, all_tensors = [], [], [] + for index, model in enumerate(models): + model.save(tmpdir, serialization="pytorch") + topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir)) + all_topics.append(topics) + all_params.append(params) + all_tensors.append(np.array(tensors["topic_embeddings"])) + + # Create a base set of parameters + if index == 0: + merged_topics = topics + merged_params = params + merged_tensors = np.array(tensors["topic_embeddings"]) + merged_topics["custom_labels"] = None + + for tensors, selected_topics in zip(all_tensors[1:], all_topics[1:]): + # Calculate similarity matrix + sim_matrix = cosine_similarity(tensors, merged_tensors) + sims = np.max(sim_matrix, axis=1) + + # Extract new topics + new_topics = sorted([index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity]) + max_topic = max(set(merged_topics["topics"])) + + # Merge Topic Representations + new_topics_dict = {} + new_topic_val = max_topic + 1 + for index, new_topic in enumerate(new_topics): + new_topic_val = max_topic + index + 1 + new_topics_dict[new_topic] = new_topic_val + merged_topics["topic_representations"][str(new_topic_val)] = selected_topics["topic_representations"][str(new_topic)] + merged_topics["topic_labels"][str(new_topic_val)] = selected_topics["topic_labels"][str(new_topic)] + + # Add new aspects + if selected_topics["topic_aspects"]: + aspects_1 = set(merged_topics["topic_aspects"].keys()) + aspects_2 = set(selected_topics["topic_aspects"].keys()) + aspects_diff = aspects_2.difference(aspects_1) + if aspects_diff: + for aspect in aspects_diff: + merged_topics["topic_aspects"][aspect] = {} + + # If the original model does not have topic aspects but the to be added model does + if not merged_topics.get("topic_aspects"): + merged_topics["topic_aspects"] = selected_topics["topic_aspects"] + + # If they both contain topic aspects, add to the existing set of aspects + else: + for aspect, values in selected_topics["topic_aspects"].items(): + merged_topics["topic_aspects"][aspect][str(new_topic_val)] = values[str(new_topic)] + + # Add new embeddings + new_tensors = tensors[new_topic + selected_topics["_outliers"]] + merged_tensors = np.vstack([merged_tensors, new_tensors]) + + # Topic Mapper + merged_topics["topic_mapper"] = TopicMapper(list(range(-1, new_topic_val+1, 1))).mappings_ + + # Find similar topics and re-assign those from the new models + sims_idx = np.argmax(sim_matrix, axis=1) + sims = np.max(sim_matrix, axis=1) + to_merge = { + a - selected_topics["_outliers"]: + b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims)) + if val >= min_similarity + } + to_merge.update(new_topics_dict) + to_merge[-1] = -1 + topics = [to_merge[topic] for topic in selected_topics["topics"]] + merged_topics["topics"].extend(topics) + merged_topics["topic_sizes"] = dict(Counter(merged_topics["topics"])) + + # Create a new model from the merged parameters + merged_tensors = {"topic_embeddings": torch.from_numpy(merged_tensors)} + merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False) + merged_model.embedding_model = models[0].embedding_model + + # Replace embedding model if one is specifically chosen + if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder: + merged_model.embedding_model = select_backend(embedding_model) + return merged_model + + def push_to_hf_hub( + self, + repo_id: str, + commit_message: str = 'Add BERTopic model', + token: str = None, + revision: str = None, + private: bool = False, + create_pr: bool = False, + model_card: bool = True, + serialization: str = "safetensors", + save_embedding_model: Union[str, bool] = True, + save_ctfidf: bool = False, + ): + """ Push your BERTopic model to a HuggingFace Hub + + Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account: + + * Log in to your HuggingFace account with the following command: + ```bash + huggingface-cli login + + # or using an environment variable + huggingface-cli login --token $HUGGINGFACE_TOKEN + ``` + * Alternatively, you can programmatically login using login() in a notebook or a script: + ```python + from huggingface_hub import login + login() + ``` + * Or you can give a token with the `token` variable + + Arguments: + repo_id: The name of your HuggingFace repository + commit_message: A commit message + token: Token to add if not already logged in + revision: Repository revision + private: Whether to create a private repository + create_pr: Whether to upload the model as a Pull Request + model_card: Whether to automatically create a modelcard + serialization: The type of serialization. + Either `safetensors` or `pytorch` + save_embedding_model: A pointer towards a HuggingFace model to be loaded in with + SentenceTransformers. E.g., + `sentence-transformers/all-MiniLM-L6-v2` + save_ctfidf: Whether to save c-TF-IDF information + + + Examples: + + ```python + topic_model.push_to_hf_hub( + repo_id="ArXiv", + save_ctfidf=True, + save_embedding_model="sentence-transformers/all-MiniLM-L6-v2" + ) + ``` + """ + return save_utils.push_to_hf_hub(model=self, repo_id=repo_id, commit_message=commit_message, + token=token, revision=revision, private=private, create_pr=create_pr, + model_card=model_card, serialization=serialization, + save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf) + + def get_params(self, deep: bool = False) -> Mapping[str, Any]: + """ Get parameters for this estimator. + + Adapted from: + https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178 + + Arguments: + deep: bool, default=True + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns: + out: Parameter names mapped to their values. + """ + out = dict() + for key in self._get_param_names(): + value = getattr(self, key) + if deep and hasattr(value, 'get_params'): + deep_items = value.get_params().items() + out.update((key + '__' + k, val) for k, val in deep_items) + out[key] = value + return out + + def _extract_embeddings(self, + documents: Union[List[str], str], + images: List[str] = None, + method: str = "document", + verbose: bool = None) -> np.ndarray: + """ Extract sentence/document embeddings through pre-trained embeddings + For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html + + Arguments: + documents: Dataframe with documents and their corresponding IDs + images: A list of paths to the images to fit on or the images themselves + method: Whether to extract document or word-embeddings, options are "document" and "word" + verbose: Whether to show a progressbar demonstrating the time to extract embeddings + + Returns: + embeddings: The extracted embeddings. + """ + if isinstance(documents, str): + documents = [documents] + + if images is not None and hasattr(self.embedding_model, "embed_images"): + embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose) + elif method == "word": + embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose) + elif method == "document": + embeddings = self.embedding_model.embed_documents(documents, verbose=verbose) + elif documents[0] is None and images is None: + raise ValueError("Make sure to use an embedding model that can either embed documents" + "or images depending on which you want to embed.") + else: + raise ValueError("Wrong method for extracting document/word embeddings. " + "Either choose 'word' or 'document' as the method. ") + return embeddings + + def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: + """ Convert images to text """ + logger.info("Images - Converting images to text. This might take a while.") + if isinstance(self.representation_model, dict): + for tuner in self.representation_model.values(): + if getattr(tuner, 'image_to_text_model', False): + documents = tuner.image_to_text(documents, embeddings) + elif isinstance(self.representation_model, list): + for tuner in self.representation_model: + if getattr(tuner, 'image_to_text_model', False): + documents = tuner.image_to_text(documents, embeddings) + elif isinstance(self.representation_model, BaseRepresentation): + if getattr(self.representation_model, 'image_to_text_model', False): + documents = self.representation_model.image_to_text(documents, embeddings) + logger.info("Images - Completed \u2713") + return documents + + def _map_predictions(self, predictions: List[int]) -> List[int]: + """ Map predictions to the correct topics if topics were reduced """ + mappings = self.topic_mapper_.get_mappings(original_topics=True) + mapped_predictions = [mappings[prediction] + if prediction in mappings + else -1 + for prediction in predictions] + return mapped_predictions + + def _reduce_dimensionality(self, + embeddings: Union[np.ndarray, csr_matrix], + y: Union[List[int], np.ndarray] = None, + partial_fit: bool = False) -> np.ndarray: + """ Reduce dimensionality of embeddings using UMAP and train a UMAP model + + Arguments: + embeddings: The extracted embeddings using the sentence transformer module. + y: The target class for (semi)-supervised dimensionality reduction + partial_fit: Whether to run `partial_fit` for online learning + + Returns: + umap_embeddings: The reduced embeddings + """ + logger.info("Dimensionality - Fitting the dimensionality reduction algorithm") + # Partial fit + if partial_fit: + if hasattr(self.umap_model, "partial_fit"): + self.umap_model = self.umap_model.partial_fit(embeddings) + elif self.topic_representations_ is None: + self.umap_model.fit(embeddings) + + # Regular fit + else: + try: + # cuml umap needs y to be an numpy array + y = np.array(y) if y is not None else None + self.umap_model.fit(embeddings, y=y) + except TypeError: + + self.umap_model.fit(embeddings) + + umap_embeddings = self.umap_model.transform(embeddings) + logger.info("Dimensionality - Completed \u2713") + return np.nan_to_num(umap_embeddings) + + def _cluster_embeddings(self, + umap_embeddings: np.ndarray, + documents: pd.DataFrame, + partial_fit: bool = False, + y: np.ndarray = None) -> Tuple[pd.DataFrame, + np.ndarray]: + """ Cluster UMAP embeddings with HDBSCAN + + Arguments: + umap_embeddings: The reduced sentence embeddings with UMAP + documents: Dataframe with documents and their corresponding IDs + partial_fit: Whether to run `partial_fit` for online learning + + Returns: + documents: Updated dataframe with documents and their corresponding IDs + and newly added Topics + probabilities: The distribution of probabilities + """ + logger.info("Cluster - Start clustering the reduced embeddings") + if partial_fit: + self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings) + labels = self.hdbscan_model.labels_ + documents['Topic'] = labels + self.topics_ = labels + else: + try: + self.hdbscan_model.fit(umap_embeddings, y=y) + except TypeError: + self.hdbscan_model.fit(umap_embeddings) + + try: + labels = self.hdbscan_model.labels_ + except AttributeError: + labels = y + documents['Topic'] = labels + self._update_topic_size(documents) + + # Some algorithms have outlier labels (-1) that can be tricky to work + # with if you are slicing data based on that labels. Therefore, we + # track if there are outlier labels and act accordingly when slicing. + self._outliers = 1 if -1 in set(labels) else 0 + + # Extract probabilities + probabilities = None + if hasattr(self.hdbscan_model, "probabilities_"): + probabilities = self.hdbscan_model.probabilities_ + + if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model): + probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors") + + if not partial_fit: + self.topic_mapper_ = TopicMapper(self.topics_) + logger.info("Cluster - Completed \u2713") + return documents, probabilities + + def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array, + pd.DataFrame, np.array]: + """ Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list + + We transform the topics in `self.zeroshot_topic_list` to embeddings and + compare them through cosine similarity with the document embeddings. + If they pass the `self.zeroshot_min_similarity` threshold, they are assigned. + + Arguments: + documents: Dataframe with documents and their corresponding IDs + embeddings: The document embeddings + + Returns: + documents: The leftover documents that were not assigned to any topic + embeddings: The leftover embeddings that were not assigned to any topic + """ + logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics") + # Similarity between document and zero-shot topic embeddings + zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list) + cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings) + assignment = np.argmax(cosine_similarities, 1) + assignment_vals = np.max(cosine_similarities, 1) + assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity] + non_assigned_ids = [index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity] + + # Assign topics + assigned_documents = documents.iloc[assigned_ids] + assigned_documents["Topic"] = [topic for topic in assignment[assigned_ids]] + assigned_documents["Old_ID"] = assigned_documents["ID"].copy() + assigned_documents["ID"] = range(len(assigned_documents)) + assigned_embeddings = embeddings[assigned_ids] + + # Select non-assigned topics to be clustered + documents = documents.iloc[non_assigned_ids] + documents["Old_ID"] = documents["ID"].copy() + documents["ID"] = range(len(documents)) + embeddings = embeddings[non_assigned_ids] + + # If only matches were found + if len(non_assigned_ids) == 0: + return None, None, assigned_documents, assigned_embeddings + logger.info("Zeroshot Step 1 - Completed \u2713") + return documents, embeddings, assigned_documents, assigned_embeddings + + def _is_zeroshot(self): + """ Check whether zero-shot topic modeling is possible + + * There should be a cluster model used + * Embedding model is necessary to convert zero-shot topics to embeddings + * Zero-shot topics should be defined + """ + if self.zeroshot_topic_list is not None and self.embedding_model is not None and type(self.hdbscan_model) != BaseCluster: + return True + return False + + def _combine_zeroshot_topics(self, + documents: pd.DataFrame, + assigned_documents: pd.DataFrame, + embeddings: np.ndarray) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: + """ Combine the zero-shot topics with the clustered topics + + There are three cases considered: + * Only zero-shot topics were found which will only return the zero-shot topic model + * Only clustered topics were found which will only return the clustered topic model + * Both zero-shot and clustered topics were found which will return a merged model + * This merged model is created using the `merge_models` function which will ignore + the underlying UMAP and HDBSCAN models + + Arguments: + documents: Dataframe with documents and their corresponding IDs + assigned_documents: Dataframe with documents and their corresponding IDs + that were assigned to a zero-shot topic + embeddings: The document embeddings + + Returns: + topics: The topics for each document + probabilities: The probabilities for each document + """ + logger.info("Zeroshot Step 2 - Clustering documents that were not found in the zero-shot model...") + + # Fit BERTopic without actually performing any clustering + docs = assigned_documents.Document.tolist() + y = assigned_documents.Topic.tolist() + empty_dimensionality_model = BaseDimensionalityReduction() + empty_cluster_model = BaseCluster() + zeroshot_model = BERTopic( + n_gram_range=self.n_gram_range, + low_memory=self.low_memory, + calculate_probabilities=self.calculate_probabilities, + embedding_model=self.embedding_model, + umap_model=empty_dimensionality_model, + hdbscan_model=empty_cluster_model, + vectorizer_model=self.vectorizer_model, + ctfidf_model=self.ctfidf_model, + representation_model=self.representation_model, + verbose=self.verbose + ).fit(docs, embeddings=embeddings, y=y) + logger.info("Zeroshot Step 2 - Completed \u2713") + logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model") + + # Update model + self.umap_model = BaseDimensionalityReduction() + self.hdbscan_model = BaseCluster() + + # Update topic label + assigned_topics = assigned_documents.groupby("Topic").first().reset_index() + indices, topics = assigned_topics.ID.values, assigned_topics.Topic.values + labels = [zeroshot_model.topic_labels_[zeroshot_model.topics_[index]] for index in indices] + labels = {label: self.zeroshot_topic_list[topic] for label, topic in zip(labels, topics)} + + # If only zero-shot matches were found and clustering was not performed + if documents is None: + for topic in range(len(set(y))): + if zeroshot_model.topic_labels_.get(topic): + if labels.get(zeroshot_model.topic_labels_[topic]): + zeroshot_model.topic_labels_[topic] = labels[zeroshot_model.topic_labels_[topic]] + self.__dict__.clear() + self.__dict__.update(zeroshot_model.__dict__) + return self.topics_, self.probabilities_ + + # Merge the two topic models + merged_model = BERTopic.merge_models([zeroshot_model, self], min_similarity=1) + + # Update topic labels and representative docs of the zero-shot model + for topic in range(len(set(y))): + if merged_model.topic_labels_.get(topic): + if labels.get(merged_model.topic_labels_[topic]): + label = labels[merged_model.topic_labels_[topic]] + merged_model.topic_labels_[topic] = label + merged_model.representative_docs_[topic] = zeroshot_model.representative_docs_[topic] + + # Add representative docs of the clustered model + for topic in set(self.topics_): + merged_model.representative_docs_[topic + self._outliers + len(set(y))] = self.representative_docs_[topic] + + if self._outliers and merged_model.topic_sizes_.get(-1): + merged_model.topic_sizes_[len(set(y))] = merged_model.topic_sizes_[-1] + del merged_model.topic_sizes_[-1] + + # Update topic assignment by finding the documents with the + # correct updated topics + zeroshot_indices = list(assigned_documents.Old_ID.values) + zeroshot_topics = [self.zeroshot_topic_list[topic] for topic in assigned_documents.Topic.values] + + cluster_indices = list(documents.Old_ID.values) + cluster_names = list(merged_model.topic_labels_.values())[len(set(y)):] + cluster_topics = [cluster_names[topic + self._outliers] for topic in documents.Topic.values] + + df = pd.DataFrame({ + "Indices": zeroshot_indices + cluster_indices, + "Label": zeroshot_topics + cluster_topics} + ).sort_values("Indices") + reverse_topic_labels = dict((v, k) for k, v in merged_model.topic_labels_.items()) + df.Label = df.Label.map(reverse_topic_labels) + merged_model.topics_ = df.Label.values + + # Update the class internally + has_outliers = bool(self._outliers) + self.__dict__.clear() + self.__dict__.update(merged_model.__dict__) + logger.info("Zeroshot Step 3 - Completed \u2713") + + # Move -1 topic back to position 0 if it exists + if has_outliers: + nr_zeroshot_topics = len(set(y)) + + # Re-map the topics such that the -1 topic is at position 0 + new_mappings = {} + for topic in self.topics_: + if topic < nr_zeroshot_topics: + new_mappings[topic] = topic + elif topic == nr_zeroshot_topics: + new_mappings[topic] = -1 + else: + new_mappings[topic] = topic - 1 + + # Re-map the topics including all representations (labels, sizes, embeddings, etc.) + self.topics_ = [new_mappings[topic] for topic in self.topics_] + self.topic_representations_ = {new_mappings[topic]: repr for topic, repr in self.topic_representations_.items()} + self.topic_labels_ = {new_mappings[topic]: label for topic, label in self.topic_labels_.items()} + self.topic_sizes_ = collections.Counter(self.topics_) + self.topic_embeddings_ = np.vstack([ + self.topic_embeddings_[nr_zeroshot_topics], + self.topic_embeddings_[:nr_zeroshot_topics], + self.topic_embeddings_[nr_zeroshot_topics+1:] + ]) + self._outliers = 1 + + return self.topics_ + + def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]: + """ Apply Guided Topic Modeling + + We transform the seeded topics to embeddings using the + same embedder as used for generating document embeddings. + + Then, we apply cosine similarity between the embeddings + and set labels for documents that are more similar to + one of the topics than the average document. + + If a document is more similar to the average document + than any of the topics, it gets the -1 label and is + thereby not included in UMAP. + + Arguments: + embeddings: The document embeddings + + Returns + y: The labels for each seeded topic + embeddings: Updated embeddings + """ + logger.info("Guided - Find embeddings highly related to seeded topics.") + # Create embeddings from the seeded topics + seed_topic_list = [" ".join(seed_topic) for seed_topic in self.seed_topic_list] + seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose) + seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)]) + + # Label documents that are most similar to one of the seeded topics + sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings) + y = [np.argmax(sim_matrix[index]) for index in range(sim_matrix.shape[0])] + y = [val if val != len(seed_topic_list) else -1 for val in y] + + # Average the document embeddings related to the seeded topics with the + # embedding of the seeded topic to force the documents in a cluster + for seed_topic in range(len(seed_topic_list)): + indices = [index for index, topic in enumerate(y) if topic == seed_topic] + embeddings[indices] = np.average([embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1]) + logger.info("Guided - Completed \u2713") + return y, embeddings + + def _extract_topics(self, documents: pd.DataFrame, embeddings: np.ndarray = None, mappings=None, verbose: bool = False): + """ Extract topics from the clusters using a class-based TF-IDF + + Arguments: + documents: Dataframe with documents and their corresponding IDs + embeddings: The document embeddings + mappings: The mappings from topic to word + verbose: Whether to log the process of extracting topics + + Returns: + c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic + """ + if verbose: + logger.info("Representation - Extracting topics from clusters using representation models.") + documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) + self.topic_representations_ = self._extract_words_per_topic(words, documents) + self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) + self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) + for key, values in + self.topic_representations_.items()} + if verbose: + logger.info("Representation - Completed \u2713") + + def _save_representative_docs(self, documents: pd.DataFrame): + """ Save the 3 most representative docs per topic + + Arguments: + documents: Dataframe with documents and their corresponding IDs + + Updates: + self.representative_docs_: Populate each topic with 3 representative docs + """ + repr_docs, _, _, _ = self._extract_representative_docs( + self.c_tf_idf_, + documents, + self.topic_representations_, + nr_samples=500, + nr_repr_docs=3 + ) + self.representative_docs_ = repr_docs + + def _extract_representative_docs(self, + c_tf_idf: csr_matrix, + documents: pd.DataFrame, + topics: Mapping[str, List[Tuple[str, float]]], + nr_samples: int = 500, + nr_repr_docs: int = 5, + diversity: float = None + ) -> Union[List[str], List[List[int]]]: + """ Approximate most representative documents per topic by sampling + a subset of the documents in each topic and calculating which are + most represenative to their topic based on the cosine similarity between + c-TF-IDF representations. + + Arguments: + c_tf_idf: The topic c-TF-IDF representation + documents: All input documents + topics: The candidate topics as calculated with c-TF-IDF + nr_samples: The number of candidate documents to extract per topic + nr_repr_docs: The number of representative documents to extract per topic + diversity: The diversity between the most representative documents. + If None, no MMR is used. Otherwise, accepts values between 0 and 1. + + Returns: + repr_docs_mappings: A dictionary from topic to representative documents + representative_docs: A flat list of representative documents + repr_doc_indices: Ordered indices of representative documents + that belong to each topic + repr_doc_ids: The indices of representative documents + that belong to each topic + """ + # Sample documents per topic + documents_per_topic = ( + documents.drop("Image", axis=1, errors="ignore") + .groupby('Topic') + .sample(n=nr_samples, replace=True, random_state=42) + .drop_duplicates() + ) + + # Find and extract documents that are most similar to the topic + repr_docs = [] + repr_docs_indices = [] + repr_docs_mappings = {} + repr_docs_ids = [] + labels = sorted(list(topics.keys())) + for index, topic in enumerate(labels): + + # Slice data + selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :] + selected_docs = selection["Document"].values + selected_docs_ids = selection.index.tolist() + + # Calculate similarity + nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs) + bow = self.vectorizer_model.transform(selected_docs) + ctfidf = self.ctfidf_model.transform(bow) + sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index]) + + # Use MMR to find representative but diverse documents + if diversity: + docs = mmr(c_tf_idf[index], ctfidf, selected_docs, top_n=nr_docs, diversity=diversity) + + # Extract top n most representative documents + else: + indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:] + docs = [selected_docs[index] for index in indices] + + doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs] + repr_docs_ids.append(doc_ids) + repr_docs.extend(docs) + repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)]) + repr_docs_mappings = {topic: repr_docs[i[0]:i[-1]+1] for topic, i in zip(topics.keys(), repr_docs_indices)} + + return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids + + def _create_topic_vectors(self, documents: pd.DataFrame = None, embeddings: np.ndarray = None, mappings=None): + """ Creates embeddings per topics based on their topic representation + + As a default, topic vectors (topic embeddings) are created by taking + the average of all document embeddings within a topic. If topics are + merged, then a weighted average of topic embeddings is taken based on + the initial topic sizes. + + For the `.partial_fit` and `.update_topics` method, the average + of all document embeddings is not taken since those are not known. + Instead, the weighted average of the embeddings of the top n words + is taken for each topic. The weighting is done based on the c-TF-IDF + score. This will put more emphasis to words that represent a topic best. + """ + # Topic embeddings based on input embeddings + if embeddings is not None and documents is not None: + topic_embeddings = [] + topics = documents.sort_values("Topic").Topic.unique() + for topic in topics: + indices = documents.loc[documents.Topic == topic, "ID"].values + indices = [int(index) for index in indices] + topic_embedding = np.mean(embeddings[indices], axis=0) + topic_embeddings.append(topic_embedding) + self.topic_embeddings_ = np.array(topic_embeddings) + + # Topic embeddings when merging topics + elif self.topic_embeddings_ is not None and mappings is not None: + topic_embeddings_dict = {} + for topic_from, topics_to in mappings.items(): + topic_ids = topics_to["topics_to"] + topic_sizes = topics_to["topic_sizes"] + if topic_ids: + embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers] + topic_embedding = np.average(embds, axis=0, weights=topic_sizes) + topic_embeddings_dict[topic_from] = topic_embedding + + # Re-order topic embeddings + topics_to_map = {topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]} + topic_embeddings = {} + for topic, embds in topic_embeddings_dict.items(): + topic_embeddings[topics_to_map[topic]] = embds + unique_topics = sorted(list(topic_embeddings.keys())) + self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics]) + + # Topic embeddings based on keyword representations + elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder: + topic_list = list(self.topic_representations_.keys()) + topic_list.sort() + + # Only extract top n words + n = len(self.topic_representations_[topic_list[0]]) + if self.top_n_words < n: + n = self.top_n_words + + # Extract embeddings for all words in all topics + topic_words = [self.get_topic(topic) for topic in topic_list] + topic_words = [word[0] for topic in topic_words for word in topic] + word_embeddings = self._extract_embeddings( + topic_words, + method="word", + verbose=False + ) + + # Take the weighted average of word embeddings in a topic based on their c-TF-IDF value + # The embeddings var is a single numpy matrix and therefore slicing is necessary to + # access the words per topic + topic_embeddings = [] + for i, topic in enumerate(topic_list): + word_importance = [val[1] for val in self.get_topic(topic)] + if sum(word_importance) == 0: + word_importance = [1 for _ in range(len(self.get_topic(topic)))] + topic_embedding = np.average(word_embeddings[i * n: n + (i * n)], weights=word_importance, axis=0) + topic_embeddings.append(topic_embedding) + + self.topic_embeddings_ = np.array(topic_embeddings) + + def _c_tf_idf(self, + documents_per_topic: pd.DataFrame, + fit: bool = True, + partial_fit: bool = False) -> Tuple[csr_matrix, List[str]]: + """ Calculate a class-based TF-IDF where m is the number of total documents. + + Arguments: + documents_per_topic: The joined documents per topic such that each topic has a single + string made out of multiple documents + m: The total number of documents (unjoined) + fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model + partial_fit: Whether to run `partial_fit` for online learning + + Returns: + tf_idf: The resulting matrix giving a value (importance score) for each word per topic + words: The names of the words to which values were given + """ + documents = self._preprocess_text(documents_per_topic.Document.values) + + if partial_fit: + X = self.vectorizer_model.partial_fit(documents).update_bow(documents) + elif fit: + self.vectorizer_model.fit(documents) + X = self.vectorizer_model.transform(documents) + else: + X = self.vectorizer_model.transform(documents) + + # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 + # and will be removed in 1.2. Please use get_feature_names_out instead. + if version.parse(sklearn_version) >= version.parse("1.0.0"): + words = self.vectorizer_model.get_feature_names_out() + else: + words = self.vectorizer_model.get_feature_names() + + multiplier = None + if self.ctfidf_model.seed_words and self.seed_topic_list: + seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] + multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]) + multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)]) + elif self.ctfidf_model.seed_words: + multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]) + elif self.seed_topic_list: + seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] + multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words]) + + if fit: + self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier) + + c_tf_idf = self.ctfidf_model.transform(X) + + return c_tf_idf, words + + def _update_topic_size(self, documents: pd.DataFrame): + """ Calculate the topic sizes + + Arguments: + documents: Updated dataframe with documents and their corresponding IDs and newly added Topics + """ + self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist()) + self.topics_ = documents.Topic.astype(int).tolist() + + def _extract_words_per_topic(self, + words: List[str], + documents: pd.DataFrame, + c_tf_idf: csr_matrix = None, + calculate_aspects: bool = True) -> Mapping[str, + List[Tuple[str, float]]]: + """ Based on tf_idf scores per topic, extract the top n words per topic + + If the top words per topic need to be extracted, then only the `words` parameter + needs to be passed. If the top words per topic in a specific timestamp, then it + is important to pass the timestamp-based c-TF-IDF matrix and its corresponding + labels. + + Arguments: + words: List of all words (sorted according to tf_idf matrix position) + documents: DataFrame with documents and their topic IDs + c_tf_idf: A c-TF-IDF matrix from which to calculate the top words + + Returns: + topics: The top words per topic + """ + if c_tf_idf is None: + c_tf_idf = self.c_tf_idf_ + + labels = sorted(list(documents.Topic.unique())) + labels = [int(label) for label in labels] + + # Get at least the top 30 indices and values per row in a sparse c-TF-IDF matrix + top_n_words = max(self.top_n_words, 30) + indices = self._top_n_idx_sparse(c_tf_idf, top_n_words) + scores = self._top_n_values_sparse(c_tf_idf, indices) + sorted_indices = np.argsort(scores, 1) + indices = np.take_along_axis(indices, sorted_indices, axis=1) + scores = np.take_along_axis(scores, sorted_indices, axis=1) + + # Get top 30 words per topic based on c-TF-IDF score + topics = {label: [(words[word_index], score) + if word_index is not None and score > 0 + else ("", 0.00001) + for word_index, score in zip(indices[index][::-1], scores[index][::-1]) + ] + for index, label in enumerate(labels)} + + # Fine-tune the topic representations + if isinstance(self.representation_model, list): + for tuner in self.representation_model: + topics = tuner.extract_topics(self, documents, c_tf_idf, topics) + elif isinstance(self.representation_model, BaseRepresentation): + topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) + elif isinstance(self.representation_model, dict): + if self.representation_model.get("Main"): + topics = self.representation_model["Main"].extract_topics(self, documents, c_tf_idf, topics) + topics = {label: values[:self.top_n_words] for label, values in topics.items()} + + # Extract additional topic aspects + if calculate_aspects and isinstance(self.representation_model, dict): + for aspect, aspect_model in self.representation_model.items(): + aspects = topics.copy() + if aspect != "Main": + if isinstance(aspect_model, list): + for tuner in aspect_model: + aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects) + self.topic_aspects_[aspect] = aspects + elif isinstance(aspect_model, BaseRepresentation): + self.topic_aspects_[aspect] = aspect_model.extract_topics(self, documents, c_tf_idf, aspects) + + return topics + + def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame: + """ Reduce topics to self.nr_topics + + Arguments: + documents: Dataframe with documents and their corresponding IDs and Topics + + Returns: + documents: Updated dataframe with documents and the reduced number of Topics + """ + logger.info("Topic reduction - Reducing number of topics") + initial_nr_topics = len(self.get_topics()) + + if isinstance(self.nr_topics, int): + if self.nr_topics < initial_nr_topics: + documents = self._reduce_to_n_topics(documents) + elif isinstance(self.nr_topics, str): + documents = self._auto_reduce_topics(documents) + else: + raise ValueError("nr_topics needs to be an int or 'auto'! ") + + logger.info(f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}") + return documents + + def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame: + """ Reduce topics to self.nr_topics + + Arguments: + documents: Dataframe with documents and their corresponding IDs and Topics + + Returns: + documents: Updated dataframe with documents and the reduced number of Topics + """ + topics = documents.Topic.tolist().copy() + + # Create topic distance matrix + if self.topic_embeddings_ is not None: + topic_embeddings = self.topic_embeddings_[self._outliers:, ] + else: + topic_embeddings = self.c_tf_idf_[self._outliers:, ].toarray() + distance_matrix = 1-cosine_similarity(topic_embeddings) + np.fill_diagonal(distance_matrix, 0) + + # Cluster the topic embeddings using AgglomerativeClustering + if version.parse(sklearn_version) >= version.parse("1.4.0"): + cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric="precomputed", linkage="average") + else: + cluster = AgglomerativeClustering(self.nr_topics - self._outliers, affinity="precomputed", linkage="average") + cluster.fit(distance_matrix) + new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics] + + # Track mappings and sizes of topics for merging topic embeddings + mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)} + mappings = defaultdict(list) + for key, val in sorted(mapped_topics.items()): + mappings[val].append(key) + mappings = {topic_from: + {"topics_to": topics_to, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} + for topic_from, topics_to in mappings.items()} + + # Map topics + documents.Topic = new_topics + self._update_topic_size(documents) + self.topic_mapper_.add_mappings(mapped_topics) + + # Update representations + documents = self._sort_mappings_by_frequency(documents) + self._extract_topics(documents, mappings=mappings) + self._update_topic_size(documents) + return documents + + def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame: + """ Reduce the number of topics automatically using HDBSCAN + + Arguments: + documents: Dataframe with documents and their corresponding IDs and Topics + + Returns: + documents: Updated dataframe with documents and the reduced number of Topics + """ + topics = documents.Topic.tolist().copy() + unique_topics = sorted(list(documents.Topic.unique()))[self._outliers:] + max_topic = unique_topics[-1] + + # Find similar topics + if self.topic_embeddings_ is not None: + embeddings = np.array(self.topic_embeddings_) + else: + embeddings = self.c_tf_idf_.toarray() + norm_data = normalize(embeddings, norm='l2') + predictions = hdbscan.HDBSCAN(min_cluster_size=2, + metric='euclidean', + cluster_selection_method='eom', + prediction_data=True).fit_predict(norm_data[self._outliers:]) + + # Map similar topics + mapped_topics = {unique_topics[index]: prediction + max_topic + for index, prediction in enumerate(predictions) + if prediction != -1} + documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int) + mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())} + + # Track mappings and sizes of topics for merging topic embeddings + mappings = defaultdict(list) + for key, val in sorted(mapped_topics.items()): + mappings[val].append(key) + mappings = {topic_from: + {"topics_to": topics_to, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} + for topic_from, topics_to in mappings.items()} + + # Update documents and topics + self.topic_mapper_.add_mappings(mapped_topics) + documents = self._sort_mappings_by_frequency(documents) + self._extract_topics(documents, mappings=mappings) + self._update_topic_size(documents) + return documents + + def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame: + """ Reorder mappings by their frequency. + + For example, if topic 88 was mapped to topic + 5 and topic 5 turns out to be the largest topic, + then topic 5 will be topic 0. The second largest + will be topic 1, etc. + + If there are no mappings since no reduction of topics + took place, then the topics will simply be ordered + by their frequency and will get the topic ids based + on that order. + + This means that -1 will remain the outlier class, and + that the rest of the topics will be in descending order + of ids and frequency. + + Arguments: + documents: Dataframe with documents and their corresponding IDs and Topics + + Returns: + documents: Updated dataframe with documents and the mapped + and re-ordered topic ids + """ + self._update_topic_size(documents) + + # Map topics based on frequency + df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False) + df = df[df.Old_Topic != -1] + sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))} + self.topic_mapper_.add_mappings(sorted_topics) + + # Map documents + documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int) + self._update_topic_size(documents) + return documents + + def _map_probabilities(self, + probabilities: Union[np.ndarray, None], + original_topics: bool = False) -> Union[np.ndarray, None]: + """ Map the probabilities to the reduced topics. + This is achieved by adding together the probabilities + of all topics that are mapped to the same topic. Then, + the topics that were mapped from are set to 0 as they + were reduced. + + Arguments: + probabilities: An array containing probabilities + original_topics: Whether we want to map from the + original topics to the most recent topics + or from the second-most recent topics. + + Returns: + mapped_probabilities: Updated probabilities + """ + mappings = self.topic_mapper_.get_mappings(original_topics) + + # Map array of probabilities (probability for assigned topic per document) + if probabilities is not None: + if len(probabilities.shape) == 2: + mapped_probabilities = np.zeros((probabilities.shape[0], + len(set(mappings.values())) - self._outliers)) + for from_topic, to_topic in mappings.items(): + if to_topic != -1 and from_topic != -1: + mapped_probabilities[:, to_topic] += probabilities[:, from_topic] + + return mapped_probabilities + + return probabilities + + def _preprocess_text(self, documents: np.ndarray) -> List[str]: + """ Basic preprocessing of text + + Steps: + * Replace \n and \t with whitespace + * Only keep alpha-numerical characters + """ + cleaned_documents = [doc.replace("\n", " ") for doc in documents] + cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents] + if self.language == "english": + cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents] + cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents] + return cleaned_documents + + @staticmethod + def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray: + """ Return indices of top n values in each row of a sparse matrix + + Retrieved from: + https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix + + Arguments: + matrix: The sparse matrix from which to get the top n indices per row + n: The number of highest values to extract from each row + + Returns: + indices: The top n indices per row + """ + indices = [] + for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]): + n_row_pick = min(n, ri - le) + values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]] + values = [values[index] if len(values) >= index + 1 else None for index in range(n)] + indices.append(values) + return np.array(indices) + + @staticmethod + def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray: + """ Return the top n values for each row in a sparse matrix + + Arguments: + matrix: The sparse matrix from which to get the top n indices per row + indices: The top n indices per row + + Returns: + top_values: The top n scores per row + """ + top_values = [] + for row, values in enumerate(indices): + scores = np.array([matrix[row, value] if value is not None else 0 for value in values]) + top_values.append(scores) + return np.array(top_values) + + @classmethod + def _get_param_names(cls): + """Get parameter names for the estimator + + Adapted from: + https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178 + """ + init_signature = inspect.signature(cls.__init__) + parameters = sorted([p.name for p in init_signature.parameters.values() + if p.name != 'self' and p.kind != p.VAR_KEYWORD]) + return parameters + + def __str__(self): + """Get a string representation of the current object. + + Returns: + str: Human readable representation of the most important model parameters. + The parameters that represent models are ignored due to their length. + """ + parameters = "" + for parameter, value in self.get_params().items(): + value = str(value) + if "(" in value and value[0] != "(": + value = value.split("(")[0] + "(...)" + parameters += f"{parameter}={value}, " + + return f"BERTopic({parameters[:-2]})" + + +class TopicMapper: + """ Keep track of Topic Mappings + + The number of topics can be reduced + by merging them together. This mapping + needs to be tracked in BERTopic as new + predictions need to be mapped to the new + topics. + + These mappings are tracked in the `self.mappings_` + attribute where each set of topic is stacked horizontally. + For example, the most recent topics can be found in the + last column. To get a mapping, simply take two columns + of topics. + + In other words, it is represented as graph: + Topic 1 --> Topic 11 --> Topic 4 --> etc. + + Attributes: + self.mappings_ (np.ndarray) : A matrix indicating the mappings from one topic + to another. The columns represent a collection of topics + at any time. The last column represents the current state + of topics and the first column represents the initial state + of topics. + """ + def __init__(self, topics: List[int]): + """ Initialization of Topic Mapper + + Arguments: + topics: A list of topics per document + """ + base_topics = np.array(sorted(set(topics))) + topics = base_topics.copy().reshape(-1, 1) + self.mappings_ = np.hstack([topics.copy(), topics.copy()]).tolist() + + def get_mappings(self, original_topics: bool = True) -> Mapping[int, int]: + """ Get mappings from either the original topics or + the second-most recent topics to the current topics + + Arguments: + original_topics: Whether we want to map from the + original topics to the most recent topics + or from the second-most recent topics. + + Returns: + mappings: The mappings from old topics to new topics + + Examples: + + To get mappings, simply call: + ```python + mapper = TopicMapper(topics) + mappings = mapper.get_mappings(original_topics=False) + ``` + """ + if original_topics: + mappings = np.array(self.mappings_)[:, [0, -1]] + mappings = dict(zip(mappings[:, 0], mappings[:, 1])) + else: + mappings = np.array(self.mappings_)[:, [-3, -1]] + mappings = dict(zip(mappings[:, 0], mappings[:, 1])) + return mappings + + def add_mappings(self, mappings: Mapping[int, int]): + """ Add new column(s) of topic mappings + + Arguments: + mappings: The mappings to add + """ + for topics in self.mappings_: + topic = topics[-1] + if topic in mappings: + topics.append(mappings[topic]) + else: + topics.append(-1) + + def add_new_topics(self, mappings: Mapping[int, int]): + """ Add new row(s) of topic mappings + + Arguments: + mappings: The mappings to add + """ + length = len(self.mappings_[0]) + for key, value in mappings.items(): + to_append = [key] + ([None] * (length-2)) + [value] + self.mappings_.append(to_append) + + +def _create_model_from_files( + topics: Mapping[str, Any], + params: Mapping[str, Any], + tensors: Mapping[str, np.array], + ctfidf_tensors: Mapping[str, Any] = None, + ctfidf_config: Mapping[str, Any] = None, + images: Mapping[int, Any] = None, + warn_no_backend: bool = True): + """ Create a BERTopic model from a variety of inputs + + Arguments: + topics: A dictionary containing topic metadata, including: + - Topic representations, labels, sizes, custom labels, etc. + params: BERTopic-specific hyperparams, including HF embedding_model ID + if given. + tensors: The topic embeddings + ctfidf_tensors: The c-TF-IDF representations + ctfidf_config: The config for CountVectorizer and c-TF-IDF + images: The images per topic + warn_no_backend: Whether to warn the user if no backend is given + """ + from sentence_transformers import SentenceTransformer + params["n_gram_range"] = tuple(params["n_gram_range"]) + + if ctfidf_config is not None: + ngram_range = ctfidf_config["vectorizer_model"]["params"]["ngram_range"] + ctfidf_config["vectorizer_model"]["params"]["ngram_range"] = tuple(ngram_range) + + params["n_gram_range"] = tuple(params["n_gram_range"]) + ctfidf_config + + # Select HF model through SentenceTransformers + try: + embedding_model = select_backend(SentenceTransformer(params['embedding_model'])) + except: + embedding_model = BaseEmbedder() + + if warn_no_backend: + logger.warning("You are loading a BERTopic model without explicitly defining an embedding model." + " If you want to also load in an embedding model, make sure to use" + " `BERTopic.load(my_model, embedding_model=my_embedding_model)`.") + + if params.get("embedding_model") is not None: + del params['embedding_model'] + + # Prepare our empty sub-models + empty_dimensionality_model = BaseDimensionalityReduction() + empty_cluster_model = BaseCluster() + + # Fit BERTopic without actually performing any clustering + topic_model = BERTopic( + embedding_model=embedding_model, + umap_model=empty_dimensionality_model, + hdbscan_model=empty_cluster_model, + **params + ) + topic_model.topic_embeddings_ = tensors["topic_embeddings"].numpy() + topic_model.topic_representations_ = {int(key): val for key, val in topics["topic_representations"].items()} + topic_model.topics_ = topics["topics"] + topic_model.topic_sizes_ = {int(key): val for key, val in topics["topic_sizes"].items()} + topic_model.topic_labels_ = {int(key): val for key, val in topics["topic_labels"].items()} + topic_model.custom_labels_ = topics["custom_labels"] + topic_model._outliers = topics["_outliers"] + + if topics.get("topic_aspects"): + topic_aspects = {} + for aspect, values in topics["topic_aspects"].items(): + if aspect != "Visual_Aspect": + topic_aspects[aspect] = {int(topic): value for topic, value in values.items()} + topic_model.topic_aspects_ = topic_aspects + + if images is not None: + topic_model.topic_aspects_["Visual_Aspect"] = images + + # Topic Mapper + topic_model.topic_mapper_ = TopicMapper([0]) + topic_model.topic_mapper_.mappings_ = topics["topic_mapper"] + + if ctfidf_tensors is not None: + topic_model.c_tf_idf_ = csr_matrix((ctfidf_tensors["data"], ctfidf_tensors["indices"], ctfidf_tensors["indptr"]), shape=ctfidf_tensors["shape"]) + + # CountVectorizer + topic_model.vectorizer_model = CountVectorizer(**ctfidf_config["vectorizer_model"]["params"]) + topic_model.vectorizer_model.vocabulary_ = ctfidf_config["vectorizer_model"]["vocab"] + + # ClassTfidfTransformer + topic_model.ctfidf_model.reduce_frequent_words = ctfidf_config["ctfidf_model"]["reduce_frequent_words"] + topic_model.ctfidf_model.bm25_weighting = ctfidf_config["ctfidf_model"]["bm25_weighting"] + idf = ctfidf_tensors["diag"].numpy() + topic_model.ctfidf_model._idf_diag = sp.diags(idf, offsets=0, shape=(len(idf), len(idf)), format='csr', dtype=np.float64) + return topic_model