| import numpy as np |
| import pandas as pd |
|
|
| from packaging import version |
| from scipy.sparse import csr_matrix |
| from typing import Mapping, List, Tuple, Union |
| from sklearn.metrics.pairwise import cosine_similarity |
| from bertopic.representation._base import BaseRepresentation |
| from sklearn import __version__ as sklearn_version |
|
|
|
|
| class KeyBERTInspired(BaseRepresentation): |
| def __init__(self, |
| top_n_words: int = 10, |
| nr_repr_docs: int = 5, |
| nr_samples: int = 500, |
| nr_candidate_words: int = 100, |
| random_state: int = 42): |
| """ Use a KeyBERT-like model to fine-tune the topic representations |
| |
| The algorithm follows KeyBERT but does some optimization in |
| order to speed up inference. |
| |
| The steps are as follows. First, we extract the top n representative |
| documents per topic. To extract the representative documents, we |
| randomly sample a number of candidate documents per cluster |
| which is controlled by the `nr_samples` parameter. Then, |
| the top n representative documents are extracted by calculating |
| the c-TF-IDF representation for the candidate documents and finding, |
| through cosine similarity, which are closest to the topic c-TF-IDF representation. |
| Next, the top n words per topic are extracted based on their |
| c-TF-IDF representation, which is controlled by the `nr_repr_docs` |
| parameter. |
| |
| Then, we extract the embeddings for words and representative documents |
| and create topic embeddings by averaging the representative documents. |
| Finally, the most similar words to each topic are extracted by |
| calculating the cosine similarity between word and topic embeddings. |
| |
| Arguments: |
| top_n_words: The top n words to extract per topic. |
| nr_repr_docs: The number of representative documents to extract per cluster. |
| nr_samples: The number of candidate documents to extract per cluster. |
| nr_candidate_words: The number of candidate words per cluster. |
| random_state: The random state for randomly sampling candidate documents. |
| |
| Usage: |
| |
| ```python |
| from bertopic.representation import KeyBERTInspired |
| from bertopic import BERTopic |
| |
| # Create your representation model |
| representation_model = KeyBERTInspired() |
| |
| # Use the representation model in BERTopic on top of the default pipeline |
| topic_model = BERTopic(representation_model=representation_model) |
| ``` |
| """ |
| self.top_n_words = top_n_words |
| self.nr_repr_docs = nr_repr_docs |
| self.nr_samples = nr_samples |
| self.nr_candidate_words = nr_candidate_words |
| self.random_state = random_state |
|
|
| def extract_topics(self, |
| topic_model, |
| documents: pd.DataFrame, |
| c_tf_idf: csr_matrix, |
| topics: Mapping[str, List[Tuple[str, float]]] |
| ) -> Mapping[str, List[Tuple[str, float]]]: |
| """ Extract topics |
| |
| Arguments: |
| topic_model: A BERTopic model |
| documents: All input documents |
| c_tf_idf: The topic c-TF-IDF representation |
| topics: The candidate topics as calculated with c-TF-IDF |
| |
| Returns: |
| updated_topics: Updated topic representations |
| """ |
| |
| _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs) |
|
|
| |
| topics = self._extract_candidate_words(topic_model, c_tf_idf, topics) |
|
|
| |
| |
| sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices) |
|
|
| |
| updated_topics = self._extract_top_words(words, topics, sim_matrix) |
|
|
| return updated_topics |
|
|
| def _extract_candidate_words(self, |
| topic_model, |
| c_tf_idf: csr_matrix, |
| topics: Mapping[str, List[Tuple[str, float]]] |
| ) -> Mapping[str, List[Tuple[str, float]]]: |
| """ For each topic, extract candidate words based on the c-TF-IDF |
| representation. |
| |
| Arguments: |
| topic_model: A BERTopic model |
| c_tf_idf: The topic c-TF-IDF representation |
| topics: The top words per topic |
| |
| Returns: |
| topics: The `self.top_n_words` per topic |
| """ |
| labels = [int(label) for label in sorted(list(topics.keys()))] |
|
|
| |
| |
| if version.parse(sklearn_version) >= version.parse("1.0.0"): |
| words = topic_model.vectorizer_model.get_feature_names_out() |
| else: |
| words = topic_model.vectorizer_model.get_feature_names() |
|
|
| indices = topic_model._top_n_idx_sparse(c_tf_idf, self.nr_candidate_words) |
| scores = topic_model._top_n_values_sparse(c_tf_idf, indices) |
| sorted_indices = np.argsort(scores, 1) |
| indices = np.take_along_axis(indices, sorted_indices, axis=1) |
| scores = np.take_along_axis(scores, sorted_indices, axis=1) |
|
|
| |
| topics = {label: [(words[word_index], score) |
| if word_index is not None and score > 0 |
| else ("", 0.00001) |
| for word_index, score in zip(indices[index][::-1], scores[index][::-1]) |
| ] |
| for index, label in enumerate(labels)} |
| topics = {label: list(zip(*values[:self.nr_candidate_words]))[0] for label, values in topics.items()} |
|
|
| return topics |
|
|
| def _extract_embeddings(self, |
| topic_model, |
| topics: Mapping[str, List[Tuple[str, float]]], |
| representative_docs: List[str], |
| repr_doc_indices: List[List[int]] |
| ) -> Union[np.ndarray, List[str]]: |
| """ Extract the representative document embeddings and create topic embeddings. |
| Then extract word embeddings and calculate the cosine similarity between topic |
| embeddings and the word embeddings. Topic embeddings are the average of |
| representative document embeddings. |
| |
| Arguments: |
| topic_model: A BERTopic model |
| topics: The top words per topic |
| representative_docs: A flat list of representative documents |
| repr_doc_indices: The indices of representative documents |
| that belong to each topic |
| |
| Returns: |
| sim: The similarity matrix between word and topic embeddings |
| vocab: The complete vocabulary of input documents |
| """ |
| |
| repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) |
| topic_embeddings = [np.mean(repr_embeddings[i[0]:i[-1]+1], axis=0) for i in repr_doc_indices] |
|
|
| |
| vocab = list(set([word for words in topics.values() for word in words])) |
| word_embeddings = topic_model._extract_embeddings(vocab, method="document", verbose=False) |
| sim = cosine_similarity(topic_embeddings, word_embeddings) |
|
|
| return sim, vocab |
|
|
| def _extract_top_words(self, |
| vocab: List[str], |
| topics: Mapping[str, List[Tuple[str, float]]], |
| sim: np.ndarray |
| ) -> Mapping[str, List[Tuple[str, float]]]: |
| """ Extract the top n words per topic based on the |
| similarity matrix between topics and words. |
| |
| Arguments: |
| vocab: The complete vocabulary of input documents |
| labels: All topic labels |
| topics: The top words per topic |
| sim: The similarity matrix between word and topic embeddings |
| |
| Returns: |
| updated_topics: The updated topic representations |
| """ |
| labels = [int(label) for label in sorted(list(topics.keys()))] |
| updated_topics = {} |
| for i, topic in enumerate(labels): |
| indices = [vocab.index(word) for word in topics[topic]] |
| values = sim[:, indices][i] |
| word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words:]] |
| updated_topics[topic] = [(vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words:], word_indices)][::-1] |
|
|
| return updated_topics |
|
|