| import pandas as pd | |
| from scipy.sparse import csr_matrix | |
| from sklearn.base import BaseEstimator | |
| from typing import Mapping, List, Tuple | |
| class BaseRepresentation(BaseEstimator): | |
| """ The base representation model for fine-tuning topic representations """ | |
| def extract_topics(self, | |
| topic_model, | |
| documents: pd.DataFrame, | |
| c_tf_idf: csr_matrix, | |
| topics: Mapping[str, List[Tuple[str, float]]] | |
| ) -> Mapping[str, List[Tuple[str, float]]]: | |
| """ Extract topics | |
| Each representation model that inherits this class will have | |
| its arguments (topic_model, documents, c_tf_idf, topics) | |
| automatically passed. Therefore, the representation model | |
| will only have access to the information about topics related | |
| to those arguments. | |
| Arguments: | |
| topic_model: The BERTopic model that is fitted until topic | |
| representations are calculated. | |
| documents: A dataframe with columns "Document" and "Topic" | |
| that contains all documents with each corresponding | |
| topic. | |
| c_tf_idf: A c-TF-IDF representation that is typically | |
| identical to `topic_model.c_tf_idf_` except for | |
| dynamic, class-based, and hierarchical topic modeling | |
| where it is calculated on a subset of the documents. | |
| topics: A dictionary with topic (key) and tuple of word and | |
| weight (value) as calculated by c-TF-IDF. This is the | |
| default topics that are returned if no representation | |
| model is used. | |
| """ | |
| return topic_model.topic_representations_ | |