| import warnings |
| import numpy as np |
| import pandas as pd |
| from typing import List, Mapping, Tuple |
| from scipy.sparse import csr_matrix |
| from sklearn.metrics.pairwise import cosine_similarity |
| from bertopic.representation._base import BaseRepresentation |
|
|
|
|
| class MaximalMarginalRelevance(BaseRepresentation): |
| """ Calculate Maximal Marginal Relevance (MMR) |
| between candidate keywords and the document. |
| |
| MMR considers the similarity of keywords/keyphrases with the |
| document, along with the similarity of already selected |
| keywords and keyphrases. This results in a selection of keywords |
| that maximize their within diversity with respect to the document. |
| |
| Arguments: |
| diversity: How diverse the select keywords/keyphrases are. |
| Values range between 0 and 1 with 0 being not diverse at all |
| and 1 being most diverse. |
| top_n_words: The number of keywords/keyhprases to return |
| |
| Usage: |
| |
| ```python |
| from bertopic.representation import MaximalMarginalRelevance |
| from bertopic import BERTopic |
| |
| # Create your representation model |
| representation_model = MaximalMarginalRelevance(diversity=0.3) |
| |
| # Use the representation model in BERTopic on top of the default pipeline |
| topic_model = BERTopic(representation_model=representation_model) |
| ``` |
| """ |
| def __init__(self, diversity: float = 0.1, top_n_words: int = 10): |
| self.diversity = diversity |
| self.top_n_words = top_n_words |
|
|
| def extract_topics(self, |
| topic_model, |
| documents: pd.DataFrame, |
| c_tf_idf: csr_matrix, |
| topics: Mapping[str, List[Tuple[str, float]]] |
| ) -> Mapping[str, List[Tuple[str, float]]]: |
| """ Extract topic representations |
| |
| Arguments: |
| topic_model: The BERTopic model |
| documents: Not used |
| c_tf_idf: Not used |
| topics: The candidate topics as calculated with c-TF-IDF |
| |
| Returns: |
| updated_topics: Updated topic representations |
| """ |
|
|
| if topic_model.embedding_model is None: |
| warnings.warn("MaximalMarginalRelevance can only be used BERTopic was instantiated" |
| "with the `embedding_model` parameter.") |
| return topics |
|
|
| updated_topics = {} |
| for topic, topic_words in topics.items(): |
| words = [word[0] for word in topic_words] |
| word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False) |
| topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(1, -1) |
| topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words) |
| updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words] |
| return updated_topics |
|
|
|
|
| def mmr(doc_embedding: np.ndarray, |
| word_embeddings: np.ndarray, |
| words: List[str], |
| diversity: float = 0.1, |
| top_n: int = 10) -> List[str]: |
| """ Maximal Marginal Relevance |
| |
| Arguments: |
| doc_embedding: The document embeddings |
| word_embeddings: The embeddings of the selected candidate keywords/phrases |
| words: The selected candidate keywords/keyphrases |
| diversity: The diversity of the selected embeddings. |
| Values between 0 and 1. |
| top_n: The top n items to return |
| |
| Returns: |
| List[str]: The selected keywords/keyphrases |
| """ |
|
|
| |
| word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding) |
| word_similarity = cosine_similarity(word_embeddings) |
|
|
| |
| keywords_idx = [np.argmax(word_doc_similarity)] |
| candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]] |
|
|
| for _ in range(top_n - 1): |
| |
| |
| candidate_similarities = word_doc_similarity[candidates_idx, :] |
| target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1) |
|
|
| |
| mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1) |
| mmr_idx = candidates_idx[np.argmax(mmr)] |
|
|
| |
| keywords_idx.append(mmr_idx) |
| candidates_idx.remove(mmr_idx) |
|
|
| return [words[idx] for idx in keywords_idx] |
|
|