Spaces:
Configuration error
Configuration error
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| from numpy.linalg import norm | |
| import matplotlib as mpl | |
| mpl.use('Agg') | |
| from typing import List, Dict, Tuple | |
| class WordToPlot: | |
| def __init__( | |
| self, | |
| word: str, | |
| color: str, | |
| bias_space: int, | |
| alpha: float | |
| ) -> None: | |
| self.word = word | |
| self.color = color | |
| self.bias_space = bias_space | |
| self.alpha = alpha | |
| class WordExplorer: | |
| def __init__( | |
| self, | |
| embedding # Embedding Class instance | |
| ) -> None: | |
| self.embedding = embedding | |
| def __errorChecking( | |
| self, | |
| word: str | |
| ) -> str: | |
| out_msj = "" | |
| if not word: | |
| out_msj = "Error: Primero debe ingresar una palabra!" | |
| else: | |
| if word not in self.embedding: | |
| out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!" | |
| return out_msj | |
| def check_oov( | |
| self, | |
| wordlists: List[str] | |
| ) -> str: | |
| for wordlist in wordlists: | |
| for word in wordlist: | |
| msg = self.__errorChecking(word) | |
| if msg: | |
| return msg | |
| return None | |
| def get_neighbors( | |
| self, | |
| word: str, | |
| n_neighbors: int, | |
| nn_method: str | |
| ) -> List[str]: | |
| return self.embedding.getNearestNeighbors(word, n_neighbors, nn_method) | |
| def get_df( | |
| self, | |
| words_embedded: np.ndarray, | |
| processed_word_list: List[str] | |
| ) -> pd.DataFrame: | |
| df = pd.DataFrame(words_embedded) | |
| df['word'] = [wtp.word for wtp in processed_word_list] | |
| df['color'] = [wtp.color for wtp in processed_word_list] | |
| df['alpha'] = [wtp.alpha for wtp in processed_word_list] | |
| df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list] | |
| return df | |
| def get_plot( | |
| self, | |
| data: pd.DataFrame, | |
| processed_word_list: List[str], | |
| words_embedded: np.ndarray, | |
| color_dict: Dict, | |
| n_neighbors: int, | |
| n_alpha: float, | |
| fontsize: int=18, | |
| figsize: Tuple[int, int]=(20, 15) | |
| ): | |
| fig, ax = plt.subplots(figsize=figsize) | |
| sns.scatterplot( | |
| data=data[data['alpha'] == 1], | |
| x=0, | |
| y=1, | |
| style='word_bias_space', | |
| hue='word_bias_space', | |
| ax=ax, | |
| palette=color_dict | |
| ) | |
| if n_neighbors > 0: | |
| sns.scatterplot( | |
| data=data[data['alpha'] != 1], | |
| x=0, | |
| y=1, | |
| style='color', | |
| hue='word_bias_space', | |
| ax=ax, | |
| alpha=n_alpha, | |
| legend=False, | |
| palette=color_dict | |
| ) | |
| for i, wtp in enumerate(processed_word_list): | |
| x, y = words_embedded[i, :] | |
| ax.annotate( | |
| wtp.word, | |
| xy=(x, y), | |
| xytext=(5, 2), | |
| color=wtp.color, | |
| textcoords='offset points', | |
| ha='right', | |
| va='bottom', | |
| size=fontsize, | |
| alpha=wtp.alpha | |
| ) | |
| ax.set_xticks([]) | |
| ax.set_yticks([]) | |
| ax.set_xlabel('') | |
| ax.set_ylabel('') | |
| fig.tight_layout() | |
| return fig | |
| def plot_projections_2d( | |
| self, | |
| wordlist_0: List[str], | |
| wordlist_1: List[str]=[], | |
| wordlist_2: List[str]=[], | |
| wordlist_3: List[str]=[], | |
| wordlist_4: List[str]=[], | |
| **kwargs | |
| ): | |
| # convertirlas a vector | |
| choices = [0, 1, 2, 3, 4] | |
| wordlist_choice = [ | |
| wordlist_0, | |
| wordlist_1, | |
| wordlist_2, | |
| wordlist_3, | |
| wordlist_4 | |
| ] | |
| err = self.check_oov(wordlist_choice) | |
| if err: | |
| raise Exception(err) | |
| color_dict = { | |
| 0: kwargs.get('color_wordlist_0', '#000000'), | |
| 1: kwargs.get('color_wordlist_1', '#1f78b4'), | |
| 2: kwargs.get('color_wordlist_2', '#33a02c'), | |
| 3: kwargs.get('color_wordlist_3', '#e31a1c'), | |
| 4: kwargs.get('color_wordlist_4', '#6a3d9a') | |
| } | |
| n_neighbors = kwargs.get('n_neighbors', 0) | |
| n_alpha = kwargs.get('n_alpha', 0.3) | |
| processed_word_list = [] | |
| for word_list_to_process, color in zip(wordlist_choice, choices): | |
| for word in word_list_to_process: | |
| processed_word_list.append( | |
| WordToPlot(word, color_dict[color], color, 1) | |
| ) | |
| if n_neighbors > 0: | |
| neighbors = self.get_neighbors( | |
| word, | |
| n_neighbors=n_neighbors, | |
| nn_method=kwargs.get('nn_method', 'sklearn') | |
| ) | |
| for n in neighbors: | |
| if n not in [wtp.word for wtp in processed_word_list]: | |
| processed_word_list.append( | |
| WordToPlot(n, color_dict[color], color, n_alpha) | |
| ) | |
| if not processed_word_list: | |
| raise Exception('Only empty lists were passed') | |
| words_embedded = np.array( | |
| [self.embedding.getPCA(wtp.word) for wtp in processed_word_list] | |
| ) | |
| data = self.get_df( | |
| words_embedded, | |
| processed_word_list | |
| ) | |
| fig = self.get_plot( | |
| data, | |
| processed_word_list, | |
| words_embedded, | |
| color_dict, | |
| n_neighbors, | |
| n_alpha, | |
| kwargs.get('fontsize', 18), | |
| kwargs.get('figsize', (20, 15)) | |
| ) | |
| plt.show() | |
| return fig | |
| # ToDo: No hay usos de este método. ¿Borrar? | |
| def doesnt_match( | |
| self, | |
| wordlist: List[str] | |
| ) -> str: | |
| err = self.check_oov([wordlist]) | |
| if err: | |
| raise Exception(err) | |
| words_emb = np.array([self.embedding.getEmbedding(word) | |
| for word in wordlist]) | |
| mean_vec = np.mean(words_emb, axis=0) | |
| doesnt_match = "" | |
| farthest_emb = 1.0 | |
| for word in wordlist: | |
| word_emb = self.embedding.getEmbedding(word) | |
| cos_sim = np.dot(mean_vec, word_emb) / \ | |
| (norm(mean_vec)*norm(word_emb)) | |
| if cos_sim <= farthest_emb: | |
| farthest_emb = cos_sim | |
| doesnt_match = word | |
| return doesnt_match | |