Spaces:

luciagonzalez
/

explorar_palabras

Build error

App Files Files Community

Lucia Gonzalez commited on Sep 23, 2022

Commit

7dda7b9

1 Parent(s): 50bd57f

Files

Browse files

Files changed (7) hide show

README.md +3 -3
examples.py +95 -0
explorar_relaciones_entre_palabras.py +105 -0
modules_sesgo_en_palabras.py +714 -0
requirements.txt +13 -0
tool_info.py +23 -0
utils_sesgo_en_palabras.py +272 -0

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
-title: Explorar Palabras
 emoji: ⚡
 colorFrom: gray
 colorTo: red
 sdk: gradio
-sdk_version: 3.3.1
-app_file: app.py
 pinned: false
 license: mit
 ---

 ---
+title: Explorar relaciones entre palabras
 emoji: ⚡
 colorFrom: gray
 colorTo: red
 sdk: gradio
+sdk_version: 3.2
+app_file: explorar_relaciones_entre_palabras.py
 pinned: false
 license: mit
 ---

examples.py ADDED Viewed

	@@ -0,0 +1,95 @@

+example_fem = {
+    "mujer": "la mente de una mujer que durante los últimos",
+    "chica": "enamorado de la misma chica desde la infancia mary",
+    "ella": "ella llego a la final",
+            "madre": "su padre y su madre margarita de parma",
+            "hija": "hija de inmigrantes españoles en",
+            "femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
+}
+example_joven = {
+    "joven": "",
+    "inmaduro": "",
+    "niño": "",
+    "crio": ""
+}
+example_viejo = {
+    "viejo": "",
+    "maduro": "",
+    "anciano": "",
+    "adulto": ""
+}
+example_masc = {
+    "hombre": "deseo innato que todo hombre tiene de comunicar su",
+    "chico": "fue un chico interesado en artes",
+    "el": "el parque nacional liwonde",
+    "padre": "la muerte de su padre en 1832 se formó",
+    "hijo": "le dice a su hijo aún no nacido como",
+            "masculino": "el mito es esencialmente masculino y entre las causas",
+}
+example_diagnose = {
+    "ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
+    "educación": "sentido de vida religión educación y cultura para cada mujer",
+    "pagado": "un rescate muy grande pagado por sus seguidores a",
+    "cocinar": "empezó a cocinar una sopa usando",
+    "lavar": "era directamente usado para lavar ropa por eso la",
+    "deporte": "se convirtió en el deporte más popular del país",
+    "ropa": "usan el kimono una ropa tradicional japonesa",
+    "pelea": "mal por la violenta pelea entre ambos hermanos",
+    "enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
+    "ganar": "una necesidad un modo de ganar",
+    "líder": "del estado en manos del líder opositor henrique capriles para el",
+    "coser": "realizar tareas domésticas básicas como coser y poner la mesa",
+    "cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
+    "cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
+    "rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
+    "reina": "año ganó el título de reina de la bahía en el"
+}
+fem_words = ','.join([word for word, context in example_fem.items()])
+fem_contexts = ','.join([context for word, context in example_fem.items()])
+masc_words = ','.join([word for word, context in example_masc.items()])
+masc_contexts = ','.join([context for word, context in example_masc.items()])
+young_words = ','.join([word for word, context in example_joven.items()])
+old_words = ','.join([word for word, context in example_viejo.items()])
+diagnose_words = ','.join([word for word, context in example_diagnose.items()])
+diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
+positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
+negative_money_words = 'malgastar,derrochar'
+diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
+lazy_words = 'vago, perezoso, gandul'
+active_words = 'trabajar, esfuerzo, trabajador'
+examples1_explorar_sesgo_en_palabras = [
+    [fem_words, masc_words, diagnose_words],
+    [old_words, young_words, diagnose_words],
+    [positive_money_words, negative_money_words, diagnose_money],
+    [lazy_words, active_words, diagnose_money]
+]
+examples2_explorar_sesgo_en_palabras = [
+    [fem_words, masc_words, young_words, old_words, diagnose_words],
+    [lazy_words, active_words, positive_money_words, negative_money_words,diagnose_money],
+]
+examples_explorar_relaciones_entre_palabras = [
+    [diagnose_words, fem_words, masc_words, young_words, old_words],
+    [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
+]
+# Examples Sesgos en frases
+examples_sesgos_frases = [
+    ["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
+    ["los cordobeses bailan *", ""],
+    ["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
+    ["",""]
+]
+examples_datos = ["ronronear", "ella", "el", "ellos"]

explorar_relaciones_entre_palabras.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import matplotlib as mpl
+mpl.use('Agg')
+from audioop import bias
+import gradio as gr
+from modules_sesgo_en_palabras import  WEBiasExplorer2d, Embedding
+from examples import examples_explorar_relaciones_entre_palabras
+import matplotlib.pyplot as plt
+from tool_info import TOOL_INFO
+plt.rcParams.update({'font.size': 14})
+LABEL_WORD_LIST_1 = 'Lista de palabras 1'
+LABEL_WORD_LIST_2 = 'Lista de palabras 2'
+LABEL_WORD_LIST_3 = 'Lista de palabras 3'
+LABEL_WORD_LIST_4 = 'Lista de palabras 4'
+LABEL_WORD_LIST_DIAGNOSE = 'Lista de palabras a diagnosticar'
+word_vectors_path = 'fasttext-sbwc.100k.vec'
+we = Embedding(word_vectors_path)
+we.load_we_as_keyed_vectors(word_vectors_path)
+we_bias_2d = WEBiasExplorer2d(we.wv)
+explorar_relaciones_entre_palabras_interface = gr.Blocks()
+with explorar_relaciones_entre_palabras_interface:
+    gr.Markdown("Escribi algunas palabras para visualizar sus palabras relacionadas")
+    with gr.Row():
+        with gr.Column(scale=3):
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=5):
+                    diagnose_list = gr.Textbox(lines=2, label=LABEL_WORD_LIST_DIAGNOSE)
+                with gr.Column(scale=1,min_width=10):
+                    color_wordlist = gr.ColorPicker(label="",value='#000000',)
+            with gr.Row():
+                with gr.Column(scale=5):
+                    wordlist_1 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_1)
+                with gr.Column(scale=1,min_width=10):
+                    color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
+            with gr.Row():
+                with gr.Column(scale=5):
+                    wordlist_2 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_2)
+                with gr.Column(scale=1,min_width=10):
+                    color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
+            with gr.Row():
+                with gr.Column(scale=5):
+                    wordlist_3 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_3)
+                with gr.Column(scale=1,min_width=10):
+                    color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
+            with gr.Row():
+                with gr.Column(scale=5):
+                    wordlist_4 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_4)
+                with gr.Column(scale=1,min_width=10):
+                    color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
+        with gr.Column(scale=4):
+            with gr.Row():
+                with gr.Row():
+                    plot_neighbors = gr.Checkbox(label='Graficar palabras relacionadas')
+                with gr.Row():
+                    alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label='Transparencia')
+                with gr.Row():
+                    fontsize=gr.Number(value=18, label='Tamaño de fuente')
+                with gr.Row():
+                    btn_plot = gr.Button('¡Graficar en el espacio!')
+            with gr.Row():
+                err_msg = gr.Markdown(label="", visible=True)
+            with gr.Row():
+                word_proyections = gr.Image(shape=(10, 10))
+    with gr.Row():
+        examples = gr.Examples(
+            fn=we_bias_2d.plot_projections_2d,
+            inputs=[diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4],
+            outputs=[word_proyections,err_msg],
+            examples=examples_explorar_relaciones_entre_palabras
+        )
+    with gr.Row():
+        gr.Markdown(TOOL_INFO)
+    btn_plot.click(
+    fn=we_bias_2d.plot_projections_2d,
+    inputs=[
+        diagnose_list,
+        wordlist_1,
+        wordlist_2,
+        wordlist_3,
+        wordlist_4,
+        color_wordlist,
+        color_wordlist_1,
+        color_wordlist_2,
+        color_wordlist_3,
+        color_wordlist_4,
+        plot_neighbors,
+        alpha,
+        fontsize
+        ],
+    outputs=[word_proyections,err_msg]
+    )
+explorar_relaciones_entre_palabras_interface.queue(concurrency_count=10)
+explorar_relaciones_entre_palabras_interface.launch()

modules_sesgo_en_palabras.py ADDED Viewed

	@@ -0,0 +1,714 @@

+import copy
+from sklearn.decomposition import PCA
+from sklearn.metrics.pairwise import euclidean_distances
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import pandas as pd
+from gensim.models import KeyedVectors
+from utils_sesgo_en_palabras import (
+    cosine_similarity,
+    normalize,
+    project_params,
+    take_two_sides_extreme_sorted
+)
+DIRECTION_METHODS = ['single', 'sum', 'pca']
+DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
+FIRST_PC_THRESHOLD = 0.5
+MAX_NON_SPECIFIC_EXAMPLES = 1000
+__all__ = ['GenderBiasWE', 'BiasWordEmbedding']
+class Loader():
+    def __init__(self):
+        self.path_to_data = ''
+    def load_tokenizer(self, tokenizer_path):
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path, do_lower_case=True, )
+        return tokenizer
+    def load_data_from_file(self, data):
+        return data
+    def load_corpus_from_file(self, data):
+        return data
+    def load_language_model(self, model_path):
+        model = AutoModelForMaskedLM.from_pretrained(
+            model_path, output_hidden_states=True)
+        return model
+class Corpus():
+    def __init__(self, corpus) -> None:
+        self.vocabulary = self.load_vocabulary_from_corpus()
+        self.corpus = corpus
+    def get_context_from_text(self, word):
+        pass
+    def get_frequency(self, word):
+        pass
+    def get_most_frequent_coocurrence(self, word):
+        pass
+class Embedding():
+    def __init__(self, word_vectors_path) -> None:
+        self.wv = self.load_we_as_keyed_vectors(word_vectors_path)
+    def load_we_as_keyed_vectors(self, word_vectors_path):
+        we = KeyedVectors.load_word2vec_format(word_vectors_path)
+        we.init_sims(replace=True)
+        return we
+    def get_word_vector(self, word, context=None):
+        return word
+class BiasExplorer():
+    def __init__(self, model, only_lower=False, verbose=False,
+                 identify_direction=False, to_normalize=True):
+        # pylint: disable=undefined-variable
+        # TODO: this is bad Python, ask someone about it
+        # probably should be a better design
+        # identify_direction doesn't have any meaning
+        # for the class BiasWordEmbedding
+        # The goal is to force this interfeace of sub-classes.
+        if self.__class__ == __class__ and identify_direction is not False:
+            raise ValueError('identify_direction must be False'
+                             ' for an instance of {}'
+                             .format(__class__))
+        self.model = model
+        # TODO: write unitest for when it is False
+        self.only_lower = only_lower
+        self._verbose = verbose
+        self.direction = None
+        self.positive_end = None
+        self.negative_end = None
+        if to_normalize:
+            self.model.init_sims(replace=True)
+    def __copy__(self):
+        bias_word_embedding = self.__class__(self.model,
+                                             self.only_lower,
+                                             self._verbose,
+                                             identify_direction=False)
+        bias_word_embedding.direction = copy.deepcopy(self.direction)
+        bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
+        bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
+        return bias_word_embedding
+    def __deepcopy__(self, memo):
+        bias_word_embedding = copy.copy(self)
+        bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
+        return bias_word_embedding
+    def __getitem__(self, key):
+        return self.model[key]
+    def __contains__(self, item):
+        return item in self.model
+    def _is_direction_identified(self):
+        if self.direction is None:
+            raise RuntimeError('The direction was not identified'
+                               ' for this {} instance'
+                               .format(self.__class__.__name__))
+    def _identify_subspace_by_pca(self, definitional_pairs, n_components):
+        matrix = []
+        for word1, word2 in definitional_pairs:
+            vector1 = normalize(self[word1])
+            vector2 = normalize(self[word2])
+            center = (vector1 + vector2) / 2
+            matrix.append(vector1 - center)
+            matrix.append(vector2 - center)
+        pca = PCA(n_components=n_components)
+        pca.fit(matrix)
+        if self._verbose:
+            table = enumerate(pca.explained_variance_ratio_, start=1)
+            headers = ['Principal Component',
+                       'Explained Variance Ratio']
+        return pca
+    def __errorChecking(self, word):
+        out_msj = ""
+        if not word:
+            out_msj = "Error: Primero debe ingresar una palabra!"
+        else:
+            if not word in self.model:
+                out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
+        if out_msj:
+            out_msj = "<center><h3>"+out_msj+"</h3></center>"
+        return out_msj
+    # TODO: add the SVD method from section 6 step 1
+    # It seems there is a mistake there, I think it is the same as PCA
+    # just with replacing it with SVD
+    def _identify_direction(self, positive_end, negative_end,
+                            definitional, method='pca'):
+        if method not in DIRECTION_METHODS:
+            raise ValueError('method should be one of {}, {} was given'.format(
+                DIRECTION_METHODS, method))
+        if positive_end == negative_end:
+            raise ValueError('positive_end and negative_end'
+                             'should be different, and not the same "{}"'
+                             .format(positive_end))
+        if self._verbose:
+            print('Identify direction using {} method...'.format(method))
+        direction = None
+        if method == 'single':
+            if self._verbose:
+                print('Positive definitional end:', definitional[0])
+                print('Negative definitional end:', definitional[1])
+            direction = normalize(normalize(self[definitional[0]])
+                                  - normalize(self[definitional[1]]))
+        elif method == 'sum':
+            group1_sum_vector = np.sum([self[word]
+                                        for word in definitional[0]], axis=0)
+            group2_sum_vector = np.sum([self[word]
+                                        for word in definitional[1]], axis=0)
+            diff_vector = (normalize(group1_sum_vector)
+                           - normalize(group2_sum_vector))
+            direction = normalize(diff_vector)
+        elif method == 'pca':
+            pca = self._identify_subspace_by_pca(definitional, 10)
+            if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
+                raise RuntimeError('The Explained variance'
+                                   'of the first principal component should be'
+                                   'at least {}, but it is {}'
+                                   .format(FIRST_PC_THRESHOLD,
+                                           pca.explained_variance_ratio_[0]))
+            direction = pca.components_[0]
+            # if direction is opposite (e.g. we cannot control
+            # what the PCA will return)
+            ends_diff_projection = cosine_similarity((self[positive_end]
+                                                      - self[negative_end]),
+                                                     direction)
+            if ends_diff_projection < 0:
+                direction = -direction  # pylint: disable=invalid-unary-operand-type
+        self.direction = direction
+        self.positive_end = positive_end
+        self.negative_end = negative_end
+    def project_on_direction(self, word):
+        """Project the normalized vector of the word on the direction.
+        :param str word: The word tor project
+        :return float: The projection scalar
+        """
+        self._is_direction_identified()
+        vector = self[word]
+        projection_score = self.model.cosine_similarities(self.direction,
+                                                          [vector])[0]
+        return projection_score
+    def _calc_projection_scores(self, words):
+        self._is_direction_identified()
+        df = pd.DataFrame({'word': words})
+        # TODO: maybe using cosine_similarities on all the vectors?
+        # it might be faster
+        df['projection'] = df['word'].apply(self.project_on_direction)
+        df = df.sort_values('projection', ascending=False)
+        return df
+    def calc_projection_data(self, words):
+        """
+        Calculate projection, projected and rejected vectors of a words list.
+        :param list words: List of words
+        :return: :class:`pandas.DataFrame` of the projection,
+                 projected and rejected vectors of the words list
+        """
+        projection_data = []
+        for word in words:
+            vector = self[word]
+            projection = self.project_on_direction(word)
+            normalized_vector = normalize(vector)
+            (projection,
+             projected_vector,
+             rejected_vector) = project_params(normalized_vector,
+                                               self.direction)
+            projection_data.append({'word': word,
+                                    'vector': vector,
+                                    'projection': projection,
+                                    'projected_vector': projected_vector,
+                                    'rejected_vector': rejected_vector})
+        return pd.DataFrame(projection_data)
+    def plot_dist_projections_on_direction(self, word_groups, ax=None):
+        """Plot the projection scalars distribution on the direction.
+        :param dict word_groups word: The groups to projects
+        :return float: The ax object of the plot
+        """
+        if ax is None:
+            _, ax = plt.subplots(1)
+        names = sorted(word_groups.keys())
+        for name in names:
+            words = word_groups[name]
+            label = '{} (#{})'.format(name, len(words))
+            vectors = [self[word] for word in words]
+            projections = self.model.cosine_similarities(self.direction,
+                                                         vectors)
+            sns.distplot(projections, hist=False, label=label, ax=ax)
+        plt.axvline(0, color='k', linestyle='--')
+        plt.title('← {} {} {} →'.format(self.negative_end,
+                                        ' ' * 20,
+                                        self.positive_end))
+        plt.xlabel('Direction Projection')
+        plt.ylabel('Density')
+        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+        return ax
+    def __errorChecking(self, word):
+        out_msj = ""
+        if not word:
+            out_msj = "Error: Primero debe ingresar una palabra!"
+        else:
+            if not word in self.model:
+                out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
+        if out_msj:
+            out_msj = "<center><h3>"+out_msj+"</h3></center>"
+        return out_msj
+    def parse_words(self, string):
+        words = string.strip()
+        if words:
+            words = [word.strip() for word in words.split(',') if word != ""]
+        return words
+    def check_oov(self, wordlists):
+        for wordlist in wordlists:
+            parsed_words = self.parse_words(wordlist)
+            for word in parsed_words:
+                msg = self.__errorChecking(word)
+                if msg:
+                    return msg
+        return None
+    def plot_projections_2d(self,
+                            wordlist,
+                            wordlist_1,
+                            wordlist_2,
+                            wordlist_3,
+                            wordlist_4,
+                            color_wordlist,
+                            color_wordlist_1,
+                            color_wordlist_2,
+                            color_wordlist_3,
+                            color_wordlist_4,
+                            plot_neighbors,
+                            n_alpha,
+                            fontsize,
+                            figsize=(15, 15),
+                            method='pca'
+                            ):
+        # convertirlas a vector
+        choices = [0, 1, 2, 3, 4]
+        word_list = []
+        wordlist_choice = [wordlist, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
+        err= self.check_oov(wordlist_choice)
+        if err:
+            return None, err
+        words_colors = {}
+        label_dict = {
+            0: 'Diagnostico',
+            1: 'Lista de palabras 1',
+            2: 'Lista de palabras 2',
+            3: 'Lista de palabras 3',
+            4: 'Lista de palabras 4'
+        }
+        color_dict = {
+            0: color_wordlist,
+            1: color_wordlist_1,
+            2: color_wordlist_2,
+            3: color_wordlist_3,
+            4: color_wordlist_4
+        }
+        word_bias_space = {}
+        alpha = {}
+        for raw_word_list, color in zip(wordlist_choice, choices):
+            parsed_words = self.parse_words(raw_word_list)
+            if parsed_words:
+                for word in parsed_words:
+                    word_bias_space[word] = color
+                    words_colors[word] = color_dict[color]
+                    alpha[word] = 1
+                    if plot_neighbors:
+                        neighbors = [w for w,s in self.model.most_similar(word,topn=5)]
+                        for n in neighbors:
+                            if n not in alpha:
+                                word_bias_space[n] = color
+                                words_colors[n] = color_dict[color]
+                                alpha[n] = n_alpha
+                        word_list += neighbors
+            word_list += parsed_words
+        if not word_list:
+            return None, "<center><h3>" + "Ingresa al menos 2 palabras para continuar" + "<center><h3>"
+        embeddings = [self.model[word] for word in word_list]
+        words_embedded = PCA(
+            n_components=2, random_state=1).fit_transform(embeddings)
+        data = pd.DataFrame(words_embedded)
+        data['word'] = word_list
+        data['color'] = [words_colors[word] for word in word_list]
+        data['alpha'] = [alpha[word] for word in word_list]
+        data['word_bias_space'] = [word_bias_space[word] for word in word_list]
+        fig, ax = plt.subplots(figsize=figsize)
+        sns.scatterplot(
+            data=data[data['alpha'] == 1],
+            x=0,
+            y=1,
+            style='word_bias_space',
+            hue='word_bias_space',
+            ax=ax,
+            palette=color_dict
+            )
+        if plot_neighbors:
+            sns.scatterplot(
+                data=data[data['alpha'] != 1],
+                x=0,
+                y=1,
+                style='color',
+                hue='word_bias_space',
+                ax=ax,
+                alpha=n_alpha,
+                legend=False,
+                palette=color_dict
+            )
+        for i, label in enumerate(word_list):
+            x, y = words_embedded[i, :]
+            ax.annotate(label, xy=(x, y), xytext=(5, 2),color=words_colors[label],
+            textcoords='offset points',
+            ha='right', va='bottom', size=fontsize, alpha=alpha[label])
+        ax.set_xticks([])
+        ax.set_yticks([])
+        fig.tight_layout()
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        w, h = fig.canvas.get_width_height()
+        im = data.reshape((int(h), int(w), -1))
+        return im, ''
+class WEBiasExplorer2d(BiasExplorer):
+    def __init__(self, word_embedding) -> None:
+        super().__init__(word_embedding)
+    def calculate_bias(
+        self,
+        palabras_extremo_1,
+        palabras_extremo_2,
+        palabras_para_situar
+    ):
+        wordlists = [
+            palabras_extremo_1,
+            palabras_extremo_2,
+            palabras_para_situar
+            ]
+        err = self.check_oov(wordlists)
+        for wordlist in wordlists:
+            if not wordlist:
+                err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' +"<center><h3>"
+        if err:
+            return None, err
+        err = self.check_oov([palabras_extremo_1,palabras_extremo_2,palabras_para_situar])
+        if err:
+            return None, err
+        palabras_extremo_1 = self.parse_words(palabras_extremo_1)
+        palabras_extremo_2 = self.parse_words(palabras_extremo_2)
+        palabras_para_situar = self.parse_words(palabras_para_situar)
+        im = self.get_bias_plot(
+            palabras_para_situar,
+            definitional=(
+                palabras_extremo_1, palabras_extremo_2),
+            method='sum',
+            n_extreme=10
+        )
+        return im, ''
+    def get_bias_plot(self,
+                      palabras_para_situar,
+                      definitional,
+                      method='sum',
+                      n_extreme=10,
+                      figsize=(10, 10)
+                      ):
+        fig, ax = plt.subplots(1, figsize=figsize)
+        self.method = method
+        self.plot_projection_scores(
+            definitional,
+            palabras_para_situar, n_extreme, ax=ax,)
+        fig.tight_layout()
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        w, h = fig.canvas.get_width_height()
+        im = data.reshape((int(h), int(w), -1))
+        return im
+    def plot_projection_scores(self, definitional,
+                               words, n_extreme=10,
+                               ax=None, axis_projection_step=None):
+        """Plot the projection scalar of words on the direction.
+        :param list words: The words tor project
+        :param int or None n_extreme: The number of extreme words to show
+        :return: The ax object of the plot
+        """
+        nombre_del_extremo_1 = ', '.join(definitional[0])
+        nombre_del_extremo_2 = ', '.join(definitional[1])
+        self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
+                                 definitional=definitional,
+                                 method='sum')
+        self._is_direction_identified()
+        projections_df = self._calc_projection_scores(words)
+        projections_df['projection'] = projections_df['projection'].round(2)
+        if n_extreme is not None:
+            projections_df = take_two_sides_extreme_sorted(projections_df,
+                                                           n_extreme=n_extreme)
+        if ax is None:
+            _, ax = plt.subplots(1)
+        if axis_projection_step is None:
+            axis_projection_step = 0.1
+        cmap = plt.get_cmap('RdBu')
+        projections_df['color'] = ((projections_df['projection'] + 0.5)
+                                   .apply(cmap))
+        most_extream_projection = np.round(
+            projections_df['projection']
+            .abs()
+            .max(),
+            decimals=1)
+        sns.barplot(x='projection', y='word', data=projections_df,
+                    palette=projections_df['color'])
+        plt.xticks(np.arange(-most_extream_projection,
+                             most_extream_projection + axis_projection_step,
+                             axis_projection_step))
+        xlabel = ('← {} {} {} →'.format(self.negative_end,
+                                        ' ' * 20,
+                                        self.positive_end))
+        plt.xlabel(xlabel)
+        plt.ylabel('Words')
+        return ax
+class WEBiasExplorer4d(BiasExplorer):
+    def __init__(self, word_embedding) -> None:
+        super().__init__(word_embedding)
+    def calculate_bias(
+        self,
+        palabras_extremo_1,
+        palabras_extremo_2,
+        palabras_extremo_3,
+        palabras_extremo_4,
+        palabras_para_situar
+    ):
+        wordlists = [
+            palabras_extremo_1,
+            palabras_extremo_2,
+            palabras_extremo_3,
+            palabras_extremo_4,
+            palabras_para_situar
+            ]
+        err = self.check_oov(wordlists)
+        for wordlist in wordlists:
+            if not wordlist:
+                err = "<center><h3>" + '¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
+        if err:
+            return None, err
+        palabras_extremo_1 = self.parse_words(palabras_extremo_1)
+        palabras_extremo_2 = self.parse_words(palabras_extremo_2)
+        palabras_extremo_3 = self.parse_words(palabras_extremo_3)
+        palabras_extremo_4 = self.parse_words(palabras_extremo_4)
+        palabras_para_situar = self.parse_words(palabras_para_situar)
+        im = self.get_bias_plot(
+            palabras_para_situar,
+            definitional_1=(
+                palabras_extremo_1, palabras_extremo_2),
+            definitional_2=(
+                palabras_extremo_3, palabras_extremo_4),
+            method='sum',
+            n_extreme=10
+        )
+        return im, ''
+    def get_bias_plot(self,
+                      palabras_para_situar,
+                      definitional_1,
+                      definitional_2,
+                      method='sum',
+                      n_extreme=10,
+                      figsize=(10, 10)
+                      ):
+        fig, ax = plt.subplots(1, figsize=figsize)
+        self.method = method
+        self.plot_projection_scores(
+            definitional_1,
+            definitional_2,
+            palabras_para_situar, n_extreme, ax=ax,)
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        w, h = fig.canvas.get_width_height()
+        im = data.reshape((int(h), int(w), -1))
+        return im
+    def plot_projection_scores(self, definitional_1, definitional_2,
+                               words, n_extreme=10,
+                               ax=None, axis_projection_step=None):
+        """Plot the projection scalar of words on the direction.
+        :param list words: The words tor project
+        :param int or None n_extreme: The number of extreme words to show
+        :return: The ax object of the plot
+        """
+        nombre_del_extremo_1 = ', '.join(definitional_1[1])
+        nombre_del_extremo_2 = ', '.join(definitional_1[0])
+        self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
+                                 definitional=definitional_1,
+                                 method='sum')
+        self._is_direction_identified()
+        projections_df = self._calc_projection_scores(words)
+        projections_df['projection_x'] = projections_df['projection'].round(2)
+        nombre_del_extremo_3 = ', '.join(definitional_2[1])
+        nombre_del_extremo_4 = ', '.join(definitional_2[0])
+        self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
+                                 definitional=definitional_2,
+                                 method='sum')
+        self._is_direction_identified()
+        projections_df['projection_y'] = self._calc_projection_scores(words)[
+            'projection'].round(2)
+        if n_extreme is not None:
+            projections_df = take_two_sides_extreme_sorted(projections_df,
+                                                           n_extreme=n_extreme)
+        if ax is None:
+            _, ax = plt.subplots(1)
+        if axis_projection_step is None:
+            axis_projection_step = 0.1
+        cmap = plt.get_cmap('RdBu')
+        projections_df['color'] = ((projections_df['projection'] + 0.5)
+                                   .apply(cmap))
+        most_extream_projection = np.round(
+            projections_df['projection']
+            .abs()
+            .max(),
+            decimals=1)
+        sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
+                        palette=projections_df['color'])
+        plt.xticks(np.arange(-most_extream_projection,
+                             most_extream_projection + axis_projection_step,
+                             axis_projection_step))
+        for _, row in (projections_df.iterrows()):
+            ax.annotate(
+                row['word'], (row['projection_x'], row['projection_y']))
+        x_label = '← {} {} {} →'.format(nombre_del_extremo_1,
+                                        ' ' * 20,
+                                        nombre_del_extremo_2)
+        y_label = '← {} {} {} →'.format(nombre_del_extremo_3,
+                                        ' ' * 20,
+                                        nombre_del_extremo_4)
+        plt.xlabel(x_label)
+        ax.xaxis.set_label_position('bottom')
+        ax.xaxis.set_label_coords(.5, 0)
+        plt.ylabel(y_label)
+        ax.yaxis.set_label_position('left')
+        ax.yaxis.set_label_coords(0, .5)
+        ax.spines['left'].set_position('center')
+        ax.spines['bottom'].set_position('center')
+        ax.set_xticks([])
+        ax.set_yticks([])
+        #plt.yticks([], [])
+        # ax.spines['left'].set_position('zero')
+        # ax.spines['bottom'].set_position('zero')
+        return ax

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# sesgos_en_frases
+regex
+# datos
+torch
+transformers
+# resto
+sklearn
+gensim==3.7.3
+transformers
+tensorflow
+matplotlib
+numpy
+seaborn

tool_info.py ADDED Viewed

	@@ -0,0 +1,23 @@

+TOOL_INFO = """
+> ### A tool to overcome technical barriers for bias assessment in human language technologies
+* [Read Full Paper](https://arxiv.org/abs/2207.06591)
+> ### Licensing Information
+* [MIT Licence](https://huggingface.co/spaces/vialibre/new_bias_tools/resolve/main/LICENSE)
+> ### Citation Information
+```c
+@misc{https://doi.org/10.48550/arxiv.2207.06591,
+    doi = {10.48550/ARXIV.2207.06591},
+    url = {https://arxiv.org/abs/2207.06591},
+    author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
+    keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
+    FOS: Computer and information sciences, FOS: Computer and information sciences},
+    title = {A tool to overcome technical barriers for bias assessment in human language technologies},
+    publisher = {arXiv},
+    year = {2022},
+    copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
+}
+```
+"""

utils_sesgo_en_palabras.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import math
+import gensim
+import matplotlib.pylab as plt
+import numpy as np
+import pandas as pd
+from six import string_types
+from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
+from sklearn.metrics import accuracy_score
+import gradio as gr
+WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
+                              gensim.models.fasttext.FastText,
+                              gensim.models.word2vec.Word2Vec,
+                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long
+def assert_gensim_keyed_vectors(model):
+    if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
+        type_names = (model_type.__name__
+                      for model_type in WORD_EMBEDDING_MODEL_TYPES)
+        raise TypeError('model should be on of the types'
+                        ' ({}), not {}.'
+                        .format(', '.join(type_names),
+                                type(model)))
+def generate_words_forms(words):
+    return sum([generate_one_word_forms(word) for word in words], [])
+def cosine_similarity(v, u):
+    """Calculate the cosine similarity between two vectors."""
+    v_norm = np.linalg.norm(v)
+    u_norm = np.linalg.norm(u)
+    similarity = v @ u / (v_norm * u_norm)
+    return similarity
+def generate_one_word_forms(word):
+    return [word.lower(), word.upper(), word.title()]
+def get_seed_vector(seed, bias_word_embedding):
+    if seed == 'direction':
+        positive_end = bias_word_embedding.positive_end
+        negative_end = bias_word_embedding.negative_end
+        bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
+        seed_vector = bias_word_embedding.direction
+    else:
+        if seed == 'ends':
+            positive_end = bias_word_embedding.positive_end
+            negative_end = bias_word_embedding.negative_end
+        else:
+            positive_end, negative_end = seed
+        seed_vector = normalize(bias_word_embedding.model[positive_end]
+                                - bias_word_embedding.model[negative_end])
+    return seed_vector, positive_end, negative_end
+def most_similar(model, positive=None, negative=None,
+                 topn=10, restrict_vocab=None, indexer=None,
+                 unrestricted=True):
+    """
+    Find the top-N most similar words.
+    Positive words contribute positively towards the similarity,
+    negative words negatively.
+    This function computes cosine similarity between a simple mean
+    of the projection weight vectors of the given words and
+    the vectors for each word in the model.
+    The function corresponds to the `word-analogy` and `distance`
+    scripts in the original word2vec implementation.
+    Based on Gensim implementation.
+    :param model: Word embedding model of ``gensim.model.KeyedVectors``.
+    :param list positive: List of words that contribute positively.
+    :param list negative: List of words that contribute negatively.
+    :param int topn: Number of top-N similar words to return.
+    :param int restrict_vocab: Optional integer which limits the
+                               range of vectors
+                               which are searched for most-similar values.
+                               For example, restrict_vocab=10000 would
+                               only check the first 10000 word vectors
+                               in the vocabulary order. (This may be
+                               meaningful if you've sorted the vocabulary
+                               by descending frequency.)
+    :param bool unrestricted: Whether to restricted the most
+                              similar words to be not from
+                              the positive or negative word list.
+    :return: Sequence of (word, similarity).
+    """
+    if topn is not None and topn < 1:
+        return []
+    if positive is None:
+        positive = []
+    if negative is None:
+        negative = []
+    model.init_sims()
+    if (isinstance(positive, string_types)
+            and not negative):
+        # allow calls like most_similar('dog'),
+        # as a shorthand for most_similar(['dog'])
+        positive = [positive]
+    if ((isinstance(positive, string_types) and negative)
+            or (isinstance(negative, string_types) and positive)):
+        raise ValueError('If positives and negatives are given, '
+                         'both should be lists!')
+    # add weights for each word, if not already present;
+    # default to 1.0 for positive and -1.0 for negative words
+    positive = [
+        (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
+        else word
+        for word in positive
+    ]
+    negative = [
+        (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
+        else word
+        for word in negative
+    ]
+    # compute the weighted average of all words
+    all_words, mean = set(), []
+    for word, weight in positive + negative:
+        if isinstance(word, np.ndarray):
+            mean.append(weight * word)
+        else:
+            mean.append(weight * model.word_vec(word, use_norm=True))
+            if word in model.vocab:
+                all_words.add(model.vocab[word].index)
+    if not mean:
+        raise ValueError("Cannot compute similarity with no input.")
+    mean = gensim.matutils.unitvec(np.array(mean)
+                                   .mean(axis=0)).astype(float)
+    if indexer is not None:
+        return indexer.most_similar(mean, topn)
+    limited = (model.vectors_norm if restrict_vocab is None
+               else model.vectors_norm[:restrict_vocab])
+    dists = limited @ mean
+    if topn is None:
+        return dists
+    best = gensim.matutils.argsort(dists,
+                                   topn=topn + len(all_words),
+                                   reverse=True)
+    # if not unrestricted, then ignore (don't return)
+    # words from the input
+    result = [(model.index2word[sim], float(dists[sim]))
+              for sim in best
+              if unrestricted or sim not in all_words]
+    return result[:topn]
+def normalize(v):
+    """Normalize a 1-D vector."""
+    if v.ndim != 1:
+        raise ValueError('v should be 1-D, {}-D was given'.format(
+            v.ndim))
+    norm = np.linalg.norm(v)
+    if norm == 0:
+        return v
+    return v / norm
+def project_params(u, v):
+    """Projecting and rejecting the vector v onto direction u with scalar."""
+    normalize_u = normalize(u)
+    projection = (v @ normalize_u)
+    projected_vector = projection * normalize_u
+    rejected_vector = v - projected_vector
+    return projection, projected_vector, rejected_vector
+def project_reject_vector(v, u):
+    """Projecting and rejecting the vector v onto direction u."""
+    projected_vector = project_vector(v, u)
+    rejected_vector = v - projected_vector
+    return projected_vector, rejected_vector
+def round_to_extreme(value, digits=2):
+    place = 10**digits
+    new_value = math.ceil(abs(value) * place) / place
+    if value < 0:
+        new_value = -new_value
+    return new_value
+def take_two_sides_extreme_sorted(df, n_extreme,
+                                  part_column=None,
+                                  head_value='',
+                                  tail_value=''):
+    head_df = df.head(n_extreme)[:]
+    tail_df = df.tail(n_extreme)[:]
+    if part_column is not None:
+        head_df[part_column] = head_value
+        tail_df[part_column] = tail_value
+    return (pd.concat([head_df, tail_df])
+            .drop_duplicates()
+            .reset_index(drop=True))
+def project_vector(v, u):
+    """Projecting the vector v onto direction u."""
+    normalize_u = normalize(u)
+    return (v @ normalize_u) * normalize_u
+def reject_vector(v, u):
+    """Rejecting the vector v onto direction u."""
+    return v - project_vector(v, u)
+def update_word_vector(model, word, new_vector):
+    model.vectors[model.vocab[word].index] = new_vector
+    if model.vectors_norm is not None:
+        model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
+def project_params(u, v):
+    """Projecting and rejecting the vector v onto direction u with scalar."""
+    normalize_u = normalize(u)
+    projection = (v @ normalize_u)
+    projected_vector = projection * normalize_u
+    rejected_vector = v - projected_vector
+    return projection, projected_vector, rejected_vector
+def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
+    if ax is None:
+        _, ax = plt.subplots(figsize=(10, 5))
+    y_cluster = (KMeans(n_clusters=2, random_state=random_state)
+                 .fit_predict(X))
+    embedded_vectors = (TSNE(n_components=2, random_state=random_state)
+                        .fit_transform(X))
+    for y_value in np.unique(y_cluster):
+        mask = (y_cluster == y_value)
+        label = 'Positive' if y_value else 'Negative'
+        ax.scatter(embedded_vectors[mask, 0],
+                   embedded_vectors[mask, 1],
+                   label=label)
+    ax.legend()
+    acc = accuracy_score(y_true, y_cluster)
+    return max(acc, 1 - acc)