Spaces:
Build error
Build error
Lucia Gonzalez commited on
Commit ·
7dda7b9
1
Parent(s): 50bd57f
Files
Browse files- README.md +3 -3
- examples.py +95 -0
- explorar_relaciones_entre_palabras.py +105 -0
- modules_sesgo_en_palabras.py +714 -0
- requirements.txt +13 -0
- tool_info.py +23 -0
- utils_sesgo_en_palabras.py +272 -0
README.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
---
|
| 2 |
-
title: Explorar
|
| 3 |
emoji: ⚡
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 3.
|
| 8 |
-
app_file:
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Explorar relaciones entre palabras
|
| 3 |
emoji: ⚡
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 3.2
|
| 8 |
+
app_file: explorar_relaciones_entre_palabras.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
---
|
examples.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
example_fem = {
|
| 2 |
+
"mujer": "la mente de una mujer que durante los últimos",
|
| 3 |
+
"chica": "enamorado de la misma chica desde la infancia mary",
|
| 4 |
+
"ella": "ella llego a la final",
|
| 5 |
+
"madre": "su padre y su madre margarita de parma",
|
| 6 |
+
"hija": "hija de inmigrantes españoles en",
|
| 7 |
+
"femenino": "campeonato mundial de voleibol femenino fue la duodécima edición",
|
| 8 |
+
}
|
| 9 |
+
example_joven = {
|
| 10 |
+
"joven": "",
|
| 11 |
+
"inmaduro": "",
|
| 12 |
+
"niño": "",
|
| 13 |
+
"crio": ""
|
| 14 |
+
}
|
| 15 |
+
example_viejo = {
|
| 16 |
+
"viejo": "",
|
| 17 |
+
"maduro": "",
|
| 18 |
+
"anciano": "",
|
| 19 |
+
"adulto": ""
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
example_masc = {
|
| 24 |
+
"hombre": "deseo innato que todo hombre tiene de comunicar su",
|
| 25 |
+
"chico": "fue un chico interesado en artes",
|
| 26 |
+
"el": "el parque nacional liwonde",
|
| 27 |
+
"padre": "la muerte de su padre en 1832 se formó",
|
| 28 |
+
"hijo": "le dice a su hijo aún no nacido como",
|
| 29 |
+
"masculino": "el mito es esencialmente masculino y entre las causas",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
example_diagnose = {
|
| 33 |
+
"ario": "establecer que el pueblo ario vivió en inmemoriales tiempos",
|
| 34 |
+
"educación": "sentido de vida religión educación y cultura para cada mujer",
|
| 35 |
+
"pagado": "un rescate muy grande pagado por sus seguidores a",
|
| 36 |
+
"cocinar": "empezó a cocinar una sopa usando",
|
| 37 |
+
"lavar": "era directamente usado para lavar ropa por eso la",
|
| 38 |
+
"deporte": "se convirtió en el deporte más popular del país",
|
| 39 |
+
"ropa": "usan el kimono una ropa tradicional japonesa",
|
| 40 |
+
"pelea": "mal por la violenta pelea entre ambos hermanos",
|
| 41 |
+
"enfermero": "en enfermería el diagnóstico enfermero o diagnóstico de enfermería es",
|
| 42 |
+
"ganar": "una necesidad un modo de ganar",
|
| 43 |
+
"líder": "del estado en manos del líder opositor henrique capriles para el",
|
| 44 |
+
"coser": "realizar tareas domésticas básicas como coser y poner la mesa",
|
| 45 |
+
"cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
|
| 46 |
+
"cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejército josefino",
|
| 47 |
+
"rey": "la princesa jeongsung esposa del rey danjong que ascendió al trono",
|
| 48 |
+
"reina": "año ganó el título de reina de la bahía en el"
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
fem_words = ','.join([word for word, context in example_fem.items()])
|
| 53 |
+
fem_contexts = ','.join([context for word, context in example_fem.items()])
|
| 54 |
+
masc_words = ','.join([word for word, context in example_masc.items()])
|
| 55 |
+
masc_contexts = ','.join([context for word, context in example_masc.items()])
|
| 56 |
+
young_words = ','.join([word for word, context in example_joven.items()])
|
| 57 |
+
old_words = ','.join([word for word, context in example_viejo.items()])
|
| 58 |
+
diagnose_words = ','.join([word for word, context in example_diagnose.items()])
|
| 59 |
+
diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
|
| 60 |
+
|
| 61 |
+
positive_money_words = 'ahorrar,economizar,administrar,manejar,negocio,beneficios'
|
| 62 |
+
negative_money_words = 'malgastar,derrochar'
|
| 63 |
+
diagnose_money = 'alemán,australiano,argentino,millonario,rico,pobre'
|
| 64 |
+
|
| 65 |
+
lazy_words = 'vago, perezoso, gandul'
|
| 66 |
+
active_words = 'trabajar, esfuerzo, trabajador'
|
| 67 |
+
|
| 68 |
+
examples1_explorar_sesgo_en_palabras = [
|
| 69 |
+
[fem_words, masc_words, diagnose_words],
|
| 70 |
+
[old_words, young_words, diagnose_words],
|
| 71 |
+
[positive_money_words, negative_money_words, diagnose_money],
|
| 72 |
+
[lazy_words, active_words, diagnose_money]
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
examples2_explorar_sesgo_en_palabras = [
|
| 76 |
+
[fem_words, masc_words, young_words, old_words, diagnose_words],
|
| 77 |
+
[lazy_words, active_words, positive_money_words, negative_money_words,diagnose_money],
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
examples_explorar_relaciones_entre_palabras = [
|
| 82 |
+
[diagnose_words, fem_words, masc_words, young_words, old_words],
|
| 83 |
+
[diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# Examples Sesgos en frases
|
| 88 |
+
examples_sesgos_frases = [
|
| 89 |
+
["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
|
| 90 |
+
["los cordobeses bailan *", ""],
|
| 91 |
+
["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
|
| 92 |
+
["",""]
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
examples_datos = ["ronronear", "ella", "el", "ellos"]
|
explorar_relaciones_entre_palabras.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib as mpl
|
| 2 |
+
mpl.use('Agg')
|
| 3 |
+
|
| 4 |
+
from audioop import bias
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
from modules_sesgo_en_palabras import WEBiasExplorer2d, Embedding
|
| 8 |
+
from examples import examples_explorar_relaciones_entre_palabras
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
|
| 11 |
+
from tool_info import TOOL_INFO
|
| 12 |
+
|
| 13 |
+
plt.rcParams.update({'font.size': 14})
|
| 14 |
+
|
| 15 |
+
LABEL_WORD_LIST_1 = 'Lista de palabras 1'
|
| 16 |
+
LABEL_WORD_LIST_2 = 'Lista de palabras 2'
|
| 17 |
+
LABEL_WORD_LIST_3 = 'Lista de palabras 3'
|
| 18 |
+
LABEL_WORD_LIST_4 = 'Lista de palabras 4'
|
| 19 |
+
LABEL_WORD_LIST_DIAGNOSE = 'Lista de palabras a diagnosticar'
|
| 20 |
+
|
| 21 |
+
word_vectors_path = 'fasttext-sbwc.100k.vec'
|
| 22 |
+
we = Embedding(word_vectors_path)
|
| 23 |
+
we.load_we_as_keyed_vectors(word_vectors_path)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
we_bias_2d = WEBiasExplorer2d(we.wv)
|
| 27 |
+
|
| 28 |
+
explorar_relaciones_entre_palabras_interface = gr.Blocks()
|
| 29 |
+
with explorar_relaciones_entre_palabras_interface:
|
| 30 |
+
gr.Markdown("Escribi algunas palabras para visualizar sus palabras relacionadas")
|
| 31 |
+
with gr.Row():
|
| 32 |
+
with gr.Column(scale=3):
|
| 33 |
+
with gr.Row(equal_height=True):
|
| 34 |
+
with gr.Column(scale=5):
|
| 35 |
+
diagnose_list = gr.Textbox(lines=2, label=LABEL_WORD_LIST_DIAGNOSE)
|
| 36 |
+
with gr.Column(scale=1,min_width=10):
|
| 37 |
+
color_wordlist = gr.ColorPicker(label="",value='#000000',)
|
| 38 |
+
with gr.Row():
|
| 39 |
+
with gr.Column(scale=5):
|
| 40 |
+
wordlist_1 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_1)
|
| 41 |
+
with gr.Column(scale=1,min_width=10):
|
| 42 |
+
color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
|
| 43 |
+
with gr.Row():
|
| 44 |
+
with gr.Column(scale=5):
|
| 45 |
+
wordlist_2 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_2)
|
| 46 |
+
with gr.Column(scale=1,min_width=10):
|
| 47 |
+
color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
|
| 48 |
+
with gr.Row():
|
| 49 |
+
with gr.Column(scale=5):
|
| 50 |
+
wordlist_3 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_3)
|
| 51 |
+
with gr.Column(scale=1,min_width=10):
|
| 52 |
+
color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
|
| 53 |
+
with gr.Row():
|
| 54 |
+
with gr.Column(scale=5):
|
| 55 |
+
wordlist_4 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_4)
|
| 56 |
+
with gr.Column(scale=1,min_width=10):
|
| 57 |
+
color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
|
| 58 |
+
with gr.Column(scale=4):
|
| 59 |
+
with gr.Row():
|
| 60 |
+
with gr.Row():
|
| 61 |
+
plot_neighbors = gr.Checkbox(label='Graficar palabras relacionadas')
|
| 62 |
+
with gr.Row():
|
| 63 |
+
alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label='Transparencia')
|
| 64 |
+
with gr.Row():
|
| 65 |
+
fontsize=gr.Number(value=18, label='Tamaño de fuente')
|
| 66 |
+
with gr.Row():
|
| 67 |
+
btn_plot = gr.Button('¡Graficar en el espacio!')
|
| 68 |
+
with gr.Row():
|
| 69 |
+
err_msg = gr.Markdown(label="", visible=True)
|
| 70 |
+
with gr.Row():
|
| 71 |
+
word_proyections = gr.Image(shape=(10, 10))
|
| 72 |
+
|
| 73 |
+
with gr.Row():
|
| 74 |
+
examples = gr.Examples(
|
| 75 |
+
fn=we_bias_2d.plot_projections_2d,
|
| 76 |
+
inputs=[diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4],
|
| 77 |
+
outputs=[word_proyections,err_msg],
|
| 78 |
+
examples=examples_explorar_relaciones_entre_palabras
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
with gr.Row():
|
| 82 |
+
gr.Markdown(TOOL_INFO)
|
| 83 |
+
btn_plot.click(
|
| 84 |
+
fn=we_bias_2d.plot_projections_2d,
|
| 85 |
+
inputs=[
|
| 86 |
+
diagnose_list,
|
| 87 |
+
wordlist_1,
|
| 88 |
+
wordlist_2,
|
| 89 |
+
wordlist_3,
|
| 90 |
+
wordlist_4,
|
| 91 |
+
color_wordlist,
|
| 92 |
+
color_wordlist_1,
|
| 93 |
+
color_wordlist_2,
|
| 94 |
+
color_wordlist_3,
|
| 95 |
+
color_wordlist_4,
|
| 96 |
+
plot_neighbors,
|
| 97 |
+
alpha,
|
| 98 |
+
fontsize
|
| 99 |
+
],
|
| 100 |
+
outputs=[word_proyections,err_msg]
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
explorar_relaciones_entre_palabras_interface.queue(concurrency_count=10)
|
| 105 |
+
explorar_relaciones_entre_palabras_interface.launch()
|
modules_sesgo_en_palabras.py
ADDED
|
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
|
| 3 |
+
from sklearn.decomposition import PCA
|
| 4 |
+
from sklearn.metrics.pairwise import euclidean_distances
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
import numpy as np
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from gensim.models import KeyedVectors
|
| 11 |
+
from utils_sesgo_en_palabras import (
|
| 12 |
+
cosine_similarity,
|
| 13 |
+
normalize,
|
| 14 |
+
project_params,
|
| 15 |
+
take_two_sides_extreme_sorted
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
DIRECTION_METHODS = ['single', 'sum', 'pca']
|
| 20 |
+
DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
|
| 21 |
+
FIRST_PC_THRESHOLD = 0.5
|
| 22 |
+
MAX_NON_SPECIFIC_EXAMPLES = 1000
|
| 23 |
+
|
| 24 |
+
__all__ = ['GenderBiasWE', 'BiasWordEmbedding']
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class Loader():
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.path_to_data = ''
|
| 30 |
+
|
| 31 |
+
def load_tokenizer(self, tokenizer_path):
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 33 |
+
tokenizer_path, do_lower_case=True, )
|
| 34 |
+
return tokenizer
|
| 35 |
+
|
| 36 |
+
def load_data_from_file(self, data):
|
| 37 |
+
return data
|
| 38 |
+
|
| 39 |
+
def load_corpus_from_file(self, data):
|
| 40 |
+
return data
|
| 41 |
+
|
| 42 |
+
def load_language_model(self, model_path):
|
| 43 |
+
model = AutoModelForMaskedLM.from_pretrained(
|
| 44 |
+
model_path, output_hidden_states=True)
|
| 45 |
+
return model
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class Corpus():
|
| 49 |
+
def __init__(self, corpus) -> None:
|
| 50 |
+
self.vocabulary = self.load_vocabulary_from_corpus()
|
| 51 |
+
self.corpus = corpus
|
| 52 |
+
|
| 53 |
+
def get_context_from_text(self, word):
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
def get_frequency(self, word):
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
def get_most_frequent_coocurrence(self, word):
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class Embedding():
|
| 64 |
+
def __init__(self, word_vectors_path) -> None:
|
| 65 |
+
self.wv = self.load_we_as_keyed_vectors(word_vectors_path)
|
| 66 |
+
|
| 67 |
+
def load_we_as_keyed_vectors(self, word_vectors_path):
|
| 68 |
+
we = KeyedVectors.load_word2vec_format(word_vectors_path)
|
| 69 |
+
we.init_sims(replace=True)
|
| 70 |
+
return we
|
| 71 |
+
|
| 72 |
+
def get_word_vector(self, word, context=None):
|
| 73 |
+
return word
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class BiasExplorer():
|
| 77 |
+
def __init__(self, model, only_lower=False, verbose=False,
|
| 78 |
+
identify_direction=False, to_normalize=True):
|
| 79 |
+
# pylint: disable=undefined-variable
|
| 80 |
+
|
| 81 |
+
# TODO: this is bad Python, ask someone about it
|
| 82 |
+
# probably should be a better design
|
| 83 |
+
# identify_direction doesn't have any meaning
|
| 84 |
+
# for the class BiasWordEmbedding
|
| 85 |
+
# The goal is to force this interfeace of sub-classes.
|
| 86 |
+
if self.__class__ == __class__ and identify_direction is not False:
|
| 87 |
+
raise ValueError('identify_direction must be False'
|
| 88 |
+
' for an instance of {}'
|
| 89 |
+
.format(__class__))
|
| 90 |
+
|
| 91 |
+
self.model = model
|
| 92 |
+
|
| 93 |
+
# TODO: write unitest for when it is False
|
| 94 |
+
self.only_lower = only_lower
|
| 95 |
+
|
| 96 |
+
self._verbose = verbose
|
| 97 |
+
|
| 98 |
+
self.direction = None
|
| 99 |
+
self.positive_end = None
|
| 100 |
+
self.negative_end = None
|
| 101 |
+
|
| 102 |
+
if to_normalize:
|
| 103 |
+
self.model.init_sims(replace=True)
|
| 104 |
+
|
| 105 |
+
def __copy__(self):
|
| 106 |
+
bias_word_embedding = self.__class__(self.model,
|
| 107 |
+
self.only_lower,
|
| 108 |
+
self._verbose,
|
| 109 |
+
identify_direction=False)
|
| 110 |
+
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
| 111 |
+
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
| 112 |
+
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
| 113 |
+
return bias_word_embedding
|
| 114 |
+
|
| 115 |
+
def __deepcopy__(self, memo):
|
| 116 |
+
bias_word_embedding = copy.copy(self)
|
| 117 |
+
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
| 118 |
+
return bias_word_embedding
|
| 119 |
+
|
| 120 |
+
def __getitem__(self, key):
|
| 121 |
+
return self.model[key]
|
| 122 |
+
|
| 123 |
+
def __contains__(self, item):
|
| 124 |
+
return item in self.model
|
| 125 |
+
|
| 126 |
+
def _is_direction_identified(self):
|
| 127 |
+
if self.direction is None:
|
| 128 |
+
raise RuntimeError('The direction was not identified'
|
| 129 |
+
' for this {} instance'
|
| 130 |
+
.format(self.__class__.__name__))
|
| 131 |
+
|
| 132 |
+
def _identify_subspace_by_pca(self, definitional_pairs, n_components):
|
| 133 |
+
matrix = []
|
| 134 |
+
|
| 135 |
+
for word1, word2 in definitional_pairs:
|
| 136 |
+
vector1 = normalize(self[word1])
|
| 137 |
+
vector2 = normalize(self[word2])
|
| 138 |
+
|
| 139 |
+
center = (vector1 + vector2) / 2
|
| 140 |
+
|
| 141 |
+
matrix.append(vector1 - center)
|
| 142 |
+
matrix.append(vector2 - center)
|
| 143 |
+
|
| 144 |
+
pca = PCA(n_components=n_components)
|
| 145 |
+
pca.fit(matrix)
|
| 146 |
+
|
| 147 |
+
if self._verbose:
|
| 148 |
+
table = enumerate(pca.explained_variance_ratio_, start=1)
|
| 149 |
+
headers = ['Principal Component',
|
| 150 |
+
'Explained Variance Ratio']
|
| 151 |
+
|
| 152 |
+
return pca
|
| 153 |
+
|
| 154 |
+
def __errorChecking(self, word):
|
| 155 |
+
out_msj = ""
|
| 156 |
+
|
| 157 |
+
if not word:
|
| 158 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
| 159 |
+
else:
|
| 160 |
+
if not word in self.model:
|
| 161 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
| 162 |
+
|
| 163 |
+
if out_msj:
|
| 164 |
+
out_msj = "<center><h3>"+out_msj+"</h3></center>"
|
| 165 |
+
|
| 166 |
+
return out_msj
|
| 167 |
+
|
| 168 |
+
# TODO: add the SVD method from section 6 step 1
|
| 169 |
+
# It seems there is a mistake there, I think it is the same as PCA
|
| 170 |
+
# just with replacing it with SVD
|
| 171 |
+
def _identify_direction(self, positive_end, negative_end,
|
| 172 |
+
definitional, method='pca'):
|
| 173 |
+
if method not in DIRECTION_METHODS:
|
| 174 |
+
raise ValueError('method should be one of {}, {} was given'.format(
|
| 175 |
+
DIRECTION_METHODS, method))
|
| 176 |
+
|
| 177 |
+
if positive_end == negative_end:
|
| 178 |
+
raise ValueError('positive_end and negative_end'
|
| 179 |
+
'should be different, and not the same "{}"'
|
| 180 |
+
.format(positive_end))
|
| 181 |
+
if self._verbose:
|
| 182 |
+
print('Identify direction using {} method...'.format(method))
|
| 183 |
+
|
| 184 |
+
direction = None
|
| 185 |
+
|
| 186 |
+
if method == 'single':
|
| 187 |
+
if self._verbose:
|
| 188 |
+
print('Positive definitional end:', definitional[0])
|
| 189 |
+
print('Negative definitional end:', definitional[1])
|
| 190 |
+
direction = normalize(normalize(self[definitional[0]])
|
| 191 |
+
- normalize(self[definitional[1]]))
|
| 192 |
+
|
| 193 |
+
elif method == 'sum':
|
| 194 |
+
group1_sum_vector = np.sum([self[word]
|
| 195 |
+
for word in definitional[0]], axis=0)
|
| 196 |
+
group2_sum_vector = np.sum([self[word]
|
| 197 |
+
for word in definitional[1]], axis=0)
|
| 198 |
+
|
| 199 |
+
diff_vector = (normalize(group1_sum_vector)
|
| 200 |
+
- normalize(group2_sum_vector))
|
| 201 |
+
|
| 202 |
+
direction = normalize(diff_vector)
|
| 203 |
+
|
| 204 |
+
elif method == 'pca':
|
| 205 |
+
pca = self._identify_subspace_by_pca(definitional, 10)
|
| 206 |
+
if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
|
| 207 |
+
raise RuntimeError('The Explained variance'
|
| 208 |
+
'of the first principal component should be'
|
| 209 |
+
'at least {}, but it is {}'
|
| 210 |
+
.format(FIRST_PC_THRESHOLD,
|
| 211 |
+
pca.explained_variance_ratio_[0]))
|
| 212 |
+
direction = pca.components_[0]
|
| 213 |
+
|
| 214 |
+
# if direction is opposite (e.g. we cannot control
|
| 215 |
+
# what the PCA will return)
|
| 216 |
+
ends_diff_projection = cosine_similarity((self[positive_end]
|
| 217 |
+
- self[negative_end]),
|
| 218 |
+
direction)
|
| 219 |
+
if ends_diff_projection < 0:
|
| 220 |
+
direction = -direction # pylint: disable=invalid-unary-operand-type
|
| 221 |
+
|
| 222 |
+
self.direction = direction
|
| 223 |
+
self.positive_end = positive_end
|
| 224 |
+
self.negative_end = negative_end
|
| 225 |
+
|
| 226 |
+
def project_on_direction(self, word):
|
| 227 |
+
"""Project the normalized vector of the word on the direction.
|
| 228 |
+
:param str word: The word tor project
|
| 229 |
+
:return float: The projection scalar
|
| 230 |
+
"""
|
| 231 |
+
|
| 232 |
+
self._is_direction_identified()
|
| 233 |
+
|
| 234 |
+
vector = self[word]
|
| 235 |
+
projection_score = self.model.cosine_similarities(self.direction,
|
| 236 |
+
[vector])[0]
|
| 237 |
+
return projection_score
|
| 238 |
+
|
| 239 |
+
def _calc_projection_scores(self, words):
|
| 240 |
+
self._is_direction_identified()
|
| 241 |
+
|
| 242 |
+
df = pd.DataFrame({'word': words})
|
| 243 |
+
|
| 244 |
+
# TODO: maybe using cosine_similarities on all the vectors?
|
| 245 |
+
# it might be faster
|
| 246 |
+
df['projection'] = df['word'].apply(self.project_on_direction)
|
| 247 |
+
df = df.sort_values('projection', ascending=False)
|
| 248 |
+
|
| 249 |
+
return df
|
| 250 |
+
|
| 251 |
+
def calc_projection_data(self, words):
|
| 252 |
+
"""
|
| 253 |
+
Calculate projection, projected and rejected vectors of a words list.
|
| 254 |
+
:param list words: List of words
|
| 255 |
+
:return: :class:`pandas.DataFrame` of the projection,
|
| 256 |
+
projected and rejected vectors of the words list
|
| 257 |
+
"""
|
| 258 |
+
projection_data = []
|
| 259 |
+
for word in words:
|
| 260 |
+
vector = self[word]
|
| 261 |
+
projection = self.project_on_direction(word)
|
| 262 |
+
normalized_vector = normalize(vector)
|
| 263 |
+
|
| 264 |
+
(projection,
|
| 265 |
+
projected_vector,
|
| 266 |
+
rejected_vector) = project_params(normalized_vector,
|
| 267 |
+
self.direction)
|
| 268 |
+
|
| 269 |
+
projection_data.append({'word': word,
|
| 270 |
+
'vector': vector,
|
| 271 |
+
'projection': projection,
|
| 272 |
+
'projected_vector': projected_vector,
|
| 273 |
+
'rejected_vector': rejected_vector})
|
| 274 |
+
|
| 275 |
+
return pd.DataFrame(projection_data)
|
| 276 |
+
|
| 277 |
+
def plot_dist_projections_on_direction(self, word_groups, ax=None):
|
| 278 |
+
"""Plot the projection scalars distribution on the direction.
|
| 279 |
+
:param dict word_groups word: The groups to projects
|
| 280 |
+
:return float: The ax object of the plot
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
if ax is None:
|
| 284 |
+
_, ax = plt.subplots(1)
|
| 285 |
+
|
| 286 |
+
names = sorted(word_groups.keys())
|
| 287 |
+
|
| 288 |
+
for name in names:
|
| 289 |
+
words = word_groups[name]
|
| 290 |
+
label = '{} (#{})'.format(name, len(words))
|
| 291 |
+
vectors = [self[word] for word in words]
|
| 292 |
+
projections = self.model.cosine_similarities(self.direction,
|
| 293 |
+
vectors)
|
| 294 |
+
sns.distplot(projections, hist=False, label=label, ax=ax)
|
| 295 |
+
|
| 296 |
+
plt.axvline(0, color='k', linestyle='--')
|
| 297 |
+
|
| 298 |
+
plt.title('← {} {} {} →'.format(self.negative_end,
|
| 299 |
+
' ' * 20,
|
| 300 |
+
self.positive_end))
|
| 301 |
+
plt.xlabel('Direction Projection')
|
| 302 |
+
plt.ylabel('Density')
|
| 303 |
+
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
| 304 |
+
|
| 305 |
+
return ax
|
| 306 |
+
|
| 307 |
+
def __errorChecking(self, word):
|
| 308 |
+
out_msj = ""
|
| 309 |
+
|
| 310 |
+
if not word:
|
| 311 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
| 312 |
+
else:
|
| 313 |
+
if not word in self.model:
|
| 314 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
| 315 |
+
|
| 316 |
+
if out_msj:
|
| 317 |
+
out_msj = "<center><h3>"+out_msj+"</h3></center>"
|
| 318 |
+
|
| 319 |
+
return out_msj
|
| 320 |
+
|
| 321 |
+
def parse_words(self, string):
|
| 322 |
+
words = string.strip()
|
| 323 |
+
if words:
|
| 324 |
+
words = [word.strip() for word in words.split(',') if word != ""]
|
| 325 |
+
return words
|
| 326 |
+
|
| 327 |
+
def check_oov(self, wordlists):
|
| 328 |
+
for wordlist in wordlists:
|
| 329 |
+
parsed_words = self.parse_words(wordlist)
|
| 330 |
+
for word in parsed_words:
|
| 331 |
+
msg = self.__errorChecking(word)
|
| 332 |
+
if msg:
|
| 333 |
+
return msg
|
| 334 |
+
return None
|
| 335 |
+
|
| 336 |
+
def plot_projections_2d(self,
|
| 337 |
+
wordlist,
|
| 338 |
+
wordlist_1,
|
| 339 |
+
wordlist_2,
|
| 340 |
+
wordlist_3,
|
| 341 |
+
wordlist_4,
|
| 342 |
+
color_wordlist,
|
| 343 |
+
color_wordlist_1,
|
| 344 |
+
color_wordlist_2,
|
| 345 |
+
color_wordlist_3,
|
| 346 |
+
color_wordlist_4,
|
| 347 |
+
plot_neighbors,
|
| 348 |
+
n_alpha,
|
| 349 |
+
fontsize,
|
| 350 |
+
figsize=(15, 15),
|
| 351 |
+
method='pca'
|
| 352 |
+
):
|
| 353 |
+
# convertirlas a vector
|
| 354 |
+
choices = [0, 1, 2, 3, 4]
|
| 355 |
+
word_list = []
|
| 356 |
+
wordlist_choice = [wordlist, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
|
| 357 |
+
err= self.check_oov(wordlist_choice)
|
| 358 |
+
if err:
|
| 359 |
+
return None, err
|
| 360 |
+
words_colors = {}
|
| 361 |
+
label_dict = {
|
| 362 |
+
0: 'Diagnostico',
|
| 363 |
+
1: 'Lista de palabras 1',
|
| 364 |
+
2: 'Lista de palabras 2',
|
| 365 |
+
3: 'Lista de palabras 3',
|
| 366 |
+
4: 'Lista de palabras 4'
|
| 367 |
+
}
|
| 368 |
+
color_dict = {
|
| 369 |
+
0: color_wordlist,
|
| 370 |
+
1: color_wordlist_1,
|
| 371 |
+
2: color_wordlist_2,
|
| 372 |
+
3: color_wordlist_3,
|
| 373 |
+
4: color_wordlist_4
|
| 374 |
+
}
|
| 375 |
+
word_bias_space = {}
|
| 376 |
+
alpha = {}
|
| 377 |
+
|
| 378 |
+
for raw_word_list, color in zip(wordlist_choice, choices):
|
| 379 |
+
parsed_words = self.parse_words(raw_word_list)
|
| 380 |
+
if parsed_words:
|
| 381 |
+
for word in parsed_words:
|
| 382 |
+
word_bias_space[word] = color
|
| 383 |
+
words_colors[word] = color_dict[color]
|
| 384 |
+
alpha[word] = 1
|
| 385 |
+
if plot_neighbors:
|
| 386 |
+
neighbors = [w for w,s in self.model.most_similar(word,topn=5)]
|
| 387 |
+
for n in neighbors:
|
| 388 |
+
if n not in alpha:
|
| 389 |
+
word_bias_space[n] = color
|
| 390 |
+
words_colors[n] = color_dict[color]
|
| 391 |
+
alpha[n] = n_alpha
|
| 392 |
+
word_list += neighbors
|
| 393 |
+
word_list += parsed_words
|
| 394 |
+
if not word_list:
|
| 395 |
+
return None, "<center><h3>" + "Ingresa al menos 2 palabras para continuar" + "<center><h3>"
|
| 396 |
+
embeddings = [self.model[word] for word in word_list]
|
| 397 |
+
words_embedded = PCA(
|
| 398 |
+
n_components=2, random_state=1).fit_transform(embeddings)
|
| 399 |
+
data = pd.DataFrame(words_embedded)
|
| 400 |
+
data['word'] = word_list
|
| 401 |
+
data['color'] = [words_colors[word] for word in word_list]
|
| 402 |
+
data['alpha'] = [alpha[word] for word in word_list]
|
| 403 |
+
data['word_bias_space'] = [word_bias_space[word] for word in word_list]
|
| 404 |
+
fig, ax = plt.subplots(figsize=figsize)
|
| 405 |
+
|
| 406 |
+
sns.scatterplot(
|
| 407 |
+
data=data[data['alpha'] == 1],
|
| 408 |
+
x=0,
|
| 409 |
+
y=1,
|
| 410 |
+
style='word_bias_space',
|
| 411 |
+
hue='word_bias_space',
|
| 412 |
+
ax=ax,
|
| 413 |
+
palette=color_dict
|
| 414 |
+
)
|
| 415 |
+
if plot_neighbors:
|
| 416 |
+
sns.scatterplot(
|
| 417 |
+
data=data[data['alpha'] != 1],
|
| 418 |
+
x=0,
|
| 419 |
+
y=1,
|
| 420 |
+
style='color',
|
| 421 |
+
hue='word_bias_space',
|
| 422 |
+
ax=ax,
|
| 423 |
+
alpha=n_alpha,
|
| 424 |
+
legend=False,
|
| 425 |
+
palette=color_dict
|
| 426 |
+
)
|
| 427 |
+
for i, label in enumerate(word_list):
|
| 428 |
+
x, y = words_embedded[i, :]
|
| 429 |
+
ax.annotate(label, xy=(x, y), xytext=(5, 2),color=words_colors[label],
|
| 430 |
+
textcoords='offset points',
|
| 431 |
+
ha='right', va='bottom', size=fontsize, alpha=alpha[label])
|
| 432 |
+
|
| 433 |
+
ax.set_xticks([])
|
| 434 |
+
ax.set_yticks([])
|
| 435 |
+
|
| 436 |
+
fig.tight_layout()
|
| 437 |
+
fig.canvas.draw()
|
| 438 |
+
|
| 439 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
| 440 |
+
w, h = fig.canvas.get_width_height()
|
| 441 |
+
im = data.reshape((int(h), int(w), -1))
|
| 442 |
+
return im, ''
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
class WEBiasExplorer2d(BiasExplorer):
|
| 446 |
+
def __init__(self, word_embedding) -> None:
|
| 447 |
+
super().__init__(word_embedding)
|
| 448 |
+
|
| 449 |
+
def calculate_bias(
|
| 450 |
+
self,
|
| 451 |
+
palabras_extremo_1,
|
| 452 |
+
palabras_extremo_2,
|
| 453 |
+
palabras_para_situar
|
| 454 |
+
):
|
| 455 |
+
|
| 456 |
+
wordlists = [
|
| 457 |
+
palabras_extremo_1,
|
| 458 |
+
palabras_extremo_2,
|
| 459 |
+
palabras_para_situar
|
| 460 |
+
]
|
| 461 |
+
err = self.check_oov(wordlists)
|
| 462 |
+
for wordlist in wordlists:
|
| 463 |
+
if not wordlist:
|
| 464 |
+
err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' +"<center><h3>"
|
| 465 |
+
if err:
|
| 466 |
+
return None, err
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
err = self.check_oov([palabras_extremo_1,palabras_extremo_2,palabras_para_situar])
|
| 470 |
+
if err:
|
| 471 |
+
return None, err
|
| 472 |
+
palabras_extremo_1 = self.parse_words(palabras_extremo_1)
|
| 473 |
+
palabras_extremo_2 = self.parse_words(palabras_extremo_2)
|
| 474 |
+
palabras_para_situar = self.parse_words(palabras_para_situar)
|
| 475 |
+
im = self.get_bias_plot(
|
| 476 |
+
palabras_para_situar,
|
| 477 |
+
definitional=(
|
| 478 |
+
palabras_extremo_1, palabras_extremo_2),
|
| 479 |
+
method='sum',
|
| 480 |
+
n_extreme=10
|
| 481 |
+
)
|
| 482 |
+
return im, ''
|
| 483 |
+
|
| 484 |
+
def get_bias_plot(self,
|
| 485 |
+
palabras_para_situar,
|
| 486 |
+
definitional,
|
| 487 |
+
method='sum',
|
| 488 |
+
n_extreme=10,
|
| 489 |
+
figsize=(10, 10)
|
| 490 |
+
):
|
| 491 |
+
|
| 492 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
| 493 |
+
self.method = method
|
| 494 |
+
self.plot_projection_scores(
|
| 495 |
+
definitional,
|
| 496 |
+
palabras_para_situar, n_extreme, ax=ax,)
|
| 497 |
+
|
| 498 |
+
fig.tight_layout()
|
| 499 |
+
fig.canvas.draw()
|
| 500 |
+
|
| 501 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
| 502 |
+
w, h = fig.canvas.get_width_height()
|
| 503 |
+
im = data.reshape((int(h), int(w), -1))
|
| 504 |
+
return im
|
| 505 |
+
|
| 506 |
+
def plot_projection_scores(self, definitional,
|
| 507 |
+
words, n_extreme=10,
|
| 508 |
+
ax=None, axis_projection_step=None):
|
| 509 |
+
"""Plot the projection scalar of words on the direction.
|
| 510 |
+
:param list words: The words tor project
|
| 511 |
+
:param int or None n_extreme: The number of extreme words to show
|
| 512 |
+
:return: The ax object of the plot
|
| 513 |
+
"""
|
| 514 |
+
nombre_del_extremo_1 = ', '.join(definitional[0])
|
| 515 |
+
nombre_del_extremo_2 = ', '.join(definitional[1])
|
| 516 |
+
|
| 517 |
+
self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
|
| 518 |
+
definitional=definitional,
|
| 519 |
+
method='sum')
|
| 520 |
+
|
| 521 |
+
self._is_direction_identified()
|
| 522 |
+
|
| 523 |
+
projections_df = self._calc_projection_scores(words)
|
| 524 |
+
projections_df['projection'] = projections_df['projection'].round(2)
|
| 525 |
+
|
| 526 |
+
if n_extreme is not None:
|
| 527 |
+
projections_df = take_two_sides_extreme_sorted(projections_df,
|
| 528 |
+
n_extreme=n_extreme)
|
| 529 |
+
|
| 530 |
+
if ax is None:
|
| 531 |
+
_, ax = plt.subplots(1)
|
| 532 |
+
|
| 533 |
+
if axis_projection_step is None:
|
| 534 |
+
axis_projection_step = 0.1
|
| 535 |
+
|
| 536 |
+
cmap = plt.get_cmap('RdBu')
|
| 537 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5)
|
| 538 |
+
.apply(cmap))
|
| 539 |
+
|
| 540 |
+
most_extream_projection = np.round(
|
| 541 |
+
projections_df['projection']
|
| 542 |
+
.abs()
|
| 543 |
+
.max(),
|
| 544 |
+
decimals=1)
|
| 545 |
+
|
| 546 |
+
sns.barplot(x='projection', y='word', data=projections_df,
|
| 547 |
+
palette=projections_df['color'])
|
| 548 |
+
|
| 549 |
+
plt.xticks(np.arange(-most_extream_projection,
|
| 550 |
+
most_extream_projection + axis_projection_step,
|
| 551 |
+
axis_projection_step))
|
| 552 |
+
xlabel = ('← {} {} {} →'.format(self.negative_end,
|
| 553 |
+
' ' * 20,
|
| 554 |
+
self.positive_end))
|
| 555 |
+
|
| 556 |
+
plt.xlabel(xlabel)
|
| 557 |
+
plt.ylabel('Words')
|
| 558 |
+
|
| 559 |
+
return ax
|
| 560 |
+
|
| 561 |
+
|
| 562 |
+
class WEBiasExplorer4d(BiasExplorer):
|
| 563 |
+
def __init__(self, word_embedding) -> None:
|
| 564 |
+
super().__init__(word_embedding)
|
| 565 |
+
|
| 566 |
+
def calculate_bias(
|
| 567 |
+
self,
|
| 568 |
+
palabras_extremo_1,
|
| 569 |
+
palabras_extremo_2,
|
| 570 |
+
palabras_extremo_3,
|
| 571 |
+
palabras_extremo_4,
|
| 572 |
+
palabras_para_situar
|
| 573 |
+
):
|
| 574 |
+
wordlists = [
|
| 575 |
+
palabras_extremo_1,
|
| 576 |
+
palabras_extremo_2,
|
| 577 |
+
palabras_extremo_3,
|
| 578 |
+
palabras_extremo_4,
|
| 579 |
+
palabras_para_situar
|
| 580 |
+
]
|
| 581 |
+
err = self.check_oov(wordlists)
|
| 582 |
+
for wordlist in wordlists:
|
| 583 |
+
if not wordlist:
|
| 584 |
+
err = "<center><h3>" + '¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
|
| 585 |
+
if err:
|
| 586 |
+
return None, err
|
| 587 |
+
|
| 588 |
+
palabras_extremo_1 = self.parse_words(palabras_extremo_1)
|
| 589 |
+
palabras_extremo_2 = self.parse_words(palabras_extremo_2)
|
| 590 |
+
palabras_extremo_3 = self.parse_words(palabras_extremo_3)
|
| 591 |
+
palabras_extremo_4 = self.parse_words(palabras_extremo_4)
|
| 592 |
+
|
| 593 |
+
palabras_para_situar = self.parse_words(palabras_para_situar)
|
| 594 |
+
|
| 595 |
+
im = self.get_bias_plot(
|
| 596 |
+
palabras_para_situar,
|
| 597 |
+
definitional_1=(
|
| 598 |
+
palabras_extremo_1, palabras_extremo_2),
|
| 599 |
+
definitional_2=(
|
| 600 |
+
palabras_extremo_3, palabras_extremo_4),
|
| 601 |
+
method='sum',
|
| 602 |
+
n_extreme=10
|
| 603 |
+
)
|
| 604 |
+
return im, ''
|
| 605 |
+
|
| 606 |
+
def get_bias_plot(self,
|
| 607 |
+
palabras_para_situar,
|
| 608 |
+
definitional_1,
|
| 609 |
+
definitional_2,
|
| 610 |
+
method='sum',
|
| 611 |
+
n_extreme=10,
|
| 612 |
+
figsize=(10, 10)
|
| 613 |
+
):
|
| 614 |
+
|
| 615 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
| 616 |
+
self.method = method
|
| 617 |
+
self.plot_projection_scores(
|
| 618 |
+
definitional_1,
|
| 619 |
+
definitional_2,
|
| 620 |
+
palabras_para_situar, n_extreme, ax=ax,)
|
| 621 |
+
fig.canvas.draw()
|
| 622 |
+
|
| 623 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
| 624 |
+
w, h = fig.canvas.get_width_height()
|
| 625 |
+
im = data.reshape((int(h), int(w), -1))
|
| 626 |
+
return im
|
| 627 |
+
|
| 628 |
+
def plot_projection_scores(self, definitional_1, definitional_2,
|
| 629 |
+
words, n_extreme=10,
|
| 630 |
+
ax=None, axis_projection_step=None):
|
| 631 |
+
"""Plot the projection scalar of words on the direction.
|
| 632 |
+
:param list words: The words tor project
|
| 633 |
+
:param int or None n_extreme: The number of extreme words to show
|
| 634 |
+
:return: The ax object of the plot
|
| 635 |
+
"""
|
| 636 |
+
|
| 637 |
+
nombre_del_extremo_1 = ', '.join(definitional_1[1])
|
| 638 |
+
nombre_del_extremo_2 = ', '.join(definitional_1[0])
|
| 639 |
+
|
| 640 |
+
self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
|
| 641 |
+
definitional=definitional_1,
|
| 642 |
+
method='sum')
|
| 643 |
+
|
| 644 |
+
self._is_direction_identified()
|
| 645 |
+
|
| 646 |
+
projections_df = self._calc_projection_scores(words)
|
| 647 |
+
projections_df['projection_x'] = projections_df['projection'].round(2)
|
| 648 |
+
|
| 649 |
+
nombre_del_extremo_3 = ', '.join(definitional_2[1])
|
| 650 |
+
nombre_del_extremo_4 = ', '.join(definitional_2[0])
|
| 651 |
+
self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
|
| 652 |
+
definitional=definitional_2,
|
| 653 |
+
method='sum')
|
| 654 |
+
|
| 655 |
+
self._is_direction_identified()
|
| 656 |
+
|
| 657 |
+
projections_df['projection_y'] = self._calc_projection_scores(words)[
|
| 658 |
+
'projection'].round(2)
|
| 659 |
+
|
| 660 |
+
if n_extreme is not None:
|
| 661 |
+
projections_df = take_two_sides_extreme_sorted(projections_df,
|
| 662 |
+
n_extreme=n_extreme)
|
| 663 |
+
|
| 664 |
+
if ax is None:
|
| 665 |
+
_, ax = plt.subplots(1)
|
| 666 |
+
|
| 667 |
+
if axis_projection_step is None:
|
| 668 |
+
axis_projection_step = 0.1
|
| 669 |
+
|
| 670 |
+
cmap = plt.get_cmap('RdBu')
|
| 671 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5)
|
| 672 |
+
.apply(cmap))
|
| 673 |
+
most_extream_projection = np.round(
|
| 674 |
+
projections_df['projection']
|
| 675 |
+
.abs()
|
| 676 |
+
.max(),
|
| 677 |
+
decimals=1)
|
| 678 |
+
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
| 679 |
+
palette=projections_df['color'])
|
| 680 |
+
|
| 681 |
+
plt.xticks(np.arange(-most_extream_projection,
|
| 682 |
+
most_extream_projection + axis_projection_step,
|
| 683 |
+
axis_projection_step))
|
| 684 |
+
for _, row in (projections_df.iterrows()):
|
| 685 |
+
ax.annotate(
|
| 686 |
+
row['word'], (row['projection_x'], row['projection_y']))
|
| 687 |
+
x_label = '← {} {} {} →'.format(nombre_del_extremo_1,
|
| 688 |
+
' ' * 20,
|
| 689 |
+
nombre_del_extremo_2)
|
| 690 |
+
|
| 691 |
+
y_label = '← {} {} {} →'.format(nombre_del_extremo_3,
|
| 692 |
+
' ' * 20,
|
| 693 |
+
nombre_del_extremo_4)
|
| 694 |
+
|
| 695 |
+
plt.xlabel(x_label)
|
| 696 |
+
ax.xaxis.set_label_position('bottom')
|
| 697 |
+
ax.xaxis.set_label_coords(.5, 0)
|
| 698 |
+
|
| 699 |
+
plt.ylabel(y_label)
|
| 700 |
+
ax.yaxis.set_label_position('left')
|
| 701 |
+
ax.yaxis.set_label_coords(0, .5)
|
| 702 |
+
|
| 703 |
+
ax.spines['left'].set_position('center')
|
| 704 |
+
ax.spines['bottom'].set_position('center')
|
| 705 |
+
|
| 706 |
+
ax.set_xticks([])
|
| 707 |
+
ax.set_yticks([])
|
| 708 |
+
#plt.yticks([], [])
|
| 709 |
+
# ax.spines['left'].set_position('zero')
|
| 710 |
+
# ax.spines['bottom'].set_position('zero')
|
| 711 |
+
|
| 712 |
+
return ax
|
| 713 |
+
|
| 714 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sesgos_en_frases
|
| 2 |
+
regex
|
| 3 |
+
# datos
|
| 4 |
+
torch
|
| 5 |
+
transformers
|
| 6 |
+
# resto
|
| 7 |
+
sklearn
|
| 8 |
+
gensim==3.7.3
|
| 9 |
+
transformers
|
| 10 |
+
tensorflow
|
| 11 |
+
matplotlib
|
| 12 |
+
numpy
|
| 13 |
+
seaborn
|
tool_info.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TOOL_INFO = """
|
| 2 |
+
> ### A tool to overcome technical barriers for bias assessment in human language technologies
|
| 3 |
+
|
| 4 |
+
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
| 5 |
+
|
| 6 |
+
> ### Licensing Information
|
| 7 |
+
* [MIT Licence](https://huggingface.co/spaces/vialibre/new_bias_tools/resolve/main/LICENSE)
|
| 8 |
+
|
| 9 |
+
> ### Citation Information
|
| 10 |
+
```c
|
| 11 |
+
@misc{https://doi.org/10.48550/arxiv.2207.06591,
|
| 12 |
+
doi = {10.48550/ARXIV.2207.06591},
|
| 13 |
+
url = {https://arxiv.org/abs/2207.06591},
|
| 14 |
+
author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
|
| 15 |
+
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
|
| 16 |
+
FOS: Computer and information sciences, FOS: Computer and information sciences},
|
| 17 |
+
title = {A tool to overcome technical barriers for bias assessment in human language technologies},
|
| 18 |
+
publisher = {arXiv},
|
| 19 |
+
year = {2022},
|
| 20 |
+
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
|
| 21 |
+
}
|
| 22 |
+
```
|
| 23 |
+
"""
|
utils_sesgo_en_palabras.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
|
| 3 |
+
import gensim
|
| 4 |
+
import matplotlib.pylab as plt
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from six import string_types
|
| 8 |
+
from sklearn.cluster import KMeans
|
| 9 |
+
from sklearn.manifold import TSNE
|
| 10 |
+
from sklearn.metrics import accuracy_score
|
| 11 |
+
import gradio as gr
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
|
| 16 |
+
gensim.models.fasttext.FastText,
|
| 17 |
+
gensim.models.word2vec.Word2Vec,
|
| 18 |
+
gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def assert_gensim_keyed_vectors(model):
|
| 22 |
+
if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
|
| 23 |
+
type_names = (model_type.__name__
|
| 24 |
+
for model_type in WORD_EMBEDDING_MODEL_TYPES)
|
| 25 |
+
raise TypeError('model should be on of the types'
|
| 26 |
+
' ({}), not {}.'
|
| 27 |
+
.format(', '.join(type_names),
|
| 28 |
+
type(model)))
|
| 29 |
+
|
| 30 |
+
def generate_words_forms(words):
|
| 31 |
+
return sum([generate_one_word_forms(word) for word in words], [])
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def cosine_similarity(v, u):
|
| 35 |
+
"""Calculate the cosine similarity between two vectors."""
|
| 36 |
+
v_norm = np.linalg.norm(v)
|
| 37 |
+
u_norm = np.linalg.norm(u)
|
| 38 |
+
similarity = v @ u / (v_norm * u_norm)
|
| 39 |
+
return similarity
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def generate_one_word_forms(word):
|
| 43 |
+
return [word.lower(), word.upper(), word.title()]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_seed_vector(seed, bias_word_embedding):
|
| 47 |
+
|
| 48 |
+
if seed == 'direction':
|
| 49 |
+
positive_end = bias_word_embedding.positive_end
|
| 50 |
+
negative_end = bias_word_embedding.negative_end
|
| 51 |
+
bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
|
| 52 |
+
seed_vector = bias_word_embedding.direction
|
| 53 |
+
else:
|
| 54 |
+
if seed == 'ends':
|
| 55 |
+
positive_end = bias_word_embedding.positive_end
|
| 56 |
+
negative_end = bias_word_embedding.negative_end
|
| 57 |
+
|
| 58 |
+
else:
|
| 59 |
+
positive_end, negative_end = seed
|
| 60 |
+
|
| 61 |
+
seed_vector = normalize(bias_word_embedding.model[positive_end]
|
| 62 |
+
- bias_word_embedding.model[negative_end])
|
| 63 |
+
|
| 64 |
+
return seed_vector, positive_end, negative_end
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def most_similar(model, positive=None, negative=None,
|
| 68 |
+
topn=10, restrict_vocab=None, indexer=None,
|
| 69 |
+
unrestricted=True):
|
| 70 |
+
"""
|
| 71 |
+
Find the top-N most similar words.
|
| 72 |
+
|
| 73 |
+
Positive words contribute positively towards the similarity,
|
| 74 |
+
negative words negatively.
|
| 75 |
+
|
| 76 |
+
This function computes cosine similarity between a simple mean
|
| 77 |
+
of the projection weight vectors of the given words and
|
| 78 |
+
the vectors for each word in the model.
|
| 79 |
+
The function corresponds to the `word-analogy` and `distance`
|
| 80 |
+
scripts in the original word2vec implementation.
|
| 81 |
+
|
| 82 |
+
Based on Gensim implementation.
|
| 83 |
+
|
| 84 |
+
:param model: Word embedding model of ``gensim.model.KeyedVectors``.
|
| 85 |
+
:param list positive: List of words that contribute positively.
|
| 86 |
+
:param list negative: List of words that contribute negatively.
|
| 87 |
+
:param int topn: Number of top-N similar words to return.
|
| 88 |
+
:param int restrict_vocab: Optional integer which limits the
|
| 89 |
+
range of vectors
|
| 90 |
+
which are searched for most-similar values.
|
| 91 |
+
For example, restrict_vocab=10000 would
|
| 92 |
+
only check the first 10000 word vectors
|
| 93 |
+
in the vocabulary order. (This may be
|
| 94 |
+
meaningful if you've sorted the vocabulary
|
| 95 |
+
by descending frequency.)
|
| 96 |
+
:param bool unrestricted: Whether to restricted the most
|
| 97 |
+
similar words to be not from
|
| 98 |
+
the positive or negative word list.
|
| 99 |
+
:return: Sequence of (word, similarity).
|
| 100 |
+
"""
|
| 101 |
+
if topn is not None and topn < 1:
|
| 102 |
+
return []
|
| 103 |
+
|
| 104 |
+
if positive is None:
|
| 105 |
+
positive = []
|
| 106 |
+
if negative is None:
|
| 107 |
+
negative = []
|
| 108 |
+
|
| 109 |
+
model.init_sims()
|
| 110 |
+
|
| 111 |
+
if (isinstance(positive, string_types)
|
| 112 |
+
and not negative):
|
| 113 |
+
# allow calls like most_similar('dog'),
|
| 114 |
+
# as a shorthand for most_similar(['dog'])
|
| 115 |
+
positive = [positive]
|
| 116 |
+
|
| 117 |
+
if ((isinstance(positive, string_types) and negative)
|
| 118 |
+
or (isinstance(negative, string_types) and positive)):
|
| 119 |
+
raise ValueError('If positives and negatives are given, '
|
| 120 |
+
'both should be lists!')
|
| 121 |
+
|
| 122 |
+
# add weights for each word, if not already present;
|
| 123 |
+
# default to 1.0 for positive and -1.0 for negative words
|
| 124 |
+
positive = [
|
| 125 |
+
(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
|
| 126 |
+
else word
|
| 127 |
+
for word in positive
|
| 128 |
+
]
|
| 129 |
+
negative = [
|
| 130 |
+
(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
|
| 131 |
+
else word
|
| 132 |
+
for word in negative
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
# compute the weighted average of all words
|
| 136 |
+
all_words, mean = set(), []
|
| 137 |
+
for word, weight in positive + negative:
|
| 138 |
+
if isinstance(word, np.ndarray):
|
| 139 |
+
mean.append(weight * word)
|
| 140 |
+
else:
|
| 141 |
+
mean.append(weight * model.word_vec(word, use_norm=True))
|
| 142 |
+
if word in model.vocab:
|
| 143 |
+
all_words.add(model.vocab[word].index)
|
| 144 |
+
|
| 145 |
+
if not mean:
|
| 146 |
+
raise ValueError("Cannot compute similarity with no input.")
|
| 147 |
+
mean = gensim.matutils.unitvec(np.array(mean)
|
| 148 |
+
.mean(axis=0)).astype(float)
|
| 149 |
+
|
| 150 |
+
if indexer is not None:
|
| 151 |
+
return indexer.most_similar(mean, topn)
|
| 152 |
+
|
| 153 |
+
limited = (model.vectors_norm if restrict_vocab is None
|
| 154 |
+
else model.vectors_norm[:restrict_vocab])
|
| 155 |
+
dists = limited @ mean
|
| 156 |
+
|
| 157 |
+
if topn is None:
|
| 158 |
+
return dists
|
| 159 |
+
|
| 160 |
+
best = gensim.matutils.argsort(dists,
|
| 161 |
+
topn=topn + len(all_words),
|
| 162 |
+
reverse=True)
|
| 163 |
+
|
| 164 |
+
# if not unrestricted, then ignore (don't return)
|
| 165 |
+
# words from the input
|
| 166 |
+
result = [(model.index2word[sim], float(dists[sim]))
|
| 167 |
+
for sim in best
|
| 168 |
+
if unrestricted or sim not in all_words]
|
| 169 |
+
|
| 170 |
+
return result[:topn]
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def normalize(v):
|
| 174 |
+
"""Normalize a 1-D vector."""
|
| 175 |
+
if v.ndim != 1:
|
| 176 |
+
raise ValueError('v should be 1-D, {}-D was given'.format(
|
| 177 |
+
v.ndim))
|
| 178 |
+
norm = np.linalg.norm(v)
|
| 179 |
+
if norm == 0:
|
| 180 |
+
return v
|
| 181 |
+
return v / norm
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def project_params(u, v):
|
| 185 |
+
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
| 186 |
+
normalize_u = normalize(u)
|
| 187 |
+
projection = (v @ normalize_u)
|
| 188 |
+
projected_vector = projection * normalize_u
|
| 189 |
+
rejected_vector = v - projected_vector
|
| 190 |
+
return projection, projected_vector, rejected_vector
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def project_reject_vector(v, u):
|
| 194 |
+
"""Projecting and rejecting the vector v onto direction u."""
|
| 195 |
+
projected_vector = project_vector(v, u)
|
| 196 |
+
rejected_vector = v - projected_vector
|
| 197 |
+
return projected_vector, rejected_vector
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def round_to_extreme(value, digits=2):
|
| 201 |
+
place = 10**digits
|
| 202 |
+
new_value = math.ceil(abs(value) * place) / place
|
| 203 |
+
if value < 0:
|
| 204 |
+
new_value = -new_value
|
| 205 |
+
return new_value
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def take_two_sides_extreme_sorted(df, n_extreme,
|
| 209 |
+
part_column=None,
|
| 210 |
+
head_value='',
|
| 211 |
+
tail_value=''):
|
| 212 |
+
head_df = df.head(n_extreme)[:]
|
| 213 |
+
tail_df = df.tail(n_extreme)[:]
|
| 214 |
+
|
| 215 |
+
if part_column is not None:
|
| 216 |
+
head_df[part_column] = head_value
|
| 217 |
+
tail_df[part_column] = tail_value
|
| 218 |
+
|
| 219 |
+
return (pd.concat([head_df, tail_df])
|
| 220 |
+
.drop_duplicates()
|
| 221 |
+
.reset_index(drop=True))
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def project_vector(v, u):
|
| 225 |
+
"""Projecting the vector v onto direction u."""
|
| 226 |
+
normalize_u = normalize(u)
|
| 227 |
+
return (v @ normalize_u) * normalize_u
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def reject_vector(v, u):
|
| 231 |
+
"""Rejecting the vector v onto direction u."""
|
| 232 |
+
return v - project_vector(v, u)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def update_word_vector(model, word, new_vector):
|
| 236 |
+
model.vectors[model.vocab[word].index] = new_vector
|
| 237 |
+
if model.vectors_norm is not None:
|
| 238 |
+
model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def project_params(u, v):
|
| 242 |
+
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
| 243 |
+
normalize_u = normalize(u)
|
| 244 |
+
projection = (v @ normalize_u)
|
| 245 |
+
projected_vector = projection * normalize_u
|
| 246 |
+
rejected_vector = v - projected_vector
|
| 247 |
+
return projection, projected_vector, rejected_vector
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
|
| 251 |
+
|
| 252 |
+
if ax is None:
|
| 253 |
+
_, ax = plt.subplots(figsize=(10, 5))
|
| 254 |
+
|
| 255 |
+
y_cluster = (KMeans(n_clusters=2, random_state=random_state)
|
| 256 |
+
.fit_predict(X))
|
| 257 |
+
|
| 258 |
+
embedded_vectors = (TSNE(n_components=2, random_state=random_state)
|
| 259 |
+
.fit_transform(X))
|
| 260 |
+
|
| 261 |
+
for y_value in np.unique(y_cluster):
|
| 262 |
+
mask = (y_cluster == y_value)
|
| 263 |
+
label = 'Positive' if y_value else 'Negative'
|
| 264 |
+
ax.scatter(embedded_vectors[mask, 0],
|
| 265 |
+
embedded_vectors[mask, 1],
|
| 266 |
+
label=label)
|
| 267 |
+
|
| 268 |
+
ax.legend()
|
| 269 |
+
|
| 270 |
+
acc = accuracy_score(y_true, y_cluster)
|
| 271 |
+
|
| 272 |
+
return max(acc, 1 - acc)
|