Spaces:
Configuration error
Configuration error
Typing. Added __init_ann_method and __init_sklearn_method in embedding class. Upgrade getNearestNeighbors method. Fix bug in get method from ann class. Etc
Browse files- .gitignore +1 -1
- app.py +10 -8
- interfaces/interface_BiasWordExplorer.py +96 -40
- interfaces/interface_WordExplorer.py +105 -32
- language/.gitignore +1 -0
- modules/model_embbeding.py +112 -51
- modules/module_BiasExplorer.py +125 -57
- modules/module_WordExplorer.py +128 -56
- modules/module_ann.py +53 -23
- modules/module_connection.py +116 -66
- modules/module_logsManager.py +7 -5
- tool_info.py +1 -1
.gitignore
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
__pycache__/
|
| 2 |
-
bias_tool_logs/
|
| 3 |
*.env
|
|
|
|
|
|
| 1 |
__pycache__/
|
|
|
|
| 2 |
*.env
|
| 3 |
+
logs_edia_we_spanish/
|
app.py
CHANGED
|
@@ -4,30 +4,32 @@ import pandas as pd
|
|
| 4 |
|
| 5 |
|
| 6 |
# --- Imports modules ---
|
| 7 |
-
from modules.model_embbeding import Embedding
|
| 8 |
|
| 9 |
|
| 10 |
# --- Imports interfaces ---
|
| 11 |
-
from interfaces.interface_WordExplorer import interface as wordExplorer_interface
|
| 12 |
from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
|
| 13 |
|
| 14 |
|
| 15 |
# --- Tool config ---
|
| 16 |
-
AVAILABLE_LOGS = True # [True | False]
|
| 17 |
-
LANGUAGE = "spanish" # [spanish | english]
|
| 18 |
EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
# --- Init classes ---
|
| 23 |
embedding = Embedding(
|
| 24 |
path=EMBEDDINGS_PATH,
|
| 25 |
-
binary=EMBEDDINGS_PATH.endswith('.bin'),
|
| 26 |
limit=None,
|
| 27 |
randomizedPCA=False,
|
| 28 |
-
max_neighbors=MAX_NEIGHBORS
|
|
|
|
| 29 |
)
|
| 30 |
|
|
|
|
| 31 |
# --- Init Vars ---
|
| 32 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
| 33 |
|
|
@@ -41,7 +43,7 @@ INTERFACE_LIST = [
|
|
| 41 |
wordExplorer_interface(
|
| 42 |
embedding=embedding,
|
| 43 |
available_logs=AVAILABLE_LOGS,
|
| 44 |
-
max_neighbors=MAX_NEIGHBORS,
|
| 45 |
lang=LANGUAGE),
|
| 46 |
]
|
| 47 |
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
# --- Imports modules ---
|
| 7 |
+
from modules.model_embbeding import Embedding
|
| 8 |
|
| 9 |
|
| 10 |
# --- Imports interfaces ---
|
| 11 |
+
from interfaces.interface_WordExplorer import interface as wordExplorer_interface
|
| 12 |
from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
|
| 13 |
|
| 14 |
|
| 15 |
# --- Tool config ---
|
|
|
|
|
|
|
| 16 |
EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
|
| 17 |
+
LANGUAGE = "spanish" # [spanish | english]
|
| 18 |
+
MAX_NEIGHBORS = 20
|
| 19 |
+
NN_METHOD = 'sklearn' # ['sklearn' | 'ann']
|
| 20 |
+
AVAILABLE_LOGS = True # [True | False]
|
| 21 |
|
| 22 |
|
| 23 |
# --- Init classes ---
|
| 24 |
embedding = Embedding(
|
| 25 |
path=EMBEDDINGS_PATH,
|
|
|
|
| 26 |
limit=None,
|
| 27 |
randomizedPCA=False,
|
| 28 |
+
max_neighbors=MAX_NEIGHBORS,
|
| 29 |
+
nn_method=NN_METHOD
|
| 30 |
)
|
| 31 |
|
| 32 |
+
|
| 33 |
# --- Init Vars ---
|
| 34 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
| 35 |
|
|
|
|
| 43 |
wordExplorer_interface(
|
| 44 |
embedding=embedding,
|
| 45 |
available_logs=AVAILABLE_LOGS,
|
| 46 |
+
max_neighbors=MAX_NEIGHBORS,
|
| 47 |
lang=LANGUAGE),
|
| 48 |
]
|
| 49 |
|
interfaces/interface_BiasWordExplorer.py
CHANGED
|
@@ -1,48 +1,96 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
-
from tkinter import image_names
|
| 4 |
|
| 5 |
-
from tool_info import TOOL_INFO
|
| 6 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
| 7 |
from modules.module_connection import BiasWordExplorerConnector
|
| 8 |
from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# --- Interface ---
|
| 11 |
-
def interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# --- Init logs ---
|
| 13 |
log_callback = HuggingFaceDatasetSaver(
|
| 14 |
-
available_logs=available_logs
|
|
|
|
| 15 |
)
|
|
|
|
| 16 |
# --- Init vars ---
|
| 17 |
-
connector = BiasWordExplorerConnector(
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
| 20 |
interface = gr.Blocks()
|
|
|
|
| 21 |
with interface:
|
| 22 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 23 |
with gr.Row():
|
| 24 |
with gr.Column():
|
| 25 |
with gr.Row():
|
| 26 |
-
diagnose_list = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
| 27 |
with gr.Row():
|
| 28 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 29 |
with gr.Row():
|
| 30 |
-
wordlist_1 = gr.Textbox(
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
with gr.Row():
|
| 33 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 34 |
with gr.Row():
|
| 35 |
-
wordlist_3 = gr.Textbox(
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
with gr.Column():
|
| 38 |
with gr.Row():
|
| 39 |
-
bias2d = gr.Button(
|
|
|
|
|
|
|
| 40 |
with gr.Row():
|
| 41 |
-
bias4d = gr.Button(
|
|
|
|
|
|
|
| 42 |
with gr.Row():
|
| 43 |
-
err_msg = gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 44 |
with gr.Row():
|
| 45 |
-
bias_plot = gr.Plot(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
with gr.Row():
|
| 47 |
examples = gr.Examples(
|
| 48 |
fn=connector.calculate_bias_2d,
|
|
@@ -54,51 +102,59 @@ def interface(embedding, available_logs, lang="spanish"):
|
|
| 54 |
with gr.Row():
|
| 55 |
examples = gr.Examples(
|
| 56 |
fn=connector.calculate_bias_4d,
|
| 57 |
-
inputs=[wordlist_1, wordlist_2,
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
examples=examples2_explorar_sesgo_en_palabras,
|
| 61 |
label=labels["examples4Spaces"]
|
| 62 |
)
|
| 63 |
|
| 64 |
with gr.Row():
|
| 65 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 66 |
|
| 67 |
bias2d.click(
|
| 68 |
-
fn=connector.calculate_bias_2d,
|
| 69 |
-
inputs=[wordlist_1,wordlist_2,diagnose_list],
|
| 70 |
-
outputs=[bias_plot,err_msg]
|
| 71 |
)
|
| 72 |
-
|
| 73 |
bias4d.click(
|
| 74 |
fn=connector.calculate_bias_4d,
|
| 75 |
-
inputs=[wordlist_1,wordlist_2,
|
| 76 |
-
|
|
|
|
| 77 |
)
|
| 78 |
|
| 79 |
# --- Logs ---
|
| 80 |
-
save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
|
| 81 |
-
log_callback.setup(
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
bias2d.click(
|
| 84 |
fn=lambda *args: log_callback.flag(
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
),
|
| 89 |
inputs=save_field,
|
| 90 |
-
outputs=None,
|
| 91 |
preprocess=False
|
| 92 |
)
|
| 93 |
-
|
| 94 |
bias4d.click(
|
| 95 |
fn=lambda *args: log_callback.flag(
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
),
|
| 100 |
inputs=save_field,
|
| 101 |
-
outputs=None,
|
| 102 |
preprocess=False
|
| 103 |
)
|
| 104 |
-
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
|
|
|
|
| 4 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
| 5 |
from modules.module_connection import BiasWordExplorerConnector
|
| 6 |
from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
|
| 7 |
+
from tool_info import TOOL_INFO
|
| 8 |
+
|
| 9 |
|
| 10 |
# --- Interface ---
|
| 11 |
+
def interface(
|
| 12 |
+
embedding, # Class Embedding instance
|
| 13 |
+
available_logs: bool,
|
| 14 |
+
lang: str="spanish"
|
| 15 |
+
) -> gr.Blocks:
|
| 16 |
+
|
| 17 |
# --- Init logs ---
|
| 18 |
log_callback = HuggingFaceDatasetSaver(
|
| 19 |
+
available_logs=available_logs,
|
| 20 |
+
dataset_name=f"logs_edia_we_{lang}"
|
| 21 |
)
|
| 22 |
+
|
| 23 |
# --- Init vars ---
|
| 24 |
+
connector = BiasWordExplorerConnector(
|
| 25 |
+
embedding=embedding
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# --- Load language ---
|
| 29 |
+
labels = pd.read_json(
|
| 30 |
+
f"language/{lang}.json"
|
| 31 |
+
)["BiasWordExplorer_interface"]
|
| 32 |
|
| 33 |
+
# --- Interface ---
|
| 34 |
interface = gr.Blocks()
|
| 35 |
+
|
| 36 |
with interface:
|
| 37 |
+
gr.Markdown(
|
| 38 |
+
value=labels["step1"]
|
| 39 |
+
)
|
| 40 |
with gr.Row():
|
| 41 |
with gr.Column():
|
| 42 |
with gr.Row():
|
| 43 |
+
diagnose_list = gr.Textbox(
|
| 44 |
+
lines=2,
|
| 45 |
+
label=labels["wordListToDiagnose"]
|
| 46 |
+
)
|
| 47 |
with gr.Row():
|
| 48 |
+
gr.Markdown(
|
| 49 |
+
value=labels["step2&2Spaces"]
|
| 50 |
+
)
|
| 51 |
with gr.Row():
|
| 52 |
+
wordlist_1 = gr.Textbox(
|
| 53 |
+
lines=2,
|
| 54 |
+
label=labels["wordList1"]
|
| 55 |
+
)
|
| 56 |
+
wordlist_2 = gr.Textbox(
|
| 57 |
+
lines=2,
|
| 58 |
+
label=labels["wordList2"]
|
| 59 |
+
)
|
| 60 |
with gr.Row():
|
| 61 |
+
gr.Markdown(
|
| 62 |
+
value=labels["step2&4Spaces"]
|
| 63 |
+
)
|
| 64 |
with gr.Row():
|
| 65 |
+
wordlist_3 = gr.Textbox(
|
| 66 |
+
lines=2,
|
| 67 |
+
label=labels["wordList3"]
|
| 68 |
+
)
|
| 69 |
+
wordlist_4 = gr.Textbox(
|
| 70 |
+
lines=2,
|
| 71 |
+
label=labels["wordList4"]
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
with gr.Column():
|
| 75 |
with gr.Row():
|
| 76 |
+
bias2d = gr.Button(
|
| 77 |
+
value=labels["plot2SpacesButton"]
|
| 78 |
+
)
|
| 79 |
with gr.Row():
|
| 80 |
+
bias4d = gr.Button(
|
| 81 |
+
value=labels["plot4SpacesButton"]
|
| 82 |
+
)
|
| 83 |
with gr.Row():
|
| 84 |
+
err_msg = gr.Markdown(
|
| 85 |
+
label="",
|
| 86 |
+
visible=True
|
| 87 |
+
)
|
| 88 |
with gr.Row():
|
| 89 |
+
bias_plot = gr.Plot(
|
| 90 |
+
label="",
|
| 91 |
+
show_label=False
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
with gr.Row():
|
| 95 |
examples = gr.Examples(
|
| 96 |
fn=connector.calculate_bias_2d,
|
|
|
|
| 102 |
with gr.Row():
|
| 103 |
examples = gr.Examples(
|
| 104 |
fn=connector.calculate_bias_4d,
|
| 105 |
+
inputs=[wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list],
|
| 106 |
+
outputs=[
|
| 107 |
+
bias_plot, err_msg
|
| 108 |
+
],
|
| 109 |
examples=examples2_explorar_sesgo_en_palabras,
|
| 110 |
label=labels["examples4Spaces"]
|
| 111 |
)
|
| 112 |
|
| 113 |
with gr.Row():
|
| 114 |
+
gr.Markdown(
|
| 115 |
+
value=TOOL_INFO
|
| 116 |
+
)
|
| 117 |
|
| 118 |
bias2d.click(
|
| 119 |
+
fn=connector.calculate_bias_2d,
|
| 120 |
+
inputs=[wordlist_1, wordlist_2, diagnose_list],
|
| 121 |
+
outputs=[bias_plot, err_msg]
|
| 122 |
)
|
| 123 |
+
|
| 124 |
bias4d.click(
|
| 125 |
fn=connector.calculate_bias_4d,
|
| 126 |
+
inputs=[wordlist_1, wordlist_2,
|
| 127 |
+
wordlist_3, wordlist_4, diagnose_list],
|
| 128 |
+
outputs=[bias_plot, err_msg]
|
| 129 |
)
|
| 130 |
|
| 131 |
# --- Logs ---
|
| 132 |
+
save_field = [wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list]
|
| 133 |
+
log_callback.setup(
|
| 134 |
+
components=save_field,
|
| 135 |
+
flagging_dir="logs_word_bias"
|
| 136 |
+
)
|
| 137 |
|
| 138 |
bias2d.click(
|
| 139 |
fn=lambda *args: log_callback.flag(
|
| 140 |
+
flag_data=args,
|
| 141 |
+
flag_option="plot_2d",
|
| 142 |
+
username="vialibre"
|
| 143 |
),
|
| 144 |
inputs=save_field,
|
| 145 |
+
outputs=None,
|
| 146 |
preprocess=False
|
| 147 |
)
|
| 148 |
+
|
| 149 |
bias4d.click(
|
| 150 |
fn=lambda *args: log_callback.flag(
|
| 151 |
+
flag_data=args,
|
| 152 |
+
flag_option="plot_4d",
|
| 153 |
+
username="vialibre"
|
| 154 |
),
|
| 155 |
inputs=save_field,
|
| 156 |
+
outputs=None,
|
| 157 |
preprocess=False
|
| 158 |
)
|
| 159 |
+
|
| 160 |
+
return interface
|
interfaces/interface_WordExplorer.py
CHANGED
|
@@ -2,73 +2,140 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
import matplotlib.pyplot as plt
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
from modules.module_connection import WordExplorerConnector # Updated
|
| 7 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
| 8 |
from examples.examples import examples_explorar_relaciones_entre_palabras
|
|
|
|
| 9 |
|
| 10 |
plt.rcParams.update({'font.size': 14})
|
| 11 |
|
| 12 |
def interface(
|
| 13 |
-
embedding,
|
| 14 |
available_logs: bool,
|
| 15 |
-
max_neighbors: int,
|
| 16 |
lang: str="spanish",
|
| 17 |
) -> gr.Blocks:
|
| 18 |
|
| 19 |
# --- Init logs ---
|
| 20 |
log_callback = HuggingFaceDatasetSaver(
|
| 21 |
-
available_logs=available_logs
|
|
|
|
| 22 |
)
|
|
|
|
| 23 |
# --- Init vars ---
|
| 24 |
-
connector = WordExplorerConnector(
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# --- Interface ---
|
| 28 |
interface = gr.Blocks()
|
|
|
|
| 29 |
with interface:
|
| 30 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 31 |
with gr.Row():
|
| 32 |
with gr.Column(scale=3):
|
| 33 |
with gr.Row(equal_height=True):
|
| 34 |
with gr.Column(scale=5):
|
| 35 |
-
diagnose_list = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
| 36 |
with gr.Column(scale=1,min_width=10):
|
| 37 |
-
color_wordlist = gr.ColorPicker(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
with gr.Row():
|
| 39 |
with gr.Column(scale=5):
|
| 40 |
-
wordlist_1 = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
| 41 |
with gr.Column(scale=1,min_width=10):
|
| 42 |
-
color_wordlist_1 = gr.ColorPicker(
|
|
|
|
|
|
|
|
|
|
| 43 |
with gr.Row():
|
| 44 |
with gr.Column(scale=5):
|
| 45 |
-
wordlist_2 = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
| 46 |
with gr.Column(scale=1,min_width=10):
|
| 47 |
-
color_wordlist_2 = gr.ColorPicker(
|
|
|
|
|
|
|
|
|
|
| 48 |
with gr.Row():
|
| 49 |
with gr.Column(scale=5):
|
| 50 |
-
wordlist_3 = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
| 51 |
with gr.Column(scale=1,min_width=10):
|
| 52 |
-
color_wordlist_3 = gr.ColorPicker(
|
|
|
|
|
|
|
|
|
|
| 53 |
with gr.Row():
|
| 54 |
with gr.Column(scale=5):
|
| 55 |
-
wordlist_4 = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
| 56 |
with gr.Column(scale=1,min_width=10):
|
| 57 |
-
color_wordlist_4 = gr.ColorPicker(
|
|
|
|
|
|
|
|
|
|
| 58 |
with gr.Column(scale=4):
|
| 59 |
with gr.Row():
|
| 60 |
with gr.Row():
|
| 61 |
-
gr.Markdown(
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
with gr.Row():
|
| 64 |
-
alpha = gr.Slider(
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
with gr.Row():
|
| 67 |
-
btn_plot = gr.Button(
|
|
|
|
|
|
|
| 68 |
with gr.Row():
|
| 69 |
-
err_msg = gr.Markdown(
|
|
|
|
|
|
|
|
|
|
| 70 |
with gr.Row():
|
| 71 |
-
word_proyections = gr.Plot(
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
with gr.Row():
|
| 74 |
gr.Examples(
|
|
@@ -80,7 +147,9 @@ def interface(
|
|
| 80 |
)
|
| 81 |
|
| 82 |
with gr.Row():
|
| 83 |
-
gr.Markdown(
|
|
|
|
|
|
|
| 84 |
|
| 85 |
btn_plot.click(
|
| 86 |
fn=connector.plot_proyection_2d,
|
|
@@ -99,21 +168,25 @@ def interface(
|
|
| 99 |
fontsize,
|
| 100 |
n_neighbors
|
| 101 |
],
|
| 102 |
-
outputs=[word_proyections,err_msg]
|
| 103 |
)
|
| 104 |
|
| 105 |
# --- Logs ---
|
| 106 |
-
save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
|
| 107 |
-
log_callback.setup(
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
btn_plot.click(
|
| 110 |
fn=lambda *args: log_callback.flag(
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
),
|
| 115 |
inputs=save_field,
|
| 116 |
outputs=None,
|
| 117 |
preprocess=False
|
| 118 |
)
|
|
|
|
| 119 |
return interface
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import matplotlib.pyplot as plt
|
| 4 |
|
| 5 |
+
from modules.module_connection import WordExplorerConnector
|
|
|
|
| 6 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
| 7 |
from examples.examples import examples_explorar_relaciones_entre_palabras
|
| 8 |
+
from tool_info import TOOL_INFO
|
| 9 |
|
| 10 |
plt.rcParams.update({'font.size': 14})
|
| 11 |
|
| 12 |
def interface(
|
| 13 |
+
embedding, # Class Embedding instance
|
| 14 |
available_logs: bool,
|
| 15 |
+
max_neighbors: int,
|
| 16 |
lang: str="spanish",
|
| 17 |
) -> gr.Blocks:
|
| 18 |
|
| 19 |
# --- Init logs ---
|
| 20 |
log_callback = HuggingFaceDatasetSaver(
|
| 21 |
+
available_logs=available_logs,
|
| 22 |
+
dataset_name=f"logs_edia_we_{lang}"
|
| 23 |
)
|
| 24 |
+
|
| 25 |
# --- Init vars ---
|
| 26 |
+
connector = WordExplorerConnector(
|
| 27 |
+
embedding=embedding
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# --- Load language ---
|
| 31 |
+
labels = pd.read_json(
|
| 32 |
+
f"language/{lang}.json"
|
| 33 |
+
)["WordExplorer_interface"]
|
| 34 |
|
| 35 |
# --- Interface ---
|
| 36 |
interface = gr.Blocks()
|
| 37 |
+
|
| 38 |
with interface:
|
| 39 |
+
gr.Markdown(
|
| 40 |
+
value=labels["title"]
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
with gr.Row():
|
| 44 |
with gr.Column(scale=3):
|
| 45 |
with gr.Row(equal_height=True):
|
| 46 |
with gr.Column(scale=5):
|
| 47 |
+
diagnose_list = gr.Textbox(
|
| 48 |
+
lines=2,
|
| 49 |
+
label=labels["wordListToDiagnose"]
|
| 50 |
+
)
|
| 51 |
with gr.Column(scale=1,min_width=10):
|
| 52 |
+
color_wordlist = gr.ColorPicker(
|
| 53 |
+
label="",
|
| 54 |
+
value='#000000'
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
with gr.Row():
|
| 58 |
with gr.Column(scale=5):
|
| 59 |
+
wordlist_1 = gr.Textbox(
|
| 60 |
+
lines=2,
|
| 61 |
+
label=labels["wordList1"]
|
| 62 |
+
)
|
| 63 |
with gr.Column(scale=1,min_width=10):
|
| 64 |
+
color_wordlist_1 = gr.ColorPicker(
|
| 65 |
+
label="",
|
| 66 |
+
value='#1f78b4'
|
| 67 |
+
)
|
| 68 |
with gr.Row():
|
| 69 |
with gr.Column(scale=5):
|
| 70 |
+
wordlist_2 = gr.Textbox(
|
| 71 |
+
lines=2,
|
| 72 |
+
label=labels["wordList2"]
|
| 73 |
+
)
|
| 74 |
with gr.Column(scale=1,min_width=10):
|
| 75 |
+
color_wordlist_2 = gr.ColorPicker(
|
| 76 |
+
label="",
|
| 77 |
+
value='#33a02c'
|
| 78 |
+
)
|
| 79 |
with gr.Row():
|
| 80 |
with gr.Column(scale=5):
|
| 81 |
+
wordlist_3 = gr.Textbox(
|
| 82 |
+
lines=2,
|
| 83 |
+
label=labels["wordList3"]
|
| 84 |
+
)
|
| 85 |
with gr.Column(scale=1,min_width=10):
|
| 86 |
+
color_wordlist_3 = gr.ColorPicker(
|
| 87 |
+
label="",
|
| 88 |
+
value='#e31a1c'
|
| 89 |
+
)
|
| 90 |
with gr.Row():
|
| 91 |
with gr.Column(scale=5):
|
| 92 |
+
wordlist_4 = gr.Textbox(
|
| 93 |
+
lines=2,
|
| 94 |
+
label=labels["wordList4"]
|
| 95 |
+
)
|
| 96 |
with gr.Column(scale=1,min_width=10):
|
| 97 |
+
color_wordlist_4 = gr.ColorPicker(
|
| 98 |
+
label="",
|
| 99 |
+
value='#6a3d9a'
|
| 100 |
+
)
|
| 101 |
with gr.Column(scale=4):
|
| 102 |
with gr.Row():
|
| 103 |
with gr.Row():
|
| 104 |
+
gr.Markdown(
|
| 105 |
+
value=labels["plotNeighbours"]["title"]
|
| 106 |
+
)
|
| 107 |
+
n_neighbors = gr.Slider(
|
| 108 |
+
minimum=0,
|
| 109 |
+
maximum=max_neighbors,
|
| 110 |
+
step=1,
|
| 111 |
+
label=labels["plotNeighbours"]["quantity"]
|
| 112 |
+
)
|
| 113 |
with gr.Row():
|
| 114 |
+
alpha = gr.Slider(
|
| 115 |
+
minimum=0.1,
|
| 116 |
+
maximum=0.9,
|
| 117 |
+
value=0.3,
|
| 118 |
+
step=0.1,
|
| 119 |
+
label=labels["options"]["transparency"]
|
| 120 |
+
)
|
| 121 |
+
fontsize=gr.Number(
|
| 122 |
+
value=25,
|
| 123 |
+
label=labels["options"]["font-size"]
|
| 124 |
+
)
|
| 125 |
with gr.Row():
|
| 126 |
+
btn_plot = gr.Button(
|
| 127 |
+
value=labels["plot_button"]
|
| 128 |
+
)
|
| 129 |
with gr.Row():
|
| 130 |
+
err_msg = gr.Markdown(
|
| 131 |
+
label="",
|
| 132 |
+
visible=True
|
| 133 |
+
)
|
| 134 |
with gr.Row():
|
| 135 |
+
word_proyections = gr.Plot(
|
| 136 |
+
label="",
|
| 137 |
+
show_label=False
|
| 138 |
+
)
|
| 139 |
|
| 140 |
with gr.Row():
|
| 141 |
gr.Examples(
|
|
|
|
| 147 |
)
|
| 148 |
|
| 149 |
with gr.Row():
|
| 150 |
+
gr.Markdown(
|
| 151 |
+
value=TOOL_INFO
|
| 152 |
+
)
|
| 153 |
|
| 154 |
btn_plot.click(
|
| 155 |
fn=connector.plot_proyection_2d,
|
|
|
|
| 168 |
fontsize,
|
| 169 |
n_neighbors
|
| 170 |
],
|
| 171 |
+
outputs=[word_proyections, err_msg]
|
| 172 |
)
|
| 173 |
|
| 174 |
# --- Logs ---
|
| 175 |
+
save_field = [diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
|
| 176 |
+
log_callback.setup(
|
| 177 |
+
components=save_field,
|
| 178 |
+
flagging_dir="logs_word_explorer"
|
| 179 |
+
)
|
| 180 |
|
| 181 |
btn_plot.click(
|
| 182 |
fn=lambda *args: log_callback.flag(
|
| 183 |
+
flag_data=args,
|
| 184 |
+
flag_option="word_explorer",
|
| 185 |
+
username="vialibre",
|
| 186 |
),
|
| 187 |
inputs=save_field,
|
| 188 |
outputs=None,
|
| 189 |
preprocess=False
|
| 190 |
)
|
| 191 |
+
|
| 192 |
return interface
|
language/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
english.json
|
modules/model_embbeding.py
CHANGED
|
@@ -3,7 +3,7 @@ from memory_profiler import profile
|
|
| 3 |
from sklearn.neighbors import NearestNeighbors
|
| 4 |
from sklearn.decomposition import PCA
|
| 5 |
from gensim.models import KeyedVectors
|
| 6 |
-
from typing import List
|
| 7 |
import os
|
| 8 |
import pandas as pd
|
| 9 |
|
|
@@ -13,21 +13,22 @@ from gensim import matutils
|
|
| 13 |
|
| 14 |
|
| 15 |
class Embedding:
|
| 16 |
-
@profile
|
| 17 |
def __init__(self,
|
| 18 |
path: str,
|
| 19 |
-
|
| 20 |
-
limit: int=None,
|
| 21 |
randomizedPCA: bool=False,
|
| 22 |
-
max_neighbors: int=20
|
|
|
|
| 23 |
) -> None:
|
| 24 |
|
| 25 |
# Embedding vars
|
| 26 |
self.path = path
|
| 27 |
self.limit = limit
|
| 28 |
self.randomizedPCA = randomizedPCA
|
| 29 |
-
self.binary = binary
|
| 30 |
self.max_neighbors = max_neighbors
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Full embedding dataset
|
| 33 |
self.ds = None
|
|
@@ -43,36 +44,34 @@ class Embedding:
|
|
| 43 |
self,
|
| 44 |
) -> None:
|
| 45 |
|
|
|
|
|
|
|
| 46 |
print(f"Preparing {os.path.basename(self.path)} embeddings...")
|
| 47 |
|
| 48 |
# --- Prepare dataset ---
|
| 49 |
self.ds = self.__preparate(
|
| 50 |
-
self.path, self.
|
| 51 |
)
|
| 52 |
|
| 53 |
# --- Estimate Nearest Neighbors
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
self.neigh.fit(
|
| 69 |
-
X=self.ds['embedding'].to_list()
|
| 70 |
-
)
|
| 71 |
|
| 72 |
def __preparate(
|
| 73 |
self,
|
| 74 |
-
path: str,
|
| 75 |
-
binary: bool,
|
| 76 |
limit: int,
|
| 77 |
randomizedPCA: bool
|
| 78 |
) -> pd.DataFrame:
|
|
@@ -94,7 +93,7 @@ class Embedding:
|
|
| 94 |
print("--------> PATH:", path)
|
| 95 |
model = KeyedVectors.load_word2vec_format(
|
| 96 |
fname=path,
|
| 97 |
-
binary=
|
| 98 |
limit=limit
|
| 99 |
)
|
| 100 |
|
|
@@ -116,11 +115,48 @@ class Embedding:
|
|
| 116 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
| 117 |
return df_uncased
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def __getValue(
|
| 120 |
self,
|
| 121 |
word: str,
|
| 122 |
feature: str
|
| 123 |
-
):
|
|
|
|
| 124 |
word_id, value = None, None
|
| 125 |
|
| 126 |
if word in self:
|
|
@@ -134,13 +170,15 @@ class Embedding:
|
|
| 134 |
def getEmbedding(
|
| 135 |
self,
|
| 136 |
word: str
|
| 137 |
-
):
|
|
|
|
| 138 |
return self.__getValue(word, 'embedding')
|
| 139 |
|
| 140 |
def getPCA(
|
| 141 |
self,
|
| 142 |
word: str
|
| 143 |
-
):
|
|
|
|
| 144 |
return self.__getValue(word, 'pca')
|
| 145 |
|
| 146 |
def getNearestNeighbors(
|
|
@@ -152,35 +190,58 @@ class Embedding:
|
|
| 152 |
|
| 153 |
assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
elif nn_method == 'sklearn':
|
| 159 |
-
word_emb = self.getEmbedding(word).reshape(1,-1)
|
| 160 |
-
_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1) #Fix and Update
|
| 161 |
-
words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:] #Fix and Update
|
| 162 |
-
else:
|
| 163 |
-
words = []
|
| 164 |
-
return words
|
| 165 |
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
self,
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
return word in self.ds['word'].to_list()
|
| 172 |
-
|
| 173 |
-
# ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
|
| 174 |
-
# ya que ahora los embedding vienen normalizados
|
| 175 |
-
def cosineSimilarities(self, vector_1, vectors_all):
|
| 176 |
norm = np.linalg.norm(vector_1)
|
| 177 |
all_norms = np.linalg.norm(vectors_all, axis=1)
|
| 178 |
dot_products = dot(vectors_all, vector_1)
|
| 179 |
similarities = dot_products / (norm * all_norms)
|
| 180 |
return similarities
|
| 181 |
|
| 182 |
-
def getCosineSimilarities(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
return dot(
|
| 184 |
matutils.unitvec(self.getEmbedding(w1)),
|
| 185 |
matutils.unitvec(self.getEmbedding(w2))
|
| 186 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from sklearn.neighbors import NearestNeighbors
|
| 4 |
from sklearn.decomposition import PCA
|
| 5 |
from gensim.models import KeyedVectors
|
| 6 |
+
from typing import List, Any
|
| 7 |
import os
|
| 8 |
import pandas as pd
|
| 9 |
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class Embedding:
|
|
|
|
| 16 |
def __init__(self,
|
| 17 |
path: str,
|
| 18 |
+
limit: int=None,
|
|
|
|
| 19 |
randomizedPCA: bool=False,
|
| 20 |
+
max_neighbors: int=20,
|
| 21 |
+
nn_method: str='sklearn'
|
| 22 |
) -> None:
|
| 23 |
|
| 24 |
# Embedding vars
|
| 25 |
self.path = path
|
| 26 |
self.limit = limit
|
| 27 |
self.randomizedPCA = randomizedPCA
|
|
|
|
| 28 |
self.max_neighbors = max_neighbors
|
| 29 |
+
|
| 30 |
+
self.availables_nn_methods = ['sklearn', 'ann']
|
| 31 |
+
self.nn_method = nn_method
|
| 32 |
|
| 33 |
# Full embedding dataset
|
| 34 |
self.ds = None
|
|
|
|
| 44 |
self,
|
| 45 |
) -> None:
|
| 46 |
|
| 47 |
+
assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
|
| 48 |
+
|
| 49 |
print(f"Preparing {os.path.basename(self.path)} embeddings...")
|
| 50 |
|
| 51 |
# --- Prepare dataset ---
|
| 52 |
self.ds = self.__preparate(
|
| 53 |
+
self.path, self.limit, self.randomizedPCA
|
| 54 |
)
|
| 55 |
|
| 56 |
# --- Estimate Nearest Neighbors
|
| 57 |
+
if self.nn_method == 'sklearn':
|
| 58 |
+
# Method A: Througth Sklearn method
|
| 59 |
+
self.__init_sklearn_method(
|
| 60 |
+
max_neighbors=self.max_neighbors,
|
| 61 |
+
vectors=self.ds['embedding'].to_list()
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
elif self.nn_method == 'ann':
|
| 65 |
+
# Method B: Througth annoy using forest tree
|
| 66 |
+
self.__init_ann_method(
|
| 67 |
+
words=self.ds['word'].to_list(),
|
| 68 |
+
vectors=self.ds['embedding'].to_list(),
|
| 69 |
+
coord=self.ds['pca'].to_list()
|
| 70 |
+
)
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def __preparate(
|
| 73 |
self,
|
| 74 |
+
path: str,
|
|
|
|
| 75 |
limit: int,
|
| 76 |
randomizedPCA: bool
|
| 77 |
) -> pd.DataFrame:
|
|
|
|
| 93 |
print("--------> PATH:", path)
|
| 94 |
model = KeyedVectors.load_word2vec_format(
|
| 95 |
fname=path,
|
| 96 |
+
binary=path.endswith('.bin'),
|
| 97 |
limit=limit
|
| 98 |
)
|
| 99 |
|
|
|
|
| 115 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
| 116 |
return df_uncased
|
| 117 |
|
| 118 |
+
def __init_ann_method(
|
| 119 |
+
self,
|
| 120 |
+
words: List[str],
|
| 121 |
+
vectors: List[float],
|
| 122 |
+
coord: List[float],
|
| 123 |
+
n_trees: int=20,
|
| 124 |
+
metric: str='dot'
|
| 125 |
+
) -> None:
|
| 126 |
+
|
| 127 |
+
print("Initializing Annoy method to search for nearby neighbors...")
|
| 128 |
+
self.ann = Ann(
|
| 129 |
+
words=words,
|
| 130 |
+
vectors=vectors,
|
| 131 |
+
coord=coord,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
self.ann.init(
|
| 135 |
+
n_trees=n_trees,
|
| 136 |
+
metric=metric,
|
| 137 |
+
n_jobs=-1
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
def __init_sklearn_method(
|
| 141 |
+
self,
|
| 142 |
+
max_neighbors: int,
|
| 143 |
+
vectors: List[float]
|
| 144 |
+
) -> None:
|
| 145 |
+
|
| 146 |
+
print("Initializing sklearn method to search for nearby neighbors...")
|
| 147 |
+
self.neigh = NearestNeighbors(
|
| 148 |
+
n_neighbors=max_neighbors
|
| 149 |
+
)
|
| 150 |
+
self.neigh.fit(
|
| 151 |
+
X=vectors
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
def __getValue(
|
| 155 |
self,
|
| 156 |
word: str,
|
| 157 |
feature: str
|
| 158 |
+
) -> Any:
|
| 159 |
+
|
| 160 |
word_id, value = None, None
|
| 161 |
|
| 162 |
if word in self:
|
|
|
|
| 170 |
def getEmbedding(
|
| 171 |
self,
|
| 172 |
word: str
|
| 173 |
+
) -> np.ndarray:
|
| 174 |
+
|
| 175 |
return self.__getValue(word, 'embedding')
|
| 176 |
|
| 177 |
def getPCA(
|
| 178 |
self,
|
| 179 |
word: str
|
| 180 |
+
) -> np.ndarray:
|
| 181 |
+
|
| 182 |
return self.__getValue(word, 'pca')
|
| 183 |
|
| 184 |
def getNearestNeighbors(
|
|
|
|
| 190 |
|
| 191 |
assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
|
| 192 |
|
| 193 |
+
assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
|
| 194 |
+
|
| 195 |
+
neighbords_list = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
if word in self:
|
| 198 |
+
if nn_method == 'ann':
|
| 199 |
+
if self.ann is None:
|
| 200 |
+
self.__init_ann_method(
|
| 201 |
+
words=self.ds['word'].to_list(),
|
| 202 |
+
vectors=self.ds['embedding'].to_list(),
|
| 203 |
+
coord=self.ds['pca'].to_list()
|
| 204 |
+
)
|
| 205 |
+
neighbords_list = self.ann.get(word, n_neighbors)
|
| 206 |
+
|
| 207 |
+
elif nn_method == 'sklearn':
|
| 208 |
+
if self.neigh is None:
|
| 209 |
+
self.__init_sklearn_method(
|
| 210 |
+
max_neighbors=self.max_neighbors,
|
| 211 |
+
vectors=self.ds['embedding'].to_list()
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
word_emb = self.getEmbedding(word).reshape(1,-1)
|
| 215 |
+
_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
|
| 216 |
+
neighbords_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
|
| 217 |
+
|
| 218 |
+
return neighbords_list
|
| 219 |
+
|
| 220 |
+
def cosineSimilarities(
|
| 221 |
self,
|
| 222 |
+
vector_1,
|
| 223 |
+
vectors_all
|
| 224 |
+
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
norm = np.linalg.norm(vector_1)
|
| 226 |
all_norms = np.linalg.norm(vectors_all, axis=1)
|
| 227 |
dot_products = dot(vectors_all, vector_1)
|
| 228 |
similarities = dot_products / (norm * all_norms)
|
| 229 |
return similarities
|
| 230 |
|
| 231 |
+
def getCosineSimilarities(
|
| 232 |
+
self,
|
| 233 |
+
w1,
|
| 234 |
+
w2
|
| 235 |
+
):
|
| 236 |
+
|
| 237 |
return dot(
|
| 238 |
matutils.unitvec(self.getEmbedding(w1)),
|
| 239 |
matutils.unitvec(self.getEmbedding(w2))
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def __contains__(
|
| 243 |
+
self,
|
| 244 |
+
word: str
|
| 245 |
+
) -> bool:
|
| 246 |
+
|
| 247 |
+
return word in self.ds['word'].to_list()
|
modules/module_BiasExplorer.py
CHANGED
|
@@ -5,10 +5,14 @@ import seaborn as sns
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
from sklearn.decomposition import PCA
|
| 7 |
|
| 8 |
-
def take_two_sides_extreme_sorted(
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
head_df = df.head(n_extreme)[:]
|
| 13 |
tail_df = df.tail(n_extreme)[:]
|
| 14 |
|
|
@@ -56,39 +60,63 @@ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
|
|
| 56 |
|
| 57 |
|
| 58 |
class WordBiasExplorer():
|
| 59 |
-
def __init__(
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
self.
|
| 63 |
self.direction = None
|
| 64 |
self.positive_end = None
|
| 65 |
self.negative_end = None
|
| 66 |
|
| 67 |
-
def __copy__(
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
| 70 |
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
| 71 |
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
| 72 |
return bias_word_embedding
|
| 73 |
|
| 74 |
-
def __deepcopy__(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
bias_word_embedding = copy.copy(self)
|
| 76 |
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
| 77 |
return bias_word_embedding
|
| 78 |
|
| 79 |
-
def __getitem__(
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
|
| 83 |
-
return item in self.vocabulary
|
| 84 |
|
| 85 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
if self.direction is None:
|
| 87 |
raise RuntimeError('The direction was not identified'
|
| 88 |
' for this {} instance'
|
| 89 |
.format(self.__class__.__name__))
|
| 90 |
|
| 91 |
-
def _identify_subspace_by_pca(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
matrix = []
|
| 93 |
|
| 94 |
for word1, word2 in definitional_pairs:
|
|
@@ -105,8 +133,14 @@ class WordBiasExplorer():
|
|
| 105 |
return pca
|
| 106 |
|
| 107 |
|
| 108 |
-
def _identify_direction(
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
if method not in DIRECTION_METHODS:
|
| 111 |
raise ValueError('method should be one of {}, {} was given'.format(
|
| 112 |
DIRECTION_METHODS, method))
|
|
@@ -154,7 +188,11 @@ class WordBiasExplorer():
|
|
| 154 |
self.positive_end = positive_end
|
| 155 |
self.negative_end = negative_end
|
| 156 |
|
| 157 |
-
def project_on_direction(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
"""Project the normalized vector of the word on the direction.
|
| 159 |
:param str word: The word tor project
|
| 160 |
:return float: The projection scalar
|
|
@@ -163,13 +201,15 @@ class WordBiasExplorer():
|
|
| 163 |
self._is_direction_identified()
|
| 164 |
|
| 165 |
vector = self[word]
|
| 166 |
-
projection_score = self.
|
| 167 |
[vector])[0]
|
| 168 |
return projection_score
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
| 172 |
-
def _calc_projection_scores(self, words):
|
| 173 |
self._is_direction_identified()
|
| 174 |
|
| 175 |
df = pd.DataFrame({'word': words})
|
|
@@ -181,7 +221,11 @@ class WordBiasExplorer():
|
|
| 181 |
|
| 182 |
return df
|
| 183 |
|
| 184 |
-
def calc_projection_data(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
"""
|
| 186 |
Calculate projection, projected and rejected vectors of a words list.
|
| 187 |
:param list words: List of words
|
|
@@ -206,7 +250,12 @@ class WordBiasExplorer():
|
|
| 206 |
|
| 207 |
return pd.DataFrame(projection_data)
|
| 208 |
|
| 209 |
-
def plot_dist_projections_on_direction(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
"""Plot the projection scalars distribution on the direction.
|
| 211 |
:param dict word_groups word: The groups to projects
|
| 212 |
:return float: The ax object of the plot
|
|
@@ -221,7 +270,7 @@ class WordBiasExplorer():
|
|
| 221 |
words = word_groups[name]
|
| 222 |
label = '{} (#{})'.format(name, len(words))
|
| 223 |
vectors = [self[word] for word in words]
|
| 224 |
-
projections = self.
|
| 225 |
vectors)
|
| 226 |
sns.distplot(projections, hist=False, label=label, ax=ax)
|
| 227 |
|
|
@@ -236,18 +285,26 @@ class WordBiasExplorer():
|
|
| 236 |
|
| 237 |
return ax
|
| 238 |
|
| 239 |
-
def __errorChecking(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
out_msj = ""
|
| 241 |
|
| 242 |
if not word:
|
| 243 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
| 244 |
else:
|
| 245 |
-
if word not in self.
|
| 246 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
| 247 |
|
| 248 |
return out_msj
|
| 249 |
|
| 250 |
-
def check_oov(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
for wordlist in wordlists:
|
| 252 |
for word in wordlist:
|
| 253 |
msg = self.__errorChecking(word)
|
|
@@ -255,13 +312,15 @@ class WordBiasExplorer():
|
|
| 255 |
return msg
|
| 256 |
return None
|
| 257 |
|
| 258 |
-
def plot_biased_words(
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
| 265 |
bias_2D = wordlist_top == [] and wordlist_bottom == []
|
| 266 |
|
| 267 |
if bias_2D and (not wordlist_right or not wordlist_left):
|
|
@@ -273,21 +332,24 @@ class WordBiasExplorer():
|
|
| 273 |
if err:
|
| 274 |
raise Exception(err)
|
| 275 |
|
| 276 |
-
return self.get_bias_plot(
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
| 281 |
|
| 282 |
-
def get_bias_plot(
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
|
|
|
|
|
|
| 291 |
fig, ax = plt.subplots(1, figsize=figsize)
|
| 292 |
self.method = method
|
| 293 |
self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
|
|
@@ -298,14 +360,17 @@ class WordBiasExplorer():
|
|
| 298 |
|
| 299 |
return fig
|
| 300 |
|
| 301 |
-
def plot_projection_scores(
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
| 309 |
name_left = ', '.join(definitional_1[1])
|
| 310 |
name_right = ', '.join(definitional_1[0])
|
| 311 |
|
|
@@ -341,6 +406,9 @@ class WordBiasExplorer():
|
|
| 341 |
sns.barplot(x='projection', y='word', data=projections_df,
|
| 342 |
palette=projections_df['color'])
|
| 343 |
else:
|
|
|
|
|
|
|
|
|
|
| 344 |
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
| 345 |
palette=projections_df['color'])
|
| 346 |
|
|
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
from sklearn.decomposition import PCA
|
| 7 |
|
| 8 |
+
def take_two_sides_extreme_sorted(
|
| 9 |
+
df,
|
| 10 |
+
n_extreme,
|
| 11 |
+
part_column=None,
|
| 12 |
+
head_value='',
|
| 13 |
+
tail_value=''
|
| 14 |
+
):
|
| 15 |
+
|
| 16 |
head_df = df.head(n_extreme)[:]
|
| 17 |
tail_df = df.tail(n_extreme)[:]
|
| 18 |
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
class WordBiasExplorer():
|
| 63 |
+
def __init__(
|
| 64 |
+
self,
|
| 65 |
+
embedding # Class Embedding instance
|
| 66 |
+
) -> None:
|
| 67 |
|
| 68 |
+
self.embedding = embedding
|
| 69 |
self.direction = None
|
| 70 |
self.positive_end = None
|
| 71 |
self.negative_end = None
|
| 72 |
|
| 73 |
+
def __copy__(
|
| 74 |
+
self
|
| 75 |
+
):
|
| 76 |
+
|
| 77 |
+
bias_word_embedding = self.__class__(self.embedding)
|
| 78 |
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
| 79 |
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
| 80 |
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
| 81 |
return bias_word_embedding
|
| 82 |
|
| 83 |
+
def __deepcopy__(
|
| 84 |
+
self,
|
| 85 |
+
memo
|
| 86 |
+
):
|
| 87 |
+
|
| 88 |
bias_word_embedding = copy.copy(self)
|
| 89 |
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
| 90 |
return bias_word_embedding
|
| 91 |
|
| 92 |
+
def __getitem__(
|
| 93 |
+
self,
|
| 94 |
+
key: str
|
| 95 |
+
) -> np.ndarray:
|
| 96 |
|
| 97 |
+
return self.embedding.getEmbedding(key)
|
|
|
|
| 98 |
|
| 99 |
+
def __contains__(
|
| 100 |
+
self,
|
| 101 |
+
item: str
|
| 102 |
+
) -> bool:
|
| 103 |
+
|
| 104 |
+
return item in self.embedding
|
| 105 |
+
|
| 106 |
+
def _is_direction_identified(
|
| 107 |
+
self
|
| 108 |
+
):
|
| 109 |
if self.direction is None:
|
| 110 |
raise RuntimeError('The direction was not identified'
|
| 111 |
' for this {} instance'
|
| 112 |
.format(self.__class__.__name__))
|
| 113 |
|
| 114 |
+
def _identify_subspace_by_pca(
|
| 115 |
+
self,
|
| 116 |
+
definitional_pairs,
|
| 117 |
+
n_components
|
| 118 |
+
):
|
| 119 |
+
|
| 120 |
matrix = []
|
| 121 |
|
| 122 |
for word1, word2 in definitional_pairs:
|
|
|
|
| 133 |
return pca
|
| 134 |
|
| 135 |
|
| 136 |
+
def _identify_direction(
|
| 137 |
+
self,
|
| 138 |
+
positive_end,
|
| 139 |
+
negative_end,
|
| 140 |
+
definitional,
|
| 141 |
+
method='pca'
|
| 142 |
+
):
|
| 143 |
+
|
| 144 |
if method not in DIRECTION_METHODS:
|
| 145 |
raise ValueError('method should be one of {}, {} was given'.format(
|
| 146 |
DIRECTION_METHODS, method))
|
|
|
|
| 188 |
self.positive_end = positive_end
|
| 189 |
self.negative_end = negative_end
|
| 190 |
|
| 191 |
+
def project_on_direction(
|
| 192 |
+
self,
|
| 193 |
+
word: str
|
| 194 |
+
):
|
| 195 |
+
|
| 196 |
"""Project the normalized vector of the word on the direction.
|
| 197 |
:param str word: The word tor project
|
| 198 |
:return float: The projection scalar
|
|
|
|
| 201 |
self._is_direction_identified()
|
| 202 |
|
| 203 |
vector = self[word]
|
| 204 |
+
projection_score = self.embedding.cosineSimilarities(self.direction,
|
| 205 |
[vector])[0]
|
| 206 |
return projection_score
|
| 207 |
|
| 208 |
+
def _calc_projection_scores(
|
| 209 |
+
self,
|
| 210 |
+
words
|
| 211 |
+
):
|
| 212 |
|
|
|
|
|
|
|
| 213 |
self._is_direction_identified()
|
| 214 |
|
| 215 |
df = pd.DataFrame({'word': words})
|
|
|
|
| 221 |
|
| 222 |
return df
|
| 223 |
|
| 224 |
+
def calc_projection_data(
|
| 225 |
+
self,
|
| 226 |
+
words
|
| 227 |
+
):
|
| 228 |
+
|
| 229 |
"""
|
| 230 |
Calculate projection, projected and rejected vectors of a words list.
|
| 231 |
:param list words: List of words
|
|
|
|
| 250 |
|
| 251 |
return pd.DataFrame(projection_data)
|
| 252 |
|
| 253 |
+
def plot_dist_projections_on_direction(
|
| 254 |
+
self,
|
| 255 |
+
word_groups,
|
| 256 |
+
ax=None
|
| 257 |
+
):
|
| 258 |
+
|
| 259 |
"""Plot the projection scalars distribution on the direction.
|
| 260 |
:param dict word_groups word: The groups to projects
|
| 261 |
:return float: The ax object of the plot
|
|
|
|
| 270 |
words = word_groups[name]
|
| 271 |
label = '{} (#{})'.format(name, len(words))
|
| 272 |
vectors = [self[word] for word in words]
|
| 273 |
+
projections = self.embedding.cosineSimilarities(self.direction,
|
| 274 |
vectors)
|
| 275 |
sns.distplot(projections, hist=False, label=label, ax=ax)
|
| 276 |
|
|
|
|
| 285 |
|
| 286 |
return ax
|
| 287 |
|
| 288 |
+
def __errorChecking(
|
| 289 |
+
self,
|
| 290 |
+
word
|
| 291 |
+
):
|
| 292 |
+
|
| 293 |
out_msj = ""
|
| 294 |
|
| 295 |
if not word:
|
| 296 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
| 297 |
else:
|
| 298 |
+
if word not in self.embedding:
|
| 299 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
| 300 |
|
| 301 |
return out_msj
|
| 302 |
|
| 303 |
+
def check_oov(
|
| 304 |
+
self,
|
| 305 |
+
wordlists
|
| 306 |
+
):
|
| 307 |
+
|
| 308 |
for wordlist in wordlists:
|
| 309 |
for word in wordlist:
|
| 310 |
msg = self.__errorChecking(word)
|
|
|
|
| 312 |
return msg
|
| 313 |
return None
|
| 314 |
|
| 315 |
+
def plot_biased_words(
|
| 316 |
+
self,
|
| 317 |
+
words_to_diagnose,
|
| 318 |
+
wordlist_right,
|
| 319 |
+
wordlist_left,
|
| 320 |
+
wordlist_top=[],
|
| 321 |
+
wordlist_bottom=[]
|
| 322 |
+
):
|
| 323 |
+
|
| 324 |
bias_2D = wordlist_top == [] and wordlist_bottom == []
|
| 325 |
|
| 326 |
if bias_2D and (not wordlist_right or not wordlist_left):
|
|
|
|
| 332 |
if err:
|
| 333 |
raise Exception(err)
|
| 334 |
|
| 335 |
+
return self.get_bias_plot(
|
| 336 |
+
bias_2D,
|
| 337 |
+
words_to_diagnose,
|
| 338 |
+
definitional_1=(wordlist_right, wordlist_left),
|
| 339 |
+
definitional_2=(wordlist_top, wordlist_bottom)
|
| 340 |
+
)
|
| 341 |
|
| 342 |
+
def get_bias_plot(
|
| 343 |
+
self,
|
| 344 |
+
plot_2D,
|
| 345 |
+
words_to_diagnose,
|
| 346 |
+
definitional_1,
|
| 347 |
+
definitional_2=([], []),
|
| 348 |
+
method='sum',
|
| 349 |
+
n_extreme=10,
|
| 350 |
+
figsize=(15, 10)
|
| 351 |
+
):
|
| 352 |
+
|
| 353 |
fig, ax = plt.subplots(1, figsize=figsize)
|
| 354 |
self.method = method
|
| 355 |
self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
|
|
|
|
| 360 |
|
| 361 |
return fig
|
| 362 |
|
| 363 |
+
def plot_projection_scores(
|
| 364 |
+
self,
|
| 365 |
+
plot_2D,
|
| 366 |
+
words,
|
| 367 |
+
definitional_1,
|
| 368 |
+
definitional_2=([], []),
|
| 369 |
+
n_extreme=10,
|
| 370 |
+
ax=None,
|
| 371 |
+
axis_projection_step=0.1
|
| 372 |
+
):
|
| 373 |
+
|
| 374 |
name_left = ', '.join(definitional_1[1])
|
| 375 |
name_right = ', '.join(definitional_1[0])
|
| 376 |
|
|
|
|
| 406 |
sns.barplot(x='projection', y='word', data=projections_df,
|
| 407 |
palette=projections_df['color'])
|
| 408 |
else:
|
| 409 |
+
# ToDo: revisar este warning:
|
| 410 |
+
# Ignoring `palette` because no `hue` variable has been assigned. sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
| 411 |
+
|
| 412 |
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
| 413 |
palette=projections_df['color'])
|
| 414 |
|
modules/module_WordExplorer.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
import pandas as pd
|
| 3 |
import seaborn as sns
|
|
@@ -5,37 +6,63 @@ from numpy.linalg import norm
|
|
| 5 |
|
| 6 |
import matplotlib as mpl
|
| 7 |
mpl.use('Agg')
|
| 8 |
-
import
|
|
|
|
| 9 |
|
| 10 |
class WordToPlot:
|
| 11 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
self.word = word
|
| 13 |
self.color = color
|
| 14 |
self.bias_space = bias_space
|
| 15 |
self.alpha = alpha
|
| 16 |
|
|
|
|
| 17 |
class WordExplorer:
|
| 18 |
-
def __init__(
|
| 19 |
-
self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
def __errorChecking(self, word):
|
| 22 |
out_msj = ""
|
| 23 |
|
| 24 |
if not word:
|
| 25 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
| 26 |
else:
|
| 27 |
-
if word not in self.
|
| 28 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
| 29 |
|
| 30 |
return out_msj
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
words = string.strip()
|
| 34 |
if words:
|
| 35 |
words = [word.strip() for word in words.split(',') if word != ""]
|
| 36 |
return words
|
| 37 |
|
| 38 |
-
def check_oov(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
for wordlist in wordlists:
|
| 40 |
for word in wordlist:
|
| 41 |
msg = self.__errorChecking(word)
|
|
@@ -43,10 +70,21 @@ class WordExplorer:
|
|
| 43 |
return msg
|
| 44 |
return None
|
| 45 |
|
| 46 |
-
def get_neighbors(
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
def get_df(self, words_embedded, processed_word_list):
|
| 50 |
df = pd.DataFrame(words_embedded)
|
| 51 |
|
| 52 |
df['word'] = [wtp.word for wtp in processed_word_list]
|
|
@@ -55,16 +93,18 @@ class WordExplorer:
|
|
| 55 |
df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
|
| 56 |
return df
|
| 57 |
|
| 58 |
-
def get_plot(
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
fig, ax = plt.subplots(figsize=figsize)
|
| 69 |
|
| 70 |
sns.scatterplot(
|
|
@@ -89,11 +129,20 @@ class WordExplorer:
|
|
| 89 |
legend=False,
|
| 90 |
palette=color_dict
|
| 91 |
)
|
|
|
|
| 92 |
for i, wtp in enumerate(processed_word_list):
|
| 93 |
x, y = words_embedded[i, :]
|
| 94 |
-
ax.annotate(
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
ax.set_xticks([])
|
| 99 |
ax.set_yticks([])
|
|
@@ -103,25 +152,27 @@ class WordExplorer:
|
|
| 103 |
|
| 104 |
return fig
|
| 105 |
|
| 106 |
-
def plot_projections_2d(
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
| 114 |
# convertirlas a vector
|
| 115 |
choices = [0, 1, 2, 3, 4]
|
| 116 |
wordlist_choice = [
|
| 117 |
-
wordlist_0,
|
| 118 |
wordlist_1,
|
| 119 |
-
wordlist_2,
|
| 120 |
-
wordlist_3,
|
| 121 |
wordlist_4
|
| 122 |
]
|
| 123 |
|
| 124 |
-
err = self.check_oov(wordlist_choice)
|
| 125 |
if err:
|
| 126 |
raise Exception(err)
|
| 127 |
|
|
@@ -139,48 +190,69 @@ class WordExplorer:
|
|
| 139 |
processed_word_list = []
|
| 140 |
for word_list_to_process, color in zip(wordlist_choice, choices):
|
| 141 |
for word in word_list_to_process:
|
| 142 |
-
processed_word_list.append(
|
|
|
|
|
|
|
| 143 |
|
| 144 |
if n_neighbors > 0:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# n_neighbors=n_neighbors+1,
|
| 149 |
-
n_neighbors=n_neighbors,
|
| 150 |
nn_method=kwargs.get('nn_method', 'sklearn')
|
| 151 |
)
|
|
|
|
| 152 |
for n in neighbors:
|
| 153 |
if n not in [wtp.word for wtp in processed_word_list]:
|
| 154 |
-
processed_word_list.append(
|
|
|
|
|
|
|
| 155 |
|
| 156 |
if not processed_word_list:
|
| 157 |
raise Exception('Only empty lists were passed')
|
| 158 |
-
|
| 159 |
-
words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
|
| 160 |
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
fig = self.get_plot(data, processed_word_list, words_embedded,
|
| 164 |
-
color_dict, n_neighbors, n_alpha,
|
| 165 |
-
kwargs.get('fontsize', 18),
|
| 166 |
-
kwargs.get('figsize', (20, 15))
|
| 167 |
-
)
|
| 168 |
plt.show()
|
| 169 |
return fig
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
err = self.check_oov([wordlist])
|
| 173 |
if err:
|
| 174 |
raise Exception(err)
|
| 175 |
-
|
| 176 |
-
words_emb = np.array([self.
|
|
|
|
| 177 |
mean_vec = np.mean(words_emb, axis=0)
|
| 178 |
|
| 179 |
doesnt_match = ""
|
| 180 |
farthest_emb = 1.0
|
| 181 |
for word in wordlist:
|
| 182 |
-
word_emb = self.
|
| 183 |
-
cos_sim = np.dot(mean_vec, word_emb) /
|
|
|
|
| 184 |
if cos_sim <= farthest_emb:
|
| 185 |
farthest_emb = cos_sim
|
| 186 |
doesnt_match = word
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
import seaborn as sns
|
|
|
|
| 6 |
|
| 7 |
import matplotlib as mpl
|
| 8 |
mpl.use('Agg')
|
| 9 |
+
from typing import List, Dict, Tuple
|
| 10 |
+
|
| 11 |
|
| 12 |
class WordToPlot:
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
word: str,
|
| 16 |
+
color: str,
|
| 17 |
+
bias_space: int,
|
| 18 |
+
alpha: float
|
| 19 |
+
):
|
| 20 |
+
|
| 21 |
self.word = word
|
| 22 |
self.color = color
|
| 23 |
self.bias_space = bias_space
|
| 24 |
self.alpha = alpha
|
| 25 |
|
| 26 |
+
|
| 27 |
class WordExplorer:
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
embedding # Class Embedding instance
|
| 31 |
+
) -> None:
|
| 32 |
+
|
| 33 |
+
self.embedding = embedding
|
| 34 |
+
|
| 35 |
+
def __errorChecking(
|
| 36 |
+
self,
|
| 37 |
+
word: str
|
| 38 |
+
) -> str:
|
| 39 |
|
|
|
|
| 40 |
out_msj = ""
|
| 41 |
|
| 42 |
if not word:
|
| 43 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
| 44 |
else:
|
| 45 |
+
if word not in self.embedding:
|
| 46 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
| 47 |
|
| 48 |
return out_msj
|
| 49 |
|
| 50 |
+
# ToDo: Este método no se usa. Creo que es el implementado en la clase connections base ¿Borrar?
|
| 51 |
+
def parse_words(
|
| 52 |
+
self,
|
| 53 |
+
string: str
|
| 54 |
+
) -> List[str]:
|
| 55 |
+
|
| 56 |
words = string.strip()
|
| 57 |
if words:
|
| 58 |
words = [word.strip() for word in words.split(',') if word != ""]
|
| 59 |
return words
|
| 60 |
|
| 61 |
+
def check_oov(
|
| 62 |
+
self,
|
| 63 |
+
wordlists: List[str]
|
| 64 |
+
) -> str:
|
| 65 |
+
|
| 66 |
for wordlist in wordlists:
|
| 67 |
for word in wordlist:
|
| 68 |
msg = self.__errorChecking(word)
|
|
|
|
| 70 |
return msg
|
| 71 |
return None
|
| 72 |
|
| 73 |
+
def get_neighbors(
|
| 74 |
+
self,
|
| 75 |
+
word: str,
|
| 76 |
+
n_neighbors: int,
|
| 77 |
+
nn_method: str
|
| 78 |
+
) -> List[str]:
|
| 79 |
+
|
| 80 |
+
return self.embedding.getNearestNeighbors(word, n_neighbors, nn_method)
|
| 81 |
+
|
| 82 |
+
def get_df(
|
| 83 |
+
self,
|
| 84 |
+
words_embedded: np.ndarray,
|
| 85 |
+
processed_word_list: List[str]
|
| 86 |
+
) -> pd.DataFrame:
|
| 87 |
|
|
|
|
| 88 |
df = pd.DataFrame(words_embedded)
|
| 89 |
|
| 90 |
df['word'] = [wtp.word for wtp in processed_word_list]
|
|
|
|
| 93 |
df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
|
| 94 |
return df
|
| 95 |
|
| 96 |
+
def get_plot(
|
| 97 |
+
self,
|
| 98 |
+
data: pd.DataFrame,
|
| 99 |
+
processed_word_list: List[str],
|
| 100 |
+
words_embedded: np.ndarray,
|
| 101 |
+
color_dict: Dict,
|
| 102 |
+
n_neighbors: int,
|
| 103 |
+
n_alpha: float,
|
| 104 |
+
fontsize: int=18,
|
| 105 |
+
figsize: Tuple[int, int]=(20, 15)
|
| 106 |
+
):
|
| 107 |
+
|
| 108 |
fig, ax = plt.subplots(figsize=figsize)
|
| 109 |
|
| 110 |
sns.scatterplot(
|
|
|
|
| 129 |
legend=False,
|
| 130 |
palette=color_dict
|
| 131 |
)
|
| 132 |
+
|
| 133 |
for i, wtp in enumerate(processed_word_list):
|
| 134 |
x, y = words_embedded[i, :]
|
| 135 |
+
ax.annotate(
|
| 136 |
+
wtp.word,
|
| 137 |
+
xy=(x, y),
|
| 138 |
+
xytext=(5, 2),
|
| 139 |
+
color=wtp.color,
|
| 140 |
+
textcoords='offset points',
|
| 141 |
+
ha='right',
|
| 142 |
+
va='bottom',
|
| 143 |
+
size=fontsize,
|
| 144 |
+
alpha=wtp.alpha
|
| 145 |
+
)
|
| 146 |
|
| 147 |
ax.set_xticks([])
|
| 148 |
ax.set_yticks([])
|
|
|
|
| 152 |
|
| 153 |
return fig
|
| 154 |
|
| 155 |
+
def plot_projections_2d(
|
| 156 |
+
self,
|
| 157 |
+
wordlist_0: List[str],
|
| 158 |
+
wordlist_1: List[str]=[],
|
| 159 |
+
wordlist_2: List[str]=[],
|
| 160 |
+
wordlist_3: List[str]=[],
|
| 161 |
+
wordlist_4: List[str]=[],
|
| 162 |
+
**kwargs
|
| 163 |
+
):
|
| 164 |
+
|
| 165 |
# convertirlas a vector
|
| 166 |
choices = [0, 1, 2, 3, 4]
|
| 167 |
wordlist_choice = [
|
| 168 |
+
wordlist_0,
|
| 169 |
wordlist_1,
|
| 170 |
+
wordlist_2,
|
| 171 |
+
wordlist_3,
|
| 172 |
wordlist_4
|
| 173 |
]
|
| 174 |
|
| 175 |
+
err = self.check_oov(wordlist_choice)
|
| 176 |
if err:
|
| 177 |
raise Exception(err)
|
| 178 |
|
|
|
|
| 190 |
processed_word_list = []
|
| 191 |
for word_list_to_process, color in zip(wordlist_choice, choices):
|
| 192 |
for word in word_list_to_process:
|
| 193 |
+
processed_word_list.append(
|
| 194 |
+
WordToPlot(word, color_dict[color], color, 1)
|
| 195 |
+
)
|
| 196 |
|
| 197 |
if n_neighbors > 0:
|
| 198 |
+
neighbors = self.get_neighbors(
|
| 199 |
+
word,
|
| 200 |
+
n_neighbors=n_neighbors,
|
|
|
|
|
|
|
| 201 |
nn_method=kwargs.get('nn_method', 'sklearn')
|
| 202 |
)
|
| 203 |
+
|
| 204 |
for n in neighbors:
|
| 205 |
if n not in [wtp.word for wtp in processed_word_list]:
|
| 206 |
+
processed_word_list.append(
|
| 207 |
+
WordToPlot(n, color_dict[color], color, n_alpha)
|
| 208 |
+
)
|
| 209 |
|
| 210 |
if not processed_word_list:
|
| 211 |
raise Exception('Only empty lists were passed')
|
|
|
|
|
|
|
| 212 |
|
| 213 |
+
words_embedded = np.array(
|
| 214 |
+
[self.embedding.getPCA(wtp.word) for wtp in processed_word_list]
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
data = self.get_df(
|
| 218 |
+
words_embedded,
|
| 219 |
+
processed_word_list
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
fig = self.get_plot(
|
| 223 |
+
data,
|
| 224 |
+
processed_word_list,
|
| 225 |
+
words_embedded,
|
| 226 |
+
color_dict,
|
| 227 |
+
n_neighbors,
|
| 228 |
+
n_alpha,
|
| 229 |
+
kwargs.get('fontsize', 18),
|
| 230 |
+
kwargs.get('figsize', (20, 15))
|
| 231 |
+
)
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
plt.show()
|
| 234 |
return fig
|
| 235 |
|
| 236 |
+
# ToDo: No encuentro donde se usa este método. ¿Borrar?
|
| 237 |
+
def doesnt_match(
|
| 238 |
+
self,
|
| 239 |
+
wordlist
|
| 240 |
+
):
|
| 241 |
+
|
| 242 |
err = self.check_oov([wordlist])
|
| 243 |
if err:
|
| 244 |
raise Exception(err)
|
| 245 |
+
|
| 246 |
+
words_emb = np.array([self.embedding.getEmbedding(word)
|
| 247 |
+
for word in wordlist])
|
| 248 |
mean_vec = np.mean(words_emb, axis=0)
|
| 249 |
|
| 250 |
doesnt_match = ""
|
| 251 |
farthest_emb = 1.0
|
| 252 |
for word in wordlist:
|
| 253 |
+
word_emb = self.embedding.getEmbedding(word)
|
| 254 |
+
cos_sim = np.dot(mean_vec, word_emb) / \
|
| 255 |
+
(norm(mean_vec)*norm(word_emb))
|
| 256 |
if cos_sim <= farthest_emb:
|
| 257 |
farthest_emb = cos_sim
|
| 258 |
doesnt_match = word
|
modules/module_ann.py
CHANGED
|
@@ -1,45 +1,71 @@
|
|
| 1 |
import time
|
| 2 |
-
import operator
|
| 3 |
from tqdm import tqdm
|
| 4 |
from annoy import AnnoyIndex
|
| 5 |
from memory_profiler import profile
|
|
|
|
| 6 |
|
| 7 |
class TicToc:
|
| 8 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
| 9 |
self.i = None
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
self.i = time.time()
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
f = time.time()
|
| 14 |
print(f - self.i, "seg.")
|
| 15 |
|
|
|
|
| 16 |
class Ann:
|
| 17 |
-
def __init__(
|
| 18 |
-
self
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
self.tree = None
|
| 22 |
|
| 23 |
self.tt = TicToc()
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
|
| 28 |
# n_jobs=-1 Run over all CPU availables
|
| 29 |
|
| 30 |
-
print("
|
| 31 |
self.tt.start()
|
| 32 |
self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
|
| 33 |
-
for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
|
| 34 |
-
self.tree.add_item(i,v)
|
| 35 |
self.tt.stop()
|
| 36 |
|
| 37 |
-
print("
|
| 38 |
self.tt.start()
|
| 39 |
self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
|
| 40 |
self.tt.stop()
|
| 41 |
|
| 42 |
-
def __getWordId(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
word_id = None
|
| 44 |
try:
|
| 45 |
word_id = self.words.index(word)
|
|
@@ -47,16 +73,20 @@ class Ann:
|
|
| 47 |
pass
|
| 48 |
return word_id
|
| 49 |
|
| 50 |
-
def get(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
word_id = self.__getWordId(word)
|
| 52 |
-
|
| 53 |
|
| 54 |
if word_id != None:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
|
| 59 |
else:
|
| 60 |
print(f"The word '{word}' does not exist")
|
| 61 |
-
|
| 62 |
-
return
|
|
|
|
| 1 |
import time
|
|
|
|
| 2 |
from tqdm import tqdm
|
| 3 |
from annoy import AnnoyIndex
|
| 4 |
from memory_profiler import profile
|
| 5 |
+
from typing import List, Any
|
| 6 |
|
| 7 |
class TicToc:
|
| 8 |
+
def __init__(
|
| 9 |
+
self
|
| 10 |
+
) -> None:
|
| 11 |
+
|
| 12 |
self.i = None
|
| 13 |
+
|
| 14 |
+
def start(
|
| 15 |
+
self
|
| 16 |
+
) -> None:
|
| 17 |
+
|
| 18 |
self.i = time.time()
|
| 19 |
+
|
| 20 |
+
def stop(
|
| 21 |
+
self
|
| 22 |
+
) -> None:
|
| 23 |
+
|
| 24 |
f = time.time()
|
| 25 |
print(f - self.i, "seg.")
|
| 26 |
|
| 27 |
+
|
| 28 |
class Ann:
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
words: List[str],
|
| 32 |
+
vectors: List[float],
|
| 33 |
+
coord: List[float],
|
| 34 |
+
) -> None:
|
| 35 |
+
|
| 36 |
+
self.words = words
|
| 37 |
+
self.vectors = vectors
|
| 38 |
+
self.coord = coord
|
| 39 |
self.tree = None
|
| 40 |
|
| 41 |
self.tt = TicToc()
|
| 42 |
|
| 43 |
+
def init(self,
|
| 44 |
+
n_trees: int=10,
|
| 45 |
+
metric: str='angular',
|
| 46 |
+
n_jobs: int=-1
|
| 47 |
+
) -> None:
|
| 48 |
+
|
| 49 |
# metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
|
| 50 |
# n_jobs=-1 Run over all CPU availables
|
| 51 |
|
| 52 |
+
print("\tInit tree...")
|
| 53 |
self.tt.start()
|
| 54 |
self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
|
| 55 |
+
for i, v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
|
| 56 |
+
self.tree.add_item(i, v)
|
| 57 |
self.tt.stop()
|
| 58 |
|
| 59 |
+
print("\tBuild tree...")
|
| 60 |
self.tt.start()
|
| 61 |
self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
|
| 62 |
self.tt.stop()
|
| 63 |
|
| 64 |
+
def __getWordId(
|
| 65 |
+
self,
|
| 66 |
+
word: str
|
| 67 |
+
) -> int:
|
| 68 |
+
|
| 69 |
word_id = None
|
| 70 |
try:
|
| 71 |
word_id = self.words.index(word)
|
|
|
|
| 73 |
pass
|
| 74 |
return word_id
|
| 75 |
|
| 76 |
+
def get(
|
| 77 |
+
self,
|
| 78 |
+
word: str,
|
| 79 |
+
n_neighbors: int=10
|
| 80 |
+
) -> List[str]:
|
| 81 |
+
|
| 82 |
word_id = self.__getWordId(word)
|
| 83 |
+
neighbords_list = None
|
| 84 |
|
| 85 |
if word_id != None:
|
| 86 |
+
neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
|
| 87 |
+
neighbords_list = [self.words[idx] for idx in neighbords_id][1:]
|
| 88 |
+
|
|
|
|
| 89 |
else:
|
| 90 |
print(f"The word '{word}' does not exist")
|
| 91 |
+
|
| 92 |
+
return neighbords_list
|
modules/module_connection.py
CHANGED
|
@@ -1,52 +1,75 @@
|
|
| 1 |
-
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import gradio as gr
|
| 4 |
-
from abc import ABC, abstractmethod
|
| 5 |
|
| 6 |
-
from modules.module_WordExplorer import WordExplorer
|
| 7 |
from modules.module_BiasExplorer import WordBiasExplorer
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class Connector(ABC):
|
| 10 |
-
def parse_word(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
return word.lower().strip()
|
| 12 |
|
| 13 |
-
def parse_words(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
words = array_in_string.strip()
|
| 15 |
if not words:
|
| 16 |
return []
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
return words
|
| 19 |
|
| 20 |
-
def process_error(
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
class WordExplorerConnector(Connector):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
def __init__(self, **kwargs):
|
| 29 |
if 'embedding' in kwargs:
|
| 30 |
embedding = kwargs.get('embedding')
|
| 31 |
else:
|
| 32 |
raise KeyError
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
err = ""
|
| 51 |
neighbors_method = 'sklearn'
|
| 52 |
wordlist_0 = self.parse_words(wordlist_0)
|
|
@@ -59,49 +82,63 @@ class WordExplorerConnector(Connector):
|
|
| 59 |
err = self.process_error("Ingresa al menos 1 palabras para continuar")
|
| 60 |
return None, err
|
| 61 |
|
| 62 |
-
err = self.word_explorer.check_oov(
|
|
|
|
|
|
|
|
|
|
| 63 |
if err:
|
| 64 |
return None, self.process_error(err)
|
| 65 |
|
| 66 |
-
fig = self.word_explorer.plot_projections_2d(
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
return fig, self.process_error(err)
|
| 82 |
|
| 83 |
class BiasWordExplorerConnector(Connector):
|
| 84 |
|
| 85 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
if 'embedding' in kwargs:
|
| 87 |
embedding = kwargs.get('embedding')
|
| 88 |
else:
|
| 89 |
raise KeyError
|
| 90 |
-
self.bias_word_explorer = WordBiasExplorer(embedding)
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
err = ""
|
| 98 |
wordlist_1 = self.parse_words(wordlist_1)
|
| 99 |
wordlist_2 = self.parse_words(wordlist_2)
|
| 100 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
| 101 |
|
| 102 |
word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
|
| 103 |
-
for
|
| 104 |
-
if not
|
| 105 |
err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
|
| 106 |
if err:
|
| 107 |
return None, self.process_error(err)
|
|
@@ -110,17 +147,23 @@ class BiasWordExplorerConnector(Connector):
|
|
| 110 |
if err:
|
| 111 |
return None, self.process_error(err)
|
| 112 |
|
| 113 |
-
fig = self.bias_word_explorer.plot_biased_words(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
return fig, self.process_error(err)
|
| 116 |
|
| 117 |
-
def calculate_bias_4d(
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
err = ""
|
| 125 |
wordlist_1 = self.parse_words(wordlist_1)
|
| 126 |
wordlist_2 = self.parse_words(wordlist_2)
|
|
@@ -129,8 +172,8 @@ class BiasWordExplorerConnector(Connector):
|
|
| 129 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
| 130 |
|
| 131 |
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
| 132 |
-
for
|
| 133 |
-
if not
|
| 134 |
err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
|
| 135 |
if err:
|
| 136 |
return None, self.process_error(err)
|
|
@@ -139,5 +182,12 @@ class BiasWordExplorerConnector(Connector):
|
|
| 139 |
if err:
|
| 140 |
return None, self.process_error(err)
|
| 141 |
|
| 142 |
-
fig = self.bias_word_explorer.plot_biased_words(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
return fig, self.process_error(err)
|
|
|
|
| 1 |
+
from abc import ABC
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from modules.module_WordExplorer import WordExplorer
|
| 4 |
from modules.module_BiasExplorer import WordBiasExplorer
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
|
| 7 |
|
| 8 |
class Connector(ABC):
|
| 9 |
+
def parse_word(
|
| 10 |
+
self,
|
| 11 |
+
word: str
|
| 12 |
+
) -> str:
|
| 13 |
+
|
| 14 |
return word.lower().strip()
|
| 15 |
|
| 16 |
+
def parse_words(
|
| 17 |
+
self,
|
| 18 |
+
array_in_string: str
|
| 19 |
+
) -> List[str]:
|
| 20 |
+
|
| 21 |
words = array_in_string.strip()
|
| 22 |
if not words:
|
| 23 |
return []
|
| 24 |
+
|
| 25 |
+
words = [
|
| 26 |
+
self.parse_word(word)
|
| 27 |
+
for word in words.split(',') if word.strip() != ''
|
| 28 |
+
]
|
| 29 |
return words
|
| 30 |
|
| 31 |
+
def process_error(
|
| 32 |
+
self,
|
| 33 |
+
err: str
|
| 34 |
+
) -> str:
|
| 35 |
+
|
| 36 |
+
if err:
|
| 37 |
+
err = "<center><h3>" + err + "</h3></center>"
|
| 38 |
+
return err
|
| 39 |
|
| 40 |
|
| 41 |
class WordExplorerConnector(Connector):
|
| 42 |
+
def __init__(
|
| 43 |
+
self,
|
| 44 |
+
**kwargs
|
| 45 |
+
) -> None:
|
| 46 |
|
|
|
|
| 47 |
if 'embedding' in kwargs:
|
| 48 |
embedding = kwargs.get('embedding')
|
| 49 |
else:
|
| 50 |
raise KeyError
|
| 51 |
+
|
| 52 |
+
self.word_explorer = WordExplorer(
|
| 53 |
+
embedding=embedding
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
def plot_proyection_2d(
|
| 57 |
+
self,
|
| 58 |
+
wordlist_0: str,
|
| 59 |
+
wordlist_1: str,
|
| 60 |
+
wordlist_2: str,
|
| 61 |
+
wordlist_3: str,
|
| 62 |
+
wordlist_4: str,
|
| 63 |
+
color_wordlist_0: str,
|
| 64 |
+
color_wordlist_1: str,
|
| 65 |
+
color_wordlist_2: str,
|
| 66 |
+
color_wordlist_3: str,
|
| 67 |
+
color_wordlist_4: str,
|
| 68 |
+
n_alpha: float,
|
| 69 |
+
fontsize: int,
|
| 70 |
+
n_neighbors: int
|
| 71 |
+
) -> Tuple:
|
| 72 |
+
|
| 73 |
err = ""
|
| 74 |
neighbors_method = 'sklearn'
|
| 75 |
wordlist_0 = self.parse_words(wordlist_0)
|
|
|
|
| 82 |
err = self.process_error("Ingresa al menos 1 palabras para continuar")
|
| 83 |
return None, err
|
| 84 |
|
| 85 |
+
err = self.word_explorer.check_oov(
|
| 86 |
+
[wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
if err:
|
| 90 |
return None, self.process_error(err)
|
| 91 |
|
| 92 |
+
fig = self.word_explorer.plot_projections_2d(
|
| 93 |
+
wordlist_0,
|
| 94 |
+
wordlist_1,
|
| 95 |
+
wordlist_2,
|
| 96 |
+
wordlist_3,
|
| 97 |
+
wordlist_4,
|
| 98 |
+
color_wordlist_0=color_wordlist_0,
|
| 99 |
+
color_wordlist_1=color_wordlist_1,
|
| 100 |
+
color_wordlist_2=color_wordlist_2,
|
| 101 |
+
color_wordlist_3=color_wordlist_3,
|
| 102 |
+
color_wordlist_4=color_wordlist_4,
|
| 103 |
+
n_alpha=n_alpha,
|
| 104 |
+
fontsize=fontsize,
|
| 105 |
+
n_neighbors=n_neighbors,
|
| 106 |
+
nn_method = neighbors_method
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
return fig, self.process_error(err)
|
| 110 |
|
| 111 |
class BiasWordExplorerConnector(Connector):
|
| 112 |
|
| 113 |
+
def __init__(
|
| 114 |
+
self,
|
| 115 |
+
**kwargs
|
| 116 |
+
) -> None:
|
| 117 |
+
|
| 118 |
if 'embedding' in kwargs:
|
| 119 |
embedding = kwargs.get('embedding')
|
| 120 |
else:
|
| 121 |
raise KeyError
|
|
|
|
| 122 |
|
| 123 |
+
self.bias_word_explorer = WordBiasExplorer(
|
| 124 |
+
embedding=embedding
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
def calculate_bias_2d(
|
| 128 |
+
self,
|
| 129 |
+
wordlist_1: str,
|
| 130 |
+
wordlist_2: str,
|
| 131 |
+
to_diagnose_list: str
|
| 132 |
+
) -> Tuple:
|
| 133 |
+
|
| 134 |
err = ""
|
| 135 |
wordlist_1 = self.parse_words(wordlist_1)
|
| 136 |
wordlist_2 = self.parse_words(wordlist_2)
|
| 137 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
| 138 |
|
| 139 |
word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
|
| 140 |
+
for _list in word_lists:
|
| 141 |
+
if not _list:
|
| 142 |
err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
|
| 143 |
if err:
|
| 144 |
return None, self.process_error(err)
|
|
|
|
| 147 |
if err:
|
| 148 |
return None, self.process_error(err)
|
| 149 |
|
| 150 |
+
fig = self.bias_word_explorer.plot_biased_words(
|
| 151 |
+
to_diagnose_list,
|
| 152 |
+
wordlist_2,
|
| 153 |
+
wordlist_1
|
| 154 |
+
)
|
| 155 |
|
| 156 |
return fig, self.process_error(err)
|
| 157 |
|
| 158 |
+
def calculate_bias_4d(
|
| 159 |
+
self,
|
| 160 |
+
wordlist_1: str,
|
| 161 |
+
wordlist_2: str,
|
| 162 |
+
wordlist_3: str,
|
| 163 |
+
wordlist_4: str,
|
| 164 |
+
to_diagnose_list: str
|
| 165 |
+
) -> Tuple:
|
| 166 |
+
|
| 167 |
err = ""
|
| 168 |
wordlist_1 = self.parse_words(wordlist_1)
|
| 169 |
wordlist_2 = self.parse_words(wordlist_2)
|
|
|
|
| 172 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
| 173 |
|
| 174 |
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
| 175 |
+
for _list in wordlists:
|
| 176 |
+
if not _list:
|
| 177 |
err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
|
| 178 |
if err:
|
| 179 |
return None, self.process_error(err)
|
|
|
|
| 182 |
if err:
|
| 183 |
return None, self.process_error(err)
|
| 184 |
|
| 185 |
+
fig = self.bias_word_explorer.plot_biased_words(
|
| 186 |
+
to_diagnose_list,
|
| 187 |
+
wordlist_1,
|
| 188 |
+
wordlist_2,
|
| 189 |
+
wordlist_3,
|
| 190 |
+
wordlist_4
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
return fig, self.process_error(err)
|
modules/module_logsManager.py
CHANGED
|
@@ -40,11 +40,11 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
| 40 |
|
| 41 |
def __init__(
|
| 42 |
self,
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
organization: Optional[str]
|
| 46 |
-
private: bool
|
| 47 |
-
available_logs: bool
|
| 48 |
):
|
| 49 |
"""
|
| 50 |
Parameters:
|
|
@@ -53,6 +53,8 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
| 53 |
organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
|
| 54 |
private: Whether the dataset should be private (defaults to False).
|
| 55 |
"""
|
|
|
|
|
|
|
| 56 |
self.hf_token = hf_token
|
| 57 |
self.dataset_name = dataset_name
|
| 58 |
self.organization_name = organization
|
|
|
|
| 40 |
|
| 41 |
def __init__(
|
| 42 |
self,
|
| 43 |
+
dataset_name: str=None,
|
| 44 |
+
hf_token: str=os.getenv('HF_TOKEN'),
|
| 45 |
+
organization: Optional[str]=os.getenv('ORG_NAME'),
|
| 46 |
+
private: bool=True,
|
| 47 |
+
available_logs: bool=False
|
| 48 |
):
|
| 49 |
"""
|
| 50 |
Parameters:
|
|
|
|
| 53 |
organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
|
| 54 |
private: Whether the dataset should be private (defaults to False).
|
| 55 |
"""
|
| 56 |
+
assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
|
| 57 |
+
|
| 58 |
self.hf_token = hf_token
|
| 59 |
self.dataset_name = dataset_name
|
| 60 |
self.organization_name = organization
|
tool_info.py
CHANGED
|
@@ -4,7 +4,7 @@ TOOL_INFO = """
|
|
| 4 |
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
| 5 |
|
| 6 |
> ### Licensing Information
|
| 7 |
-
* [MIT Licence](https://huggingface.co/spaces/vialibre/
|
| 8 |
|
| 9 |
> ### Citation Information
|
| 10 |
```c
|
|
|
|
| 4 |
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
| 5 |
|
| 6 |
> ### Licensing Information
|
| 7 |
+
* [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_es/resolve/main/LICENSE)
|
| 8 |
|
| 9 |
> ### Citation Information
|
| 10 |
```c
|