Initial clone with modifications
Browse files- .ipynb_checkpoints/app-checkpoint.py +23 -18
- app.py +18 -18
- src/__pycache__/about.cpython-310.pyc +0 -0
- src/about.py +37 -37
.ipynb_checkpoints/app-checkpoint.py
CHANGED
|
@@ -52,7 +52,7 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
|
|
| 52 |
- lang in {EN, IT, SL, SK, GR, PL} or None/"All"
|
| 53 |
- shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
|
| 54 |
"""
|
| 55 |
-
tasks = ["NER", "REL", "RML", "HIS", "DIA"]
|
| 56 |
df = dataframe.copy()
|
| 57 |
|
| 58 |
if lang and lang != "All" and "LANG" in df.columns:
|
|
@@ -141,7 +141,7 @@ def create_best_model_comparison_table_without_lang(dataframe):
|
|
| 141 |
Table with the best overall model per task (NER, REL,) and the model that
|
| 142 |
achieves the best score with its own best prompt.
|
| 143 |
"""
|
| 144 |
-
tasks = ["NER", "REL", "RML", "HIS", "DIA"]
|
| 145 |
table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
|
| 146 |
|
| 147 |
for task in tasks:
|
|
@@ -216,7 +216,7 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
|
|
| 216 |
- lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
|
| 217 |
- shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
|
| 218 |
"""
|
| 219 |
-
tasks = ["NER", "REL", "RML", "HIS", "DIA"]
|
| 220 |
|
| 221 |
df = dataframe.copy()
|
| 222 |
# Language filter
|
|
@@ -236,6 +236,8 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
|
|
| 236 |
for task in tasks:
|
| 237 |
col = f"{task} Best Prompt Id"
|
| 238 |
if col in df.columns:
|
|
|
|
|
|
|
| 239 |
all_ids.update(df[col].dropna().unique())
|
| 240 |
prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0))
|
| 241 |
prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] or [1, 2, 3]
|
|
@@ -294,7 +296,7 @@ def create_prompt_heatmap_without_lang(dataframe):
|
|
| 294 |
for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
|
| 295 |
ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
|
| 296 |
"""
|
| 297 |
-
tasks = ["NER", "REL", "RML", "HIS", "DIA"]
|
| 298 |
|
| 299 |
# Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
|
| 300 |
all_ids = set()
|
|
@@ -395,10 +397,11 @@ def mean_of_max_per_field(df):
|
|
| 395 |
float: media dei valori massimi dei campi
|
| 396 |
"""
|
| 397 |
#fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 398 |
-
fields = ["NER", "REL", "RML", "DIA", "HIS"]
|
| 399 |
#print(df.columns)
|
| 400 |
|
| 401 |
# Controlla che tutte le colonne esistano nel DataFrame
|
|
|
|
| 402 |
missing = [f for f in fields if f not in df.columns]
|
| 403 |
if missing:
|
| 404 |
raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
|
|
@@ -414,7 +417,7 @@ def mean_of_max_per_field(df):
|
|
| 414 |
|
| 415 |
def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
| 416 |
if tasks is None:
|
| 417 |
-
tasks = [ "NER", "REL", "RML", "DIA", "HIS"]
|
| 418 |
|
| 419 |
task_means = {}
|
| 420 |
|
|
@@ -489,7 +492,7 @@ def boxplot_per_task(dataframe=None, baselines=None, references=None):
|
|
| 489 |
#print(dataframe.columns)
|
| 490 |
|
| 491 |
#tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 492 |
-
tasks =["NER", "REL", "RML", "HIS", "DIA"]
|
| 493 |
if dataframe is None:
|
| 494 |
np.random.seed(42)
|
| 495 |
dataframe = pd.DataFrame({
|
|
@@ -598,7 +601,7 @@ REFERENCES = {
|
|
| 598 |
|
| 599 |
def boxplot_prompts_per_task(dataframe, tasks=None):
|
| 600 |
if tasks is None:
|
| 601 |
-
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 602 |
|
| 603 |
# Lista delle colonne da aggiornare
|
| 604 |
cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
|
|
@@ -808,11 +811,13 @@ TASK_METADATA_MULTIPLECHOICE = {
|
|
| 808 |
# Define task metadata (icons, names, descriptions)
|
| 809 |
TASK_METADATA_GENERATIVE = {
|
| 810 |
|
| 811 |
-
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
| 812 |
-
"REL": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
| 813 |
-
"RML": {"icon": "
|
| 814 |
-
"DIA": {"icon": "π₯", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"},
|
| 815 |
-
"HIS": {"icon": "π", "name": "CRF History", "tooltip": "CRF History"},
|
|
|
|
|
|
|
| 816 |
}
|
| 817 |
|
| 818 |
def restart_space():
|
|
@@ -891,7 +896,7 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
| 891 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 892 |
filter_columns=[
|
| 893 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 894 |
-
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="
|
| 895 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
|
| 896 |
],
|
| 897 |
bool_checkboxgroup_label="Evaluation Mode",
|
|
@@ -975,7 +980,7 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
|
|
| 975 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 976 |
filter_columns=[
|
| 977 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 978 |
-
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="
|
| 979 |
|
| 980 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
|
| 981 |
label="Select the number of parameters (B)"),
|
|
@@ -1075,8 +1080,8 @@ with demo:
|
|
| 1075 |
|
| 1076 |
leaderboard = init_leaderboard(
|
| 1077 |
LEADERBOARD_DF,
|
| 1078 |
-
default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"],
|
| 1079 |
-
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"]]
|
| 1080 |
)
|
| 1081 |
|
| 1082 |
|
|
@@ -1107,7 +1112,7 @@ with demo:
|
|
| 1107 |
with gr.TabItem(f"{metadata['icon']}{task}"):
|
| 1108 |
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
| 1109 |
gr.Markdown(task_description, elem_classes="markdown-text1")
|
| 1110 |
-
|
| 1111 |
leaderboard = update_task_leaderboard(
|
| 1112 |
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
|
| 1113 |
f"{task} Prompt Std": "Prompt Std",
|
|
|
|
| 52 |
- lang in {EN, IT, SL, SK, GR, PL} or None/"All"
|
| 53 |
- shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
|
| 54 |
"""
|
| 55 |
+
tasks = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA", "NER-PHA"]
|
| 56 |
df = dataframe.copy()
|
| 57 |
|
| 58 |
if lang and lang != "All" and "LANG" in df.columns:
|
|
|
|
| 141 |
Table with the best overall model per task (NER, REL,) and the model that
|
| 142 |
achieves the best score with its own best prompt.
|
| 143 |
"""
|
| 144 |
+
tasks = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA", "NER-PHA"]
|
| 145 |
table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
|
| 146 |
|
| 147 |
for task in tasks:
|
|
|
|
| 216 |
- lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
|
| 217 |
- shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
|
| 218 |
"""
|
| 219 |
+
tasks = ["NER-E3C", "REL-E3C", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-PHA"]
|
| 220 |
|
| 221 |
df = dataframe.copy()
|
| 222 |
# Language filter
|
|
|
|
| 236 |
for task in tasks:
|
| 237 |
col = f"{task} Best Prompt Id"
|
| 238 |
if col in df.columns:
|
| 239 |
+
#print (col)
|
| 240 |
+
#print(df[col])
|
| 241 |
all_ids.update(df[col].dropna().unique())
|
| 242 |
prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0))
|
| 243 |
prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] or [1, 2, 3]
|
|
|
|
| 296 |
for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
|
| 297 |
ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
|
| 298 |
"""
|
| 299 |
+
tasks = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA", "NER-PHA"]
|
| 300 |
|
| 301 |
# Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
|
| 302 |
all_ids = set()
|
|
|
|
| 397 |
float: media dei valori massimi dei campi
|
| 398 |
"""
|
| 399 |
#fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 400 |
+
fields = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-DIA", "CRF-HIS", "NER-PHA"]
|
| 401 |
#print(df.columns)
|
| 402 |
|
| 403 |
# Controlla che tutte le colonne esistano nel DataFrame
|
| 404 |
+
print(df.columns)
|
| 405 |
missing = [f for f in fields if f not in df.columns]
|
| 406 |
if missing:
|
| 407 |
raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
|
|
|
|
| 417 |
|
| 418 |
def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
| 419 |
if tasks is None:
|
| 420 |
+
tasks = [ "NER-E3C", "REL-E3C", "RML-CRF", "DIA-CRF", "HIS-CRF","NER-PHA"]
|
| 421 |
|
| 422 |
task_means = {}
|
| 423 |
|
|
|
|
| 492 |
#print(dataframe.columns)
|
| 493 |
|
| 494 |
#tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 495 |
+
tasks =["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA" , "NER-PHA"]
|
| 496 |
if dataframe is None:
|
| 497 |
np.random.seed(42)
|
| 498 |
dataframe = pd.DataFrame({
|
|
|
|
| 601 |
|
| 602 |
def boxplot_prompts_per_task(dataframe, tasks=None):
|
| 603 |
if tasks is None:
|
| 604 |
+
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER-E3C", "REL-E3C"]
|
| 605 |
|
| 606 |
# Lista delle colonne da aggiornare
|
| 607 |
cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
|
|
|
|
| 811 |
# Define task metadata (icons, names, descriptions)
|
| 812 |
TASK_METADATA_GENERATIVE = {
|
| 813 |
|
| 814 |
+
"NER-E3C": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
| 815 |
+
"REL-E3C": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
| 816 |
+
"CRF-RML": {"icon": "π", "name": "CRF RML", "tooltip": "CRF RML"},
|
| 817 |
+
"CRF-DIA": {"icon": "π₯", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"},
|
| 818 |
+
"CRF-HIS": {"icon": "π", "name": "CRF History", "tooltip": "CRF History"},
|
| 819 |
+
"NER-PHA": {"icon": "π·οΈ", "name": "Named Entity Recognition over PharmaER.It Datasets", "tooltip": ""},
|
| 820 |
+
|
| 821 |
}
|
| 822 |
|
| 823 |
def restart_space():
|
|
|
|
| 896 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 897 |
filter_columns=[
|
| 898 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 899 |
+
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languages: "),
|
| 900 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
|
| 901 |
],
|
| 902 |
bool_checkboxgroup_label="Evaluation Mode",
|
|
|
|
| 980 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 981 |
filter_columns=[
|
| 982 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 983 |
+
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languages: "),
|
| 984 |
|
| 985 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
|
| 986 |
label="Select the number of parameters (B)"),
|
|
|
|
| 1080 |
|
| 1081 |
leaderboard = init_leaderboard(
|
| 1082 |
LEADERBOARD_DF,
|
| 1083 |
+
default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER-E3C", "REL-E3C", "CRF-RML", "CRF-DIA", "CRF-HIS", "NER-PHA"],
|
| 1084 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER-E3C", "REL-E3C", "CRF-RML", "CRF-DIA", "CRF-HIS", "NER-PHA"]]
|
| 1085 |
)
|
| 1086 |
|
| 1087 |
|
|
|
|
| 1112 |
with gr.TabItem(f"{metadata['icon']}{task}"):
|
| 1113 |
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
| 1114 |
gr.Markdown(task_description, elem_classes="markdown-text1")
|
| 1115 |
+
print (task)
|
| 1116 |
leaderboard = update_task_leaderboard(
|
| 1117 |
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
|
| 1118 |
f"{task} Prompt Std": "Prompt Std",
|
app.py
CHANGED
|
@@ -52,7 +52,7 @@ def create_best_model_comparison_table(dataframe, lang: str | None = None, shot:
|
|
| 52 |
- lang in {EN, IT, SL, SK, GR, PL} or None/"All"
|
| 53 |
- shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
|
| 54 |
"""
|
| 55 |
-
tasks = ["NER", "REL", "RML
|
| 56 |
df = dataframe.copy()
|
| 57 |
|
| 58 |
if lang and lang != "All" and "LANG" in df.columns:
|
|
@@ -141,7 +141,7 @@ def create_best_model_comparison_table_without_lang(dataframe):
|
|
| 141 |
Table with the best overall model per task (NER, REL,) and the model that
|
| 142 |
achieves the best score with its own best prompt.
|
| 143 |
"""
|
| 144 |
-
tasks = ["NER", "REL", "RML
|
| 145 |
table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
|
| 146 |
|
| 147 |
for task in tasks:
|
|
@@ -216,7 +216,7 @@ def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None =
|
|
| 216 |
- lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
|
| 217 |
- shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
|
| 218 |
"""
|
| 219 |
-
tasks = ["NER", "REL", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-
|
| 220 |
|
| 221 |
df = dataframe.copy()
|
| 222 |
# Language filter
|
|
@@ -296,7 +296,7 @@ def create_prompt_heatmap_without_lang(dataframe):
|
|
| 296 |
for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
|
| 297 |
ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
|
| 298 |
"""
|
| 299 |
-
tasks = ["NER", "REL", "RML
|
| 300 |
|
| 301 |
# Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
|
| 302 |
all_ids = set()
|
|
@@ -397,7 +397,7 @@ def mean_of_max_per_field(df):
|
|
| 397 |
float: media dei valori massimi dei campi
|
| 398 |
"""
|
| 399 |
#fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 400 |
-
fields = ["NER", "REL", "RML
|
| 401 |
#print(df.columns)
|
| 402 |
|
| 403 |
# Controlla che tutte le colonne esistano nel DataFrame
|
|
@@ -417,7 +417,7 @@ def mean_of_max_per_field(df):
|
|
| 417 |
|
| 418 |
def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
| 419 |
if tasks is None:
|
| 420 |
-
tasks = [ "NER", "REL", "RML-CRF", "DIA-CRF", "HIS-CRF","NER-
|
| 421 |
|
| 422 |
task_means = {}
|
| 423 |
|
|
@@ -492,7 +492,7 @@ def boxplot_per_task(dataframe=None, baselines=None, references=None):
|
|
| 492 |
#print(dataframe.columns)
|
| 493 |
|
| 494 |
#tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 495 |
-
tasks =["NER", "REL", "RML
|
| 496 |
if dataframe is None:
|
| 497 |
np.random.seed(42)
|
| 498 |
dataframe = pd.DataFrame({
|
|
@@ -601,7 +601,7 @@ REFERENCES = {
|
|
| 601 |
|
| 602 |
def boxplot_prompts_per_task(dataframe, tasks=None):
|
| 603 |
if tasks is None:
|
| 604 |
-
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 605 |
|
| 606 |
# Lista delle colonne da aggiornare
|
| 607 |
cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
|
|
@@ -811,12 +811,12 @@ TASK_METADATA_MULTIPLECHOICE = {
|
|
| 811 |
# Define task metadata (icons, names, descriptions)
|
| 812 |
TASK_METADATA_GENERATIVE = {
|
| 813 |
|
| 814 |
-
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
| 815 |
-
"REL": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
| 816 |
-
"RML
|
| 817 |
-
"DIA
|
| 818 |
-
"HIS
|
| 819 |
-
"NER-
|
| 820 |
|
| 821 |
}
|
| 822 |
|
|
@@ -896,7 +896,7 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
| 896 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 897 |
filter_columns=[
|
| 898 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 899 |
-
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="
|
| 900 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
|
| 901 |
],
|
| 902 |
bool_checkboxgroup_label="Evaluation Mode",
|
|
@@ -980,7 +980,7 @@ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=No
|
|
| 980 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 981 |
filter_columns=[
|
| 982 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 983 |
-
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="
|
| 984 |
|
| 985 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
|
| 986 |
label="Select the number of parameters (B)"),
|
|
@@ -1080,8 +1080,8 @@ with demo:
|
|
| 1080 |
|
| 1081 |
leaderboard = init_leaderboard(
|
| 1082 |
LEADERBOARD_DF,
|
| 1083 |
-
default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML
|
| 1084 |
-
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML
|
| 1085 |
)
|
| 1086 |
|
| 1087 |
|
|
|
|
| 52 |
- lang in {EN, IT, SL, SK, GR, PL} or None/"All"
|
| 53 |
- shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
|
| 54 |
"""
|
| 55 |
+
tasks = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA", "NER-PHA"]
|
| 56 |
df = dataframe.copy()
|
| 57 |
|
| 58 |
if lang and lang != "All" and "LANG" in df.columns:
|
|
|
|
| 141 |
Table with the best overall model per task (NER, REL,) and the model that
|
| 142 |
achieves the best score with its own best prompt.
|
| 143 |
"""
|
| 144 |
+
tasks = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA", "NER-PHA"]
|
| 145 |
table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
|
| 146 |
|
| 147 |
for task in tasks:
|
|
|
|
| 216 |
- lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
|
| 217 |
- shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
|
| 218 |
"""
|
| 219 |
+
tasks = ["NER-E3C", "REL-E3C", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-PHA"]
|
| 220 |
|
| 221 |
df = dataframe.copy()
|
| 222 |
# Language filter
|
|
|
|
| 296 |
for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
|
| 297 |
ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
|
| 298 |
"""
|
| 299 |
+
tasks = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA", "NER-PHA"]
|
| 300 |
|
| 301 |
# Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts
|
| 302 |
all_ids = set()
|
|
|
|
| 397 |
float: media dei valori massimi dei campi
|
| 398 |
"""
|
| 399 |
#fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 400 |
+
fields = ["NER-E3C", "REL-E3C", "CRF-RML", "CRF-DIA", "CRF-HIS", "NER-PHA"]
|
| 401 |
#print(df.columns)
|
| 402 |
|
| 403 |
# Controlla che tutte le colonne esistano nel DataFrame
|
|
|
|
| 417 |
|
| 418 |
def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
| 419 |
if tasks is None:
|
| 420 |
+
tasks = [ "NER-E3C", "REL-E3C", "RML-CRF", "DIA-CRF", "HIS-CRF","NER-PHA"]
|
| 421 |
|
| 422 |
task_means = {}
|
| 423 |
|
|
|
|
| 492 |
#print(dataframe.columns)
|
| 493 |
|
| 494 |
#tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
| 495 |
+
tasks =["NER-E3C", "REL-E3C", "CRF-RML", "CRF-HIS", "CRF-DIA" , "NER-PHA"]
|
| 496 |
if dataframe is None:
|
| 497 |
np.random.seed(42)
|
| 498 |
dataframe = pd.DataFrame({
|
|
|
|
| 601 |
|
| 602 |
def boxplot_prompts_per_task(dataframe, tasks=None):
|
| 603 |
if tasks is None:
|
| 604 |
+
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER-E3C", "REL-E3C"]
|
| 605 |
|
| 606 |
# Lista delle colonne da aggiornare
|
| 607 |
cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
|
|
|
|
| 811 |
# Define task metadata (icons, names, descriptions)
|
| 812 |
TASK_METADATA_GENERATIVE = {
|
| 813 |
|
| 814 |
+
"NER-E3C": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
| 815 |
+
"REL-E3C": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
| 816 |
+
"CRF-RML": {"icon": "π", "name": "CRF RML", "tooltip": "CRF RML"},
|
| 817 |
+
"CRF-DIA": {"icon": "π₯", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"},
|
| 818 |
+
"CRF-HIS": {"icon": "π", "name": "CRF History", "tooltip": "CRF History"},
|
| 819 |
+
"NER-PHA": {"icon": "π·οΈ", "name": "Named Entity Recognition over PharmaER.It Datasets", "tooltip": ""},
|
| 820 |
|
| 821 |
}
|
| 822 |
|
|
|
|
| 896 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 897 |
filter_columns=[
|
| 898 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 899 |
+
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languages: "),
|
| 900 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
|
| 901 |
],
|
| 902 |
bool_checkboxgroup_label="Evaluation Mode",
|
|
|
|
| 980 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
| 981 |
filter_columns=[
|
| 982 |
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
| 983 |
+
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languages: "),
|
| 984 |
|
| 985 |
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
|
| 986 |
label="Select the number of parameters (B)"),
|
|
|
|
| 1080 |
|
| 1081 |
leaderboard = init_leaderboard(
|
| 1082 |
LEADERBOARD_DF,
|
| 1083 |
+
default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER-E3C", "REL-E3C", "CRF-RML", "CRF-DIA", "CRF-HIS", "NER-PHA"],
|
| 1084 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER-E3C", "REL-E3C", "CRF-RML", "CRF-DIA", "CRF-HIS", "NER-PHA"]]
|
| 1085 |
)
|
| 1086 |
|
| 1087 |
|
src/__pycache__/about.cpython-310.pyc
CHANGED
|
Binary files a/src/__pycache__/about.cpython-310.pyc and b/src/__pycache__/about.cpython-310.pyc differ
|
|
|
src/about.py
CHANGED
|
@@ -72,43 +72,43 @@ class Tasks(Enum):
|
|
| 72 |
#task48 = Task("relation-extraction_5", "acc", "std_accuracy", "REL Prompt Std")
|
| 73 |
#task49 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
|
| 74 |
#task50 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
|
| 75 |
-
task1 = Task("RE_1", "acc", "CPS", "REL")
|
| 76 |
-
task2 = Task("RE_2", "acc", "average_accuracy", "REL Prompt Average")
|
| 77 |
-
task3 = Task("RE_5", "acc", "std_accuracy", "REL Prompt Std")
|
| 78 |
-
task4 = Task("RE_3", "acc", "best_prompt", "REL Best Prompt")
|
| 79 |
-
task5 = Task("RE_4", "acc", "prompt_id", "REL Best Prompt Id")
|
| 80 |
-
|
| 81 |
-
task6 = Task("NER_1", "acc", "CPS", "NER")
|
| 82 |
-
task7 = Task("NER_2", "acc", "average_accuracy", "NER Prompt Average")
|
| 83 |
-
task8 = Task("NER_3", "acc", "std_accuracy", "NER Prompt Std")
|
| 84 |
-
task9 = Task("NER_4", "acc", "best_prompt", "NER Best Prompt")
|
| 85 |
-
task10 = Task("NER_5", "acc", "prompt_id", "NER Best Prompt Id")
|
| 86 |
-
|
| 87 |
-
task11 = Task("RML-CRF_1", "acc", "CPS", "RML
|
| 88 |
-
task12 = Task("RML-CRF_2", "acc", "average_accuracy", "RML
|
| 89 |
-
task13 = Task("RML-CRF_3", "acc", "std_accuracy", "RML
|
| 90 |
-
task14 = Task("RML-CRF_4", "acc", "best_prompt", "RML
|
| 91 |
-
task15 = Task("RML-CRF_5", "acc", "prompt_id", "RML
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
task16 = Task("DIA-CRF_1", "acc", "CPS", "DIA
|
| 96 |
-
task17 = Task("DIA-CRF_2", "acc", "average_accuracy", "DIA
|
| 97 |
-
task18 = Task("DIA-CRF_3", "acc", "std_accuracy", "DIA
|
| 98 |
-
task19 = Task("DIA-CRF_4", "acc", "best_prompt", "DIA
|
| 99 |
-
task20 = Task("DIA-CRF_5", "acc", "prompt_id", "DIA
|
| 100 |
-
|
| 101 |
-
task21 = Task("HIS-CRF_1", "acc", "CPS", "HIS
|
| 102 |
-
task22 = Task("HIS-CRF_2", "acc", "average_accuracy", "HIS
|
| 103 |
-
task23 = Task("HIS-CRF_3", "acc", "std_accuracy", "HIS
|
| 104 |
-
task24 = Task("HIS-CRF_4", "acc", "best_prompt", "HIS
|
| 105 |
-
task25 = Task("HIS-CRF_5", "acc", "prompt_id", "HIS
|
| 106 |
-
|
| 107 |
-
task26 = Task("NER-PHARMAER_1", "acc", "CPS", "NER-
|
| 108 |
-
task27 = Task("NER-PHARMAER_2", "acc", "average_accuracy", "NER-
|
| 109 |
-
task28 = Task("NER-PHARMAER_3", "acc", "std_accuracy", "NER-
|
| 110 |
-
task29 = Task("NER-PHARMAER_4", "acc", "best_prompt", "NER-
|
| 111 |
-
task30 = Task("NER-PHARMAER_5", "acc", "prompt_id", "NER-
|
| 112 |
|
| 113 |
'''
|
| 114 |
task0 = Task("TextualEntailment", "acc", "Textual Entailment")
|
|
|
|
| 72 |
#task48 = Task("relation-extraction_5", "acc", "std_accuracy", "REL Prompt Std")
|
| 73 |
#task49 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
|
| 74 |
#task50 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
|
| 75 |
+
task1 = Task("RE_1", "acc", "CPS", "REL-E3C")
|
| 76 |
+
task2 = Task("RE_2", "acc", "average_accuracy", "REL-E3C Prompt Average")
|
| 77 |
+
task3 = Task("RE_5", "acc", "std_accuracy", "REL-E3C Prompt Std")
|
| 78 |
+
task4 = Task("RE_3", "acc", "best_prompt", "REL-E3C Best Prompt")
|
| 79 |
+
task5 = Task("RE_4", "acc", "prompt_id", "REL-E3C Best Prompt Id")
|
| 80 |
+
|
| 81 |
+
task6 = Task("NER_1", "acc", "CPS", "NER-E3C")
|
| 82 |
+
task7 = Task("NER_2", "acc", "average_accuracy", "NER-E3C Prompt Average")
|
| 83 |
+
task8 = Task("NER_3", "acc", "std_accuracy", "NER-E3C Prompt Std")
|
| 84 |
+
task9 = Task("NER_4", "acc", "best_prompt", "NER-E3C Best Prompt")
|
| 85 |
+
task10 = Task("NER_5", "acc", "prompt_id", "NER-E3C Best Prompt Id")
|
| 86 |
+
|
| 87 |
+
task11 = Task("RML-CRF_1", "acc", "CPS", "CRF-RML")
|
| 88 |
+
task12 = Task("RML-CRF_2", "acc", "average_accuracy", "CRF-RML Prompt Average")
|
| 89 |
+
task13 = Task("RML-CRF_3", "acc", "std_accuracy", "CRF-RML Prompt Std")
|
| 90 |
+
task14 = Task("RML-CRF_4", "acc", "best_prompt", "CRF-RML Best Prompt")
|
| 91 |
+
task15 = Task("RML-CRF_5", "acc", "prompt_id", "CRF-RML Best Prompt Id")
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
task16 = Task("DIA-CRF_1", "acc", "CPS", "CRF-DIA")
|
| 96 |
+
task17 = Task("DIA-CRF_2", "acc", "average_accuracy", "CRF-DIA Prompt Average")
|
| 97 |
+
task18 = Task("DIA-CRF_3", "acc", "std_accuracy", "CRF-DIA Prompt Std")
|
| 98 |
+
task19 = Task("DIA-CRF_4", "acc", "best_prompt", "CRF-DIA Best Prompt")
|
| 99 |
+
task20 = Task("DIA-CRF_5", "acc", "prompt_id", "CRF-DIA Best Prompt Id")
|
| 100 |
+
|
| 101 |
+
task21 = Task("HIS-CRF_1", "acc", "CPS", "CRF-HIS")
|
| 102 |
+
task22 = Task("HIS-CRF_2", "acc", "average_accuracy", "CRF-HIS Prompt Average")
|
| 103 |
+
task23 = Task("HIS-CRF_3", "acc", "std_accuracy", "CRF-HIS Prompt Std")
|
| 104 |
+
task24 = Task("HIS-CRF_4", "acc", "best_prompt", "CRF-HIS Best Prompt")
|
| 105 |
+
task25 = Task("HIS-CRF_5", "acc", "prompt_id", "CRF-HIS Best Prompt Id")
|
| 106 |
+
|
| 107 |
+
task26 = Task("NER-PHARMAER_1", "acc", "CPS", "NER-PHA")
|
| 108 |
+
task27 = Task("NER-PHARMAER_2", "acc", "average_accuracy", "NER-PHA Prompt Average")
|
| 109 |
+
task28 = Task("NER-PHARMAER_3", "acc", "std_accuracy", "NER-PHA Prompt Std")
|
| 110 |
+
task29 = Task("NER-PHARMAER_4", "acc", "best_prompt", "NER-PHA Best Prompt")
|
| 111 |
+
task30 = Task("NER-PHARMAER_5", "acc", "prompt_id", "NER-PHA Best Prompt Id")
|
| 112 |
|
| 113 |
'''
|
| 114 |
task0 = Task("TextualEntailment", "acc", "Textual Entailment")
|