Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Plot Results
Browse files- app.py +6 -0
- src/results.py +43 -0
app.py
CHANGED
|
@@ -19,6 +19,7 @@ from src.results import (
|
|
| 19 |
display_results,
|
| 20 |
fetch_result_paths,
|
| 21 |
load_results_dataframes,
|
|
|
|
| 22 |
sort_result_paths_per_model,
|
| 23 |
update_load_results_component,
|
| 24 |
update_tasks_component,
|
|
@@ -62,6 +63,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
| 62 |
visible=False,
|
| 63 |
)
|
| 64 |
hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
|
|
|
|
| 65 |
results = gr.HTML()
|
| 66 |
results_dataframe_1 = gr.Dataframe(visible=False)
|
| 67 |
results_dataframe_2 = gr.Dataframe(visible=False)
|
|
@@ -153,6 +155,10 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
| 153 |
fn=display_results,
|
| 154 |
inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
|
| 155 |
outputs=[results, configs],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
)
|
| 157 |
gr.on(
|
| 158 |
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
|
|
|
| 19 |
display_results,
|
| 20 |
fetch_result_paths,
|
| 21 |
load_results_dataframes,
|
| 22 |
+
plot_results,
|
| 23 |
sort_result_paths_per_model,
|
| 24 |
update_load_results_component,
|
| 25 |
update_tasks_component,
|
|
|
|
| 63 |
visible=False,
|
| 64 |
)
|
| 65 |
hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
|
| 66 |
+
results_plot = gr.Plot()
|
| 67 |
results = gr.HTML()
|
| 68 |
results_dataframe_1 = gr.Dataframe(visible=False)
|
| 69 |
results_dataframe_2 = gr.Dataframe(visible=False)
|
|
|
|
| 155 |
fn=display_results,
|
| 156 |
inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
|
| 157 |
outputs=[results, configs],
|
| 158 |
+
).then(
|
| 159 |
+
fn=plot_results,
|
| 160 |
+
inputs=[results_task, results_dataframe_1, results_dataframe_2], # results,
|
| 161 |
+
outputs=results_plot,
|
| 162 |
)
|
| 163 |
gr.on(
|
| 164 |
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
src/results.py
CHANGED
|
@@ -4,6 +4,7 @@ import gradio as gr
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
import src.constants as constants
|
|
|
|
| 7 |
from src.hub import glob, load_json_file
|
| 8 |
|
| 9 |
|
|
@@ -143,3 +144,45 @@ def clear_results():
|
|
| 143 |
|
| 144 |
def display_loading_message_for_results():
|
| 145 |
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
import src.constants as constants
|
| 7 |
+
from src.constants import TASKS
|
| 8 |
from src.hub import glob, load_json_file
|
| 9 |
|
| 10 |
|
|
|
|
| 144 |
|
| 145 |
def display_loading_message_for_results():
|
| 146 |
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def plot_results(task, *dfs):
|
| 150 |
+
df = concat_results(dfs)
|
| 151 |
+
if df is not None:
|
| 152 |
+
df = df[
|
| 153 |
+
[
|
| 154 |
+
col
|
| 155 |
+
for col in df.columns
|
| 156 |
+
if col.startswith("results.")
|
| 157 |
+
and (col.endswith("acc,none") or col.endswith("acc_norm,none") or col.endswith("exact_match,none"))
|
| 158 |
+
]
|
| 159 |
+
]
|
| 160 |
+
if task == "All":
|
| 161 |
+
df = df[[col for col in df.columns if col.split(".")[1] in TASKS]]
|
| 162 |
+
# - IFEval: Calculate average of both strict accuracies
|
| 163 |
+
ifeval_mean = df[
|
| 164 |
+
[
|
| 165 |
+
"results.leaderboard_ifeval.inst_level_strict_acc,none",
|
| 166 |
+
"results.leaderboard_ifeval.prompt_level_strict_acc,none",
|
| 167 |
+
]
|
| 168 |
+
].mean(axis=1)
|
| 169 |
+
df = df.drop(columns=[col for col in df.columns if col.split(".")[1] == "leaderboard_ifeval"])
|
| 170 |
+
loc = df.columns.get_loc("results.leaderboard_math_hard.exact_match,none")
|
| 171 |
+
df.insert(loc - 1, "results.leaderboard_ifeval", ifeval_mean)
|
| 172 |
+
# Rename
|
| 173 |
+
df = df.rename(columns=lambda col: TASKS[col.split(".")[1]][0])
|
| 174 |
+
else:
|
| 175 |
+
df = df[[col for col in df.columns if col.startswith(f"results.{task}")]]
|
| 176 |
+
tasks = {key: tupl[0] for key, tupl in TASKS.items()}
|
| 177 |
+
subtasks = {tupl[1]: tupl[0] for value in constants.SUBTASKS.values() for tupl in value}
|
| 178 |
+
subtasks = {**tasks, **subtasks}
|
| 179 |
+
# - IFEval: Return 4 accuracies
|
| 180 |
+
if task == "leaderboard_ifeval":
|
| 181 |
+
df = df.rename(columns=lambda col: col.split(".")[2].removesuffix(",none"))
|
| 182 |
+
else:
|
| 183 |
+
df = df.rename(columns=lambda col: subtasks[col.split(".")[1]])
|
| 184 |
+
ax = df.T.rename_axis(columns="Models").plot(kind="bar", ylabel="Scores", rot=45, figsize=(18, 6))
|
| 185 |
+
fig = ax.get_figure()
|
| 186 |
+
fig.autofmt_xdate(rotation=45)
|
| 187 |
+
fig.tight_layout()
|
| 188 |
+
return fig
|