Spaces:
Sleeping
Sleeping
xeon27
commited on
Commit
·
1d1f5e9
1
Parent(s):
0796d85
Add separate tab for agentic benchmark
Browse files- app.py +9 -5
- src/display/utils.py +2 -2
- src/populate.py +2 -3
app.py
CHANGED
|
@@ -14,8 +14,8 @@ from src.about import (
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
| 17 |
-
|
| 18 |
-
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
|
@@ -49,7 +49,8 @@ except Exception:
|
|
| 49 |
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
(
|
| 55 |
finished_eval_queue_df,
|
|
@@ -96,8 +97,11 @@ with demo:
|
|
| 96 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 97 |
|
| 98 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 99 |
-
with gr.TabItem("
|
| 100 |
-
leaderboard = init_leaderboard(
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 103 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
| 17 |
+
ST_BENCHMARK_COLS,
|
| 18 |
+
AGENTIC_BENCHMARK_COLS,
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
| 21 |
AutoEvalColumn,
|
|
|
|
| 49 |
restart_space()
|
| 50 |
|
| 51 |
|
| 52 |
+
ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ST_BENCHMARK_COLS)
|
| 53 |
+
AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, AGENTIC_BENCHMARK_COLS)
|
| 54 |
|
| 55 |
(
|
| 56 |
finished_eval_queue_df,
|
|
|
|
| 97 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 98 |
|
| 99 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 100 |
+
with gr.TabItem("Single-turn Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
| 101 |
+
leaderboard = init_leaderboard(ST_LEADERBOARD_DF)
|
| 102 |
+
|
| 103 |
+
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
| 104 |
+
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF)
|
| 105 |
|
| 106 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
| 107 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
src/display/utils.py
CHANGED
|
@@ -106,5 +106,5 @@ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
| 106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
+
ST_BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.type=="single-turn"]
|
| 110 |
+
AGENTIC_BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.type=="agentic"]
|
src/populate.py
CHANGED
|
@@ -34,7 +34,7 @@ def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
|
|
| 34 |
return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
| 35 |
|
| 36 |
|
| 37 |
-
def get_leaderboard_df(results_path: str, requests_path: str,
|
| 38 |
"""Creates a dataframe from all the individual experiment results"""
|
| 39 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 40 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
@@ -42,12 +42,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 42 |
df = pd.DataFrame.from_records(all_data_json)
|
| 43 |
|
| 44 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 45 |
-
df = df[
|
| 46 |
|
| 47 |
# # filter out if any of the benchmarks have not been produced
|
| 48 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 49 |
df = df.fillna(EMPTY_SYMBOL)
|
| 50 |
-
print(df["GAIA"].head())
|
| 51 |
|
| 52 |
# make values clickable and link to log files
|
| 53 |
for col in benchmark_cols:
|
|
|
|
| 34 |
return f"https://storage.googleapis.com/inspect-evals/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
| 35 |
|
| 36 |
|
| 37 |
+
def get_leaderboard_df(results_path: str, requests_path: str, benchmark_cols: list) -> pd.DataFrame:
|
| 38 |
"""Creates a dataframe from all the individual experiment results"""
|
| 39 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 40 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
|
| 42 |
df = pd.DataFrame.from_records(all_data_json)
|
| 43 |
|
| 44 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 45 |
+
df = df[benchmark_cols].round(decimals=2)
|
| 46 |
|
| 47 |
# # filter out if any of the benchmarks have not been produced
|
| 48 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
| 49 |
df = df.fillna(EMPTY_SYMBOL)
|
|
|
|
| 50 |
|
| 51 |
# make values clickable and link to log files
|
| 52 |
for col in benchmark_cols:
|