Spaces:
Running
Running
benediktstroebl commited on
Commit ·
c03c7bc
1
Parent(s): 56a86ce
added trace download links
Browse files- .gitignore +3 -0
- app.py +10 -10
- utils/db.py +11 -5
- utils/processing.py +2 -17
.gitignore
CHANGED
|
@@ -5,3 +5,6 @@ evals_live/*
|
|
| 5 |
evals_processed/*
|
| 6 |
*.db
|
| 7 |
.env
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
evals_processed/*
|
| 6 |
*.db
|
| 7 |
.env
|
| 8 |
+
encrypted_files/*
|
| 9 |
+
evals_live_old/*
|
| 10 |
+
evals_upload_old/*
|
app.py
CHANGED
|
@@ -527,7 +527,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 527 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 528 |
),
|
| 529 |
select_columns=SelectColumns(
|
| 530 |
-
default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
|
| 531 |
cant_deselect=["Agent Name"],
|
| 532 |
label="Select Columns to Display:",
|
| 533 |
),
|
|
@@ -567,7 +567,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 567 |
Leaderboard(
|
| 568 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 569 |
select_columns=SelectColumns(
|
| 570 |
-
default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
|
| 571 |
cant_deselect=["Agent Name"],
|
| 572 |
label="Select Columns to Display:",
|
| 573 |
),
|
|
@@ -603,7 +603,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 603 |
Leaderboard(
|
| 604 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 605 |
select_columns=SelectColumns(
|
| 606 |
-
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
| 607 |
cant_deselect=["Agent Name"],
|
| 608 |
label="Select Columns to Display:",
|
| 609 |
),
|
|
@@ -647,7 +647,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 647 |
Leaderboard(
|
| 648 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 649 |
select_columns=SelectColumns(
|
| 650 |
-
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
| 651 |
cant_deselect=["Agent Name"],
|
| 652 |
label="Select Columns to Display:",
|
| 653 |
),
|
|
@@ -685,7 +685,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 685 |
Leaderboard(
|
| 686 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 687 |
select_columns=SelectColumns(
|
| 688 |
-
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
| 689 |
cant_deselect=["Agent Name"],
|
| 690 |
label="Select Columns to Display:",
|
| 691 |
),
|
|
@@ -745,7 +745,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 745 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 746 |
),
|
| 747 |
select_columns=SelectColumns(
|
| 748 |
-
default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
| 749 |
cant_deselect=["Agent Name"],
|
| 750 |
label="Select Columns to Display:",
|
| 751 |
),
|
|
@@ -839,7 +839,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 839 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 840 |
),
|
| 841 |
select_columns=SelectColumns(
|
| 842 |
-
default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified"],
|
| 843 |
cant_deselect=["Agent Name"],
|
| 844 |
label="Select Columns to Display:",
|
| 845 |
),
|
|
@@ -932,7 +932,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 932 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 933 |
),
|
| 934 |
select_columns=SelectColumns(
|
| 935 |
-
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
| 936 |
cant_deselect=["Agent Name"],
|
| 937 |
label="Select Columns to Display:",
|
| 938 |
),
|
|
@@ -1000,7 +1000,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 1000 |
Leaderboard(
|
| 1001 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 1002 |
select_columns=SelectColumns(
|
| 1003 |
-
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
| 1004 |
cant_deselect=["Agent Name"],
|
| 1005 |
label="Select Columns to Display:",
|
| 1006 |
),
|
|
@@ -1033,7 +1033,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 1033 |
Leaderboard(
|
| 1034 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 1035 |
select_columns=SelectColumns(
|
| 1036 |
-
default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
|
| 1037 |
cant_deselect=["Agent Name"],
|
| 1038 |
label="Select Columns to Display:",
|
| 1039 |
),
|
|
|
|
| 527 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 528 |
),
|
| 529 |
select_columns=SelectColumns(
|
| 530 |
+
default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 531 |
cant_deselect=["Agent Name"],
|
| 532 |
label="Select Columns to Display:",
|
| 533 |
),
|
|
|
|
| 567 |
Leaderboard(
|
| 568 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 569 |
select_columns=SelectColumns(
|
| 570 |
+
default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 571 |
cant_deselect=["Agent Name"],
|
| 572 |
label="Select Columns to Display:",
|
| 573 |
),
|
|
|
|
| 603 |
Leaderboard(
|
| 604 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 605 |
select_columns=SelectColumns(
|
| 606 |
+
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 607 |
cant_deselect=["Agent Name"],
|
| 608 |
label="Select Columns to Display:",
|
| 609 |
),
|
|
|
|
| 647 |
Leaderboard(
|
| 648 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 649 |
select_columns=SelectColumns(
|
| 650 |
+
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 651 |
cant_deselect=["Agent Name"],
|
| 652 |
label="Select Columns to Display:",
|
| 653 |
),
|
|
|
|
| 685 |
Leaderboard(
|
| 686 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 687 |
select_columns=SelectColumns(
|
| 688 |
+
default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 689 |
cant_deselect=["Agent Name"],
|
| 690 |
label="Select Columns to Display:",
|
| 691 |
),
|
|
|
|
| 745 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 746 |
),
|
| 747 |
select_columns=SelectColumns(
|
| 748 |
+
default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 749 |
cant_deselect=["Agent Name"],
|
| 750 |
label="Select Columns to Display:",
|
| 751 |
),
|
|
|
|
| 839 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 840 |
),
|
| 841 |
select_columns=SelectColumns(
|
| 842 |
+
default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 843 |
cant_deselect=["Agent Name"],
|
| 844 |
label="Select Columns to Display:",
|
| 845 |
),
|
|
|
|
| 932 |
ci_metrics=["Accuracy", "Total Cost"]
|
| 933 |
),
|
| 934 |
select_columns=SelectColumns(
|
| 935 |
+
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 936 |
cant_deselect=["Agent Name"],
|
| 937 |
label="Select Columns to Display:",
|
| 938 |
),
|
|
|
|
| 1000 |
Leaderboard(
|
| 1001 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 1002 |
select_columns=SelectColumns(
|
| 1003 |
+
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 1004 |
cant_deselect=["Agent Name"],
|
| 1005 |
label="Select Columns to Display:",
|
| 1006 |
),
|
|
|
|
| 1033 |
Leaderboard(
|
| 1034 |
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
|
| 1035 |
select_columns=SelectColumns(
|
| 1036 |
+
default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified", "Traces"],
|
| 1037 |
cant_deselect=["Agent Name"],
|
| 1038 |
label="Select Columns to Display:",
|
| 1039 |
),
|
utils/db.py
CHANGED
|
@@ -67,8 +67,10 @@ AGGREGATION_RULES = {
|
|
| 67 |
'scenario_goal_completion': 'mean',
|
| 68 |
'Verified': 'first',
|
| 69 |
'Runs': 'first',
|
|
|
|
| 70 |
'accuracy_ci': 'first',
|
| 71 |
'cost_ci': 'first',
|
|
|
|
| 72 |
}
|
| 73 |
|
| 74 |
# Define column display names
|
|
@@ -409,7 +411,13 @@ class TracePreprocessor:
|
|
| 409 |
df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
|
| 410 |
df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
|
| 411 |
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
if aggregate:
|
| 415 |
df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
|
|
@@ -540,10 +548,7 @@ class TracePreprocessor:
|
|
| 540 |
|
| 541 |
# Drop temp column
|
| 542 |
results_df = results_df.drop('agent_name_temp', axis=1)
|
| 543 |
-
|
| 544 |
-
# Fill any missing costs with 0
|
| 545 |
-
# results_df['Total Cost'] = results_df['Total Cost'].fillna(0)
|
| 546 |
-
|
| 547 |
if aggregate:
|
| 548 |
# Aggregate results
|
| 549 |
results_df = results_df.groupby('Agent Name').agg({
|
|
@@ -567,6 +572,7 @@ class TracePreprocessor:
|
|
| 567 |
'Level 2 Accuracy': 'mean',
|
| 568 |
'Level 3 Accuracy': 'mean',
|
| 569 |
'Verified': 'first',
|
|
|
|
| 570 |
'Runs': 'first',
|
| 571 |
'Accuracy CI': 'first',
|
| 572 |
'Total Cost CI': 'first'
|
|
|
|
| 67 |
'scenario_goal_completion': 'mean',
|
| 68 |
'Verified': 'first',
|
| 69 |
'Runs': 'first',
|
| 70 |
+
'Traces': 'first',
|
| 71 |
'accuracy_ci': 'first',
|
| 72 |
'cost_ci': 'first',
|
| 73 |
+
|
| 74 |
}
|
| 75 |
|
| 76 |
# Define column display names
|
|
|
|
| 411 |
df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
|
| 412 |
df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
|
| 413 |
|
| 414 |
+
# Before dropping run_id, create new column from it with download link
|
| 415 |
+
df['Traces'] = df['run_id'].apply(
|
| 416 |
+
lambda x: f'[load](https://huggingface.co/datasets/agent-evals/agent_traces/resolve/main/{x}.zip?download=true)'
|
| 417 |
+
if x else ''
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
df = df.drop(columns=['successful_tasks', 'failed_tasks'], axis=1)
|
| 421 |
|
| 422 |
if aggregate:
|
| 423 |
df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
|
|
|
|
| 548 |
|
| 549 |
# Drop temp column
|
| 550 |
results_df = results_df.drop('agent_name_temp', axis=1)
|
| 551 |
+
|
|
|
|
|
|
|
|
|
|
| 552 |
if aggregate:
|
| 553 |
# Aggregate results
|
| 554 |
results_df = results_df.groupby('Agent Name').agg({
|
|
|
|
| 572 |
'Level 2 Accuracy': 'mean',
|
| 573 |
'Level 3 Accuracy': 'mean',
|
| 574 |
'Verified': 'first',
|
| 575 |
+
'Traces': 'first',
|
| 576 |
'Runs': 'first',
|
| 577 |
'Accuracy CI': 'first',
|
| 578 |
'Total Cost CI': 'first'
|
utils/processing.py
CHANGED
|
@@ -29,25 +29,10 @@ async def check_and_process_uploads():
|
|
| 29 |
if not os.path.exists(live_path) and not os.path.exists(processed_path):
|
| 30 |
unprocessed_uploads.append(upload)
|
| 31 |
elif os.path.exists(processed_path):
|
| 32 |
-
|
| 33 |
-
# new_data = json.load(f)
|
| 34 |
-
|
| 35 |
-
# with open(processed_path, 'r') as f:
|
| 36 |
-
# processed_data = json.load(f)
|
| 37 |
-
|
| 38 |
-
# TODO we can use a better comparison method with exact comparison
|
| 39 |
-
# if new_data != processed_data:
|
| 40 |
-
# unprocessed_uploads.append(upload)
|
| 41 |
print(f"Upload {upload} is already in processed directory.")
|
| 42 |
-
elif os.path.exists(live_path):
|
| 43 |
-
with open(upload_path, 'r') as f:
|
| 44 |
-
new_data = json.load(f)
|
| 45 |
-
|
| 46 |
-
with open(live_path, 'r') as f:
|
| 47 |
-
live_data = json.load(f)
|
| 48 |
|
| 49 |
-
|
| 50 |
-
# unprocessed_uploads.append(upload)
|
| 51 |
print(f"Upload {upload} is already in live directory.")
|
| 52 |
else:
|
| 53 |
unprocessed_uploads.append(upload)
|
|
|
|
| 29 |
if not os.path.exists(live_path) and not os.path.exists(processed_path):
|
| 30 |
unprocessed_uploads.append(upload)
|
| 31 |
elif os.path.exists(processed_path):
|
| 32 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
print(f"Upload {upload} is already in processed directory.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
elif os.path.exists(live_path):
|
|
|
|
| 36 |
print(f"Upload {upload} is already in live directory.")
|
| 37 |
else:
|
| 38 |
unprocessed_uploads.append(upload)
|