benediktstroebl commited on
Commit
c03c7bc
·
1 Parent(s): 56a86ce

added trace download links

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. app.py +10 -10
  3. utils/db.py +11 -5
  4. utils/processing.py +2 -17
.gitignore CHANGED
@@ -5,3 +5,6 @@ evals_live/*
5
  evals_processed/*
6
  *.db
7
  .env
 
 
 
 
5
  evals_processed/*
6
  *.db
7
  .env
8
+ encrypted_files/*
9
+ evals_live_old/*
10
+ evals_upload_old/*
app.py CHANGED
@@ -527,7 +527,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
527
  ci_metrics=["Accuracy", "Total Cost"]
528
  ),
529
  select_columns=SelectColumns(
530
- default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
531
  cant_deselect=["Agent Name"],
532
  label="Select Columns to Display:",
533
  ),
@@ -567,7 +567,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
567
  Leaderboard(
568
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
569
  select_columns=SelectColumns(
570
- default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified"],
571
  cant_deselect=["Agent Name"],
572
  label="Select Columns to Display:",
573
  ),
@@ -603,7 +603,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
603
  Leaderboard(
604
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
605
  select_columns=SelectColumns(
606
- default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
607
  cant_deselect=["Agent Name"],
608
  label="Select Columns to Display:",
609
  ),
@@ -647,7 +647,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
647
  Leaderboard(
648
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
649
  select_columns=SelectColumns(
650
- default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
651
  cant_deselect=["Agent Name"],
652
  label="Select Columns to Display:",
653
  ),
@@ -685,7 +685,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
685
  Leaderboard(
686
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
687
  select_columns=SelectColumns(
688
- default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified"],
689
  cant_deselect=["Agent Name"],
690
  label="Select Columns to Display:",
691
  ),
@@ -745,7 +745,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
745
  ci_metrics=["Accuracy", "Total Cost"]
746
  ),
747
  select_columns=SelectColumns(
748
- default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified"],
749
  cant_deselect=["Agent Name"],
750
  label="Select Columns to Display:",
751
  ),
@@ -839,7 +839,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
839
  ci_metrics=["Accuracy", "Total Cost"]
840
  ),
841
  select_columns=SelectColumns(
842
- default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified"],
843
  cant_deselect=["Agent Name"],
844
  label="Select Columns to Display:",
845
  ),
@@ -932,7 +932,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
932
  ci_metrics=["Accuracy", "Total Cost"]
933
  ),
934
  select_columns=SelectColumns(
935
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
936
  cant_deselect=["Agent Name"],
937
  label="Select Columns to Display:",
938
  ),
@@ -1000,7 +1000,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1000
  Leaderboard(
1001
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]),
1002
  select_columns=SelectColumns(
1003
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
1004
  cant_deselect=["Agent Name"],
1005
  label="Select Columns to Display:",
1006
  ),
@@ -1033,7 +1033,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1033
  Leaderboard(
1034
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
1035
  select_columns=SelectColumns(
1036
- default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
1037
  cant_deselect=["Agent Name"],
1038
  label="Select Columns to Display:",
1039
  ),
 
527
  ci_metrics=["Accuracy", "Total Cost"]
528
  ),
529
  select_columns=SelectColumns(
530
+ default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified", "Traces"],
531
  cant_deselect=["Agent Name"],
532
  label="Select Columns to Display:",
533
  ),
 
567
  Leaderboard(
568
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'appworld_test_normal'), ci_metrics=["Accuracy", "Total Cost"]),
569
  select_columns=SelectColumns(
570
+ default_selection=config.APPWORLD_ON_LOAD_COLUMNS + ["Verified", "Traces"],
571
  cant_deselect=["Agent Name"],
572
  label="Select Columns to Display:",
573
  ),
 
603
  Leaderboard(
604
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_easy'), ci_metrics=["Accuracy", "Total Cost"]),
605
  select_columns=SelectColumns(
606
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
607
  cant_deselect=["Agent Name"],
608
  label="Select Columns to Display:",
609
  ),
 
647
  Leaderboard(
648
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_hard'), ci_metrics=["Accuracy", "Total Cost"]),
649
  select_columns=SelectColumns(
650
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
651
  cant_deselect=["Agent Name"],
652
  label="Select Columns to Display:",
653
  ),
 
685
  Leaderboard(
686
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'corebench_medium'), ci_metrics=["Accuracy", "Total Cost"]),
687
  select_columns=SelectColumns(
688
+ default_selection=config.COREBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
689
  cant_deselect=["Agent Name"],
690
  label="Select Columns to Display:",
691
  ),
 
745
  ci_metrics=["Accuracy", "Total Cost"]
746
  ),
747
  select_columns=SelectColumns(
748
+ default_selection=config.CYBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
749
  cant_deselect=["Agent Name"],
750
  label="Select Columns to Display:",
751
  ),
 
839
  ci_metrics=["Accuracy", "Total Cost"]
840
  ),
841
  select_columns=SelectColumns(
842
+ default_selection=config.GAIA_ON_LOAD_COLUMNS + ["Verified", "Traces"],
843
  cant_deselect=["Agent Name"],
844
  label="Select Columns to Display:",
845
  ),
 
932
  ci_metrics=["Accuracy", "Total Cost"]
933
  ),
934
  select_columns=SelectColumns(
935
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
936
  cant_deselect=["Agent Name"],
937
  label="Select Columns to Display:",
938
  ),
 
1000
  Leaderboard(
1001
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified_mini'), ci_metrics=["Accuracy", "Total Cost"]),
1002
  select_columns=SelectColumns(
1003
+ default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified", "Traces"],
1004
  cant_deselect=["Agent Name"],
1005
  label="Select Columns to Display:",
1006
  ),
 
1033
  Leaderboard(
1034
  value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
1035
  select_columns=SelectColumns(
1036
+ default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified", "Traces"],
1037
  cant_deselect=["Agent Name"],
1038
  label="Select Columns to Display:",
1039
  ),
utils/db.py CHANGED
@@ -67,8 +67,10 @@ AGGREGATION_RULES = {
67
  'scenario_goal_completion': 'mean',
68
  'Verified': 'first',
69
  'Runs': 'first',
 
70
  'accuracy_ci': 'first',
71
  'cost_ci': 'first',
 
72
  }
73
 
74
  # Define column display names
@@ -409,7 +411,13 @@ class TracePreprocessor:
409
  df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
410
  df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
411
 
412
- df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
 
 
 
 
 
 
413
 
414
  if aggregate:
415
  df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
@@ -540,10 +548,7 @@ class TracePreprocessor:
540
 
541
  # Drop temp column
542
  results_df = results_df.drop('agent_name_temp', axis=1)
543
-
544
- # Fill any missing costs with 0
545
- # results_df['Total Cost'] = results_df['Total Cost'].fillna(0)
546
-
547
  if aggregate:
548
  # Aggregate results
549
  results_df = results_df.groupby('Agent Name').agg({
@@ -567,6 +572,7 @@ class TracePreprocessor:
567
  'Level 2 Accuracy': 'mean',
568
  'Level 3 Accuracy': 'mean',
569
  'Verified': 'first',
 
570
  'Runs': 'first',
571
  'Accuracy CI': 'first',
572
  'Total Cost CI': 'first'
 
67
  'scenario_goal_completion': 'mean',
68
  'Verified': 'first',
69
  'Runs': 'first',
70
+ 'Traces': 'first',
71
  'accuracy_ci': 'first',
72
  'cost_ci': 'first',
73
+
74
  }
75
 
76
  # Define column display names
 
411
  df.loc[df['agent_name'] == agent_name, 'accuracy_ci'] = accuracy_ci
412
  df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
413
 
414
+ # Before dropping run_id, create new column from it with download link
415
+ df['Traces'] = df['run_id'].apply(
416
+ lambda x: f'[load](https://huggingface.co/datasets/agent-evals/agent_traces/resolve/main/{x}.zip?download=true)'
417
+ if x else ''
418
+ )
419
+
420
+ df = df.drop(columns=['successful_tasks', 'failed_tasks'], axis=1)
421
 
422
  if aggregate:
423
  df = df.groupby('agent_name').agg(AGGREGATION_RULES).reset_index()
 
548
 
549
  # Drop temp column
550
  results_df = results_df.drop('agent_name_temp', axis=1)
551
+
 
 
 
552
  if aggregate:
553
  # Aggregate results
554
  results_df = results_df.groupby('Agent Name').agg({
 
572
  'Level 2 Accuracy': 'mean',
573
  'Level 3 Accuracy': 'mean',
574
  'Verified': 'first',
575
+ 'Traces': 'first',
576
  'Runs': 'first',
577
  'Accuracy CI': 'first',
578
  'Total Cost CI': 'first'
utils/processing.py CHANGED
@@ -29,25 +29,10 @@ async def check_and_process_uploads():
29
  if not os.path.exists(live_path) and not os.path.exists(processed_path):
30
  unprocessed_uploads.append(upload)
31
  elif os.path.exists(processed_path):
32
- # with open(upload_path, 'r') as f:
33
- # new_data = json.load(f)
34
-
35
- # with open(processed_path, 'r') as f:
36
- # processed_data = json.load(f)
37
-
38
- # TODO we can use a better comparison method with exact comparison
39
- # if new_data != processed_data:
40
- # unprocessed_uploads.append(upload)
41
  print(f"Upload {upload} is already in processed directory.")
42
- elif os.path.exists(live_path):
43
- with open(upload_path, 'r') as f:
44
- new_data = json.load(f)
45
-
46
- with open(live_path, 'r') as f:
47
- live_data = json.load(f)
48
 
49
- # if new_data != live_data:
50
- # unprocessed_uploads.append(upload)
51
  print(f"Upload {upload} is already in live directory.")
52
  else:
53
  unprocessed_uploads.append(upload)
 
29
  if not os.path.exists(live_path) and not os.path.exists(processed_path):
30
  unprocessed_uploads.append(upload)
31
  elif os.path.exists(processed_path):
32
+
 
 
 
 
 
 
 
 
33
  print(f"Upload {upload} is already in processed directory.")
 
 
 
 
 
 
34
 
35
+ elif os.path.exists(live_path):
 
36
  print(f"Upload {upload} is already in live directory.")
37
  else:
38
  unprocessed_uploads.append(upload)