Spaces:
Running
Running
benediktstroebl commited on
Commit ·
178673f
1
Parent(s): bd0b3ec
minor tweaks
Browse files- agent_monitor/monitor.py +5 -0
- app.py +3 -3
- utils/viz.py +2 -1
- verified_agents.yaml +16 -4
agent_monitor/monitor.py
CHANGED
|
@@ -23,6 +23,11 @@ async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
|
|
| 23 |
task_calls[call['weave_task_id']].append(call)
|
| 24 |
|
| 25 |
for task_id in task_calls:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
|
| 27 |
|
| 28 |
tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]
|
|
|
|
| 23 |
task_calls[call['weave_task_id']].append(call)
|
| 24 |
|
| 25 |
for task_id in task_calls:
|
| 26 |
+
|
| 27 |
+
# sort calls by timestamp and handle null timestamps
|
| 28 |
+
for call in task_calls[task_id]:
|
| 29 |
+
if call['created_timestamp'] is None:
|
| 30 |
+
call['created_timestamp'] = 0
|
| 31 |
task_calls[task_id].sort(key=lambda x: x['created_timestamp'])
|
| 32 |
|
| 33 |
tasks = [analyze_task(calls, llm_client, llm_eval) for task_id, calls in task_calls.items()]
|
app.py
CHANGED
|
@@ -16,7 +16,7 @@ import re
|
|
| 16 |
import markdown
|
| 17 |
import asyncio
|
| 18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
| 19 |
-
import weave
|
| 20 |
from utils.db import TracePreprocessor
|
| 21 |
from gradio.themes.soft import Soft
|
| 22 |
|
|
@@ -771,7 +771,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
|
|
| 771 |
demo.load(
|
| 772 |
lambda: create_task_success_heatmap(
|
| 773 |
preprocessor.get_task_success_data('swebench_verified_mini'),
|
| 774 |
-
'SWE-bench Verified'
|
| 775 |
),
|
| 776 |
outputs=[task_success_heatmap]
|
| 777 |
)
|
|
@@ -1454,5 +1454,5 @@ async def main():
|
|
| 1454 |
await demo.launch(favicon_path="hal.png")
|
| 1455 |
|
| 1456 |
if __name__ == "__main__":
|
| 1457 |
-
weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
| 1458 |
asyncio.run(main())
|
|
|
|
| 16 |
import markdown
|
| 17 |
import asyncio
|
| 18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
| 19 |
+
# import weave
|
| 20 |
from utils.db import TracePreprocessor
|
| 21 |
from gradio.themes.soft import Soft
|
| 22 |
|
|
|
|
| 771 |
demo.load(
|
| 772 |
lambda: create_task_success_heatmap(
|
| 773 |
preprocessor.get_task_success_data('swebench_verified_mini'),
|
| 774 |
+
'SWE-bench Verified (Mini)'
|
| 775 |
),
|
| 776 |
outputs=[task_success_heatmap]
|
| 777 |
)
|
|
|
|
| 1454 |
await demo.launch(favicon_path="hal.png")
|
| 1455 |
|
| 1456 |
if __name__ == "__main__":
|
| 1457 |
+
# weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
| 1458 |
asyncio.run(main())
|
utils/viz.py
CHANGED
|
@@ -41,7 +41,8 @@ def create_task_success_heatmap(df, benchmark_name):
|
|
| 41 |
tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
|
| 42 |
# Total number of tasks (columns)
|
| 43 |
total_tasks = len(pivot_df.columns)
|
| 44 |
-
|
|
|
|
| 45 |
total_tasks = 50 # TODO - remove hardcoding
|
| 46 |
|
| 47 |
# Add the new row to the pivot table
|
|
|
|
| 41 |
tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
|
| 42 |
# Total number of tasks (columns)
|
| 43 |
total_tasks = len(pivot_df.columns)
|
| 44 |
+
print(benchmark_name)
|
| 45 |
+
if benchmark_name == "SWE-bench Verified (Mini)":
|
| 46 |
total_tasks = 50 # TODO - remove hardcoding
|
| 47 |
|
| 48 |
# Add the new row to the pivot table
|
verified_agents.yaml
CHANGED
|
@@ -48,12 +48,24 @@ usaco:
|
|
| 48 |
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
|
| 49 |
verification_date: 2024-08-24
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
- agent_name: "Agentless (gpt-4o-mini-2024-07-18) (50 Instances)"
|
| 54 |
verification_date: 2024-08-17
|
| 55 |
-
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)
|
| 56 |
verification_date: 2024-08-19
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
mlagentbench:
|
| 59 |
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
|
|
|
|
| 48 |
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
|
| 49 |
verification_date: 2024-08-24
|
| 50 |
|
| 51 |
+
swebench_verified_mini:
|
| 52 |
+
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
|
|
|
|
| 53 |
verification_date: 2024-08-17
|
| 54 |
+
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
|
| 55 |
verification_date: 2024-08-19
|
| 56 |
+
- agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
|
| 57 |
+
verification_date: 2024-10-30
|
| 58 |
+
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
| 59 |
+
verification_date: 2024-10-30
|
| 60 |
+
- agent_name: "Moatless (claude-3-5-sonnet-20241022)"
|
| 61 |
+
verification_date: 2024-10-30
|
| 62 |
+
- agent_name: "Agentless (o1-mini-2024-09-12)"
|
| 63 |
+
verification_date: 2024-10-30
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
swebench_verified:
|
| 67 |
+
- agent_name: "Moatless (gpt-4o-2024-08-06)"
|
| 68 |
+
verification_date: 2024-10-30
|
| 69 |
|
| 70 |
mlagentbench:
|
| 71 |
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
|