Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
f7e6b58
1
Parent(s):
68b25a7
Update mock data and docs to match new openhands-index-results format
Browse files- Update scores.json: rename total_cost -> cost_per_instance, total_runtime -> average_runtime
- Add new required fields to mock data: full_archive, agent_version, submission_time
- Add agent_name and country fields to metadata.json
- Update DATA_STRUCTURE.md documentation to reflect new schema
This ensures mock data works as a proper fallback when GitHub is unavailable
and matches the production format from openhands-index-results.
Co-authored-by: openhands <openhands@all-hands.dev>
- DATA_STRUCTURE.md +31 -12
- mock_results/1.0.0-dev1/results/20250723_qwen3_coder/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20250723_qwen3_coder/scores.json +9 -3
- mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/scores.json +10 -4
- mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +36 -18
- mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +36 -18
- mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +36 -18
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +36 -18
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +36 -18
- mock_results/1.0.0-dev1/results/20251201_deepseek_v3/metadata.json +4 -2
- mock_results/1.0.0-dev1/results/20251201_deepseek_v3/scores.json +9 -3
DATA_STRUCTURE.md
CHANGED
|
@@ -21,25 +21,44 @@ openhands-index-results/
|
|
| 21 |
|
| 22 |
## File Formats
|
| 23 |
|
| 24 |
-
###
|
| 25 |
|
| 26 |
-
Each
|
| 27 |
|
|
|
|
| 28 |
```json
|
| 29 |
{
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
|
|
|
| 40 |
}
|
| 41 |
```
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
### Configuration File (agenteval.json)
|
| 44 |
|
| 45 |
The configuration file defines the benchmark structure:
|
|
|
|
| 21 |
|
| 22 |
## File Formats
|
| 23 |
|
| 24 |
+
### Agent Directory Structure
|
| 25 |
|
| 26 |
+
Each agent has its own directory containing two files:
|
| 27 |
|
| 28 |
+
**metadata.json** - Agent and model information:
|
| 29 |
```json
|
| 30 |
{
|
| 31 |
+
"agent_name": "OpenHands CodeAct",
|
| 32 |
+
"agent_version": "v1.8.3",
|
| 33 |
+
"model": "claude-4.5-opus",
|
| 34 |
+
"openness": "closed_api_available",
|
| 35 |
+
"country": "us",
|
| 36 |
+
"tool_usage": "standard",
|
| 37 |
+
"submission_time": "2026-01-27T01:24:15.735789+00:00",
|
| 38 |
+
"directory_name": "claude-4.5-opus",
|
| 39 |
+
"release_date": "2025-11-24",
|
| 40 |
+
"parameter_count_b": null,
|
| 41 |
+
"active_parameter_count_b": null
|
| 42 |
}
|
| 43 |
```
|
| 44 |
|
| 45 |
+
**scores.json** - Array of benchmark results:
|
| 46 |
+
```json
|
| 47 |
+
[
|
| 48 |
+
{
|
| 49 |
+
"benchmark": "swe-bench",
|
| 50 |
+
"score": 76.6,
|
| 51 |
+
"metric": "accuracy",
|
| 52 |
+
"cost_per_instance": 1.82,
|
| 53 |
+
"average_runtime": 325.0,
|
| 54 |
+
"full_archive": "https://results.eval.all-hands.dev/eval-21370451733-...",
|
| 55 |
+
"tags": ["swe-bench"],
|
| 56 |
+
"agent_version": "v1.8.3",
|
| 57 |
+
"submission_time": "2026-01-27T01:24:15.735789+00:00"
|
| 58 |
+
}
|
| 59 |
+
]
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
### Configuration File (agenteval.json)
|
| 63 |
|
| 64 |
The configuration file defines the benchmark structure:
|
mock_results/1.0.0-dev1/results/20250723_qwen3_coder/metadata.json
CHANGED
|
@@ -7,5 +7,7 @@
|
|
| 7 |
"directory_name": "20250723_qwen3_coder",
|
| 8 |
"release_date": "2025-07-23",
|
| 9 |
"parameter_count_b": 480,
|
| 10 |
-
"active_parameter_count_b": 35
|
| 11 |
-
|
|
|
|
|
|
|
|
|
| 7 |
"directory_name": "20250723_qwen3_coder",
|
| 8 |
"release_date": "2025-07-23",
|
| 9 |
"parameter_count_b": 480,
|
| 10 |
+
"active_parameter_count_b": 35,
|
| 11 |
+
"agent_name": "OpenHands CodeAct",
|
| 12 |
+
"country": "cn"
|
| 13 |
+
}
|
mock_results/1.0.0-dev1/results/20250723_qwen3_coder/scores.json
CHANGED
|
@@ -4,13 +4,19 @@
|
|
| 4 |
"score": 38.0,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
"cost_per_instance": 0.12,
|
| 7 |
-
"average_runtime": 150
|
|
|
|
|
|
|
|
|
|
| 8 |
},
|
| 9 |
{
|
| 10 |
"benchmark": "gaia",
|
| 11 |
"score": 48.0,
|
| 12 |
"metric": "accuracy",
|
| 13 |
"cost_per_instance": 0.06,
|
| 14 |
-
"average_runtime": 45
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
-
]
|
|
|
|
| 4 |
"score": 38.0,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
"cost_per_instance": 0.12,
|
| 7 |
+
"average_runtime": 150,
|
| 8 |
+
"full_archive": "",
|
| 9 |
+
"agent_version": "v1.0.0",
|
| 10 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 11 |
},
|
| 12 |
{
|
| 13 |
"benchmark": "gaia",
|
| 14 |
"score": 48.0,
|
| 15 |
"metric": "accuracy",
|
| 16 |
"cost_per_instance": 0.06,
|
| 17 |
+
"average_runtime": 45,
|
| 18 |
+
"full_archive": "",
|
| 19 |
+
"agent_version": "v1.0.0",
|
| 20 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 21 |
}
|
| 22 |
+
]
|
mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/metadata.json
CHANGED
|
@@ -7,5 +7,7 @@
|
|
| 7 |
"directory_name": "20251106_kimi_k2_thinking",
|
| 8 |
"release_date": "2025-11-06",
|
| 9 |
"parameter_count_b": 1000,
|
| 10 |
-
"active_parameter_count_b": 32
|
| 11 |
-
|
|
|
|
|
|
|
|
|
| 7 |
"directory_name": "20251106_kimi_k2_thinking",
|
| 8 |
"release_date": "2025-11-06",
|
| 9 |
"parameter_count_b": 1000,
|
| 10 |
+
"active_parameter_count_b": 32,
|
| 11 |
+
"agent_name": "OpenHands CodeAct",
|
| 12 |
+
"country": "cn"
|
| 13 |
+
}
|
mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/scores.json
CHANGED
|
@@ -4,13 +4,19 @@
|
|
| 4 |
"score": 45.0,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
"cost_per_instance": 0.18,
|
| 7 |
-
"average_runtime": 200
|
|
|
|
|
|
|
|
|
|
| 8 |
},
|
| 9 |
{
|
| 10 |
"benchmark": "gaia",
|
| 11 |
"score": 52.0,
|
| 12 |
"metric": "accuracy",
|
| 13 |
-
"cost_per_instance": 0.
|
| 14 |
-
"average_runtime": 70
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
-
]
|
|
|
|
| 4 |
"score": 45.0,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
"cost_per_instance": 0.18,
|
| 7 |
+
"average_runtime": 200,
|
| 8 |
+
"full_archive": "",
|
| 9 |
+
"agent_version": "v1.0.0",
|
| 10 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 11 |
},
|
| 12 |
{
|
| 13 |
"benchmark": "gaia",
|
| 14 |
"score": 52.0,
|
| 15 |
"metric": "accuracy",
|
| 16 |
+
"cost_per_instance": 0.1,
|
| 17 |
+
"average_runtime": 70,
|
| 18 |
+
"full_archive": "",
|
| 19 |
+
"agent_version": "v1.0.0",
|
| 20 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 21 |
}
|
| 22 |
+
]
|
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json
CHANGED
|
@@ -5,5 +5,7 @@
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092865",
|
| 7 |
"directory_name": "20251124_claude_3_5_sonnet_20241022",
|
| 8 |
-
"release_date": "2024-10-22"
|
| 9 |
-
|
|
|
|
|
|
|
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092865",
|
| 7 |
"directory_name": "20251124_claude_3_5_sonnet_20241022",
|
| 8 |
+
"release_date": "2024-10-22",
|
| 9 |
+
"agent_name": "OpenHands CodeAct",
|
| 10 |
+
"country": "us"
|
| 11 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json
CHANGED
|
@@ -3,62 +3,80 @@
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 48.3,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 34.15,
|
| 7 |
-
"total_runtime": 541.5,
|
| 8 |
"tags": [
|
| 9 |
"swe-bench",
|
| 10 |
"Bug Fixing"
|
| 11 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"benchmark": "swe-bench-multimodal",
|
| 15 |
"score": 42.1,
|
| 16 |
"metric": "resolve_rate",
|
| 17 |
-
"total_cost": 31.05,
|
| 18 |
-
"total_runtime": 510.5,
|
| 19 |
"tags": [
|
| 20 |
"swe-bench-multimodal",
|
| 21 |
"Bug Fixing"
|
| 22 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"benchmark": "commit0",
|
| 26 |
"score": 71.2,
|
| 27 |
"metric": "test_pass_rate",
|
| 28 |
-
"total_cost": 45.6,
|
| 29 |
-
"total_runtime": 656.0,
|
| 30 |
"tags": [
|
| 31 |
"commit0"
|
| 32 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"benchmark": "multi-swe-bench",
|
| 36 |
"score": 35.2,
|
| 37 |
"metric": "resolve_rate",
|
| 38 |
-
"total_cost": 27.6,
|
| 39 |
-
"total_runtime": 476.0,
|
| 40 |
"tags": [
|
| 41 |
"multi-swe-bench"
|
| 42 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"benchmark": "swt-bench",
|
| 46 |
"score": 65.4,
|
| 47 |
"metric": "success_rate",
|
| 48 |
-
"total_cost": 42.7,
|
| 49 |
-
"total_runtime": 627.0,
|
| 50 |
"tags": [
|
| 51 |
"swt-bench"
|
| 52 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"benchmark": "gaia",
|
| 56 |
"score": 58.7,
|
| 57 |
"metric": "accuracy",
|
| 58 |
-
"total_cost": 39.35,
|
| 59 |
-
"total_runtime": 593.5,
|
| 60 |
"tags": [
|
| 61 |
"gaia"
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
]
|
|
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 48.3,
|
| 5 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 6 |
"tags": [
|
| 7 |
"swe-bench",
|
| 8 |
"Bug Fixing"
|
| 9 |
+
],
|
| 10 |
+
"cost_per_instance": 34.15,
|
| 11 |
+
"average_runtime": 541.5,
|
| 12 |
+
"full_archive": "",
|
| 13 |
+
"agent_version": "v1.0.0",
|
| 14 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"benchmark": "swe-bench-multimodal",
|
| 18 |
"score": 42.1,
|
| 19 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 20 |
"tags": [
|
| 21 |
"swe-bench-multimodal",
|
| 22 |
"Bug Fixing"
|
| 23 |
+
],
|
| 24 |
+
"cost_per_instance": 31.05,
|
| 25 |
+
"average_runtime": 510.5,
|
| 26 |
+
"full_archive": "",
|
| 27 |
+
"agent_version": "v1.0.0",
|
| 28 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"benchmark": "commit0",
|
| 32 |
"score": 71.2,
|
| 33 |
"metric": "test_pass_rate",
|
|
|
|
|
|
|
| 34 |
"tags": [
|
| 35 |
"commit0"
|
| 36 |
+
],
|
| 37 |
+
"cost_per_instance": 45.6,
|
| 38 |
+
"average_runtime": 656.0,
|
| 39 |
+
"full_archive": "",
|
| 40 |
+
"agent_version": "v1.0.0",
|
| 41 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 42 |
},
|
| 43 |
{
|
| 44 |
"benchmark": "multi-swe-bench",
|
| 45 |
"score": 35.2,
|
| 46 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 47 |
"tags": [
|
| 48 |
"multi-swe-bench"
|
| 49 |
+
],
|
| 50 |
+
"cost_per_instance": 27.6,
|
| 51 |
+
"average_runtime": 476.0,
|
| 52 |
+
"full_archive": "",
|
| 53 |
+
"agent_version": "v1.0.0",
|
| 54 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"benchmark": "swt-bench",
|
| 58 |
"score": 65.4,
|
| 59 |
"metric": "success_rate",
|
|
|
|
|
|
|
| 60 |
"tags": [
|
| 61 |
"swt-bench"
|
| 62 |
+
],
|
| 63 |
+
"cost_per_instance": 42.7,
|
| 64 |
+
"average_runtime": 627.0,
|
| 65 |
+
"full_archive": "",
|
| 66 |
+
"agent_version": "v1.0.0",
|
| 67 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"benchmark": "gaia",
|
| 71 |
"score": 58.7,
|
| 72 |
"metric": "accuracy",
|
|
|
|
|
|
|
| 73 |
"tags": [
|
| 74 |
"gaia"
|
| 75 |
+
],
|
| 76 |
+
"cost_per_instance": 39.35,
|
| 77 |
+
"average_runtime": 593.5,
|
| 78 |
+
"full_archive": "",
|
| 79 |
+
"agent_version": "v1.0.0",
|
| 80 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 81 |
}
|
| 82 |
]
|
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json
CHANGED
|
@@ -5,5 +5,7 @@
|
|
| 5 |
"tool_usage": "custom_interface",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092922",
|
| 7 |
"directory_name": "20251124_claude_3_opus_20240229",
|
| 8 |
-
"release_date": "2024-02-29"
|
| 9 |
-
|
|
|
|
|
|
|
|
|
| 5 |
"tool_usage": "custom_interface",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092922",
|
| 7 |
"directory_name": "20251124_claude_3_opus_20240229",
|
| 8 |
+
"release_date": "2024-02-29",
|
| 9 |
+
"agent_name": "OpenHands CodeAct",
|
| 10 |
+
"country": "us"
|
| 11 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json
CHANGED
|
@@ -3,62 +3,80 @@
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 29.8,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 24.9,
|
| 7 |
-
"total_runtime": 449.0,
|
| 8 |
"tags": [
|
| 9 |
"swe-bench",
|
| 10 |
"Bug Fixing"
|
| 11 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"benchmark": "swe-bench-multimodal",
|
| 15 |
"score": 25.7,
|
| 16 |
"metric": "resolve_rate",
|
| 17 |
-
"total_cost": 22.85,
|
| 18 |
-
"total_runtime": 428.5,
|
| 19 |
"tags": [
|
| 20 |
"swe-bench-multimodal",
|
| 21 |
"Bug Fixing"
|
| 22 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"benchmark": "commit0",
|
| 26 |
"score": 52.1,
|
| 27 |
"metric": "test_pass_rate",
|
| 28 |
-
"total_cost": 36.05,
|
| 29 |
-
"total_runtime": 560.5,
|
| 30 |
"tags": [
|
| 31 |
"commit0"
|
| 32 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"benchmark": "multi-swe-bench",
|
| 36 |
"score": 21.5,
|
| 37 |
"metric": "resolve_rate",
|
| 38 |
-
"total_cost": 20.75,
|
| 39 |
-
"total_runtime": 407.5,
|
| 40 |
"tags": [
|
| 41 |
"multi-swe-bench"
|
| 42 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"benchmark": "swt-bench",
|
| 46 |
"score": 44.2,
|
| 47 |
"metric": "success_rate",
|
| 48 |
-
"total_cost": 32.1,
|
| 49 |
-
"total_runtime": 521.0,
|
| 50 |
"tags": [
|
| 51 |
"swt-bench"
|
| 52 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"benchmark": "gaia",
|
| 56 |
"score": 39.4,
|
| 57 |
"metric": "accuracy",
|
| 58 |
-
"total_cost": 29.7,
|
| 59 |
-
"total_runtime": 497.0,
|
| 60 |
"tags": [
|
| 61 |
"gaia"
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
]
|
|
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 29.8,
|
| 5 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 6 |
"tags": [
|
| 7 |
"swe-bench",
|
| 8 |
"Bug Fixing"
|
| 9 |
+
],
|
| 10 |
+
"cost_per_instance": 24.9,
|
| 11 |
+
"average_runtime": 449.0,
|
| 12 |
+
"full_archive": "",
|
| 13 |
+
"agent_version": "v1.0.0",
|
| 14 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"benchmark": "swe-bench-multimodal",
|
| 18 |
"score": 25.7,
|
| 19 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 20 |
"tags": [
|
| 21 |
"swe-bench-multimodal",
|
| 22 |
"Bug Fixing"
|
| 23 |
+
],
|
| 24 |
+
"cost_per_instance": 22.85,
|
| 25 |
+
"average_runtime": 428.5,
|
| 26 |
+
"full_archive": "",
|
| 27 |
+
"agent_version": "v1.0.0",
|
| 28 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"benchmark": "commit0",
|
| 32 |
"score": 52.1,
|
| 33 |
"metric": "test_pass_rate",
|
|
|
|
|
|
|
| 34 |
"tags": [
|
| 35 |
"commit0"
|
| 36 |
+
],
|
| 37 |
+
"cost_per_instance": 36.05,
|
| 38 |
+
"average_runtime": 560.5,
|
| 39 |
+
"full_archive": "",
|
| 40 |
+
"agent_version": "v1.0.0",
|
| 41 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 42 |
},
|
| 43 |
{
|
| 44 |
"benchmark": "multi-swe-bench",
|
| 45 |
"score": 21.5,
|
| 46 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 47 |
"tags": [
|
| 48 |
"multi-swe-bench"
|
| 49 |
+
],
|
| 50 |
+
"cost_per_instance": 20.75,
|
| 51 |
+
"average_runtime": 407.5,
|
| 52 |
+
"full_archive": "",
|
| 53 |
+
"agent_version": "v1.0.0",
|
| 54 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"benchmark": "swt-bench",
|
| 58 |
"score": 44.2,
|
| 59 |
"metric": "success_rate",
|
|
|
|
|
|
|
| 60 |
"tags": [
|
| 61 |
"swt-bench"
|
| 62 |
+
],
|
| 63 |
+
"cost_per_instance": 32.1,
|
| 64 |
+
"average_runtime": 521.0,
|
| 65 |
+
"full_archive": "",
|
| 66 |
+
"agent_version": "v1.0.0",
|
| 67 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"benchmark": "gaia",
|
| 71 |
"score": 39.4,
|
| 72 |
"metric": "accuracy",
|
|
|
|
|
|
|
| 73 |
"tags": [
|
| 74 |
"gaia"
|
| 75 |
+
],
|
| 76 |
+
"cost_per_instance": 29.7,
|
| 77 |
+
"average_runtime": 497.0,
|
| 78 |
+
"full_archive": "",
|
| 79 |
+
"agent_version": "v1.0.0",
|
| 80 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 81 |
}
|
| 82 |
]
|
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json
CHANGED
|
@@ -5,5 +5,7 @@
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092908",
|
| 7 |
"directory_name": "20251124_gpt_4_turbo_2024_04_09",
|
| 8 |
-
"release_date": "2024-04-09"
|
| 9 |
-
|
|
|
|
|
|
|
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092908",
|
| 7 |
"directory_name": "20251124_gpt_4_turbo_2024_04_09",
|
| 8 |
+
"release_date": "2024-04-09",
|
| 9 |
+
"agent_name": "OpenHands CodeAct",
|
| 10 |
+
"country": "us"
|
| 11 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json
CHANGED
|
@@ -3,62 +3,80 @@
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 38.7,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 29.35,
|
| 7 |
-
"total_runtime": 493.5,
|
| 8 |
"tags": [
|
| 9 |
"swe-bench",
|
| 10 |
"Bug Fixing"
|
| 11 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"benchmark": "swe-bench-multimodal",
|
| 15 |
"score": 34.2,
|
| 16 |
"metric": "resolve_rate",
|
| 17 |
-
"total_cost": 27.1,
|
| 18 |
-
"total_runtime": 471.0,
|
| 19 |
"tags": [
|
| 20 |
"swe-bench-multimodal",
|
| 21 |
"Bug Fixing"
|
| 22 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"benchmark": "commit0",
|
| 26 |
"score": 61.5,
|
| 27 |
"metric": "test_pass_rate",
|
| 28 |
-
"total_cost": 40.75,
|
| 29 |
-
"total_runtime": 607.5,
|
| 30 |
"tags": [
|
| 31 |
"commit0"
|
| 32 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"benchmark": "multi-swe-bench",
|
| 36 |
"score": 28.4,
|
| 37 |
"metric": "resolve_rate",
|
| 38 |
-
"total_cost": 24.2,
|
| 39 |
-
"total_runtime": 442.0,
|
| 40 |
"tags": [
|
| 41 |
"multi-swe-bench"
|
| 42 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"benchmark": "swt-bench",
|
| 46 |
"score": 54.1,
|
| 47 |
"metric": "success_rate",
|
| 48 |
-
"total_cost": 37.05,
|
| 49 |
-
"total_runtime": 570.5,
|
| 50 |
"tags": [
|
| 51 |
"swt-bench"
|
| 52 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"benchmark": "gaia",
|
| 56 |
"score": 48.3,
|
| 57 |
"metric": "accuracy",
|
| 58 |
-
"total_cost": 34.15,
|
| 59 |
-
"total_runtime": 541.5,
|
| 60 |
"tags": [
|
| 61 |
"gaia"
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
]
|
|
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 38.7,
|
| 5 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 6 |
"tags": [
|
| 7 |
"swe-bench",
|
| 8 |
"Bug Fixing"
|
| 9 |
+
],
|
| 10 |
+
"cost_per_instance": 29.35,
|
| 11 |
+
"average_runtime": 493.5,
|
| 12 |
+
"full_archive": "",
|
| 13 |
+
"agent_version": "v1.0.0",
|
| 14 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"benchmark": "swe-bench-multimodal",
|
| 18 |
"score": 34.2,
|
| 19 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 20 |
"tags": [
|
| 21 |
"swe-bench-multimodal",
|
| 22 |
"Bug Fixing"
|
| 23 |
+
],
|
| 24 |
+
"cost_per_instance": 27.1,
|
| 25 |
+
"average_runtime": 471.0,
|
| 26 |
+
"full_archive": "",
|
| 27 |
+
"agent_version": "v1.0.0",
|
| 28 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"benchmark": "commit0",
|
| 32 |
"score": 61.5,
|
| 33 |
"metric": "test_pass_rate",
|
|
|
|
|
|
|
| 34 |
"tags": [
|
| 35 |
"commit0"
|
| 36 |
+
],
|
| 37 |
+
"cost_per_instance": 40.75,
|
| 38 |
+
"average_runtime": 607.5,
|
| 39 |
+
"full_archive": "",
|
| 40 |
+
"agent_version": "v1.0.0",
|
| 41 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 42 |
},
|
| 43 |
{
|
| 44 |
"benchmark": "multi-swe-bench",
|
| 45 |
"score": 28.4,
|
| 46 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 47 |
"tags": [
|
| 48 |
"multi-swe-bench"
|
| 49 |
+
],
|
| 50 |
+
"cost_per_instance": 24.2,
|
| 51 |
+
"average_runtime": 442.0,
|
| 52 |
+
"full_archive": "",
|
| 53 |
+
"agent_version": "v1.0.0",
|
| 54 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"benchmark": "swt-bench",
|
| 58 |
"score": 54.1,
|
| 59 |
"metric": "success_rate",
|
|
|
|
|
|
|
| 60 |
"tags": [
|
| 61 |
"swt-bench"
|
| 62 |
+
],
|
| 63 |
+
"cost_per_instance": 37.05,
|
| 64 |
+
"average_runtime": 570.5,
|
| 65 |
+
"full_archive": "",
|
| 66 |
+
"agent_version": "v1.0.0",
|
| 67 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"benchmark": "gaia",
|
| 71 |
"score": 48.3,
|
| 72 |
"metric": "accuracy",
|
|
|
|
|
|
|
| 73 |
"tags": [
|
| 74 |
"gaia"
|
| 75 |
+
],
|
| 76 |
+
"cost_per_instance": 34.15,
|
| 77 |
+
"average_runtime": 541.5,
|
| 78 |
+
"full_archive": "",
|
| 79 |
+
"agent_version": "v1.0.0",
|
| 80 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 81 |
}
|
| 82 |
]
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json
CHANGED
|
@@ -5,5 +5,7 @@
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092895",
|
| 7 |
"directory_name": "20251124_gpt_4o_2024_11_20",
|
| 8 |
-
"release_date": "2024-11-20"
|
| 9 |
-
|
|
|
|
|
|
|
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092895",
|
| 7 |
"directory_name": "20251124_gpt_4o_2024_11_20",
|
| 8 |
+
"release_date": "2024-11-20",
|
| 9 |
+
"agent_name": "OpenHands CodeAct",
|
| 10 |
+
"country": "us"
|
| 11 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json
CHANGED
|
@@ -3,62 +3,80 @@
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 45.1,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 32.55,
|
| 7 |
-
"total_runtime": 525.5,
|
| 8 |
"tags": [
|
| 9 |
"swe-bench",
|
| 10 |
"Bug Fixing"
|
| 11 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"benchmark": "swe-bench-multimodal",
|
| 15 |
"score": 39.5,
|
| 16 |
"metric": "resolve_rate",
|
| 17 |
-
"total_cost": 29.75,
|
| 18 |
-
"total_runtime": 497.5,
|
| 19 |
"tags": [
|
| 20 |
"swe-bench-multimodal",
|
| 21 |
"Bug Fixing"
|
| 22 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"benchmark": "commit0",
|
| 26 |
"score": 68.9,
|
| 27 |
"metric": "test_pass_rate",
|
| 28 |
-
"total_cost": 44.45,
|
| 29 |
-
"total_runtime": 644.5,
|
| 30 |
"tags": [
|
| 31 |
"commit0"
|
| 32 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"benchmark": "multi-swe-bench",
|
| 36 |
"score": 32.8,
|
| 37 |
"metric": "resolve_rate",
|
| 38 |
-
"total_cost": 26.4,
|
| 39 |
-
"total_runtime": 464.0,
|
| 40 |
"tags": [
|
| 41 |
"multi-swe-bench"
|
| 42 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"benchmark": "swt-bench",
|
| 46 |
"score": 62.3,
|
| 47 |
"metric": "success_rate",
|
| 48 |
-
"total_cost": 41.15,
|
| 49 |
-
"total_runtime": 611.5,
|
| 50 |
"tags": [
|
| 51 |
"swt-bench"
|
| 52 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"benchmark": "gaia",
|
| 56 |
"score": 55.2,
|
| 57 |
"metric": "accuracy",
|
| 58 |
-
"total_cost": 37.6,
|
| 59 |
-
"total_runtime": 576.0,
|
| 60 |
"tags": [
|
| 61 |
"gaia"
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
]
|
|
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 45.1,
|
| 5 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 6 |
"tags": [
|
| 7 |
"swe-bench",
|
| 8 |
"Bug Fixing"
|
| 9 |
+
],
|
| 10 |
+
"cost_per_instance": 32.55,
|
| 11 |
+
"average_runtime": 525.5,
|
| 12 |
+
"full_archive": "",
|
| 13 |
+
"agent_version": "v1.0.0",
|
| 14 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"benchmark": "swe-bench-multimodal",
|
| 18 |
"score": 39.5,
|
| 19 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 20 |
"tags": [
|
| 21 |
"swe-bench-multimodal",
|
| 22 |
"Bug Fixing"
|
| 23 |
+
],
|
| 24 |
+
"cost_per_instance": 29.75,
|
| 25 |
+
"average_runtime": 497.5,
|
| 26 |
+
"full_archive": "",
|
| 27 |
+
"agent_version": "v1.0.0",
|
| 28 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"benchmark": "commit0",
|
| 32 |
"score": 68.9,
|
| 33 |
"metric": "test_pass_rate",
|
|
|
|
|
|
|
| 34 |
"tags": [
|
| 35 |
"commit0"
|
| 36 |
+
],
|
| 37 |
+
"cost_per_instance": 44.45,
|
| 38 |
+
"average_runtime": 644.5,
|
| 39 |
+
"full_archive": "",
|
| 40 |
+
"agent_version": "v1.0.0",
|
| 41 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 42 |
},
|
| 43 |
{
|
| 44 |
"benchmark": "multi-swe-bench",
|
| 45 |
"score": 32.8,
|
| 46 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 47 |
"tags": [
|
| 48 |
"multi-swe-bench"
|
| 49 |
+
],
|
| 50 |
+
"cost_per_instance": 26.4,
|
| 51 |
+
"average_runtime": 464.0,
|
| 52 |
+
"full_archive": "",
|
| 53 |
+
"agent_version": "v1.0.0",
|
| 54 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"benchmark": "swt-bench",
|
| 58 |
"score": 62.3,
|
| 59 |
"metric": "success_rate",
|
|
|
|
|
|
|
| 60 |
"tags": [
|
| 61 |
"swt-bench"
|
| 62 |
+
],
|
| 63 |
+
"cost_per_instance": 41.15,
|
| 64 |
+
"average_runtime": 611.5,
|
| 65 |
+
"full_archive": "",
|
| 66 |
+
"agent_version": "v1.0.0",
|
| 67 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"benchmark": "gaia",
|
| 71 |
"score": 55.2,
|
| 72 |
"metric": "accuracy",
|
|
|
|
|
|
|
| 73 |
"tags": [
|
| 74 |
"gaia"
|
| 75 |
+
],
|
| 76 |
+
"cost_per_instance": 37.6,
|
| 77 |
+
"average_runtime": 576.0,
|
| 78 |
+
"full_archive": "",
|
| 79 |
+
"agent_version": "v1.0.0",
|
| 80 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 81 |
}
|
| 82 |
]
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json
CHANGED
|
@@ -5,5 +5,7 @@
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092916",
|
| 7 |
"directory_name": "20251124_gpt_4o_mini_2024_07_18",
|
| 8 |
-
"release_date": "2024-07-18"
|
| 9 |
-
|
|
|
|
|
|
|
|
|
| 5 |
"tool_usage": "standard",
|
| 6 |
"submission_time": "2025-11-24T19:56:00.092916",
|
| 7 |
"directory_name": "20251124_gpt_4o_mini_2024_07_18",
|
| 8 |
+
"release_date": "2024-07-18",
|
| 9 |
+
"agent_name": "OpenHands CodeAct",
|
| 10 |
+
"country": "us"
|
| 11 |
+
}
|
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json
CHANGED
|
@@ -3,62 +3,80 @@
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 32.5,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
-
"total_cost": 26.25,
|
| 7 |
-
"total_runtime": 462.5,
|
| 8 |
"tags": [
|
| 9 |
"swe-bench",
|
| 10 |
"Bug Fixing"
|
| 11 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"benchmark": "swe-bench-multimodal",
|
| 15 |
"score": 28.9,
|
| 16 |
"metric": "resolve_rate",
|
| 17 |
-
"total_cost": 24.45,
|
| 18 |
-
"total_runtime": 444.5,
|
| 19 |
"tags": [
|
| 20 |
"swe-bench-multimodal",
|
| 21 |
"Bug Fixing"
|
| 22 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"benchmark": "commit0",
|
| 26 |
"score": 55.3,
|
| 27 |
"metric": "test_pass_rate",
|
| 28 |
-
"total_cost": 37.65,
|
| 29 |
-
"total_runtime": 576.5,
|
| 30 |
"tags": [
|
| 31 |
"commit0"
|
| 32 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"benchmark": "multi-swe-bench",
|
| 36 |
"score": 24.1,
|
| 37 |
"metric": "resolve_rate",
|
| 38 |
-
"total_cost": 22.05,
|
| 39 |
-
"total_runtime": 420.5,
|
| 40 |
"tags": [
|
| 41 |
"multi-swe-bench"
|
| 42 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
},
|
| 44 |
{
|
| 45 |
"benchmark": "swt-bench",
|
| 46 |
"score": 47.8,
|
| 47 |
"metric": "success_rate",
|
| 48 |
-
"total_cost": 33.9,
|
| 49 |
-
"total_runtime": 539.0,
|
| 50 |
"tags": [
|
| 51 |
"swt-bench"
|
| 52 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
},
|
| 54 |
{
|
| 55 |
"benchmark": "gaia",
|
| 56 |
"score": 42.1,
|
| 57 |
"metric": "accuracy",
|
| 58 |
-
"total_cost": 31.05,
|
| 59 |
-
"total_runtime": 510.5,
|
| 60 |
"tags": [
|
| 61 |
"gaia"
|
| 62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
]
|
|
|
|
| 3 |
"benchmark": "swe-bench",
|
| 4 |
"score": 32.5,
|
| 5 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 6 |
"tags": [
|
| 7 |
"swe-bench",
|
| 8 |
"Bug Fixing"
|
| 9 |
+
],
|
| 10 |
+
"cost_per_instance": 26.25,
|
| 11 |
+
"average_runtime": 462.5,
|
| 12 |
+
"full_archive": "",
|
| 13 |
+
"agent_version": "v1.0.0",
|
| 14 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
"benchmark": "swe-bench-multimodal",
|
| 18 |
"score": 28.9,
|
| 19 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 20 |
"tags": [
|
| 21 |
"swe-bench-multimodal",
|
| 22 |
"Bug Fixing"
|
| 23 |
+
],
|
| 24 |
+
"cost_per_instance": 24.45,
|
| 25 |
+
"average_runtime": 444.5,
|
| 26 |
+
"full_archive": "",
|
| 27 |
+
"agent_version": "v1.0.0",
|
| 28 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 29 |
},
|
| 30 |
{
|
| 31 |
"benchmark": "commit0",
|
| 32 |
"score": 55.3,
|
| 33 |
"metric": "test_pass_rate",
|
|
|
|
|
|
|
| 34 |
"tags": [
|
| 35 |
"commit0"
|
| 36 |
+
],
|
| 37 |
+
"cost_per_instance": 37.65,
|
| 38 |
+
"average_runtime": 576.5,
|
| 39 |
+
"full_archive": "",
|
| 40 |
+
"agent_version": "v1.0.0",
|
| 41 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 42 |
},
|
| 43 |
{
|
| 44 |
"benchmark": "multi-swe-bench",
|
| 45 |
"score": 24.1,
|
| 46 |
"metric": "resolve_rate",
|
|
|
|
|
|
|
| 47 |
"tags": [
|
| 48 |
"multi-swe-bench"
|
| 49 |
+
],
|
| 50 |
+
"cost_per_instance": 22.05,
|
| 51 |
+
"average_runtime": 420.5,
|
| 52 |
+
"full_archive": "",
|
| 53 |
+
"agent_version": "v1.0.0",
|
| 54 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"benchmark": "swt-bench",
|
| 58 |
"score": 47.8,
|
| 59 |
"metric": "success_rate",
|
|
|
|
|
|
|
| 60 |
"tags": [
|
| 61 |
"swt-bench"
|
| 62 |
+
],
|
| 63 |
+
"cost_per_instance": 33.9,
|
| 64 |
+
"average_runtime": 539.0,
|
| 65 |
+
"full_archive": "",
|
| 66 |
+
"agent_version": "v1.0.0",
|
| 67 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"benchmark": "gaia",
|
| 71 |
"score": 42.1,
|
| 72 |
"metric": "accuracy",
|
|
|
|
|
|
|
| 73 |
"tags": [
|
| 74 |
"gaia"
|
| 75 |
+
],
|
| 76 |
+
"cost_per_instance": 31.05,
|
| 77 |
+
"average_runtime": 510.5,
|
| 78 |
+
"full_archive": "",
|
| 79 |
+
"agent_version": "v1.0.0",
|
| 80 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 81 |
}
|
| 82 |
]
|
mock_results/1.0.0-dev1/results/20251201_deepseek_v3/metadata.json
CHANGED
|
@@ -6,5 +6,7 @@
|
|
| 6 |
"submission_time": "2025-12-01T10:00:00.000000",
|
| 7 |
"directory_name": "20251201_deepseek_v3",
|
| 8 |
"release_date": "2025-12-01",
|
| 9 |
-
"parameter_count_b": 685
|
| 10 |
-
|
|
|
|
|
|
|
|
|
| 6 |
"submission_time": "2025-12-01T10:00:00.000000",
|
| 7 |
"directory_name": "20251201_deepseek_v3",
|
| 8 |
"release_date": "2025-12-01",
|
| 9 |
+
"parameter_count_b": 685,
|
| 10 |
+
"agent_name": "OpenHands CodeAct",
|
| 11 |
+
"country": "cn"
|
| 12 |
+
}
|
mock_results/1.0.0-dev1/results/20251201_deepseek_v3/scores.json
CHANGED
|
@@ -4,13 +4,19 @@
|
|
| 4 |
"score": 42.5,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
"cost_per_instance": 0.15,
|
| 7 |
-
"average_runtime": 180
|
|
|
|
|
|
|
|
|
|
| 8 |
},
|
| 9 |
{
|
| 10 |
"benchmark": "gaia",
|
| 11 |
"score": 55.0,
|
| 12 |
"metric": "accuracy",
|
| 13 |
"cost_per_instance": 0.08,
|
| 14 |
-
"average_runtime": 60
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
-
]
|
|
|
|
| 4 |
"score": 42.5,
|
| 5 |
"metric": "resolve_rate",
|
| 6 |
"cost_per_instance": 0.15,
|
| 7 |
+
"average_runtime": 180,
|
| 8 |
+
"full_archive": "",
|
| 9 |
+
"agent_version": "v1.0.0",
|
| 10 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 11 |
},
|
| 12 |
{
|
| 13 |
"benchmark": "gaia",
|
| 14 |
"score": 55.0,
|
| 15 |
"metric": "accuracy",
|
| 16 |
"cost_per_instance": 0.08,
|
| 17 |
+
"average_runtime": 60,
|
| 18 |
+
"full_archive": "",
|
| 19 |
+
"agent_version": "v1.0.0",
|
| 20 |
+
"submission_time": "2025-11-24T19:56:00.092865+00:00"
|
| 21 |
}
|
| 22 |
+
]
|