openhands openhands commited on
Commit
f7e6b58
·
1 Parent(s): 68b25a7

Update mock data and docs to match new openhands-index-results format

Browse files

- Update scores.json: rename total_cost -> cost_per_instance, total_runtime -> average_runtime
- Add new required fields to mock data: full_archive, agent_version, submission_time
- Add agent_name and country fields to metadata.json
- Update DATA_STRUCTURE.md documentation to reflect new schema

This ensures mock data works as a proper fallback when GitHub is unavailable
and matches the production format from openhands-index-results.

Co-authored-by: openhands <openhands@all-hands.dev>

DATA_STRUCTURE.md CHANGED
@@ -21,25 +21,44 @@ openhands-index-results/
21
 
22
  ## File Formats
23
 
24
- ### JSONL Files (test.jsonl, validation.jsonl, etc.)
25
 
26
- Each line in a JSONL file should be a JSON object representing one agent's results:
27
 
 
28
  ```json
29
  {
30
- "Agent_Name": "OpenHands CodeAct v2.1",
31
- "Llm_Base": "claude-3-5-sonnet-20241022",
32
- "Openness": "closed_api_available",
33
- "Tool_Usage": "standard",
34
- "Score": 48.3,
35
- "Metric": "resolve_rate",
36
- "Submission_Time": "2025-11-24T19:56:00.092865",
37
- "Tags": ["swe-bench"],
38
- "Total_Cost": 34.15,
39
- "Total_Runtime": 541.5
 
40
  }
41
  ```
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ### Configuration File (agenteval.json)
44
 
45
  The configuration file defines the benchmark structure:
 
21
 
22
  ## File Formats
23
 
24
+ ### Agent Directory Structure
25
 
26
+ Each agent has its own directory containing two files:
27
 
28
+ **metadata.json** - Agent and model information:
29
  ```json
30
  {
31
+ "agent_name": "OpenHands CodeAct",
32
+ "agent_version": "v1.8.3",
33
+ "model": "claude-4.5-opus",
34
+ "openness": "closed_api_available",
35
+ "country": "us",
36
+ "tool_usage": "standard",
37
+ "submission_time": "2026-01-27T01:24:15.735789+00:00",
38
+ "directory_name": "claude-4.5-opus",
39
+ "release_date": "2025-11-24",
40
+ "parameter_count_b": null,
41
+ "active_parameter_count_b": null
42
  }
43
  ```
44
 
45
+ **scores.json** - Array of benchmark results:
46
+ ```json
47
+ [
48
+ {
49
+ "benchmark": "swe-bench",
50
+ "score": 76.6,
51
+ "metric": "accuracy",
52
+ "cost_per_instance": 1.82,
53
+ "average_runtime": 325.0,
54
+ "full_archive": "https://results.eval.all-hands.dev/eval-21370451733-...",
55
+ "tags": ["swe-bench"],
56
+ "agent_version": "v1.8.3",
57
+ "submission_time": "2026-01-27T01:24:15.735789+00:00"
58
+ }
59
+ ]
60
+ ```
61
+
62
  ### Configuration File (agenteval.json)
63
 
64
  The configuration file defines the benchmark structure:
mock_results/1.0.0-dev1/results/20250723_qwen3_coder/metadata.json CHANGED
@@ -7,5 +7,7 @@
7
  "directory_name": "20250723_qwen3_coder",
8
  "release_date": "2025-07-23",
9
  "parameter_count_b": 480,
10
- "active_parameter_count_b": 35
11
- }
 
 
 
7
  "directory_name": "20250723_qwen3_coder",
8
  "release_date": "2025-07-23",
9
  "parameter_count_b": 480,
10
+ "active_parameter_count_b": 35,
11
+ "agent_name": "OpenHands CodeAct",
12
+ "country": "cn"
13
+ }
mock_results/1.0.0-dev1/results/20250723_qwen3_coder/scores.json CHANGED
@@ -4,13 +4,19 @@
4
  "score": 38.0,
5
  "metric": "resolve_rate",
6
  "cost_per_instance": 0.12,
7
- "average_runtime": 150
 
 
 
8
  },
9
  {
10
  "benchmark": "gaia",
11
  "score": 48.0,
12
  "metric": "accuracy",
13
  "cost_per_instance": 0.06,
14
- "average_runtime": 45
 
 
 
15
  }
16
- ]
 
4
  "score": 38.0,
5
  "metric": "resolve_rate",
6
  "cost_per_instance": 0.12,
7
+ "average_runtime": 150,
8
+ "full_archive": "",
9
+ "agent_version": "v1.0.0",
10
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
11
  },
12
  {
13
  "benchmark": "gaia",
14
  "score": 48.0,
15
  "metric": "accuracy",
16
  "cost_per_instance": 0.06,
17
+ "average_runtime": 45,
18
+ "full_archive": "",
19
+ "agent_version": "v1.0.0",
20
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
21
  }
22
+ ]
mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/metadata.json CHANGED
@@ -7,5 +7,7 @@
7
  "directory_name": "20251106_kimi_k2_thinking",
8
  "release_date": "2025-11-06",
9
  "parameter_count_b": 1000,
10
- "active_parameter_count_b": 32
11
- }
 
 
 
7
  "directory_name": "20251106_kimi_k2_thinking",
8
  "release_date": "2025-11-06",
9
  "parameter_count_b": 1000,
10
+ "active_parameter_count_b": 32,
11
+ "agent_name": "OpenHands CodeAct",
12
+ "country": "cn"
13
+ }
mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/scores.json CHANGED
@@ -4,13 +4,19 @@
4
  "score": 45.0,
5
  "metric": "resolve_rate",
6
  "cost_per_instance": 0.18,
7
- "average_runtime": 200
 
 
 
8
  },
9
  {
10
  "benchmark": "gaia",
11
  "score": 52.0,
12
  "metric": "accuracy",
13
- "cost_per_instance": 0.10,
14
- "average_runtime": 70
 
 
 
15
  }
16
- ]
 
4
  "score": 45.0,
5
  "metric": "resolve_rate",
6
  "cost_per_instance": 0.18,
7
+ "average_runtime": 200,
8
+ "full_archive": "",
9
+ "agent_version": "v1.0.0",
10
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
11
  },
12
  {
13
  "benchmark": "gaia",
14
  "score": 52.0,
15
  "metric": "accuracy",
16
+ "cost_per_instance": 0.1,
17
+ "average_runtime": 70,
18
+ "full_archive": "",
19
+ "agent_version": "v1.0.0",
20
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
21
  }
22
+ ]
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json CHANGED
@@ -5,5 +5,7 @@
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092865",
7
  "directory_name": "20251124_claude_3_5_sonnet_20241022",
8
- "release_date": "2024-10-22"
9
- }
 
 
 
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092865",
7
  "directory_name": "20251124_claude_3_5_sonnet_20241022",
8
+ "release_date": "2024-10-22",
9
+ "agent_name": "OpenHands CodeAct",
10
+ "country": "us"
11
+ }
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json CHANGED
@@ -3,62 +3,80 @@
3
  "benchmark": "swe-bench",
4
  "score": 48.3,
5
  "metric": "resolve_rate",
6
- "total_cost": 34.15,
7
- "total_runtime": 541.5,
8
  "tags": [
9
  "swe-bench",
10
  "Bug Fixing"
11
- ]
 
 
 
 
 
12
  },
13
  {
14
  "benchmark": "swe-bench-multimodal",
15
  "score": 42.1,
16
  "metric": "resolve_rate",
17
- "total_cost": 31.05,
18
- "total_runtime": 510.5,
19
  "tags": [
20
  "swe-bench-multimodal",
21
  "Bug Fixing"
22
- ]
 
 
 
 
 
23
  },
24
  {
25
  "benchmark": "commit0",
26
  "score": 71.2,
27
  "metric": "test_pass_rate",
28
- "total_cost": 45.6,
29
- "total_runtime": 656.0,
30
  "tags": [
31
  "commit0"
32
- ]
 
 
 
 
 
33
  },
34
  {
35
  "benchmark": "multi-swe-bench",
36
  "score": 35.2,
37
  "metric": "resolve_rate",
38
- "total_cost": 27.6,
39
- "total_runtime": 476.0,
40
  "tags": [
41
  "multi-swe-bench"
42
- ]
 
 
 
 
 
43
  },
44
  {
45
  "benchmark": "swt-bench",
46
  "score": 65.4,
47
  "metric": "success_rate",
48
- "total_cost": 42.7,
49
- "total_runtime": 627.0,
50
  "tags": [
51
  "swt-bench"
52
- ]
 
 
 
 
 
53
  },
54
  {
55
  "benchmark": "gaia",
56
  "score": 58.7,
57
  "metric": "accuracy",
58
- "total_cost": 39.35,
59
- "total_runtime": 593.5,
60
  "tags": [
61
  "gaia"
62
- ]
 
 
 
 
 
63
  }
64
  ]
 
3
  "benchmark": "swe-bench",
4
  "score": 48.3,
5
  "metric": "resolve_rate",
 
 
6
  "tags": [
7
  "swe-bench",
8
  "Bug Fixing"
9
+ ],
10
+ "cost_per_instance": 34.15,
11
+ "average_runtime": 541.5,
12
+ "full_archive": "",
13
+ "agent_version": "v1.0.0",
14
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
15
  },
16
  {
17
  "benchmark": "swe-bench-multimodal",
18
  "score": 42.1,
19
  "metric": "resolve_rate",
 
 
20
  "tags": [
21
  "swe-bench-multimodal",
22
  "Bug Fixing"
23
+ ],
24
+ "cost_per_instance": 31.05,
25
+ "average_runtime": 510.5,
26
+ "full_archive": "",
27
+ "agent_version": "v1.0.0",
28
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
29
  },
30
  {
31
  "benchmark": "commit0",
32
  "score": 71.2,
33
  "metric": "test_pass_rate",
 
 
34
  "tags": [
35
  "commit0"
36
+ ],
37
+ "cost_per_instance": 45.6,
38
+ "average_runtime": 656.0,
39
+ "full_archive": "",
40
+ "agent_version": "v1.0.0",
41
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
42
  },
43
  {
44
  "benchmark": "multi-swe-bench",
45
  "score": 35.2,
46
  "metric": "resolve_rate",
 
 
47
  "tags": [
48
  "multi-swe-bench"
49
+ ],
50
+ "cost_per_instance": 27.6,
51
+ "average_runtime": 476.0,
52
+ "full_archive": "",
53
+ "agent_version": "v1.0.0",
54
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
55
  },
56
  {
57
  "benchmark": "swt-bench",
58
  "score": 65.4,
59
  "metric": "success_rate",
 
 
60
  "tags": [
61
  "swt-bench"
62
+ ],
63
+ "cost_per_instance": 42.7,
64
+ "average_runtime": 627.0,
65
+ "full_archive": "",
66
+ "agent_version": "v1.0.0",
67
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
68
  },
69
  {
70
  "benchmark": "gaia",
71
  "score": 58.7,
72
  "metric": "accuracy",
 
 
73
  "tags": [
74
  "gaia"
75
+ ],
76
+ "cost_per_instance": 39.35,
77
+ "average_runtime": 593.5,
78
+ "full_archive": "",
79
+ "agent_version": "v1.0.0",
80
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
81
  }
82
  ]
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json CHANGED
@@ -5,5 +5,7 @@
5
  "tool_usage": "custom_interface",
6
  "submission_time": "2025-11-24T19:56:00.092922",
7
  "directory_name": "20251124_claude_3_opus_20240229",
8
- "release_date": "2024-02-29"
9
- }
 
 
 
5
  "tool_usage": "custom_interface",
6
  "submission_time": "2025-11-24T19:56:00.092922",
7
  "directory_name": "20251124_claude_3_opus_20240229",
8
+ "release_date": "2024-02-29",
9
+ "agent_name": "OpenHands CodeAct",
10
+ "country": "us"
11
+ }
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json CHANGED
@@ -3,62 +3,80 @@
3
  "benchmark": "swe-bench",
4
  "score": 29.8,
5
  "metric": "resolve_rate",
6
- "total_cost": 24.9,
7
- "total_runtime": 449.0,
8
  "tags": [
9
  "swe-bench",
10
  "Bug Fixing"
11
- ]
 
 
 
 
 
12
  },
13
  {
14
  "benchmark": "swe-bench-multimodal",
15
  "score": 25.7,
16
  "metric": "resolve_rate",
17
- "total_cost": 22.85,
18
- "total_runtime": 428.5,
19
  "tags": [
20
  "swe-bench-multimodal",
21
  "Bug Fixing"
22
- ]
 
 
 
 
 
23
  },
24
  {
25
  "benchmark": "commit0",
26
  "score": 52.1,
27
  "metric": "test_pass_rate",
28
- "total_cost": 36.05,
29
- "total_runtime": 560.5,
30
  "tags": [
31
  "commit0"
32
- ]
 
 
 
 
 
33
  },
34
  {
35
  "benchmark": "multi-swe-bench",
36
  "score": 21.5,
37
  "metric": "resolve_rate",
38
- "total_cost": 20.75,
39
- "total_runtime": 407.5,
40
  "tags": [
41
  "multi-swe-bench"
42
- ]
 
 
 
 
 
43
  },
44
  {
45
  "benchmark": "swt-bench",
46
  "score": 44.2,
47
  "metric": "success_rate",
48
- "total_cost": 32.1,
49
- "total_runtime": 521.0,
50
  "tags": [
51
  "swt-bench"
52
- ]
 
 
 
 
 
53
  },
54
  {
55
  "benchmark": "gaia",
56
  "score": 39.4,
57
  "metric": "accuracy",
58
- "total_cost": 29.7,
59
- "total_runtime": 497.0,
60
  "tags": [
61
  "gaia"
62
- ]
 
 
 
 
 
63
  }
64
  ]
 
3
  "benchmark": "swe-bench",
4
  "score": 29.8,
5
  "metric": "resolve_rate",
 
 
6
  "tags": [
7
  "swe-bench",
8
  "Bug Fixing"
9
+ ],
10
+ "cost_per_instance": 24.9,
11
+ "average_runtime": 449.0,
12
+ "full_archive": "",
13
+ "agent_version": "v1.0.0",
14
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
15
  },
16
  {
17
  "benchmark": "swe-bench-multimodal",
18
  "score": 25.7,
19
  "metric": "resolve_rate",
 
 
20
  "tags": [
21
  "swe-bench-multimodal",
22
  "Bug Fixing"
23
+ ],
24
+ "cost_per_instance": 22.85,
25
+ "average_runtime": 428.5,
26
+ "full_archive": "",
27
+ "agent_version": "v1.0.0",
28
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
29
  },
30
  {
31
  "benchmark": "commit0",
32
  "score": 52.1,
33
  "metric": "test_pass_rate",
 
 
34
  "tags": [
35
  "commit0"
36
+ ],
37
+ "cost_per_instance": 36.05,
38
+ "average_runtime": 560.5,
39
+ "full_archive": "",
40
+ "agent_version": "v1.0.0",
41
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
42
  },
43
  {
44
  "benchmark": "multi-swe-bench",
45
  "score": 21.5,
46
  "metric": "resolve_rate",
 
 
47
  "tags": [
48
  "multi-swe-bench"
49
+ ],
50
+ "cost_per_instance": 20.75,
51
+ "average_runtime": 407.5,
52
+ "full_archive": "",
53
+ "agent_version": "v1.0.0",
54
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
55
  },
56
  {
57
  "benchmark": "swt-bench",
58
  "score": 44.2,
59
  "metric": "success_rate",
 
 
60
  "tags": [
61
  "swt-bench"
62
+ ],
63
+ "cost_per_instance": 32.1,
64
+ "average_runtime": 521.0,
65
+ "full_archive": "",
66
+ "agent_version": "v1.0.0",
67
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
68
  },
69
  {
70
  "benchmark": "gaia",
71
  "score": 39.4,
72
  "metric": "accuracy",
 
 
73
  "tags": [
74
  "gaia"
75
+ ],
76
+ "cost_per_instance": 29.7,
77
+ "average_runtime": 497.0,
78
+ "full_archive": "",
79
+ "agent_version": "v1.0.0",
80
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
81
  }
82
  ]
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json CHANGED
@@ -5,5 +5,7 @@
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092908",
7
  "directory_name": "20251124_gpt_4_turbo_2024_04_09",
8
- "release_date": "2024-04-09"
9
- }
 
 
 
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092908",
7
  "directory_name": "20251124_gpt_4_turbo_2024_04_09",
8
+ "release_date": "2024-04-09",
9
+ "agent_name": "OpenHands CodeAct",
10
+ "country": "us"
11
+ }
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json CHANGED
@@ -3,62 +3,80 @@
3
  "benchmark": "swe-bench",
4
  "score": 38.7,
5
  "metric": "resolve_rate",
6
- "total_cost": 29.35,
7
- "total_runtime": 493.5,
8
  "tags": [
9
  "swe-bench",
10
  "Bug Fixing"
11
- ]
 
 
 
 
 
12
  },
13
  {
14
  "benchmark": "swe-bench-multimodal",
15
  "score": 34.2,
16
  "metric": "resolve_rate",
17
- "total_cost": 27.1,
18
- "total_runtime": 471.0,
19
  "tags": [
20
  "swe-bench-multimodal",
21
  "Bug Fixing"
22
- ]
 
 
 
 
 
23
  },
24
  {
25
  "benchmark": "commit0",
26
  "score": 61.5,
27
  "metric": "test_pass_rate",
28
- "total_cost": 40.75,
29
- "total_runtime": 607.5,
30
  "tags": [
31
  "commit0"
32
- ]
 
 
 
 
 
33
  },
34
  {
35
  "benchmark": "multi-swe-bench",
36
  "score": 28.4,
37
  "metric": "resolve_rate",
38
- "total_cost": 24.2,
39
- "total_runtime": 442.0,
40
  "tags": [
41
  "multi-swe-bench"
42
- ]
 
 
 
 
 
43
  },
44
  {
45
  "benchmark": "swt-bench",
46
  "score": 54.1,
47
  "metric": "success_rate",
48
- "total_cost": 37.05,
49
- "total_runtime": 570.5,
50
  "tags": [
51
  "swt-bench"
52
- ]
 
 
 
 
 
53
  },
54
  {
55
  "benchmark": "gaia",
56
  "score": 48.3,
57
  "metric": "accuracy",
58
- "total_cost": 34.15,
59
- "total_runtime": 541.5,
60
  "tags": [
61
  "gaia"
62
- ]
 
 
 
 
 
63
  }
64
  ]
 
3
  "benchmark": "swe-bench",
4
  "score": 38.7,
5
  "metric": "resolve_rate",
 
 
6
  "tags": [
7
  "swe-bench",
8
  "Bug Fixing"
9
+ ],
10
+ "cost_per_instance": 29.35,
11
+ "average_runtime": 493.5,
12
+ "full_archive": "",
13
+ "agent_version": "v1.0.0",
14
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
15
  },
16
  {
17
  "benchmark": "swe-bench-multimodal",
18
  "score": 34.2,
19
  "metric": "resolve_rate",
 
 
20
  "tags": [
21
  "swe-bench-multimodal",
22
  "Bug Fixing"
23
+ ],
24
+ "cost_per_instance": 27.1,
25
+ "average_runtime": 471.0,
26
+ "full_archive": "",
27
+ "agent_version": "v1.0.0",
28
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
29
  },
30
  {
31
  "benchmark": "commit0",
32
  "score": 61.5,
33
  "metric": "test_pass_rate",
 
 
34
  "tags": [
35
  "commit0"
36
+ ],
37
+ "cost_per_instance": 40.75,
38
+ "average_runtime": 607.5,
39
+ "full_archive": "",
40
+ "agent_version": "v1.0.0",
41
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
42
  },
43
  {
44
  "benchmark": "multi-swe-bench",
45
  "score": 28.4,
46
  "metric": "resolve_rate",
 
 
47
  "tags": [
48
  "multi-swe-bench"
49
+ ],
50
+ "cost_per_instance": 24.2,
51
+ "average_runtime": 442.0,
52
+ "full_archive": "",
53
+ "agent_version": "v1.0.0",
54
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
55
  },
56
  {
57
  "benchmark": "swt-bench",
58
  "score": 54.1,
59
  "metric": "success_rate",
 
 
60
  "tags": [
61
  "swt-bench"
62
+ ],
63
+ "cost_per_instance": 37.05,
64
+ "average_runtime": 570.5,
65
+ "full_archive": "",
66
+ "agent_version": "v1.0.0",
67
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
68
  },
69
  {
70
  "benchmark": "gaia",
71
  "score": 48.3,
72
  "metric": "accuracy",
 
 
73
  "tags": [
74
  "gaia"
75
+ ],
76
+ "cost_per_instance": 34.15,
77
+ "average_runtime": 541.5,
78
+ "full_archive": "",
79
+ "agent_version": "v1.0.0",
80
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
81
  }
82
  ]
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json CHANGED
@@ -5,5 +5,7 @@
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092895",
7
  "directory_name": "20251124_gpt_4o_2024_11_20",
8
- "release_date": "2024-11-20"
9
- }
 
 
 
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092895",
7
  "directory_name": "20251124_gpt_4o_2024_11_20",
8
+ "release_date": "2024-11-20",
9
+ "agent_name": "OpenHands CodeAct",
10
+ "country": "us"
11
+ }
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json CHANGED
@@ -3,62 +3,80 @@
3
  "benchmark": "swe-bench",
4
  "score": 45.1,
5
  "metric": "resolve_rate",
6
- "total_cost": 32.55,
7
- "total_runtime": 525.5,
8
  "tags": [
9
  "swe-bench",
10
  "Bug Fixing"
11
- ]
 
 
 
 
 
12
  },
13
  {
14
  "benchmark": "swe-bench-multimodal",
15
  "score": 39.5,
16
  "metric": "resolve_rate",
17
- "total_cost": 29.75,
18
- "total_runtime": 497.5,
19
  "tags": [
20
  "swe-bench-multimodal",
21
  "Bug Fixing"
22
- ]
 
 
 
 
 
23
  },
24
  {
25
  "benchmark": "commit0",
26
  "score": 68.9,
27
  "metric": "test_pass_rate",
28
- "total_cost": 44.45,
29
- "total_runtime": 644.5,
30
  "tags": [
31
  "commit0"
32
- ]
 
 
 
 
 
33
  },
34
  {
35
  "benchmark": "multi-swe-bench",
36
  "score": 32.8,
37
  "metric": "resolve_rate",
38
- "total_cost": 26.4,
39
- "total_runtime": 464.0,
40
  "tags": [
41
  "multi-swe-bench"
42
- ]
 
 
 
 
 
43
  },
44
  {
45
  "benchmark": "swt-bench",
46
  "score": 62.3,
47
  "metric": "success_rate",
48
- "total_cost": 41.15,
49
- "total_runtime": 611.5,
50
  "tags": [
51
  "swt-bench"
52
- ]
 
 
 
 
 
53
  },
54
  {
55
  "benchmark": "gaia",
56
  "score": 55.2,
57
  "metric": "accuracy",
58
- "total_cost": 37.6,
59
- "total_runtime": 576.0,
60
  "tags": [
61
  "gaia"
62
- ]
 
 
 
 
 
63
  }
64
  ]
 
3
  "benchmark": "swe-bench",
4
  "score": 45.1,
5
  "metric": "resolve_rate",
 
 
6
  "tags": [
7
  "swe-bench",
8
  "Bug Fixing"
9
+ ],
10
+ "cost_per_instance": 32.55,
11
+ "average_runtime": 525.5,
12
+ "full_archive": "",
13
+ "agent_version": "v1.0.0",
14
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
15
  },
16
  {
17
  "benchmark": "swe-bench-multimodal",
18
  "score": 39.5,
19
  "metric": "resolve_rate",
 
 
20
  "tags": [
21
  "swe-bench-multimodal",
22
  "Bug Fixing"
23
+ ],
24
+ "cost_per_instance": 29.75,
25
+ "average_runtime": 497.5,
26
+ "full_archive": "",
27
+ "agent_version": "v1.0.0",
28
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
29
  },
30
  {
31
  "benchmark": "commit0",
32
  "score": 68.9,
33
  "metric": "test_pass_rate",
 
 
34
  "tags": [
35
  "commit0"
36
+ ],
37
+ "cost_per_instance": 44.45,
38
+ "average_runtime": 644.5,
39
+ "full_archive": "",
40
+ "agent_version": "v1.0.0",
41
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
42
  },
43
  {
44
  "benchmark": "multi-swe-bench",
45
  "score": 32.8,
46
  "metric": "resolve_rate",
 
 
47
  "tags": [
48
  "multi-swe-bench"
49
+ ],
50
+ "cost_per_instance": 26.4,
51
+ "average_runtime": 464.0,
52
+ "full_archive": "",
53
+ "agent_version": "v1.0.0",
54
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
55
  },
56
  {
57
  "benchmark": "swt-bench",
58
  "score": 62.3,
59
  "metric": "success_rate",
 
 
60
  "tags": [
61
  "swt-bench"
62
+ ],
63
+ "cost_per_instance": 41.15,
64
+ "average_runtime": 611.5,
65
+ "full_archive": "",
66
+ "agent_version": "v1.0.0",
67
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
68
  },
69
  {
70
  "benchmark": "gaia",
71
  "score": 55.2,
72
  "metric": "accuracy",
 
 
73
  "tags": [
74
  "gaia"
75
+ ],
76
+ "cost_per_instance": 37.6,
77
+ "average_runtime": 576.0,
78
+ "full_archive": "",
79
+ "agent_version": "v1.0.0",
80
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
81
  }
82
  ]
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json CHANGED
@@ -5,5 +5,7 @@
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092916",
7
  "directory_name": "20251124_gpt_4o_mini_2024_07_18",
8
- "release_date": "2024-07-18"
9
- }
 
 
 
5
  "tool_usage": "standard",
6
  "submission_time": "2025-11-24T19:56:00.092916",
7
  "directory_name": "20251124_gpt_4o_mini_2024_07_18",
8
+ "release_date": "2024-07-18",
9
+ "agent_name": "OpenHands CodeAct",
10
+ "country": "us"
11
+ }
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json CHANGED
@@ -3,62 +3,80 @@
3
  "benchmark": "swe-bench",
4
  "score": 32.5,
5
  "metric": "resolve_rate",
6
- "total_cost": 26.25,
7
- "total_runtime": 462.5,
8
  "tags": [
9
  "swe-bench",
10
  "Bug Fixing"
11
- ]
 
 
 
 
 
12
  },
13
  {
14
  "benchmark": "swe-bench-multimodal",
15
  "score": 28.9,
16
  "metric": "resolve_rate",
17
- "total_cost": 24.45,
18
- "total_runtime": 444.5,
19
  "tags": [
20
  "swe-bench-multimodal",
21
  "Bug Fixing"
22
- ]
 
 
 
 
 
23
  },
24
  {
25
  "benchmark": "commit0",
26
  "score": 55.3,
27
  "metric": "test_pass_rate",
28
- "total_cost": 37.65,
29
- "total_runtime": 576.5,
30
  "tags": [
31
  "commit0"
32
- ]
 
 
 
 
 
33
  },
34
  {
35
  "benchmark": "multi-swe-bench",
36
  "score": 24.1,
37
  "metric": "resolve_rate",
38
- "total_cost": 22.05,
39
- "total_runtime": 420.5,
40
  "tags": [
41
  "multi-swe-bench"
42
- ]
 
 
 
 
 
43
  },
44
  {
45
  "benchmark": "swt-bench",
46
  "score": 47.8,
47
  "metric": "success_rate",
48
- "total_cost": 33.9,
49
- "total_runtime": 539.0,
50
  "tags": [
51
  "swt-bench"
52
- ]
 
 
 
 
 
53
  },
54
  {
55
  "benchmark": "gaia",
56
  "score": 42.1,
57
  "metric": "accuracy",
58
- "total_cost": 31.05,
59
- "total_runtime": 510.5,
60
  "tags": [
61
  "gaia"
62
- ]
 
 
 
 
 
63
  }
64
  ]
 
3
  "benchmark": "swe-bench",
4
  "score": 32.5,
5
  "metric": "resolve_rate",
 
 
6
  "tags": [
7
  "swe-bench",
8
  "Bug Fixing"
9
+ ],
10
+ "cost_per_instance": 26.25,
11
+ "average_runtime": 462.5,
12
+ "full_archive": "",
13
+ "agent_version": "v1.0.0",
14
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
15
  },
16
  {
17
  "benchmark": "swe-bench-multimodal",
18
  "score": 28.9,
19
  "metric": "resolve_rate",
 
 
20
  "tags": [
21
  "swe-bench-multimodal",
22
  "Bug Fixing"
23
+ ],
24
+ "cost_per_instance": 24.45,
25
+ "average_runtime": 444.5,
26
+ "full_archive": "",
27
+ "agent_version": "v1.0.0",
28
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
29
  },
30
  {
31
  "benchmark": "commit0",
32
  "score": 55.3,
33
  "metric": "test_pass_rate",
 
 
34
  "tags": [
35
  "commit0"
36
+ ],
37
+ "cost_per_instance": 37.65,
38
+ "average_runtime": 576.5,
39
+ "full_archive": "",
40
+ "agent_version": "v1.0.0",
41
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
42
  },
43
  {
44
  "benchmark": "multi-swe-bench",
45
  "score": 24.1,
46
  "metric": "resolve_rate",
 
 
47
  "tags": [
48
  "multi-swe-bench"
49
+ ],
50
+ "cost_per_instance": 22.05,
51
+ "average_runtime": 420.5,
52
+ "full_archive": "",
53
+ "agent_version": "v1.0.0",
54
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
55
  },
56
  {
57
  "benchmark": "swt-bench",
58
  "score": 47.8,
59
  "metric": "success_rate",
 
 
60
  "tags": [
61
  "swt-bench"
62
+ ],
63
+ "cost_per_instance": 33.9,
64
+ "average_runtime": 539.0,
65
+ "full_archive": "",
66
+ "agent_version": "v1.0.0",
67
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
68
  },
69
  {
70
  "benchmark": "gaia",
71
  "score": 42.1,
72
  "metric": "accuracy",
 
 
73
  "tags": [
74
  "gaia"
75
+ ],
76
+ "cost_per_instance": 31.05,
77
+ "average_runtime": 510.5,
78
+ "full_archive": "",
79
+ "agent_version": "v1.0.0",
80
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
81
  }
82
  ]
mock_results/1.0.0-dev1/results/20251201_deepseek_v3/metadata.json CHANGED
@@ -6,5 +6,7 @@
6
  "submission_time": "2025-12-01T10:00:00.000000",
7
  "directory_name": "20251201_deepseek_v3",
8
  "release_date": "2025-12-01",
9
- "parameter_count_b": 685
10
- }
 
 
 
6
  "submission_time": "2025-12-01T10:00:00.000000",
7
  "directory_name": "20251201_deepseek_v3",
8
  "release_date": "2025-12-01",
9
+ "parameter_count_b": 685,
10
+ "agent_name": "OpenHands CodeAct",
11
+ "country": "cn"
12
+ }
mock_results/1.0.0-dev1/results/20251201_deepseek_v3/scores.json CHANGED
@@ -4,13 +4,19 @@
4
  "score": 42.5,
5
  "metric": "resolve_rate",
6
  "cost_per_instance": 0.15,
7
- "average_runtime": 180
 
 
 
8
  },
9
  {
10
  "benchmark": "gaia",
11
  "score": 55.0,
12
  "metric": "accuracy",
13
  "cost_per_instance": 0.08,
14
- "average_runtime": 60
 
 
 
15
  }
16
- ]
 
4
  "score": 42.5,
5
  "metric": "resolve_rate",
6
  "cost_per_instance": 0.15,
7
+ "average_runtime": 180,
8
+ "full_archive": "",
9
+ "agent_version": "v1.0.0",
10
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
11
  },
12
  {
13
  "benchmark": "gaia",
14
  "score": 55.0,
15
  "metric": "accuracy",
16
  "cost_per_instance": 0.08,
17
+ "average_runtime": 60,
18
+ "full_archive": "",
19
+ "agent_version": "v1.0.0",
20
+ "submission_time": "2025-11-24T19:56:00.092865+00:00"
21
  }
22
+ ]