Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Feb 1

Commit

f7e6b58

1 Parent(s): 68b25a7

Update mock data and docs to match new openhands-index-results format

- Update scores.json: rename total_cost -> cost_per_instance, total_runtime -> average_runtime
- Add new required fields to mock data: full_archive, agent_version, submission_time
- Add agent_name and country fields to metadata.json
- Update DATA_STRUCTURE.md documentation to reflect new schema

This ensures mock data works as a proper fallback when GitHub is unavailable
and matches the production format from openhands-index-results.

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (17) hide show

DATA_STRUCTURE.md +31 -12
mock_results/1.0.0-dev1/results/20250723_qwen3_coder/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20250723_qwen3_coder/scores.json +9 -3
mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/scores.json +10 -4
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json +36 -18
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json +36 -18
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json +36 -18
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json +36 -18
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json +36 -18
mock_results/1.0.0-dev1/results/20251201_deepseek_v3/metadata.json +4 -2
mock_results/1.0.0-dev1/results/20251201_deepseek_v3/scores.json +9 -3

DATA_STRUCTURE.md CHANGED Viewed

@@ -21,25 +21,44 @@ openhands-index-results/
 ## File Formats
-### JSONL Files (test.jsonl, validation.jsonl, etc.)
-Each line in a JSONL file should be a JSON object representing one agent's results:
 ```json
 {
-  "Agent_Name": "OpenHands CodeAct v2.1",
-  "Llm_Base": "claude-3-5-sonnet-20241022",
-  "Openness": "closed_api_available",
-  "Tool_Usage": "standard",
-  "Score": 48.3,
-  "Metric": "resolve_rate",
-  "Submission_Time": "2025-11-24T19:56:00.092865",
-  "Tags": ["swe-bench"],
-  "Total_Cost": 34.15,
-  "Total_Runtime": 541.5
 }
 ```
 ### Configuration File (agenteval.json)
 The configuration file defines the benchmark structure:

 ## File Formats
+### Agent Directory Structure
+Each agent has its own directory containing two files:
+**metadata.json** - Agent and model information:
 ```json
 {
+  "agent_name": "OpenHands CodeAct",
+  "agent_version": "v1.8.3",
+  "model": "claude-4.5-opus",
+  "openness": "closed_api_available",
+  "country": "us",
+  "tool_usage": "standard",
+  "submission_time": "2026-01-27T01:24:15.735789+00:00",
+  "directory_name": "claude-4.5-opus",
+  "release_date": "2025-11-24",
+  "parameter_count_b": null,
+  "active_parameter_count_b": null
 }
 ```
+**scores.json** - Array of benchmark results:
+```json
+[
+  {
+    "benchmark": "swe-bench",
+    "score": 76.6,
+    "metric": "accuracy",
+    "cost_per_instance": 1.82,
+    "average_runtime": 325.0,
+    "full_archive": "https://results.eval.all-hands.dev/eval-21370451733-...",
+    "tags": ["swe-bench"],
+    "agent_version": "v1.8.3",
+    "submission_time": "2026-01-27T01:24:15.735789+00:00"
+  }
+]
+```
 ### Configuration File (agenteval.json)
 The configuration file defines the benchmark structure:

mock_results/1.0.0-dev1/results/20250723_qwen3_coder/metadata.json CHANGED Viewed

@@ -7,5 +7,7 @@
   "directory_name": "20250723_qwen3_coder",
   "release_date": "2025-07-23",
   "parameter_count_b": 480,
-  "active_parameter_count_b": 35
-}

   "directory_name": "20250723_qwen3_coder",
   "release_date": "2025-07-23",
   "parameter_count_b": 480,
+  "active_parameter_count_b": 35,
+  "agent_name": "OpenHands CodeAct",
+  "country": "cn"
+}

mock_results/1.0.0-dev1/results/20250723_qwen3_coder/scores.json CHANGED Viewed

@@ -4,13 +4,19 @@
     "score": 38.0,
     "metric": "resolve_rate",
     "cost_per_instance": 0.12,
-    "average_runtime": 150
   },
   {
     "benchmark": "gaia",
     "score": 48.0,
     "metric": "accuracy",
     "cost_per_instance": 0.06,
-    "average_runtime": 45
   }
-]

     "score": 38.0,
     "metric": "resolve_rate",
     "cost_per_instance": 0.12,
+    "average_runtime": 150,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 48.0,
     "metric": "accuracy",
     "cost_per_instance": 0.06,
+    "average_runtime": 45,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
+]

mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/metadata.json CHANGED Viewed

@@ -7,5 +7,7 @@
   "directory_name": "20251106_kimi_k2_thinking",
   "release_date": "2025-11-06",
   "parameter_count_b": 1000,
-  "active_parameter_count_b": 32
-}

   "directory_name": "20251106_kimi_k2_thinking",
   "release_date": "2025-11-06",
   "parameter_count_b": 1000,
+  "active_parameter_count_b": 32,
+  "agent_name": "OpenHands CodeAct",
+  "country": "cn"
+}

mock_results/1.0.0-dev1/results/20251106_kimi_k2_thinking/scores.json CHANGED Viewed

@@ -4,13 +4,19 @@
     "score": 45.0,
     "metric": "resolve_rate",
     "cost_per_instance": 0.18,
-    "average_runtime": 200
   },
   {
     "benchmark": "gaia",
     "score": 52.0,
     "metric": "accuracy",
-    "cost_per_instance": 0.10,
-    "average_runtime": 70
   }
-]

     "score": 45.0,
     "metric": "resolve_rate",
     "cost_per_instance": 0.18,
+    "average_runtime": 200,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 52.0,
     "metric": "accuracy",
+    "cost_per_instance": 0.1,
+    "average_runtime": 70,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
+]

mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json CHANGED Viewed

@@ -5,5 +5,7 @@
   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092865",
   "directory_name": "20251124_claude_3_5_sonnet_20241022",
-  "release_date": "2024-10-22"
-}

   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092865",
   "directory_name": "20251124_claude_3_5_sonnet_20241022",
+  "release_date": "2024-10-22",
+  "agent_name": "OpenHands CodeAct",
+  "country": "us"
+}

mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json CHANGED Viewed

@@ -3,62 +3,80 @@
     "benchmark": "swe-bench",
     "score": 48.3,
     "metric": "resolve_rate",
-    "total_cost": 34.15,
-    "total_runtime": 541.5,
     "tags": [
       "swe-bench",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 42.1,
     "metric": "resolve_rate",
-    "total_cost": 31.05,
-    "total_runtime": 510.5,
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "commit0",
     "score": 71.2,
     "metric": "test_pass_rate",
-    "total_cost": 45.6,
-    "total_runtime": 656.0,
     "tags": [
       "commit0"
-    ]
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 35.2,
     "metric": "resolve_rate",
-    "total_cost": 27.6,
-    "total_runtime": 476.0,
     "tags": [
       "multi-swe-bench"
-    ]
   },
   {
     "benchmark": "swt-bench",
     "score": 65.4,
     "metric": "success_rate",
-    "total_cost": 42.7,
-    "total_runtime": 627.0,
     "tags": [
       "swt-bench"
-    ]
   },
   {
     "benchmark": "gaia",
     "score": 58.7,
     "metric": "accuracy",
-    "total_cost": 39.35,
-    "total_runtime": 593.5,
     "tags": [
       "gaia"
-    ]
   }
 ]

     "benchmark": "swe-bench",
     "score": 48.3,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 34.15,
+    "average_runtime": 541.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 42.1,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 31.05,
+    "average_runtime": 510.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "commit0",
     "score": 71.2,
     "metric": "test_pass_rate",
     "tags": [
       "commit0"
+    ],
+    "cost_per_instance": 45.6,
+    "average_runtime": 656.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 35.2,
     "metric": "resolve_rate",
     "tags": [
       "multi-swe-bench"
+    ],
+    "cost_per_instance": 27.6,
+    "average_runtime": 476.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swt-bench",
     "score": 65.4,
     "metric": "success_rate",
     "tags": [
       "swt-bench"
+    ],
+    "cost_per_instance": 42.7,
+    "average_runtime": 627.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 58.7,
     "metric": "accuracy",
     "tags": [
       "gaia"
+    ],
+    "cost_per_instance": 39.35,
+    "average_runtime": 593.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
 ]

mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json CHANGED Viewed

@@ -5,5 +5,7 @@
   "tool_usage": "custom_interface",
   "submission_time": "2025-11-24T19:56:00.092922",
   "directory_name": "20251124_claude_3_opus_20240229",
-  "release_date": "2024-02-29"
-}

   "tool_usage": "custom_interface",
   "submission_time": "2025-11-24T19:56:00.092922",
   "directory_name": "20251124_claude_3_opus_20240229",
+  "release_date": "2024-02-29",
+  "agent_name": "OpenHands CodeAct",
+  "country": "us"
+}

mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json CHANGED Viewed

@@ -3,62 +3,80 @@
     "benchmark": "swe-bench",
     "score": 29.8,
     "metric": "resolve_rate",
-    "total_cost": 24.9,
-    "total_runtime": 449.0,
     "tags": [
       "swe-bench",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 25.7,
     "metric": "resolve_rate",
-    "total_cost": 22.85,
-    "total_runtime": 428.5,
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "commit0",
     "score": 52.1,
     "metric": "test_pass_rate",
-    "total_cost": 36.05,
-    "total_runtime": 560.5,
     "tags": [
       "commit0"
-    ]
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 21.5,
     "metric": "resolve_rate",
-    "total_cost": 20.75,
-    "total_runtime": 407.5,
     "tags": [
       "multi-swe-bench"
-    ]
   },
   {
     "benchmark": "swt-bench",
     "score": 44.2,
     "metric": "success_rate",
-    "total_cost": 32.1,
-    "total_runtime": 521.0,
     "tags": [
       "swt-bench"
-    ]
   },
   {
     "benchmark": "gaia",
     "score": 39.4,
     "metric": "accuracy",
-    "total_cost": 29.7,
-    "total_runtime": 497.0,
     "tags": [
       "gaia"
-    ]
   }
 ]

     "benchmark": "swe-bench",
     "score": 29.8,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 24.9,
+    "average_runtime": 449.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 25.7,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 22.85,
+    "average_runtime": 428.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "commit0",
     "score": 52.1,
     "metric": "test_pass_rate",
     "tags": [
       "commit0"
+    ],
+    "cost_per_instance": 36.05,
+    "average_runtime": 560.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 21.5,
     "metric": "resolve_rate",
     "tags": [
       "multi-swe-bench"
+    ],
+    "cost_per_instance": 20.75,
+    "average_runtime": 407.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swt-bench",
     "score": 44.2,
     "metric": "success_rate",
     "tags": [
       "swt-bench"
+    ],
+    "cost_per_instance": 32.1,
+    "average_runtime": 521.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 39.4,
     "metric": "accuracy",
     "tags": [
       "gaia"
+    ],
+    "cost_per_instance": 29.7,
+    "average_runtime": 497.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
 ]

mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json CHANGED Viewed

@@ -5,5 +5,7 @@
   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092908",
   "directory_name": "20251124_gpt_4_turbo_2024_04_09",
-  "release_date": "2024-04-09"
-}

   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092908",
   "directory_name": "20251124_gpt_4_turbo_2024_04_09",
+  "release_date": "2024-04-09",
+  "agent_name": "OpenHands CodeAct",
+  "country": "us"
+}

mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json CHANGED Viewed

@@ -3,62 +3,80 @@
     "benchmark": "swe-bench",
     "score": 38.7,
     "metric": "resolve_rate",
-    "total_cost": 29.35,
-    "total_runtime": 493.5,
     "tags": [
       "swe-bench",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 34.2,
     "metric": "resolve_rate",
-    "total_cost": 27.1,
-    "total_runtime": 471.0,
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "commit0",
     "score": 61.5,
     "metric": "test_pass_rate",
-    "total_cost": 40.75,
-    "total_runtime": 607.5,
     "tags": [
       "commit0"
-    ]
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 28.4,
     "metric": "resolve_rate",
-    "total_cost": 24.2,
-    "total_runtime": 442.0,
     "tags": [
       "multi-swe-bench"
-    ]
   },
   {
     "benchmark": "swt-bench",
     "score": 54.1,
     "metric": "success_rate",
-    "total_cost": 37.05,
-    "total_runtime": 570.5,
     "tags": [
       "swt-bench"
-    ]
   },
   {
     "benchmark": "gaia",
     "score": 48.3,
     "metric": "accuracy",
-    "total_cost": 34.15,
-    "total_runtime": 541.5,
     "tags": [
       "gaia"
-    ]
   }
 ]

     "benchmark": "swe-bench",
     "score": 38.7,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 29.35,
+    "average_runtime": 493.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 34.2,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 27.1,
+    "average_runtime": 471.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "commit0",
     "score": 61.5,
     "metric": "test_pass_rate",
     "tags": [
       "commit0"
+    ],
+    "cost_per_instance": 40.75,
+    "average_runtime": 607.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 28.4,
     "metric": "resolve_rate",
     "tags": [
       "multi-swe-bench"
+    ],
+    "cost_per_instance": 24.2,
+    "average_runtime": 442.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swt-bench",
     "score": 54.1,
     "metric": "success_rate",
     "tags": [
       "swt-bench"
+    ],
+    "cost_per_instance": 37.05,
+    "average_runtime": 570.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 48.3,
     "metric": "accuracy",
     "tags": [
       "gaia"
+    ],
+    "cost_per_instance": 34.15,
+    "average_runtime": 541.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
 ]

mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json CHANGED Viewed

@@ -5,5 +5,7 @@
   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092895",
   "directory_name": "20251124_gpt_4o_2024_11_20",
-  "release_date": "2024-11-20"
-}

   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092895",
   "directory_name": "20251124_gpt_4o_2024_11_20",
+  "release_date": "2024-11-20",
+  "agent_name": "OpenHands CodeAct",
+  "country": "us"
+}

mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json CHANGED Viewed

@@ -3,62 +3,80 @@
     "benchmark": "swe-bench",
     "score": 45.1,
     "metric": "resolve_rate",
-    "total_cost": 32.55,
-    "total_runtime": 525.5,
     "tags": [
       "swe-bench",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 39.5,
     "metric": "resolve_rate",
-    "total_cost": 29.75,
-    "total_runtime": 497.5,
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "commit0",
     "score": 68.9,
     "metric": "test_pass_rate",
-    "total_cost": 44.45,
-    "total_runtime": 644.5,
     "tags": [
       "commit0"
-    ]
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 32.8,
     "metric": "resolve_rate",
-    "total_cost": 26.4,
-    "total_runtime": 464.0,
     "tags": [
       "multi-swe-bench"
-    ]
   },
   {
     "benchmark": "swt-bench",
     "score": 62.3,
     "metric": "success_rate",
-    "total_cost": 41.15,
-    "total_runtime": 611.5,
     "tags": [
       "swt-bench"
-    ]
   },
   {
     "benchmark": "gaia",
     "score": 55.2,
     "metric": "accuracy",
-    "total_cost": 37.6,
-    "total_runtime": 576.0,
     "tags": [
       "gaia"
-    ]
   }
 ]

     "benchmark": "swe-bench",
     "score": 45.1,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 32.55,
+    "average_runtime": 525.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 39.5,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 29.75,
+    "average_runtime": 497.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "commit0",
     "score": 68.9,
     "metric": "test_pass_rate",
     "tags": [
       "commit0"
+    ],
+    "cost_per_instance": 44.45,
+    "average_runtime": 644.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 32.8,
     "metric": "resolve_rate",
     "tags": [
       "multi-swe-bench"
+    ],
+    "cost_per_instance": 26.4,
+    "average_runtime": 464.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swt-bench",
     "score": 62.3,
     "metric": "success_rate",
     "tags": [
       "swt-bench"
+    ],
+    "cost_per_instance": 41.15,
+    "average_runtime": 611.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 55.2,
     "metric": "accuracy",
     "tags": [
       "gaia"
+    ],
+    "cost_per_instance": 37.6,
+    "average_runtime": 576.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
 ]

mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json CHANGED Viewed

@@ -5,5 +5,7 @@
   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092916",
   "directory_name": "20251124_gpt_4o_mini_2024_07_18",
-  "release_date": "2024-07-18"
-}

   "tool_usage": "standard",
   "submission_time": "2025-11-24T19:56:00.092916",
   "directory_name": "20251124_gpt_4o_mini_2024_07_18",
+  "release_date": "2024-07-18",
+  "agent_name": "OpenHands CodeAct",
+  "country": "us"
+}

mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json CHANGED Viewed

@@ -3,62 +3,80 @@
     "benchmark": "swe-bench",
     "score": 32.5,
     "metric": "resolve_rate",
-    "total_cost": 26.25,
-    "total_runtime": 462.5,
     "tags": [
       "swe-bench",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 28.9,
     "metric": "resolve_rate",
-    "total_cost": 24.45,
-    "total_runtime": 444.5,
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
-    ]
   },
   {
     "benchmark": "commit0",
     "score": 55.3,
     "metric": "test_pass_rate",
-    "total_cost": 37.65,
-    "total_runtime": 576.5,
     "tags": [
       "commit0"
-    ]
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 24.1,
     "metric": "resolve_rate",
-    "total_cost": 22.05,
-    "total_runtime": 420.5,
     "tags": [
       "multi-swe-bench"
-    ]
   },
   {
     "benchmark": "swt-bench",
     "score": 47.8,
     "metric": "success_rate",
-    "total_cost": 33.9,
-    "total_runtime": 539.0,
     "tags": [
       "swt-bench"
-    ]
   },
   {
     "benchmark": "gaia",
     "score": 42.1,
     "metric": "accuracy",
-    "total_cost": 31.05,
-    "total_runtime": 510.5,
     "tags": [
       "gaia"
-    ]
   }
 ]

     "benchmark": "swe-bench",
     "score": 32.5,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 26.25,
+    "average_runtime": 462.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swe-bench-multimodal",
     "score": 28.9,
     "metric": "resolve_rate",
     "tags": [
       "swe-bench-multimodal",
       "Bug Fixing"
+    ],
+    "cost_per_instance": 24.45,
+    "average_runtime": 444.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "commit0",
     "score": 55.3,
     "metric": "test_pass_rate",
     "tags": [
       "commit0"
+    ],
+    "cost_per_instance": 37.65,
+    "average_runtime": 576.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "multi-swe-bench",
     "score": 24.1,
     "metric": "resolve_rate",
     "tags": [
       "multi-swe-bench"
+    ],
+    "cost_per_instance": 22.05,
+    "average_runtime": 420.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "swt-bench",
     "score": 47.8,
     "metric": "success_rate",
     "tags": [
       "swt-bench"
+    ],
+    "cost_per_instance": 33.9,
+    "average_runtime": 539.0,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 42.1,
     "metric": "accuracy",
     "tags": [
       "gaia"
+    ],
+    "cost_per_instance": 31.05,
+    "average_runtime": 510.5,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
 ]

mock_results/1.0.0-dev1/results/20251201_deepseek_v3/metadata.json CHANGED Viewed

@@ -6,5 +6,7 @@
   "submission_time": "2025-12-01T10:00:00.000000",
   "directory_name": "20251201_deepseek_v3",
   "release_date": "2025-12-01",
-  "parameter_count_b": 685
-}

   "submission_time": "2025-12-01T10:00:00.000000",
   "directory_name": "20251201_deepseek_v3",
   "release_date": "2025-12-01",
+  "parameter_count_b": 685,
+  "agent_name": "OpenHands CodeAct",
+  "country": "cn"
+}

mock_results/1.0.0-dev1/results/20251201_deepseek_v3/scores.json CHANGED Viewed

@@ -4,13 +4,19 @@
     "score": 42.5,
     "metric": "resolve_rate",
     "cost_per_instance": 0.15,
-    "average_runtime": 180
   },
   {
     "benchmark": "gaia",
     "score": 55.0,
     "metric": "accuracy",
     "cost_per_instance": 0.08,
-    "average_runtime": 60
   }
-]

     "score": 42.5,
     "metric": "resolve_rate",
     "cost_per_instance": 0.15,
+    "average_runtime": 180,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   },
   {
     "benchmark": "gaia",
     "score": 55.0,
     "metric": "accuracy",
     "cost_per_instance": 0.08,
+    "average_runtime": 60,
+    "full_archive": "",
+    "agent_version": "v1.0.0",
+    "submission_time": "2025-11-24T19:56:00.092865+00:00"
   }
+]