File size: 7,872 Bytes
085a012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
{
  "suite_config": {
    "name": "openhands-index",
    "version": "1.0.0-dev1",
    "splits": [
      {
        "name": "validation",
        "tasks": [
          {
            "name": "swe-bench",
            "path": "openhands/swe-bench",
            "primary_metric": "resolved/mean",
            "tags": [
              "swe-bench"
            ]
          },
          {
            "name": "multi-swe-bench",
            "path": "openhands/multi-swe-bench",
            "primary_metric": "resolved/mean",
            "tags": [
              "multi-swe-bench"
            ]
          },
          {
            "name": "swe-bench-multimodal",
            "path": "openhands/swe-bench-multimodal",
            "primary_metric": "resolved/mean",
            "tags": [
              "swe-bench-multimodal"
            ]
          },
          {
            "name": "swt-bench",
            "path": "openhands/swt-bench",
            "primary_metric": "generated/mean",
            "tags": [
              "swt-bench"
            ]
          },
          {
            "name": "commit0",
            "path": "openhands/commit0",
            "primary_metric": "tests_passed/mean",
            "tags": [
              "commit0"
            ]
          },
          {
            "name": "gaia",
            "path": "openhands/gaia",
            "primary_metric": "correct/mean",
            "tags": [
              "gaia"
            ]
          }
        ]
      },
      {
        "name": "test",
        "tasks": [
          {
            "name": "swe-bench",
            "path": "openhands/swe-bench",
            "primary_metric": "resolved/mean",
            "tags": [
              "swe-bench"
            ]
          },
          {
            "name": "multi-swe-bench",
            "path": "openhands/multi-swe-bench",
            "primary_metric": "resolved/mean",
            "tags": [
              "multi-swe-bench"
            ]
          },
          {
            "name": "arxivdigestables_test",
            "path": "astabench/arxivdigestables_test",
            "primary_metric": "score_tables/mean",
            "tags": [
              "lit"
            ]
          },
          {
            "name": "litqa2_test",
            "path": "astabench/litqa2_test",
            "primary_metric": "is_correct/accuracy",
            "tags": [
              "lit"
            ]
          },
          {
            "name": "discoverybench_test",
            "path": "astabench/discoverybench_test",
            "primary_metric": "score_discoverybench/mean",
            "tags": [
              "data"
            ]
          },
          {
            "name": "core_bench_test",
            "path": "astabench/core_bench_test",
            "primary_metric": "evaluate_task_questions/accuracy",
            "tags": [
              "code"
            ]
          },
          {
            "name": "ds1000_test",
            "path": "astabench/ds1000_test",
            "primary_metric": "ds1000_scorer/accuracy",
            "tags": [
              "code"
            ]
          },
          {
            "name": "e2e_discovery_test",
            "path": "astabench/e2e_discovery_test",
            "primary_metric": "score_rubric/accuracy",
            "tags": [
              "discovery"
            ]
          },
          {
            "name": "super_test",
            "path": "astabench/super_test",
            "primary_metric": "check_super_execution/entrypoints",
            "tags": [
              "code"
            ]
          }
        ]
      }
    ]
  },
  "split": "validation",
  "results": [
    {
      "task_name": "sqa_dev",
      "metrics": [
        {
          "name": "global_avg/mean",
          "value": 0.6215245045241414
        },
        {
          "name": "global_avg/stderr",
          "value": 0.02088486499225903
        },
        {
          "name": "ingredient_recall/mean",
          "value": 0.6029178145087237
        },
        {
          "name": "ingredient_recall/stderr",
          "value": 0.026215888361291618
        },
        {
          "name": "answer_precision/mean",
          "value": 0.7960436785436785
        },
        {
          "name": "answer_precision/stderr",
          "value": 0.027692773517249983
        },
        {
          "name": "citation_precision/mean",
          "value": 0.697849041353826
        },
        {
          "name": "citation_precision/stderr",
          "value": 0.026784164936602798
        },
        {
          "name": "citation_recall/mean",
          "value": 0.3892874836903378
        },
        {
          "name": "citation_recall/stderr",
          "value": 0.015094770200171756
        }
      ],
      "model_costs": [
        1.3829150000000001,
        0.9759700000000001,
        2.2324650000000004,
        0.76631,
        0.9277900000000001,
        2.6388600000000006,
        0.8114100000000002,
        2.3263174999999996,
        2.5423725,
        1.2398675000000001,
        1.7387300000000003,
        1.2176599999999997,
        0.564655,
        0.9726750000000001,
        0.7675700000000001,
        1.5198850000000002,
        1.4726625000000002,
        2.1937650000000004,
        0.6907700000000001,
        1.39835,
        1.2598175,
        2.5373550000000002,
        2.19239,
        1.2508875000000006,
        2.2650550000000007,
        1.6047725,
        0.6525125000000003,
        1.4262200000000003,
        1.0533299999999999,
        1.7252375,
        1.407145,
        1.5408700000000004,
        2.8073224999999993,
        1.0448125000000006,
        1.7037300000000004,
        0.8650500000000001,
        1.0171225000000002,
        0.5697925000000001,
        2.7851025,
        1.0551425,
        2.9213775,
        1.7772975000000004,
        1.2753225000000001,
        0.8108325000000001,
        0.6958375000000001,
        0.8840950000000003,
        1.2028724999999998,
        1.2490475000000003,
        2.4272,
        1.95026,
        1.5352475,
        2.11181,
        2.3612249999999997,
        1.8619225000000004,
        0.7431075000000001,
        1.5189675000000002,
        1.089575,
        1.6103700000000003,
        1.4201450000000002,
        2.397835,
        1.469175,
        1.0723550000000004,
        0.7964050000000003,
        3.3733175,
        4.197085,
        4.2637675,
        1.2982124999999998,
        0.66146,
        1.1130475000000002,
        2.4393974999999997,
        2.582,
        1.7381725000000001,
        0.415025,
        1.6777325,
        1.0507825000000002,
        2.4627125000000003,
        1.017005,
        1.9210250000000002,
        1.5009025000000003,
        0.8283125000000001,
        2.9854425,
        0.4633375000000001,
        0.397685,
        1.2803425,
        3.0388200000000003,
        1.2610875000000004,
        1.798365,
        3.427287500000001,
        0.29307750000000005,
        0.37101249999999997,
        2.8046925000000003,
        0.35557000000000005,
        3.5481700000000007,
        1.1073975,
        1.5280825,
        1.1714900000000001,
        3.1791275000000003,
        3.8214725000000005,
        1.8440275,
        1.730515,
        1.9350675000000002,
        1.6592125000000002,
        1.9227124999999998,
        1.202885,
        1.2688150000000002,
        0.8819875000000001,
        0.6989325,
        1.965635,
        1.7467800000000002,
        1.6940625000000002
      ]
    }
  ],
  "submission": {
    "submit_time": "2025-06-09T20:55:35.869831Z",
    "username": "miked-ai",
    "agent_name": "Basic ReAct",
    "agent_description": null,
    "agent_url": null,
    "logs_url": "hf://datasets/allenai/asta-bench-internal-submissions/1.0.0-dev1/validation/miked-ai_Basic_ReAct__task_tools__report_editor__2025-06-09T20-55-35",
    "logs_url_public": null,
    "summary_url": null
  }
}