clemsail commited on
Commit
0a11b0a
·
verified ·
1 Parent(s): 75c22a0

chore: upload lm-eval-harness results

Browse files
evals/results_2026-04-16T03-36-07.968866.json ADDED
@@ -0,0 +1,1246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "mmlu_pro": {
4
+ "exact_match,custom-extract": 0.6191458026509573,
5
+ "exact_match_stderr,custom-extract": 0.005790896435584423,
6
+ "alias": "mmlu_pro"
7
+ },
8
+ "mmlu_pro_biology": {
9
+ "alias": " - biology",
10
+ "exact_match,custom-extract": 0.768,
11
+ "exact_match_stderr,custom-extract": 0.01889619359195203
12
+ },
13
+ "mmlu_pro_business": {
14
+ "alias": " - business",
15
+ "exact_match,custom-extract": 0.66,
16
+ "exact_match_stderr,custom-extract": 0.021206117013673063
17
+ },
18
+ "mmlu_pro_chemistry": {
19
+ "alias": " - chemistry",
20
+ "exact_match,custom-extract": 0.58,
21
+ "exact_match_stderr,custom-extract": 0.02209471322976178
22
+ },
23
+ "mmlu_pro_computer_science": {
24
+ "alias": " - computer_science",
25
+ "exact_match,custom-extract": 0.675609756097561,
26
+ "exact_match_stderr,custom-extract": 0.02314835821240817
27
+ },
28
+ "mmlu_pro_economics": {
29
+ "alias": " - economics",
30
+ "exact_match,custom-extract": 0.678,
31
+ "exact_match_stderr,custom-extract": 0.020916668330019882
32
+ },
33
+ "mmlu_pro_engineering": {
34
+ "alias": " - engineering",
35
+ "exact_match,custom-extract": 0.448,
36
+ "exact_match_stderr,custom-extract": 0.022261697292270143
37
+ },
38
+ "mmlu_pro_health": {
39
+ "alias": " - health",
40
+ "exact_match,custom-extract": 0.678,
41
+ "exact_match_stderr,custom-extract": 0.020916668330019882
42
+ },
43
+ "mmlu_pro_history": {
44
+ "alias": " - history",
45
+ "exact_match,custom-extract": 0.5748031496062992,
46
+ "exact_match_stderr,custom-extract": 0.025360790748556062
47
+ },
48
+ "mmlu_pro_law": {
49
+ "alias": " - law",
50
+ "exact_match,custom-extract": 0.432,
51
+ "exact_match_stderr,custom-extract": 0.022175109265613165
52
+ },
53
+ "mmlu_pro_math": {
54
+ "alias": " - math",
55
+ "exact_match,custom-extract": 0.678,
56
+ "exact_match_stderr,custom-extract": 0.020916668330019886
57
+ },
58
+ "mmlu_pro_other": {
59
+ "alias": " - other",
60
+ "exact_match,custom-extract": 0.612,
61
+ "exact_match_stderr,custom-extract": 0.021814300984787635
62
+ },
63
+ "mmlu_pro_philosophy": {
64
+ "alias": " - philosophy",
65
+ "exact_match,custom-extract": 0.5490981963927856,
66
+ "exact_match_stderr,custom-extract": 0.022297251037679492
67
+ },
68
+ "mmlu_pro_physics": {
69
+ "alias": " - physics",
70
+ "exact_match,custom-extract": 0.63,
71
+ "exact_match_stderr,custom-extract": 0.021613289165165788
72
+ },
73
+ "mmlu_pro_psychology": {
74
+ "alias": " - psychology",
75
+ "exact_match,custom-extract": 0.704,
76
+ "exact_match_stderr,custom-extract": 0.020435342091896135
77
+ }
78
+ },
79
+ "groups": {
80
+ "mmlu_pro": {
81
+ "exact_match,custom-extract": 0.6191458026509573,
82
+ "exact_match_stderr,custom-extract": 0.005790896435584423,
83
+ "alias": "mmlu_pro"
84
+ }
85
+ },
86
+ "group_subtasks": {
87
+ "mmlu_pro": [
88
+ "mmlu_pro_biology",
89
+ "mmlu_pro_business",
90
+ "mmlu_pro_chemistry",
91
+ "mmlu_pro_computer_science",
92
+ "mmlu_pro_economics",
93
+ "mmlu_pro_engineering",
94
+ "mmlu_pro_health",
95
+ "mmlu_pro_history",
96
+ "mmlu_pro_law",
97
+ "mmlu_pro_math",
98
+ "mmlu_pro_other",
99
+ "mmlu_pro_philosophy",
100
+ "mmlu_pro_physics",
101
+ "mmlu_pro_psychology"
102
+ ]
103
+ },
104
+ "configs": {
105
+ "mmlu_pro_biology": {
106
+ "task": "mmlu_pro_biology",
107
+ "task_alias": "biology",
108
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
109
+ "test_split": "test",
110
+ "fewshot_split": "validation",
111
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c5800>, subject='biology')",
112
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c6840>, including_answer=False)",
113
+ "doc_to_target": "answer",
114
+ "unsafe_code": false,
115
+ "description": "The following are multiple choice questions (with answers) about biology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
116
+ "target_delimiter": " ",
117
+ "fewshot_delimiter": "\n\n",
118
+ "fewshot_config": {
119
+ "sampler": "first_n",
120
+ "split": "validation",
121
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c5800>, subject='biology')",
122
+ "fewshot_indices": null,
123
+ "samples": null,
124
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c42c0>, including_answer=True)",
125
+ "doc_to_choice": null,
126
+ "doc_to_target": "",
127
+ "gen_prefix": null,
128
+ "fewshot_delimiter": "\n\n",
129
+ "target_delimiter": " "
130
+ },
131
+ "num_fewshot": 0,
132
+ "metric_list": [
133
+ {
134
+ "metric": "exact_match",
135
+ "aggregation": "mean",
136
+ "higher_is_better": true,
137
+ "ignore_case": true,
138
+ "ignore_punctuation": true
139
+ }
140
+ ],
141
+ "output_type": "generate_until",
142
+ "generation_kwargs": {
143
+ "until": [
144
+ "Question:"
145
+ ],
146
+ "max_gen_toks": 2048,
147
+ "do_sample": false,
148
+ "temperature": 0.0
149
+ },
150
+ "repeats": 1,
151
+ "filter_list": [
152
+ {
153
+ "name": "custom-extract",
154
+ "filter": [
155
+ {
156
+ "function": "regex",
157
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
158
+ },
159
+ {
160
+ "function": "take_first"
161
+ }
162
+ ]
163
+ }
164
+ ],
165
+ "should_decontaminate": false,
166
+ "metadata": {
167
+ "version": 3.0,
168
+ "base_url": "http://localhost:8000/v1/chat/completions",
169
+ "model": "devstral",
170
+ "num_concurrent": 1
171
+ }
172
+ },
173
+ "mmlu_pro_business": {
174
+ "task": "mmlu_pro_business",
175
+ "task_alias": "business",
176
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
177
+ "test_split": "test",
178
+ "fewshot_split": "validation",
179
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c5d00>, subject='business')",
180
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c6c00>, including_answer=False)",
181
+ "doc_to_target": "answer",
182
+ "unsafe_code": false,
183
+ "description": "The following are multiple choice questions (with answers) about business. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
184
+ "target_delimiter": " ",
185
+ "fewshot_delimiter": "\n\n",
186
+ "fewshot_config": {
187
+ "sampler": "first_n",
188
+ "split": "validation",
189
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c5d00>, subject='business')",
190
+ "fewshot_indices": null,
191
+ "samples": null,
192
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c6660>, including_answer=True)",
193
+ "doc_to_choice": null,
194
+ "doc_to_target": "",
195
+ "gen_prefix": null,
196
+ "fewshot_delimiter": "\n\n",
197
+ "target_delimiter": " "
198
+ },
199
+ "num_fewshot": 0,
200
+ "metric_list": [
201
+ {
202
+ "metric": "exact_match",
203
+ "aggregation": "mean",
204
+ "higher_is_better": true,
205
+ "ignore_case": true,
206
+ "ignore_punctuation": true
207
+ }
208
+ ],
209
+ "output_type": "generate_until",
210
+ "generation_kwargs": {
211
+ "until": [
212
+ "Question:"
213
+ ],
214
+ "max_gen_toks": 2048,
215
+ "do_sample": false,
216
+ "temperature": 0.0
217
+ },
218
+ "repeats": 1,
219
+ "filter_list": [
220
+ {
221
+ "name": "custom-extract",
222
+ "filter": [
223
+ {
224
+ "function": "regex",
225
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
226
+ },
227
+ {
228
+ "function": "take_first"
229
+ }
230
+ ]
231
+ }
232
+ ],
233
+ "should_decontaminate": false,
234
+ "metadata": {
235
+ "version": 3.0,
236
+ "base_url": "http://localhost:8000/v1/chat/completions",
237
+ "model": "devstral",
238
+ "num_concurrent": 1
239
+ }
240
+ },
241
+ "mmlu_pro_chemistry": {
242
+ "task": "mmlu_pro_chemistry",
243
+ "task_alias": "chemistry",
244
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
245
+ "test_split": "test",
246
+ "fewshot_split": "validation",
247
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f897eb60>, subject='chemistry')",
248
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f897ef20>, including_answer=False)",
249
+ "doc_to_target": "answer",
250
+ "unsafe_code": false,
251
+ "description": "The following are multiple choice questions (with answers) about chemistry. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
252
+ "target_delimiter": " ",
253
+ "fewshot_delimiter": "\n\n",
254
+ "fewshot_config": {
255
+ "sampler": "first_n",
256
+ "split": "validation",
257
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f897eb60>, subject='chemistry')",
258
+ "fewshot_indices": null,
259
+ "samples": null,
260
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f897ea20>, including_answer=True)",
261
+ "doc_to_choice": null,
262
+ "doc_to_target": "",
263
+ "gen_prefix": null,
264
+ "fewshot_delimiter": "\n\n",
265
+ "target_delimiter": " "
266
+ },
267
+ "num_fewshot": 0,
268
+ "metric_list": [
269
+ {
270
+ "metric": "exact_match",
271
+ "aggregation": "mean",
272
+ "higher_is_better": true,
273
+ "ignore_case": true,
274
+ "ignore_punctuation": true
275
+ }
276
+ ],
277
+ "output_type": "generate_until",
278
+ "generation_kwargs": {
279
+ "until": [
280
+ "Question:"
281
+ ],
282
+ "max_gen_toks": 2048,
283
+ "do_sample": false,
284
+ "temperature": 0.0
285
+ },
286
+ "repeats": 1,
287
+ "filter_list": [
288
+ {
289
+ "name": "custom-extract",
290
+ "filter": [
291
+ {
292
+ "function": "regex",
293
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
294
+ },
295
+ {
296
+ "function": "take_first"
297
+ }
298
+ ]
299
+ }
300
+ ],
301
+ "should_decontaminate": false,
302
+ "metadata": {
303
+ "version": 3.0,
304
+ "base_url": "http://localhost:8000/v1/chat/completions",
305
+ "model": "devstral",
306
+ "num_concurrent": 1
307
+ }
308
+ },
309
+ "mmlu_pro_computer_science": {
310
+ "task": "mmlu_pro_computer_science",
311
+ "task_alias": "computer_science",
312
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
313
+ "test_split": "test",
314
+ "fewshot_split": "validation",
315
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f897e020>, subject='computer science')",
316
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f897e2a0>, including_answer=False)",
317
+ "doc_to_target": "answer",
318
+ "unsafe_code": false,
319
+ "description": "The following are multiple choice questions (with answers) about computer science. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
320
+ "target_delimiter": " ",
321
+ "fewshot_delimiter": "\n\n",
322
+ "fewshot_config": {
323
+ "sampler": "first_n",
324
+ "split": "validation",
325
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f897e020>, subject='computer science')",
326
+ "fewshot_indices": null,
327
+ "samples": null,
328
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f897e3e0>, including_answer=True)",
329
+ "doc_to_choice": null,
330
+ "doc_to_target": "",
331
+ "gen_prefix": null,
332
+ "fewshot_delimiter": "\n\n",
333
+ "target_delimiter": " "
334
+ },
335
+ "num_fewshot": 0,
336
+ "metric_list": [
337
+ {
338
+ "metric": "exact_match",
339
+ "aggregation": "mean",
340
+ "higher_is_better": true,
341
+ "ignore_case": true,
342
+ "ignore_punctuation": true
343
+ }
344
+ ],
345
+ "output_type": "generate_until",
346
+ "generation_kwargs": {
347
+ "until": [
348
+ "Question:"
349
+ ],
350
+ "max_gen_toks": 2048,
351
+ "do_sample": false,
352
+ "temperature": 0.0
353
+ },
354
+ "repeats": 1,
355
+ "filter_list": [
356
+ {
357
+ "name": "custom-extract",
358
+ "filter": [
359
+ {
360
+ "function": "regex",
361
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
362
+ },
363
+ {
364
+ "function": "take_first"
365
+ }
366
+ ]
367
+ }
368
+ ],
369
+ "should_decontaminate": false,
370
+ "metadata": {
371
+ "version": 3.0,
372
+ "base_url": "http://localhost:8000/v1/chat/completions",
373
+ "model": "devstral",
374
+ "num_concurrent": 1
375
+ }
376
+ },
377
+ "mmlu_pro_economics": {
378
+ "task": "mmlu_pro_economics",
379
+ "task_alias": "economics",
380
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
381
+ "test_split": "test",
382
+ "fewshot_split": "validation",
383
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c7420>, subject='economics')",
384
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c79c0>, including_answer=False)",
385
+ "doc_to_target": "answer",
386
+ "unsafe_code": false,
387
+ "description": "The following are multiple choice questions (with answers) about economics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
388
+ "target_delimiter": " ",
389
+ "fewshot_delimiter": "\n\n",
390
+ "fewshot_config": {
391
+ "sampler": "first_n",
392
+ "split": "validation",
393
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c7420>, subject='economics')",
394
+ "fewshot_indices": null,
395
+ "samples": null,
396
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c44a0>, including_answer=True)",
397
+ "doc_to_choice": null,
398
+ "doc_to_target": "",
399
+ "gen_prefix": null,
400
+ "fewshot_delimiter": "\n\n",
401
+ "target_delimiter": " "
402
+ },
403
+ "num_fewshot": 0,
404
+ "metric_list": [
405
+ {
406
+ "metric": "exact_match",
407
+ "aggregation": "mean",
408
+ "higher_is_better": true,
409
+ "ignore_case": true,
410
+ "ignore_punctuation": true
411
+ }
412
+ ],
413
+ "output_type": "generate_until",
414
+ "generation_kwargs": {
415
+ "until": [
416
+ "Question:"
417
+ ],
418
+ "max_gen_toks": 2048,
419
+ "do_sample": false,
420
+ "temperature": 0.0
421
+ },
422
+ "repeats": 1,
423
+ "filter_list": [
424
+ {
425
+ "name": "custom-extract",
426
+ "filter": [
427
+ {
428
+ "function": "regex",
429
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
430
+ },
431
+ {
432
+ "function": "take_first"
433
+ }
434
+ ]
435
+ }
436
+ ],
437
+ "should_decontaminate": false,
438
+ "metadata": {
439
+ "version": 3.0,
440
+ "base_url": "http://localhost:8000/v1/chat/completions",
441
+ "model": "devstral",
442
+ "num_concurrent": 1
443
+ }
444
+ },
445
+ "mmlu_pro_engineering": {
446
+ "task": "mmlu_pro_engineering",
447
+ "task_alias": "engineering",
448
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
449
+ "test_split": "test",
450
+ "fewshot_split": "validation",
451
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c4f40>, subject='engineering')",
452
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c59e0>, including_answer=False)",
453
+ "doc_to_target": "answer",
454
+ "unsafe_code": false,
455
+ "description": "The following are multiple choice questions (with answers) about engineering. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
456
+ "target_delimiter": " ",
457
+ "fewshot_delimiter": "\n\n",
458
+ "fewshot_config": {
459
+ "sampler": "first_n",
460
+ "split": "validation",
461
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c4f40>, subject='engineering')",
462
+ "fewshot_indices": null,
463
+ "samples": null,
464
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c67a0>, including_answer=True)",
465
+ "doc_to_choice": null,
466
+ "doc_to_target": "",
467
+ "gen_prefix": null,
468
+ "fewshot_delimiter": "\n\n",
469
+ "target_delimiter": " "
470
+ },
471
+ "num_fewshot": 0,
472
+ "metric_list": [
473
+ {
474
+ "metric": "exact_match",
475
+ "aggregation": "mean",
476
+ "higher_is_better": true,
477
+ "ignore_case": true,
478
+ "ignore_punctuation": true
479
+ }
480
+ ],
481
+ "output_type": "generate_until",
482
+ "generation_kwargs": {
483
+ "until": [
484
+ "Question:"
485
+ ],
486
+ "max_gen_toks": 2048,
487
+ "do_sample": false,
488
+ "temperature": 0.0
489
+ },
490
+ "repeats": 1,
491
+ "filter_list": [
492
+ {
493
+ "name": "custom-extract",
494
+ "filter": [
495
+ {
496
+ "function": "regex",
497
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
498
+ },
499
+ {
500
+ "function": "take_first"
501
+ }
502
+ ]
503
+ }
504
+ ],
505
+ "should_decontaminate": false,
506
+ "metadata": {
507
+ "version": 3.0,
508
+ "base_url": "http://localhost:8000/v1/chat/completions",
509
+ "model": "devstral",
510
+ "num_concurrent": 1
511
+ }
512
+ },
513
+ "mmlu_pro_health": {
514
+ "task": "mmlu_pro_health",
515
+ "task_alias": "health",
516
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
517
+ "test_split": "test",
518
+ "fewshot_split": "validation",
519
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c5440>, subject='health')",
520
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c5940>, including_answer=False)",
521
+ "doc_to_target": "answer",
522
+ "unsafe_code": false,
523
+ "description": "The following are multiple choice questions (with answers) about health. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
524
+ "target_delimiter": " ",
525
+ "fewshot_delimiter": "\n\n",
526
+ "fewshot_config": {
527
+ "sampler": "first_n",
528
+ "split": "validation",
529
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c5440>, subject='health')",
530
+ "fewshot_indices": null,
531
+ "samples": null,
532
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c5620>, including_answer=True)",
533
+ "doc_to_choice": null,
534
+ "doc_to_target": "",
535
+ "gen_prefix": null,
536
+ "fewshot_delimiter": "\n\n",
537
+ "target_delimiter": " "
538
+ },
539
+ "num_fewshot": 0,
540
+ "metric_list": [
541
+ {
542
+ "metric": "exact_match",
543
+ "aggregation": "mean",
544
+ "higher_is_better": true,
545
+ "ignore_case": true,
546
+ "ignore_punctuation": true
547
+ }
548
+ ],
549
+ "output_type": "generate_until",
550
+ "generation_kwargs": {
551
+ "until": [
552
+ "Question:"
553
+ ],
554
+ "max_gen_toks": 2048,
555
+ "do_sample": false,
556
+ "temperature": 0.0
557
+ },
558
+ "repeats": 1,
559
+ "filter_list": [
560
+ {
561
+ "name": "custom-extract",
562
+ "filter": [
563
+ {
564
+ "function": "regex",
565
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
566
+ },
567
+ {
568
+ "function": "take_first"
569
+ }
570
+ ]
571
+ }
572
+ ],
573
+ "should_decontaminate": false,
574
+ "metadata": {
575
+ "version": 3.0,
576
+ "base_url": "http://localhost:8000/v1/chat/completions",
577
+ "model": "devstral",
578
+ "num_concurrent": 1
579
+ }
580
+ },
581
+ "mmlu_pro_history": {
582
+ "task": "mmlu_pro_history",
583
+ "task_alias": "history",
584
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
585
+ "test_split": "test",
586
+ "fewshot_split": "validation",
587
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c4a40>, subject='history')",
588
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c4180>, including_answer=False)",
589
+ "doc_to_target": "answer",
590
+ "unsafe_code": false,
591
+ "description": "The following are multiple choice questions (with answers) about history. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
592
+ "target_delimiter": " ",
593
+ "fewshot_delimiter": "\n\n",
594
+ "fewshot_config": {
595
+ "sampler": "first_n",
596
+ "split": "validation",
597
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f89c4a40>, subject='history')",
598
+ "fewshot_indices": null,
599
+ "samples": null,
600
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f89c4ae0>, including_answer=True)",
601
+ "doc_to_choice": null,
602
+ "doc_to_target": "",
603
+ "gen_prefix": null,
604
+ "fewshot_delimiter": "\n\n",
605
+ "target_delimiter": " "
606
+ },
607
+ "num_fewshot": 0,
608
+ "metric_list": [
609
+ {
610
+ "metric": "exact_match",
611
+ "aggregation": "mean",
612
+ "higher_is_better": true,
613
+ "ignore_case": true,
614
+ "ignore_punctuation": true
615
+ }
616
+ ],
617
+ "output_type": "generate_until",
618
+ "generation_kwargs": {
619
+ "until": [
620
+ "Question:"
621
+ ],
622
+ "max_gen_toks": 2048,
623
+ "do_sample": false,
624
+ "temperature": 0.0
625
+ },
626
+ "repeats": 1,
627
+ "filter_list": [
628
+ {
629
+ "name": "custom-extract",
630
+ "filter": [
631
+ {
632
+ "function": "regex",
633
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
634
+ },
635
+ {
636
+ "function": "take_first"
637
+ }
638
+ ]
639
+ }
640
+ ],
641
+ "should_decontaminate": false,
642
+ "metadata": {
643
+ "version": 3.0,
644
+ "base_url": "http://localhost:8000/v1/chat/completions",
645
+ "model": "devstral",
646
+ "num_concurrent": 1
647
+ }
648
+ },
649
+ "mmlu_pro_law": {
650
+ "task": "mmlu_pro_law",
651
+ "task_alias": "law",
652
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
653
+ "test_split": "test",
654
+ "fewshot_split": "validation",
655
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b971a0>, subject='law')",
656
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b96480>, including_answer=False)",
657
+ "doc_to_target": "answer",
658
+ "unsafe_code": false,
659
+ "description": "The following are multiple choice questions (with answers) about law. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
660
+ "target_delimiter": " ",
661
+ "fewshot_delimiter": "\n\n",
662
+ "fewshot_config": {
663
+ "sampler": "first_n",
664
+ "split": "validation",
665
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b971a0>, subject='law')",
666
+ "fewshot_indices": null,
667
+ "samples": null,
668
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef395cbc4a0>, including_answer=True)",
669
+ "doc_to_choice": null,
670
+ "doc_to_target": "",
671
+ "gen_prefix": null,
672
+ "fewshot_delimiter": "\n\n",
673
+ "target_delimiter": " "
674
+ },
675
+ "num_fewshot": 0,
676
+ "metric_list": [
677
+ {
678
+ "metric": "exact_match",
679
+ "aggregation": "mean",
680
+ "higher_is_better": true,
681
+ "ignore_case": true,
682
+ "ignore_punctuation": true
683
+ }
684
+ ],
685
+ "output_type": "generate_until",
686
+ "generation_kwargs": {
687
+ "until": [
688
+ "Question:"
689
+ ],
690
+ "max_gen_toks": 2048,
691
+ "do_sample": false,
692
+ "temperature": 0.0
693
+ },
694
+ "repeats": 1,
695
+ "filter_list": [
696
+ {
697
+ "name": "custom-extract",
698
+ "filter": [
699
+ {
700
+ "function": "regex",
701
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
702
+ },
703
+ {
704
+ "function": "take_first"
705
+ }
706
+ ]
707
+ }
708
+ ],
709
+ "should_decontaminate": false,
710
+ "metadata": {
711
+ "version": 3.0,
712
+ "base_url": "http://localhost:8000/v1/chat/completions",
713
+ "model": "devstral",
714
+ "num_concurrent": 1
715
+ }
716
+ },
717
+ "mmlu_pro_math": {
718
+ "task": "mmlu_pro_math",
719
+ "task_alias": "math",
720
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
721
+ "test_split": "test",
722
+ "fewshot_split": "validation",
723
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b953a0>, subject='math')",
724
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b95800>, including_answer=False)",
725
+ "doc_to_target": "answer",
726
+ "unsafe_code": false,
727
+ "description": "The following are multiple choice questions (with answers) about math. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
728
+ "target_delimiter": " ",
729
+ "fewshot_delimiter": "\n\n",
730
+ "fewshot_config": {
731
+ "sampler": "first_n",
732
+ "split": "validation",
733
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b953a0>, subject='math')",
734
+ "fewshot_indices": null,
735
+ "samples": null,
736
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b956c0>, including_answer=True)",
737
+ "doc_to_choice": null,
738
+ "doc_to_target": "",
739
+ "gen_prefix": null,
740
+ "fewshot_delimiter": "\n\n",
741
+ "target_delimiter": " "
742
+ },
743
+ "num_fewshot": 0,
744
+ "metric_list": [
745
+ {
746
+ "metric": "exact_match",
747
+ "aggregation": "mean",
748
+ "higher_is_better": true,
749
+ "ignore_case": true,
750
+ "ignore_punctuation": true
751
+ }
752
+ ],
753
+ "output_type": "generate_until",
754
+ "generation_kwargs": {
755
+ "until": [
756
+ "Question:"
757
+ ],
758
+ "max_gen_toks": 2048,
759
+ "do_sample": false,
760
+ "temperature": 0.0
761
+ },
762
+ "repeats": 1,
763
+ "filter_list": [
764
+ {
765
+ "name": "custom-extract",
766
+ "filter": [
767
+ {
768
+ "function": "regex",
769
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
770
+ },
771
+ {
772
+ "function": "take_first"
773
+ }
774
+ ]
775
+ }
776
+ ],
777
+ "should_decontaminate": false,
778
+ "metadata": {
779
+ "version": 3.0,
780
+ "base_url": "http://localhost:8000/v1/chat/completions",
781
+ "model": "devstral",
782
+ "num_concurrent": 1
783
+ }
784
+ },
785
+ "mmlu_pro_other": {
786
+ "task": "mmlu_pro_other",
787
+ "task_alias": "other",
788
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
789
+ "test_split": "test",
790
+ "fewshot_split": "validation",
791
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b95760>, subject='other')",
792
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b95940>, including_answer=False)",
793
+ "doc_to_target": "answer",
794
+ "unsafe_code": false,
795
+ "description": "The following are multiple choice questions (with answers) about other. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
796
+ "target_delimiter": " ",
797
+ "fewshot_delimiter": "\n\n",
798
+ "fewshot_config": {
799
+ "sampler": "first_n",
800
+ "split": "validation",
801
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b95760>, subject='other')",
802
+ "fewshot_indices": null,
803
+ "samples": null,
804
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b97060>, including_answer=True)",
805
+ "doc_to_choice": null,
806
+ "doc_to_target": "",
807
+ "gen_prefix": null,
808
+ "fewshot_delimiter": "\n\n",
809
+ "target_delimiter": " "
810
+ },
811
+ "num_fewshot": 0,
812
+ "metric_list": [
813
+ {
814
+ "metric": "exact_match",
815
+ "aggregation": "mean",
816
+ "higher_is_better": true,
817
+ "ignore_case": true,
818
+ "ignore_punctuation": true
819
+ }
820
+ ],
821
+ "output_type": "generate_until",
822
+ "generation_kwargs": {
823
+ "until": [
824
+ "Question:"
825
+ ],
826
+ "max_gen_toks": 2048,
827
+ "do_sample": false,
828
+ "temperature": 0.0
829
+ },
830
+ "repeats": 1,
831
+ "filter_list": [
832
+ {
833
+ "name": "custom-extract",
834
+ "filter": [
835
+ {
836
+ "function": "regex",
837
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
838
+ },
839
+ {
840
+ "function": "take_first"
841
+ }
842
+ ]
843
+ }
844
+ ],
845
+ "should_decontaminate": false,
846
+ "metadata": {
847
+ "version": 3.0,
848
+ "base_url": "http://localhost:8000/v1/chat/completions",
849
+ "model": "devstral",
850
+ "num_concurrent": 1
851
+ }
852
+ },
853
+ "mmlu_pro_philosophy": {
854
+ "task": "mmlu_pro_philosophy",
855
+ "task_alias": "philosophy",
856
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
857
+ "test_split": "test",
858
+ "fewshot_split": "validation",
859
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b97a60>, subject='philosophy')",
860
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b97240>, including_answer=False)",
861
+ "doc_to_target": "answer",
862
+ "unsafe_code": false,
863
+ "description": "The following are multiple choice questions (with answers) about philosophy. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
864
+ "target_delimiter": " ",
865
+ "fewshot_delimiter": "\n\n",
866
+ "fewshot_config": {
867
+ "sampler": "first_n",
868
+ "split": "validation",
869
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b97a60>, subject='philosophy')",
870
+ "fewshot_indices": null,
871
+ "samples": null,
872
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b95b20>, including_answer=True)",
873
+ "doc_to_choice": null,
874
+ "doc_to_target": "",
875
+ "gen_prefix": null,
876
+ "fewshot_delimiter": "\n\n",
877
+ "target_delimiter": " "
878
+ },
879
+ "num_fewshot": 0,
880
+ "metric_list": [
881
+ {
882
+ "metric": "exact_match",
883
+ "aggregation": "mean",
884
+ "higher_is_better": true,
885
+ "ignore_case": true,
886
+ "ignore_punctuation": true
887
+ }
888
+ ],
889
+ "output_type": "generate_until",
890
+ "generation_kwargs": {
891
+ "until": [
892
+ "Question:"
893
+ ],
894
+ "max_gen_toks": 2048,
895
+ "do_sample": false,
896
+ "temperature": 0.0
897
+ },
898
+ "repeats": 1,
899
+ "filter_list": [
900
+ {
901
+ "name": "custom-extract",
902
+ "filter": [
903
+ {
904
+ "function": "regex",
905
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
906
+ },
907
+ {
908
+ "function": "take_first"
909
+ }
910
+ ]
911
+ }
912
+ ],
913
+ "should_decontaminate": false,
914
+ "metadata": {
915
+ "version": 3.0,
916
+ "base_url": "http://localhost:8000/v1/chat/completions",
917
+ "model": "devstral",
918
+ "num_concurrent": 1
919
+ }
920
+ },
921
+ "mmlu_pro_physics": {
922
+ "task": "mmlu_pro_physics",
923
+ "task_alias": "physics",
924
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
925
+ "test_split": "test",
926
+ "fewshot_split": "validation",
927
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b96d40>, subject='physics')",
928
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b96de0>, including_answer=False)",
929
+ "doc_to_target": "answer",
930
+ "unsafe_code": false,
931
+ "description": "The following are multiple choice questions (with answers) about physics. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
932
+ "target_delimiter": " ",
933
+ "fewshot_delimiter": "\n\n",
934
+ "fewshot_config": {
935
+ "sampler": "first_n",
936
+ "split": "validation",
937
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2f9b96d40>, subject='physics')",
938
+ "fewshot_indices": null,
939
+ "samples": null,
940
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2f9b97920>, including_answer=True)",
941
+ "doc_to_choice": null,
942
+ "doc_to_target": "",
943
+ "gen_prefix": null,
944
+ "fewshot_delimiter": "\n\n",
945
+ "target_delimiter": " "
946
+ },
947
+ "num_fewshot": 0,
948
+ "metric_list": [
949
+ {
950
+ "metric": "exact_match",
951
+ "aggregation": "mean",
952
+ "higher_is_better": true,
953
+ "ignore_case": true,
954
+ "ignore_punctuation": true
955
+ }
956
+ ],
957
+ "output_type": "generate_until",
958
+ "generation_kwargs": {
959
+ "until": [
960
+ "Question:"
961
+ ],
962
+ "max_gen_toks": 2048,
963
+ "do_sample": false,
964
+ "temperature": 0.0
965
+ },
966
+ "repeats": 1,
967
+ "filter_list": [
968
+ {
969
+ "name": "custom-extract",
970
+ "filter": [
971
+ {
972
+ "function": "regex",
973
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
974
+ },
975
+ {
976
+ "function": "take_first"
977
+ }
978
+ ]
979
+ }
980
+ ],
981
+ "should_decontaminate": false,
982
+ "metadata": {
983
+ "version": 3.0,
984
+ "base_url": "http://localhost:8000/v1/chat/completions",
985
+ "model": "devstral",
986
+ "num_concurrent": 1
987
+ }
988
+ },
989
+ "mmlu_pro_psychology": {
990
+ "task": "mmlu_pro_psychology",
991
+ "task_alias": "psychology",
992
+ "dataset_path": "TIGER-Lab/MMLU-Pro",
993
+ "test_split": "test",
994
+ "fewshot_split": "validation",
995
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2fb5036a0>, subject='psychology')",
996
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2fb5037e0>, including_answer=False)",
997
+ "doc_to_target": "answer",
998
+ "unsafe_code": false,
999
+ "description": "The following are multiple choice questions (with answers) about psychology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n",
1000
+ "target_delimiter": " ",
1001
+ "fewshot_delimiter": "\n\n",
1002
+ "fewshot_config": {
1003
+ "sampler": "first_n",
1004
+ "split": "validation",
1005
+ "process_docs": "functools.partial(<function process_docs at 0x7ef2fb5036a0>, subject='psychology')",
1006
+ "fewshot_indices": null,
1007
+ "samples": null,
1008
+ "doc_to_text": "functools.partial(<function format_cot_example at 0x7ef2fb5039c0>, including_answer=True)",
1009
+ "doc_to_choice": null,
1010
+ "doc_to_target": "",
1011
+ "gen_prefix": null,
1012
+ "fewshot_delimiter": "\n\n",
1013
+ "target_delimiter": " "
1014
+ },
1015
+ "num_fewshot": 0,
1016
+ "metric_list": [
1017
+ {
1018
+ "metric": "exact_match",
1019
+ "aggregation": "mean",
1020
+ "higher_is_better": true,
1021
+ "ignore_case": true,
1022
+ "ignore_punctuation": true
1023
+ }
1024
+ ],
1025
+ "output_type": "generate_until",
1026
+ "generation_kwargs": {
1027
+ "until": [
1028
+ "Question:"
1029
+ ],
1030
+ "max_gen_toks": 2048,
1031
+ "do_sample": false,
1032
+ "temperature": 0.0
1033
+ },
1034
+ "repeats": 1,
1035
+ "filter_list": [
1036
+ {
1037
+ "name": "custom-extract",
1038
+ "filter": [
1039
+ {
1040
+ "function": "regex",
1041
+ "regex_pattern": "answer is \\(?([ABCDEFGHIJ])\\)?"
1042
+ },
1043
+ {
1044
+ "function": "take_first"
1045
+ }
1046
+ ]
1047
+ }
1048
+ ],
1049
+ "should_decontaminate": false,
1050
+ "metadata": {
1051
+ "version": 3.0,
1052
+ "base_url": "http://localhost:8000/v1/chat/completions",
1053
+ "model": "devstral",
1054
+ "num_concurrent": 1
1055
+ }
1056
+ }
1057
+ },
1058
+ "versions": {
1059
+ "mmlu_pro": 2.0,
1060
+ "mmlu_pro_biology": 3.0,
1061
+ "mmlu_pro_business": 3.0,
1062
+ "mmlu_pro_chemistry": 3.0,
1063
+ "mmlu_pro_computer_science": 3.0,
1064
+ "mmlu_pro_economics": 3.0,
1065
+ "mmlu_pro_engineering": 3.0,
1066
+ "mmlu_pro_health": 3.0,
1067
+ "mmlu_pro_history": 3.0,
1068
+ "mmlu_pro_law": 3.0,
1069
+ "mmlu_pro_math": 3.0,
1070
+ "mmlu_pro_other": 3.0,
1071
+ "mmlu_pro_philosophy": 3.0,
1072
+ "mmlu_pro_physics": 3.0,
1073
+ "mmlu_pro_psychology": 3.0
1074
+ },
1075
+ "n-shot": {
1076
+ "mmlu_pro_biology": 0,
1077
+ "mmlu_pro_business": 0,
1078
+ "mmlu_pro_chemistry": 0,
1079
+ "mmlu_pro_computer_science": 0,
1080
+ "mmlu_pro_economics": 0,
1081
+ "mmlu_pro_engineering": 0,
1082
+ "mmlu_pro_health": 0,
1083
+ "mmlu_pro_history": 0,
1084
+ "mmlu_pro_law": 0,
1085
+ "mmlu_pro_math": 0,
1086
+ "mmlu_pro_other": 0,
1087
+ "mmlu_pro_philosophy": 0,
1088
+ "mmlu_pro_physics": 0,
1089
+ "mmlu_pro_psychology": 0
1090
+ },
1091
+ "higher_is_better": {
1092
+ "mmlu_pro": {
1093
+ "exact_match": true
1094
+ },
1095
+ "mmlu_pro_biology": {
1096
+ "exact_match": true
1097
+ },
1098
+ "mmlu_pro_business": {
1099
+ "exact_match": true
1100
+ },
1101
+ "mmlu_pro_chemistry": {
1102
+ "exact_match": true
1103
+ },
1104
+ "mmlu_pro_computer_science": {
1105
+ "exact_match": true
1106
+ },
1107
+ "mmlu_pro_economics": {
1108
+ "exact_match": true
1109
+ },
1110
+ "mmlu_pro_engineering": {
1111
+ "exact_match": true
1112
+ },
1113
+ "mmlu_pro_health": {
1114
+ "exact_match": true
1115
+ },
1116
+ "mmlu_pro_history": {
1117
+ "exact_match": true
1118
+ },
1119
+ "mmlu_pro_law": {
1120
+ "exact_match": true
1121
+ },
1122
+ "mmlu_pro_math": {
1123
+ "exact_match": true
1124
+ },
1125
+ "mmlu_pro_other": {
1126
+ "exact_match": true
1127
+ },
1128
+ "mmlu_pro_philosophy": {
1129
+ "exact_match": true
1130
+ },
1131
+ "mmlu_pro_physics": {
1132
+ "exact_match": true
1133
+ },
1134
+ "mmlu_pro_psychology": {
1135
+ "exact_match": true
1136
+ }
1137
+ },
1138
+ "n-samples": {
1139
+ "mmlu_pro_biology": {
1140
+ "original": 717,
1141
+ "effective": 500
1142
+ },
1143
+ "mmlu_pro_business": {
1144
+ "original": 789,
1145
+ "effective": 500
1146
+ },
1147
+ "mmlu_pro_chemistry": {
1148
+ "original": 1132,
1149
+ "effective": 500
1150
+ },
1151
+ "mmlu_pro_computer_science": {
1152
+ "original": 410,
1153
+ "effective": 410
1154
+ },
1155
+ "mmlu_pro_economics": {
1156
+ "original": 844,
1157
+ "effective": 500
1158
+ },
1159
+ "mmlu_pro_engineering": {
1160
+ "original": 969,
1161
+ "effective": 500
1162
+ },
1163
+ "mmlu_pro_health": {
1164
+ "original": 818,
1165
+ "effective": 500
1166
+ },
1167
+ "mmlu_pro_history": {
1168
+ "original": 381,
1169
+ "effective": 381
1170
+ },
1171
+ "mmlu_pro_law": {
1172
+ "original": 1101,
1173
+ "effective": 500
1174
+ },
1175
+ "mmlu_pro_math": {
1176
+ "original": 1351,
1177
+ "effective": 500
1178
+ },
1179
+ "mmlu_pro_other": {
1180
+ "original": 924,
1181
+ "effective": 500
1182
+ },
1183
+ "mmlu_pro_philosophy": {
1184
+ "original": 499,
1185
+ "effective": 499
1186
+ },
1187
+ "mmlu_pro_physics": {
1188
+ "original": 1299,
1189
+ "effective": 500
1190
+ },
1191
+ "mmlu_pro_psychology": {
1192
+ "original": 798,
1193
+ "effective": 500
1194
+ }
1195
+ },
1196
+ "config": {
1197
+ "model": "local-chat-completions",
1198
+ "model_args": {
1199
+ "base_url": "http://localhost:8000/v1/chat/completions",
1200
+ "model": "devstral",
1201
+ "num_concurrent": 1
1202
+ },
1203
+ "batch_size": "1",
1204
+ "batch_sizes": [],
1205
+ "device": "cuda:0",
1206
+ "use_cache": null,
1207
+ "limit": 500.0,
1208
+ "bootstrap_iters": 100000,
1209
+ "gen_kwargs": {},
1210
+ "random_seed": 0,
1211
+ "numpy_seed": 1234,
1212
+ "torch_seed": 1234,
1213
+ "fewshot_seed": 1234
1214
+ },
1215
+ "git_hash": null,
1216
+ "date": 1776271483.5504663,
1217
+ "pretty_env_info": "N/A (torch not installed)",
1218
+ "transformers_version": "N/A",
1219
+ "lm_eval_version": "0.4.11",
1220
+ "upper_git_hash": null,
1221
+ "task_hashes": {
1222
+ "mmlu_pro_biology": "3c2112b732af5d17ee489a18a4a28b281d708ab9460d58e911fd8fcc3006287e",
1223
+ "mmlu_pro_business": "6ab127402399858d74b61427ce3d78091524aee695ff2b5de80a0ab2e9fdad52",
1224
+ "mmlu_pro_chemistry": "ec5f94016ce915cded9c6de70bbe70a5994765369d1b6c6663700c070913da39",
1225
+ "mmlu_pro_computer_science": "ae64c4e0c8e51f9bd391df78eaaa85d84c5cfa3563c5f8bedaa534b7dbe3a4f2",
1226
+ "mmlu_pro_economics": "be404104f5091690230edc2e564794f66a0eb810465e70b34256d3c4cc609b42",
1227
+ "mmlu_pro_engineering": "2f6391732a670b086c854a713daf1362bb640faa2c60fd0df8e13f626da243b0",
1228
+ "mmlu_pro_health": "ae8a2bfdcccb38c83a38bd197a4213d9465e0eeabb63448054627e2a18eab4ca",
1229
+ "mmlu_pro_history": "7d97d49dfe0d7307375fed90fce653921f1152f4d2b2a3e39284f134165b55fa",
1230
+ "mmlu_pro_law": "8ac74781703cd92166006eb3c3d140916860d48cd197d71066c0c5e376651a14",
1231
+ "mmlu_pro_math": "6a02748f6ed0d3648b1cb0443fbe48ba7e3619a8069b9ad125fbe20e2a9ebf83",
1232
+ "mmlu_pro_other": "27c2300b7edd3ccbe40e2b0b389f1b780a8d008c4c656d4be3e836bcbabca914",
1233
+ "mmlu_pro_philosophy": "5445259aa2fef05844ce456d626ed7bb89ff73beba9143a1f0b435d341313c37",
1234
+ "mmlu_pro_physics": "5d6b459fb7ae0eea42afbed221b102ac1579dbe28e7be7d0412c7680f872f2a7",
1235
+ "mmlu_pro_psychology": "68facfac7fb5b75138419bf4e545ef1e7406c5b116a19c8b50e5dda301e8f7a9"
1236
+ },
1237
+ "model_source": "local-chat-completions",
1238
+ "model_name": "devstral",
1239
+ "model_name_sanitized": "devstral",
1240
+ "system_instruction": null,
1241
+ "system_instruction_sha": null,
1242
+ "fewshot_as_multiturn": true,
1243
+ "chat_template": "",
1244
+ "chat_template_sha": null,
1245
+ "total_evaluation_time_seconds": "31885.789048001636"
1246
+ }