alplusplus commited on
Commit
e5351fc
·
verified ·
1 Parent(s): b8f867e

Upload eval_results.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. eval_results.json +354 -0
eval_results.json ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "arc_challenge": {
3
+ "alias": "arc_challenge",
4
+ "acc,none": 0.5486348122866894,
5
+ "acc_stderr,none": 0.014542104569955378,
6
+ "acc_norm,none": 0.5674061433447098,
7
+ "acc_norm_stderr,none": 0.014478005694182441
8
+ },
9
+ "gsm8k": {
10
+ "alias": "gsm8k",
11
+ "exact_match,strict-match": 0.7558756633813495,
12
+ "exact_match_stderr,strict-match": 0.011832404674077592,
13
+ "exact_match,flexible-extract": 0.7915087187263078,
14
+ "exact_match_stderr,flexible-extract": 0.011189587985791413
15
+ },
16
+ "hellaswag": {
17
+ "alias": "hellaswag",
18
+ "acc,none": 0.5007966540529775,
19
+ "acc_stderr,none": 0.004989775077835642,
20
+ "acc_norm,none": 0.6353316072495518,
21
+ "acc_norm_stderr,none": 0.004803533333363877
22
+ },
23
+ "humaneval": {
24
+ "alias": "humaneval",
25
+ "pass@1,create_test": 0.0,
26
+ "pass@1_stderr,create_test": 0.0
27
+ },
28
+ "ifeval": {
29
+ "alias": "ifeval",
30
+ "prompt_level_strict_acc,none": 0.7855822550831792,
31
+ "prompt_level_strict_acc_stderr,none": 0.017661570312173906,
32
+ "inst_level_strict_acc,none": 0.8453237410071942,
33
+ "inst_level_strict_acc_stderr,none": "N/A",
34
+ "prompt_level_loose_acc,none": 0.8188539741219963,
35
+ "prompt_level_loose_acc_stderr,none": 0.01657374894371447,
36
+ "inst_level_loose_acc,none": 0.8717026378896883,
37
+ "inst_level_loose_acc_stderr,none": "N/A"
38
+ },
39
+ "mmlu": {
40
+ "acc,none": 0.7172055262783079,
41
+ "acc_stderr,none": 0.003613949324499328,
42
+ "alias": "mmlu"
43
+ },
44
+ "mmlu_humanities": {
45
+ "acc,none": 0.649096705632306,
46
+ "acc_stderr,none": 0.006681387824934989,
47
+ "alias": " - humanities"
48
+ },
49
+ "mmlu_formal_logic": {
50
+ "alias": " - formal_logic",
51
+ "acc,none": 0.6031746031746031,
52
+ "acc_stderr,none": 0.04375888492727054
53
+ },
54
+ "mmlu_high_school_european_history": {
55
+ "alias": " - high_school_european_history",
56
+ "acc,none": 0.8363636363636363,
57
+ "acc_stderr,none": 0.028887872395487978
58
+ },
59
+ "mmlu_high_school_us_history": {
60
+ "alias": " - high_school_us_history",
61
+ "acc,none": 0.8725490196078431,
62
+ "acc_stderr,none": 0.02340553048084634
63
+ },
64
+ "mmlu_high_school_world_history": {
65
+ "alias": " - high_school_world_history",
66
+ "acc,none": 0.8438818565400844,
67
+ "acc_stderr,none": 0.02362715946031866
68
+ },
69
+ "mmlu_international_law": {
70
+ "alias": " - international_law",
71
+ "acc,none": 0.8181818181818182,
72
+ "acc_stderr,none": 0.03520893951097652
73
+ },
74
+ "mmlu_jurisprudence": {
75
+ "alias": " - jurisprudence",
76
+ "acc,none": 0.7685185185185185,
77
+ "acc_stderr,none": 0.040774947092526284
78
+ },
79
+ "mmlu_logical_fallacies": {
80
+ "alias": " - logical_fallacies",
81
+ "acc,none": 0.7975460122699386,
82
+ "acc_stderr,none": 0.031570650789119054
83
+ },
84
+ "mmlu_moral_disputes": {
85
+ "alias": " - moral_disputes",
86
+ "acc,none": 0.7398843930635838,
87
+ "acc_stderr,none": 0.02361867831006933
88
+ },
89
+ "mmlu_moral_scenarios": {
90
+ "alias": " - moral_scenarios",
91
+ "acc,none": 0.5251396648044693,
92
+ "acc_stderr,none": 0.01670135084268273
93
+ },
94
+ "mmlu_philosophy": {
95
+ "alias": " - philosophy",
96
+ "acc,none": 0.7202572347266881,
97
+ "acc_stderr,none": 0.02549425935069486
98
+ },
99
+ "mmlu_prehistory": {
100
+ "alias": " - prehistory",
101
+ "acc,none": 0.7901234567901234,
102
+ "acc_stderr,none": 0.022658344085981406
103
+ },
104
+ "mmlu_professional_law": {
105
+ "alias": " - professional_law",
106
+ "acc,none": 0.5254237288135594,
107
+ "acc_stderr,none": 0.012753716929101162
108
+ },
109
+ "mmlu_world_religions": {
110
+ "alias": " - world_religions",
111
+ "acc,none": 0.8070175438596491,
112
+ "acc_stderr,none": 0.030267457554898448
113
+ },
114
+ "mmlu_other": {
115
+ "acc,none": 0.7644029610556807,
116
+ "acc_stderr,none": 0.007308949401897016,
117
+ "alias": " - other"
118
+ },
119
+ "mmlu_business_ethics": {
120
+ "alias": " - business_ethics",
121
+ "acc,none": 0.78,
122
+ "acc_stderr,none": 0.041633319989322654
123
+ },
124
+ "mmlu_clinical_knowledge": {
125
+ "alias": " - clinical_knowledge",
126
+ "acc,none": 0.8113207547169812,
127
+ "acc_stderr,none": 0.024079995130062277
128
+ },
129
+ "mmlu_college_medicine": {
130
+ "alias": " - college_medicine",
131
+ "acc,none": 0.7398843930635838,
132
+ "acc_stderr,none": 0.03345036916788986
133
+ },
134
+ "mmlu_global_facts": {
135
+ "alias": " - global_facts",
136
+ "acc,none": 0.41,
137
+ "acc_stderr,none": 0.04943110704237104
138
+ },
139
+ "mmlu_human_aging": {
140
+ "alias": " - human_aging",
141
+ "acc,none": 0.695067264573991,
142
+ "acc_stderr,none": 0.03089861088247754
143
+ },
144
+ "mmlu_management": {
145
+ "alias": " - management",
146
+ "acc,none": 0.8932038834951457,
147
+ "acc_stderr,none": 0.030581088928331335
148
+ },
149
+ "mmlu_marketing": {
150
+ "alias": " - marketing",
151
+ "acc,none": 0.9017094017094017,
152
+ "acc_stderr,none": 0.019503444900757564
153
+ },
154
+ "mmlu_medical_genetics": {
155
+ "alias": " - medical_genetics",
156
+ "acc,none": 0.82,
157
+ "acc_stderr,none": 0.03861229196653691
158
+ },
159
+ "mmlu_miscellaneous": {
160
+ "alias": " - miscellaneous",
161
+ "acc,none": 0.8390804597701149,
162
+ "acc_stderr,none": 0.01314022551561178
163
+ },
164
+ "mmlu_nutrition": {
165
+ "alias": " - nutrition",
166
+ "acc,none": 0.803921568627451,
167
+ "acc_stderr,none": 0.02273378940544767
168
+ },
169
+ "mmlu_professional_accounting": {
170
+ "alias": " - professional_accounting",
171
+ "acc,none": 0.574468085106383,
172
+ "acc_stderr,none": 0.02949482760014435
173
+ },
174
+ "mmlu_professional_medicine": {
175
+ "alias": " - professional_medicine",
176
+ "acc,none": 0.8125,
177
+ "acc_stderr,none": 0.023709788253811766
178
+ },
179
+ "mmlu_virology": {
180
+ "alias": " - virology",
181
+ "acc,none": 0.5240963855421686,
182
+ "acc_stderr,none": 0.03887971849597267
183
+ },
184
+ "mmlu_social_sciences": {
185
+ "acc,none": 0.8098797530061749,
186
+ "acc_stderr,none": 0.0069319469054339704,
187
+ "alias": " - social sciences"
188
+ },
189
+ "mmlu_econometrics": {
190
+ "alias": " - econometrics",
191
+ "acc,none": 0.6491228070175439,
192
+ "acc_stderr,none": 0.044895393502707
193
+ },
194
+ "mmlu_high_school_geography": {
195
+ "alias": " - high_school_geography",
196
+ "acc,none": 0.8787878787878788,
197
+ "acc_stderr,none": 0.02325315795194205
198
+ },
199
+ "mmlu_high_school_government_and_politics": {
200
+ "alias": " - high_school_government_and_politics",
201
+ "acc,none": 0.9326424870466321,
202
+ "acc_stderr,none": 0.01808839383907894
203
+ },
204
+ "mmlu_high_school_macroeconomics": {
205
+ "alias": " - high_school_macroeconomics",
206
+ "acc,none": 0.764102564102564,
207
+ "acc_stderr,none": 0.02152596540740865
208
+ },
209
+ "mmlu_high_school_microeconomics": {
210
+ "alias": " - high_school_microeconomics",
211
+ "acc,none": 0.8823529411764706,
212
+ "acc_stderr,none": 0.020928472557788826
213
+ },
214
+ "mmlu_high_school_psychology": {
215
+ "alias": " - high_school_psychology",
216
+ "acc,none": 0.9027522935779817,
217
+ "acc_stderr,none": 0.01270353340854044
218
+ },
219
+ "mmlu_human_sexuality": {
220
+ "alias": " - human_sexuality",
221
+ "acc,none": 0.7786259541984732,
222
+ "acc_stderr,none": 0.036412970813137276
223
+ },
224
+ "mmlu_professional_psychology": {
225
+ "alias": " - professional_psychology",
226
+ "acc,none": 0.7254901960784313,
227
+ "acc_stderr,none": 0.018054027458815132
228
+ },
229
+ "mmlu_public_relations": {
230
+ "alias": " - public_relations",
231
+ "acc,none": 0.6818181818181818,
232
+ "acc_stderr,none": 0.044612721759105065
233
+ },
234
+ "mmlu_security_studies": {
235
+ "alias": " - security_studies",
236
+ "acc,none": 0.7714285714285715,
237
+ "acc_stderr,none": 0.026882144922307713
238
+ },
239
+ "mmlu_sociology": {
240
+ "alias": " - sociology",
241
+ "acc,none": 0.835820895522388,
242
+ "acc_stderr,none": 0.026193923544454094
243
+ },
244
+ "mmlu_us_foreign_policy": {
245
+ "alias": " - us_foreign_policy",
246
+ "acc,none": 0.86,
247
+ "acc_stderr,none": 0.0348735088019777
248
+ },
249
+ "mmlu_stem": {
250
+ "acc,none": 0.6818902632413575,
251
+ "acc_stderr,none": 0.007873734738401106,
252
+ "alias": " - stem"
253
+ },
254
+ "mmlu_abstract_algebra": {
255
+ "alias": " - abstract_algebra",
256
+ "acc,none": 0.51,
257
+ "acc_stderr,none": 0.05024183937956913
258
+ },
259
+ "mmlu_anatomy": {
260
+ "alias": " - anatomy",
261
+ "acc,none": 0.7111111111111111,
262
+ "acc_stderr,none": 0.0391545063041425
263
+ },
264
+ "mmlu_astronomy": {
265
+ "alias": " - astronomy",
266
+ "acc,none": 0.8092105263157895,
267
+ "acc_stderr,none": 0.03197565821032503
268
+ },
269
+ "mmlu_college_biology": {
270
+ "alias": " - college_biology",
271
+ "acc,none": 0.8888888888888888,
272
+ "acc_stderr,none": 0.026280550932848118
273
+ },
274
+ "mmlu_college_chemistry": {
275
+ "alias": " - college_chemistry",
276
+ "acc,none": 0.61,
277
+ "acc_stderr,none": 0.04902071300001973
278
+ },
279
+ "mmlu_college_computer_science": {
280
+ "alias": " - college_computer_science",
281
+ "acc,none": 0.61,
282
+ "acc_stderr,none": 0.04902071300001973
283
+ },
284
+ "mmlu_college_mathematics": {
285
+ "alias": " - college_mathematics",
286
+ "acc,none": 0.5,
287
+ "acc_stderr,none": 0.050251890762960605
288
+ },
289
+ "mmlu_college_physics": {
290
+ "alias": " - college_physics",
291
+ "acc,none": 0.5196078431372549,
292
+ "acc_stderr,none": 0.049713588843674016
293
+ },
294
+ "mmlu_computer_security": {
295
+ "alias": " - computer_security",
296
+ "acc,none": 0.8,
297
+ "acc_stderr,none": 0.04020151261036849
298
+ },
299
+ "mmlu_conceptual_physics": {
300
+ "alias": " - conceptual_physics",
301
+ "acc,none": 0.8297872340425532,
302
+ "acc_stderr,none": 0.024568096561260667
303
+ },
304
+ "mmlu_electrical_engineering": {
305
+ "alias": " - electrical_engineering",
306
+ "acc,none": 0.7379310344827587,
307
+ "acc_stderr,none": 0.03664666337225255
308
+ },
309
+ "mmlu_elementary_mathematics": {
310
+ "alias": " - elementary_mathematics",
311
+ "acc,none": 0.5925925925925926,
312
+ "acc_stderr,none": 0.02530590624159063
313
+ },
314
+ "mmlu_high_school_biology": {
315
+ "alias": " - high_school_biology",
316
+ "acc,none": 0.896774193548387,
317
+ "acc_stderr,none": 0.017308381281034537
318
+ },
319
+ "mmlu_high_school_chemistry": {
320
+ "alias": " - high_school_chemistry",
321
+ "acc,none": 0.7635467980295566,
322
+ "acc_stderr,none": 0.029896114291733576
323
+ },
324
+ "mmlu_high_school_computer_science": {
325
+ "alias": " - high_school_computer_science",
326
+ "acc,none": 0.76,
327
+ "acc_stderr,none": 0.04292346959909278
328
+ },
329
+ "mmlu_high_school_mathematics": {
330
+ "alias": " - high_school_mathematics",
331
+ "acc,none": 0.4,
332
+ "acc_stderr,none": 0.029869605095316977
333
+ },
334
+ "mmlu_high_school_physics": {
335
+ "alias": " - high_school_physics",
336
+ "acc,none": 0.6423841059602649,
337
+ "acc_stderr,none": 0.0391345343117726
338
+ },
339
+ "mmlu_high_school_statistics": {
340
+ "alias": " - high_school_statistics",
341
+ "acc,none": 0.7083333333333334,
342
+ "acc_stderr,none": 0.030998666304560565
343
+ },
344
+ "mmlu_machine_learning": {
345
+ "alias": " - machine_learning",
346
+ "acc,none": 0.48214285714285715,
347
+ "acc_stderr,none": 0.047427623612430116
348
+ },
349
+ "truthfulqa_mc2": {
350
+ "alias": "truthfulqa_mc2",
351
+ "acc,none": 0.538773182494322,
352
+ "acc_stderr,none": 0.015822493198912112
353
+ }
354
+ }