Upload eval_results.json with huggingface_hub

#2
by GODELEV - opened
Files changed (1) hide show
  1. eval_results.json +173 -0
eval_results.json ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task": "hellaswag",
4
+ "benchmark": "HellaSwag",
5
+ "metric": "acc_norm",
6
+ "score": 0.26707827126070505,
7
+ "shots": 0,
8
+ "runtime_sec": 318.67,
9
+ "status": "success"
10
+ },
11
+ {
12
+ "task": "piqa",
13
+ "benchmark": "PIQA",
14
+ "metric": "acc_norm",
15
+ "score": 0.5386289445048966,
16
+ "shots": 0,
17
+ "runtime_sec": 38.85,
18
+ "status": "success"
19
+ },
20
+ {
21
+ "task": "winogrande",
22
+ "benchmark": "WinoGrande",
23
+ "metric": "acc",
24
+ "score": 0.5067087608524072,
25
+ "shots": 0,
26
+ "runtime_sec": 23.73,
27
+ "status": "success"
28
+ },
29
+ {
30
+ "task": "boolq",
31
+ "benchmark": "BoolQ",
32
+ "metric": "acc",
33
+ "score": 0.40214067278287463,
34
+ "shots": 0,
35
+ "runtime_sec": 144.8,
36
+ "status": "success"
37
+ },
38
+ {
39
+ "task": "arc_easy",
40
+ "benchmark": "ARC-Easy",
41
+ "metric": "acc_norm",
42
+ "score": 0.3468013468013468,
43
+ "shots": 0,
44
+ "runtime_sec": 51.41,
45
+ "status": "success"
46
+ },
47
+ {
48
+ "task": "arc_challenge",
49
+ "benchmark": "ARC-Challenge",
50
+ "metric": "acc_norm",
51
+ "score": 0.25597269624573377,
52
+ "shots": 0,
53
+ "runtime_sec": 37.69,
54
+ "status": "success"
55
+ },
56
+ {
57
+ "task": "openbookqa",
58
+ "benchmark": "OpenBookQA",
59
+ "metric": "acc_norm",
60
+ "score": 0.25,
61
+ "shots": 0,
62
+ "runtime_sec": 21.14,
63
+ "status": "success"
64
+ },
65
+ {
66
+ "task": "commonsense_qa",
67
+ "benchmark": "CommonsenseQA",
68
+ "metric": "acc",
69
+ "score": 0.2031122031122031,
70
+ "shots": 0,
71
+ "runtime_sec": 27.66,
72
+ "status": "success"
73
+ },
74
+ {
75
+ "task": "lambada_openai",
76
+ "benchmark": "LAMBADA",
77
+ "metric": "acc",
78
+ "score": 0.0023287405394915583,
79
+ "shots": 0,
80
+ "runtime_sec": 96.28,
81
+ "status": "success"
82
+ },
83
+ {
84
+ "task": "blimp",
85
+ "benchmark": "BLiMP",
86
+ "metric": "acc",
87
+ "score": 0.5923432835820895,
88
+ "shots": 0,
89
+ "runtime_sec": 354.79,
90
+ "status": "success"
91
+ },
92
+ {
93
+ "task": "mmlu",
94
+ "benchmark": "MMLU",
95
+ "metric": "acc",
96
+ "score": 0.23892607890613873,
97
+ "shots": 0,
98
+ "runtime_sec": 388.62,
99
+ "status": "success"
100
+ },
101
+ {
102
+ "task": "wikitext",
103
+ "benchmark": "WikiText-2",
104
+ "metric": "word_perplexity",
105
+ "score": 12524.42105099034,
106
+ "shots": 0,
107
+ "runtime_sec": 182.89,
108
+ "status": "success"
109
+ },
110
+ {
111
+ "task": "wikitext",
112
+ "benchmark": "WikiText-2",
113
+ "metric": "byte_perplexity",
114
+ "score": 5.838498405241562,
115
+ "shots": 0,
116
+ "runtime_sec": 181.42,
117
+ "status": "success"
118
+ },
119
+ {
120
+ "task": "wikitext",
121
+ "benchmark": "WikiText-2",
122
+ "metric": "bit_perplexity",
123
+ "score": "wikitext",
124
+ "shots": 0,
125
+ "runtime_sec": 180.88,
126
+ "status": "success"
127
+ },
128
+ {
129
+ "task": "sciq",
130
+ "benchmark": "SciQ",
131
+ "metric": "acc_norm",
132
+ "score": 0.356,
133
+ "shots": 0,
134
+ "runtime_sec": 87.15,
135
+ "status": "success"
136
+ },
137
+ {
138
+ "task": "copa",
139
+ "benchmark": "COPA",
140
+ "metric": "acc",
141
+ "score": 0.64,
142
+ "shots": 0,
143
+ "runtime_sec": 17.21,
144
+ "status": "success"
145
+ },
146
+ {
147
+ "task": "race",
148
+ "benchmark": "RACE",
149
+ "metric": "acc",
150
+ "score": 0.23157894736842105,
151
+ "shots": 0,
152
+ "runtime_sec": 334.7,
153
+ "status": "success"
154
+ },
155
+ {
156
+ "task": "swag",
157
+ "benchmark": "SWAG",
158
+ "metric": "acc_norm",
159
+ "score": 0.2912626212136359,
160
+ "shots": 0,
161
+ "runtime_sec": 252.0,
162
+ "status": "success"
163
+ },
164
+ {
165
+ "task": "truthfulqa_mc2",
166
+ "benchmark": "TruthfulQA MC2",
167
+ "metric": "acc",
168
+ "score": 0.48740972804833826,
169
+ "shots": 0,
170
+ "runtime_sec": 126.29,
171
+ "status": "success"
172
+ }
173
+ ]