Upload eval_results.json with huggingface_hub

#1
by GODELEV - opened
Files changed (1) hide show
  1. eval_results.json +164 -0
eval_results.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task": "hellaswag",
4
+ "benchmark": "HellaSwag",
5
+ "metric": "acc_norm",
6
+ "score": 0.2672774347739494,
7
+ "shots": 0,
8
+ "runtime_sec": 152.49,
9
+ "status": "success"
10
+ },
11
+ {
12
+ "task": "piqa",
13
+ "benchmark": "PIQA",
14
+ "metric": "acc_norm",
15
+ "score": 0.5032644178454843,
16
+ "shots": 0,
17
+ "runtime_sec": 29.83,
18
+ "status": "success"
19
+ },
20
+ {
21
+ "task": "winogrande",
22
+ "benchmark": "WinoGrande",
23
+ "metric": "acc",
24
+ "score": 0.4964483030781373,
25
+ "shots": 0,
26
+ "runtime_sec": 22.5,
27
+ "status": "success"
28
+ },
29
+ {
30
+ "task": "boolq",
31
+ "benchmark": "BoolQ",
32
+ "metric": "acc",
33
+ "score": 0.3782874617737003,
34
+ "shots": 0,
35
+ "runtime_sec": 54.69,
36
+ "status": "success"
37
+ },
38
+ {
39
+ "task": "arc_easy",
40
+ "benchmark": "ARC-Easy",
41
+ "metric": "acc_norm",
42
+ "score": 0.2769360269360269,
43
+ "shots": 0,
44
+ "runtime_sec": 38.91,
45
+ "status": "success"
46
+ },
47
+ {
48
+ "task": "arc_challenge",
49
+ "benchmark": "ARC-Challenge",
50
+ "metric": "acc_norm",
51
+ "score": 0.27474402730375425,
52
+ "shots": 0,
53
+ "runtime_sec": 29.25,
54
+ "status": "success"
55
+ },
56
+ {
57
+ "task": "openbookqa",
58
+ "benchmark": "OpenBookQA",
59
+ "metric": "acc_norm",
60
+ "score": 0.308,
61
+ "shots": 0,
62
+ "runtime_sec": 23.18,
63
+ "status": "success"
64
+ },
65
+ {
66
+ "task": "commonsense_qa",
67
+ "benchmark": "CommonsenseQA",
68
+ "metric": "acc",
69
+ "score": 0.18591318591318592,
70
+ "shots": 0,
71
+ "runtime_sec": 27.0,
72
+ "status": "success"
73
+ },
74
+ {
75
+ "task": "lambada_openai",
76
+ "benchmark": "LAMBADA",
77
+ "metric": "acc",
78
+ "score": 0.0,
79
+ "shots": 0,
80
+ "runtime_sec": 71.7,
81
+ "status": "success"
82
+ },
83
+ {
84
+ "task": "blimp",
85
+ "benchmark": "BLiMP",
86
+ "metric": "acc",
87
+ "score": 0.5428358208955224,
88
+ "shots": 0,
89
+ "runtime_sec": 367.52,
90
+ "status": "success"
91
+ },
92
+ {
93
+ "task": "mmlu",
94
+ "benchmark": "MMLU",
95
+ "metric": "acc",
96
+ "score": 0.2543797179888905,
97
+ "shots": 0,
98
+ "runtime_sec": 295.22,
99
+ "status": "success"
100
+ },
101
+ {
102
+ "task": "wikitext",
103
+ "benchmark": "WikiText-2",
104
+ "metric": "word_perplexity",
105
+ "score": 88520100.69650024,
106
+ "shots": 0,
107
+ "runtime_sec": 34.96,
108
+ "status": "success"
109
+ },
110
+ {
111
+ "task": "wikitext",
112
+ "benchmark": "WikiText-2",
113
+ "metric": "byte_perplexity",
114
+ "score": 30.629263941602346,
115
+ "shots": 0,
116
+ "runtime_sec": 31.49,
117
+ "status": "success"
118
+ },
119
+ {
120
+ "task": "sciq",
121
+ "benchmark": "SciQ",
122
+ "metric": "acc_norm",
123
+ "score": 0.215,
124
+ "shots": 0,
125
+ "runtime_sec": 41.48,
126
+ "status": "success"
127
+ },
128
+ {
129
+ "task": "copa",
130
+ "benchmark": "COPA",
131
+ "metric": "acc",
132
+ "score": 0.57,
133
+ "shots": 0,
134
+ "runtime_sec": 18.99,
135
+ "status": "success"
136
+ },
137
+ {
138
+ "task": "race",
139
+ "benchmark": "RACE",
140
+ "metric": "acc",
141
+ "score": 0.22775119617224882,
142
+ "shots": 0,
143
+ "runtime_sec": 100.93,
144
+ "status": "success"
145
+ },
146
+ {
147
+ "task": "swag",
148
+ "benchmark": "SWAG",
149
+ "metric": "acc_norm",
150
+ "score": 0.2575227431770469,
151
+ "shots": 0,
152
+ "runtime_sec": 153.63,
153
+ "status": "success"
154
+ },
155
+ {
156
+ "task": "truthfulqa_mc2",
157
+ "benchmark": "TruthfulQA MC2",
158
+ "metric": "acc",
159
+ "score": 0.4874513485881811,
160
+ "shots": 0,
161
+ "runtime_sec": 49.19,
162
+ "status": "success"
163
+ }
164
+ ]