Upload eval_results.json with huggingface_hub

#2
by GODELEV - opened
Files changed (1) hide show
  1. eval_results.json +164 -0
eval_results.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task": "hellaswag",
4
+ "benchmark": "HellaSwag",
5
+ "metric": "acc_norm",
6
+ "score": 0.25851424019119695,
7
+ "shots": 0,
8
+ "runtime_sec": 156.87,
9
+ "status": "success"
10
+ },
11
+ {
12
+ "task": "piqa",
13
+ "benchmark": "PIQA",
14
+ "metric": "acc_norm",
15
+ "score": 0.5495103373231773,
16
+ "shots": 0,
17
+ "runtime_sec": 27.06,
18
+ "status": "success"
19
+ },
20
+ {
21
+ "task": "winogrande",
22
+ "benchmark": "WinoGrande",
23
+ "metric": "acc",
24
+ "score": 0.5027624309392266,
25
+ "shots": 0,
26
+ "runtime_sec": 20.36,
27
+ "status": "success"
28
+ },
29
+ {
30
+ "task": "boolq",
31
+ "benchmark": "BoolQ",
32
+ "metric": "acc",
33
+ "score": 0.37920489296636084,
34
+ "shots": 0,
35
+ "runtime_sec": 47.74,
36
+ "status": "success"
37
+ },
38
+ {
39
+ "task": "arc_easy",
40
+ "benchmark": "ARC-Easy",
41
+ "metric": "acc_norm",
42
+ "score": 0.30387205387205385,
43
+ "shots": 0,
44
+ "runtime_sec": 35.14,
45
+ "status": "success"
46
+ },
47
+ {
48
+ "task": "arc_challenge",
49
+ "benchmark": "ARC-Challenge",
50
+ "metric": "acc_norm",
51
+ "score": 0.23208191126279865,
52
+ "shots": 0,
53
+ "runtime_sec": 26.46,
54
+ "status": "success"
55
+ },
56
+ {
57
+ "task": "openbookqa",
58
+ "benchmark": "OpenBookQA",
59
+ "metric": "acc_norm",
60
+ "score": 0.258,
61
+ "shots": 0,
62
+ "runtime_sec": 20.55,
63
+ "status": "success"
64
+ },
65
+ {
66
+ "task": "commonsense_qa",
67
+ "benchmark": "CommonsenseQA",
68
+ "metric": "acc",
69
+ "score": 0.21048321048321048,
70
+ "shots": 0,
71
+ "runtime_sec": 24.0,
72
+ "status": "success"
73
+ },
74
+ {
75
+ "task": "lambada_openai",
76
+ "benchmark": "LAMBADA",
77
+ "metric": "acc",
78
+ "score": 0.11003299049097613,
79
+ "shots": 0,
80
+ "runtime_sec": 65.51,
81
+ "status": "success"
82
+ },
83
+ {
84
+ "task": "blimp",
85
+ "benchmark": "BLiMP",
86
+ "metric": "acc",
87
+ "score": 0.6532537313432836,
88
+ "shots": 0,
89
+ "runtime_sec": 330.74,
90
+ "status": "success"
91
+ },
92
+ {
93
+ "task": "mmlu",
94
+ "benchmark": "MMLU",
95
+ "metric": "acc",
96
+ "score": 0.24626121635094717,
97
+ "shots": 0,
98
+ "runtime_sec": 269.48,
99
+ "status": "success"
100
+ },
101
+ {
102
+ "task": "wikitext",
103
+ "benchmark": "WikiText-2",
104
+ "metric": "word_perplexity",
105
+ "score": 217.79068002976928,
106
+ "shots": 0,
107
+ "runtime_sec": 38.93,
108
+ "status": "success"
109
+ },
110
+ {
111
+ "task": "wikitext",
112
+ "benchmark": "WikiText-2",
113
+ "metric": "byte_perplexity",
114
+ "score": 2.7366864042695744,
115
+ "shots": 0,
116
+ "runtime_sec": 36.83,
117
+ "status": "success"
118
+ },
119
+ {
120
+ "task": "sciq",
121
+ "benchmark": "SciQ",
122
+ "metric": "acc_norm",
123
+ "score": 0.425,
124
+ "shots": 0,
125
+ "runtime_sec": 41.89,
126
+ "status": "success"
127
+ },
128
+ {
129
+ "task": "copa",
130
+ "benchmark": "COPA",
131
+ "metric": "acc",
132
+ "score": 0.52,
133
+ "shots": 0,
134
+ "runtime_sec": 16.87,
135
+ "status": "success"
136
+ },
137
+ {
138
+ "task": "race",
139
+ "benchmark": "RACE",
140
+ "metric": "acc",
141
+ "score": 0.23636363636363636,
142
+ "shots": 0,
143
+ "runtime_sec": 116.74,
144
+ "status": "success"
145
+ },
146
+ {
147
+ "task": "swag",
148
+ "benchmark": "SWAG",
149
+ "metric": "acc_norm",
150
+ "score": 0.32810156952914127,
151
+ "shots": 0,
152
+ "runtime_sec": 138.44,
153
+ "status": "success"
154
+ },
155
+ {
156
+ "task": "truthfulqa_mc2",
157
+ "benchmark": "TruthfulQA MC2",
158
+ "metric": "acc",
159
+ "score": 0.48090628877613545,
160
+ "shots": 0,
161
+ "runtime_sec": 48.45,
162
+ "status": "success"
163
+ }
164
+ ]