Upload eval_results.json with huggingface_hub

#1
by GODELEV - opened
Files changed (1) hide show
  1. eval_results.json +164 -0
eval_results.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task": "hellaswag",
4
+ "benchmark": "HellaSwag",
5
+ "metric": "acc_norm",
6
+ "score": 0.2622983469428401,
7
+ "shots": 0,
8
+ "runtime_sec": 199.9,
9
+ "status": "success"
10
+ },
11
+ {
12
+ "task": "piqa",
13
+ "benchmark": "PIQA",
14
+ "metric": "acc_norm",
15
+ "score": 0.5195865070729053,
16
+ "shots": 0,
17
+ "runtime_sec": 31.25,
18
+ "status": "success"
19
+ },
20
+ {
21
+ "task": "winogrande",
22
+ "benchmark": "WinoGrande",
23
+ "metric": "acc",
24
+ "score": 0.5114443567482242,
25
+ "shots": 0,
26
+ "runtime_sec": 21.22,
27
+ "status": "success"
28
+ },
29
+ {
30
+ "task": "boolq",
31
+ "benchmark": "BoolQ",
32
+ "metric": "acc",
33
+ "score": 0.5348623853211009,
34
+ "shots": 0,
35
+ "runtime_sec": 89.55,
36
+ "status": "success"
37
+ },
38
+ {
39
+ "task": "arc_easy",
40
+ "benchmark": "ARC-Easy",
41
+ "metric": "acc_norm",
42
+ "score": 0.24621212121212122,
43
+ "shots": 0,
44
+ "runtime_sec": 39.34,
45
+ "status": "success"
46
+ },
47
+ {
48
+ "task": "arc_challenge",
49
+ "benchmark": "ARC-Challenge",
50
+ "metric": "acc_norm",
51
+ "score": 0.22781569965870307,
52
+ "shots": 0,
53
+ "runtime_sec": 29.88,
54
+ "status": "success"
55
+ },
56
+ {
57
+ "task": "openbookqa",
58
+ "benchmark": "OpenBookQA",
59
+ "metric": "acc_norm",
60
+ "score": 0.236,
61
+ "shots": 0,
62
+ "runtime_sec": 21.16,
63
+ "status": "success"
64
+ },
65
+ {
66
+ "task": "commonsense_qa",
67
+ "benchmark": "CommonsenseQA",
68
+ "metric": "acc",
69
+ "score": 0.19492219492219492,
70
+ "shots": 0,
71
+ "runtime_sec": 25.28,
72
+ "status": "success"
73
+ },
74
+ {
75
+ "task": "lambada_openai",
76
+ "benchmark": "LAMBADA",
77
+ "metric": "acc",
78
+ "score": 0.062293809431399186,
79
+ "shots": 0,
80
+ "runtime_sec": 73.36,
81
+ "status": "success"
82
+ },
83
+ {
84
+ "task": "blimp",
85
+ "benchmark": "BLiMP",
86
+ "metric": "acc",
87
+ "score": 0.5597462686567164,
88
+ "shots": 0,
89
+ "runtime_sec": 327.32,
90
+ "status": "success"
91
+ },
92
+ {
93
+ "task": "mmlu",
94
+ "benchmark": "MMLU",
95
+ "metric": "acc",
96
+ "score": 0.23949579831932774,
97
+ "shots": 0,
98
+ "runtime_sec": 328.49,
99
+ "status": "success"
100
+ },
101
+ {
102
+ "task": "wikitext",
103
+ "benchmark": "WikiText-2",
104
+ "metric": "word_perplexity",
105
+ "score": 39613.77005205672,
106
+ "shots": 0,
107
+ "runtime_sec": 103.36,
108
+ "status": "success"
109
+ },
110
+ {
111
+ "task": "wikitext",
112
+ "benchmark": "WikiText-2",
113
+ "metric": "byte_perplexity",
114
+ "score": 7.241361328166556,
115
+ "shots": 0,
116
+ "runtime_sec": 100.6,
117
+ "status": "success"
118
+ },
119
+ {
120
+ "task": "sciq",
121
+ "benchmark": "SciQ",
122
+ "metric": "acc_norm",
123
+ "score": 0.227,
124
+ "shots": 0,
125
+ "runtime_sec": 50.71,
126
+ "status": "success"
127
+ },
128
+ {
129
+ "task": "copa",
130
+ "benchmark": "COPA",
131
+ "metric": "acc",
132
+ "score": 0.58,
133
+ "shots": 0,
134
+ "runtime_sec": 18.88,
135
+ "status": "success"
136
+ },
137
+ {
138
+ "task": "race",
139
+ "benchmark": "RACE",
140
+ "metric": "acc",
141
+ "score": 0.24401913875598086,
142
+ "shots": 0,
143
+ "runtime_sec": 173.42,
144
+ "status": "success"
145
+ },
146
+ {
147
+ "task": "swag",
148
+ "benchmark": "SWAG",
149
+ "metric": "acc_norm",
150
+ "score": 0.35669299210236927,
151
+ "shots": 0,
152
+ "runtime_sec": 169.04,
153
+ "status": "success"
154
+ },
155
+ {
156
+ "task": "truthfulqa_mc2",
157
+ "benchmark": "TruthfulQA MC2",
158
+ "metric": "acc",
159
+ "score": 0.45582288523716935,
160
+ "shots": 0,
161
+ "runtime_sec": 69.19,
162
+ "status": "success"
163
+ }
164
+ ]