Upload eval_results.json with huggingface_hub

#1
by GODELEV - opened
Files changed (1) hide show
  1. eval_results.json +164 -0
eval_results.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task": "hellaswag",
4
+ "benchmark": "HellaSwag",
5
+ "metric": "acc_norm",
6
+ "score": 0.26847241585341564,
7
+ "shots": 0,
8
+ "runtime_sec": 157.33,
9
+ "status": "success"
10
+ },
11
+ {
12
+ "task": "piqa",
13
+ "benchmark": "PIQA",
14
+ "metric": "acc_norm",
15
+ "score": 0.4923830250272035,
16
+ "shots": 0,
17
+ "runtime_sec": 34.0,
18
+ "status": "success"
19
+ },
20
+ {
21
+ "task": "winogrande",
22
+ "benchmark": "WinoGrande",
23
+ "metric": "acc",
24
+ "score": 0.4996053670086819,
25
+ "shots": 0,
26
+ "runtime_sec": 26.59,
27
+ "status": "success"
28
+ },
29
+ {
30
+ "task": "boolq",
31
+ "benchmark": "BoolQ",
32
+ "metric": "acc",
33
+ "score": 0.6085626911314985,
34
+ "shots": 0,
35
+ "runtime_sec": 54.76,
36
+ "status": "success"
37
+ },
38
+ {
39
+ "task": "arc_easy",
40
+ "benchmark": "ARC-Easy",
41
+ "metric": "acc_norm",
42
+ "score": 0.27104377104377103,
43
+ "shots": 0,
44
+ "runtime_sec": 41.15,
45
+ "status": "success"
46
+ },
47
+ {
48
+ "task": "arc_challenge",
49
+ "benchmark": "ARC-Challenge",
50
+ "metric": "acc_norm",
51
+ "score": 0.27559726962457337,
52
+ "shots": 0,
53
+ "runtime_sec": 31.75,
54
+ "status": "success"
55
+ },
56
+ {
57
+ "task": "openbookqa",
58
+ "benchmark": "OpenBookQA",
59
+ "metric": "acc_norm",
60
+ "score": 0.298,
61
+ "shots": 0,
62
+ "runtime_sec": 26.69,
63
+ "status": "success"
64
+ },
65
+ {
66
+ "task": "commonsense_qa",
67
+ "benchmark": "CommonsenseQA",
68
+ "metric": "acc",
69
+ "score": 0.20802620802620803,
70
+ "shots": 0,
71
+ "runtime_sec": 30.66,
72
+ "status": "success"
73
+ },
74
+ {
75
+ "task": "lambada_openai",
76
+ "benchmark": "LAMBADA",
77
+ "metric": "acc",
78
+ "score": 0.0,
79
+ "shots": 0,
80
+ "runtime_sec": 68.38,
81
+ "status": "success"
82
+ },
83
+ {
84
+ "task": "blimp",
85
+ "benchmark": "BLiMP",
86
+ "metric": "acc",
87
+ "score": 0.536134328358209,
88
+ "shots": 0,
89
+ "runtime_sec": 471.77,
90
+ "status": "success"
91
+ },
92
+ {
93
+ "task": "mmlu",
94
+ "benchmark": "MMLU",
95
+ "metric": "acc",
96
+ "score": 0.2416322461187865,
97
+ "shots": 0,
98
+ "runtime_sec": 532.07,
99
+ "status": "success"
100
+ },
101
+ {
102
+ "task": "wikitext",
103
+ "benchmark": "WikiText-2",
104
+ "metric": "word_perplexity",
105
+ "score": 39301278.79233013,
106
+ "shots": 0,
107
+ "runtime_sec": 46.38,
108
+ "status": "success"
109
+ },
110
+ {
111
+ "task": "wikitext",
112
+ "benchmark": "WikiText-2",
113
+ "metric": "byte_perplexity",
114
+ "score": 26.31431152931224,
115
+ "shots": 0,
116
+ "runtime_sec": 39.52,
117
+ "status": "success"
118
+ },
119
+ {
120
+ "task": "sciq",
121
+ "benchmark": "SciQ",
122
+ "metric": "acc_norm",
123
+ "score": 0.2,
124
+ "shots": 0,
125
+ "runtime_sec": 48.87,
126
+ "status": "success"
127
+ },
128
+ {
129
+ "task": "copa",
130
+ "benchmark": "COPA",
131
+ "metric": "acc",
132
+ "score": 0.63,
133
+ "shots": 0,
134
+ "runtime_sec": 22.29,
135
+ "status": "success"
136
+ },
137
+ {
138
+ "task": "race",
139
+ "benchmark": "RACE",
140
+ "metric": "acc",
141
+ "score": 0.22392344497607655,
142
+ "shots": 0,
143
+ "runtime_sec": 123.26,
144
+ "status": "success"
145
+ },
146
+ {
147
+ "task": "swag",
148
+ "benchmark": "SWAG",
149
+ "metric": "acc_norm",
150
+ "score": 0.2589723083075077,
151
+ "shots": 0,
152
+ "runtime_sec": 141.34,
153
+ "status": "success"
154
+ },
155
+ {
156
+ "task": "truthfulqa_mc2",
157
+ "benchmark": "TruthfulQA MC2",
158
+ "metric": "acc",
159
+ "score": 0.4981403502526444,
160
+ "shots": 0,
161
+ "runtime_sec": 52.22,
162
+ "status": "success"
163
+ }
164
+ ]