FuryAssassin commited on
Commit
366a59f
·
verified ·
1 Parent(s): 9baec9c

Upload recalculated_scores.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. recalculated_scores.json +212 -0
recalculated_scores.json ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step_100": {
3
+ "step": 100,
4
+ "scores": {
5
+ "math_reasoning": 0.345,
6
+ "code_generation": 0.367,
7
+ "text_classification": 0.413,
8
+ "sentiment_analysis": 0.394,
9
+ "question_answering": 0.351,
10
+ "logical_reasoning": 0.444,
11
+ "common_sense": 0.353,
12
+ "reading_comprehension": 0.371,
13
+ "dialogue_generation": 0.323,
14
+ "summarization": 0.456,
15
+ "translation": 0.503,
16
+ "knowledge_retrieval": 0.311,
17
+ "creative_writing": 0.302,
18
+ "instruction_following": 0.386,
19
+ "safety_evaluation": 0.33
20
+ },
21
+ "overall": 0.377
22
+ },
23
+ "step_200": {
24
+ "step": 200,
25
+ "scores": {
26
+ "math_reasoning": 0.383,
27
+ "code_generation": 0.383,
28
+ "text_classification": 0.425,
29
+ "sentiment_analysis": 0.406,
30
+ "question_answering": 0.371,
31
+ "logical_reasoning": 0.465,
32
+ "common_sense": 0.365,
33
+ "reading_comprehension": 0.381,
34
+ "dialogue_generation": 0.335,
35
+ "summarization": 0.461,
36
+ "translation": 0.506,
37
+ "knowledge_retrieval": 0.321,
38
+ "creative_writing": 0.323,
39
+ "instruction_following": 0.4,
40
+ "safety_evaluation": 0.34
41
+ },
42
+ "overall": 0.392
43
+ },
44
+ "step_300": {
45
+ "step": 300,
46
+ "scores": {
47
+ "math_reasoning": 0.415,
48
+ "code_generation": 0.398,
49
+ "text_classification": 0.436,
50
+ "sentiment_analysis": 0.418,
51
+ "question_answering": 0.388,
52
+ "logical_reasoning": 0.484,
53
+ "common_sense": 0.377,
54
+ "reading_comprehension": 0.39,
55
+ "dialogue_generation": 0.346,
56
+ "summarization": 0.467,
57
+ "translation": 0.509,
58
+ "knowledge_retrieval": 0.331,
59
+ "creative_writing": 0.341,
60
+ "instruction_following": 0.414,
61
+ "safety_evaluation": 0.35
62
+ },
63
+ "overall": 0.405
64
+ },
65
+ "step_400": {
66
+ "step": 400,
67
+ "scores": {
68
+ "math_reasoning": 0.443,
69
+ "code_generation": 0.412,
70
+ "text_classification": 0.447,
71
+ "sentiment_analysis": 0.429,
72
+ "question_answering": 0.405,
73
+ "logical_reasoning": 0.501,
74
+ "common_sense": 0.388,
75
+ "reading_comprehension": 0.399,
76
+ "dialogue_generation": 0.357,
77
+ "summarization": 0.472,
78
+ "translation": 0.512,
79
+ "knowledge_retrieval": 0.34,
80
+ "creative_writing": 0.358,
81
+ "instruction_following": 0.427,
82
+ "safety_evaluation": 0.359
83
+ },
84
+ "overall": 0.418
85
+ },
86
+ "step_500": {
87
+ "step": 500,
88
+ "scores": {
89
+ "math_reasoning": 0.467,
90
+ "code_generation": 0.425,
91
+ "text_classification": 0.457,
92
+ "sentiment_analysis": 0.44,
93
+ "question_answering": 0.42,
94
+ "logical_reasoning": 0.517,
95
+ "common_sense": 0.398,
96
+ "reading_comprehension": 0.408,
97
+ "dialogue_generation": 0.368,
98
+ "summarization": 0.477,
99
+ "translation": 0.515,
100
+ "knowledge_retrieval": 0.348,
101
+ "creative_writing": 0.373,
102
+ "instruction_following": 0.439,
103
+ "safety_evaluation": 0.367
104
+ },
105
+ "overall": 0.429
106
+ },
107
+ "step_600": {
108
+ "step": 600,
109
+ "scores": {
110
+ "math_reasoning": 0.487,
111
+ "code_generation": 0.437,
112
+ "text_classification": 0.467,
113
+ "sentiment_analysis": 0.45,
114
+ "question_answering": 0.434,
115
+ "logical_reasoning": 0.531,
116
+ "common_sense": 0.407,
117
+ "reading_comprehension": 0.416,
118
+ "dialogue_generation": 0.378,
119
+ "summarization": 0.482,
120
+ "translation": 0.518,
121
+ "knowledge_retrieval": 0.356,
122
+ "creative_writing": 0.387,
123
+ "instruction_following": 0.45,
124
+ "safety_evaluation": 0.375
125
+ },
126
+ "overall": 0.44
127
+ },
128
+ "step_700": {
129
+ "step": 700,
130
+ "scores": {
131
+ "math_reasoning": 0.506,
132
+ "code_generation": 0.448,
133
+ "text_classification": 0.476,
134
+ "sentiment_analysis": 0.459,
135
+ "question_answering": 0.447,
136
+ "logical_reasoning": 0.543,
137
+ "common_sense": 0.416,
138
+ "reading_comprehension": 0.424,
139
+ "dialogue_generation": 0.387,
140
+ "summarization": 0.487,
141
+ "translation": 0.521,
142
+ "knowledge_retrieval": 0.364,
143
+ "creative_writing": 0.4,
144
+ "instruction_following": 0.461,
145
+ "safety_evaluation": 0.383
146
+ },
147
+ "overall": 0.45
148
+ },
149
+ "step_800": {
150
+ "step": 800,
151
+ "scores": {
152
+ "math_reasoning": 0.522,
153
+ "code_generation": 0.459,
154
+ "text_classification": 0.484,
155
+ "sentiment_analysis": 0.468,
156
+ "question_answering": 0.459,
157
+ "logical_reasoning": 0.555,
158
+ "common_sense": 0.424,
159
+ "reading_comprehension": 0.432,
160
+ "dialogue_generation": 0.396,
161
+ "summarization": 0.491,
162
+ "translation": 0.523,
163
+ "knowledge_retrieval": 0.371,
164
+ "creative_writing": 0.413,
165
+ "instruction_following": 0.471,
166
+ "safety_evaluation": 0.391
167
+ },
168
+ "overall": 0.459
169
+ },
170
+ "step_900": {
171
+ "step": 900,
172
+ "scores": {
173
+ "math_reasoning": 0.537,
174
+ "code_generation": 0.469,
175
+ "text_classification": 0.492,
176
+ "sentiment_analysis": 0.477,
177
+ "question_answering": 0.471,
178
+ "logical_reasoning": 0.566,
179
+ "common_sense": 0.432,
180
+ "reading_comprehension": 0.439,
181
+ "dialogue_generation": 0.404,
182
+ "summarization": 0.496,
183
+ "translation": 0.526,
184
+ "knowledge_retrieval": 0.378,
185
+ "creative_writing": 0.424,
186
+ "instruction_following": 0.48,
187
+ "safety_evaluation": 0.398
188
+ },
189
+ "overall": 0.468
190
+ },
191
+ "step_1000": {
192
+ "step": 1000,
193
+ "scores": {
194
+ "math_reasoning": 0.55,
195
+ "code_generation": 0.479,
196
+ "text_classification": 0.5,
197
+ "sentiment_analysis": 0.485,
198
+ "question_answering": 0.482,
199
+ "logical_reasoning": 0.576,
200
+ "common_sense": 0.44,
201
+ "reading_comprehension": 0.446,
202
+ "dialogue_generation": 0.412,
203
+ "summarization": 0.5,
204
+ "translation": 0.529,
205
+ "knowledge_retrieval": 0.385,
206
+ "creative_writing": 0.434,
207
+ "instruction_following": 0.489,
208
+ "safety_evaluation": 0.404
209
+ },
210
+ "overall": 0.476
211
+ }
212
+ }