amirali1985 commited on
Commit
320521a
·
verified ·
1 Parent(s): 177d7d5

Upload add_sub_sorl_v1_abs10_K1_25K_1L2H256d

Browse files
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 256,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1024,
15
+ "layer_types": [
16
+ "full_attention"
17
+ ],
18
+ "max_position_embeddings": 128,
19
+ "max_window_layers": 28,
20
+ "model_type": "qwen3",
21
+ "num_attention_heads": 2,
22
+ "num_hidden_layers": 1,
23
+ "num_key_value_heads": 2,
24
+ "pad_token_id": null,
25
+ "rms_norm_eps": 1e-06,
26
+ "rope_parameters": {
27
+ "rope_theta": 10000.0,
28
+ "rope_type": "default"
29
+ },
30
+ "sliding_window": null,
31
+ "tie_word_embeddings": false,
32
+ "transformers_version": "5.5.0",
33
+ "use_cache": true,
34
+ "use_sliding_window": false,
35
+ "vocab_size": 151654
36
+ }
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/metrics.json ADDED
@@ -0,0 +1,1617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 441,
12
+ 491,
13
+ 541,
14
+ 591,
15
+ 641,
16
+ 691,
17
+ 741,
18
+ 832,
19
+ 882,
20
+ 932,
21
+ 982,
22
+ 1032,
23
+ 1082,
24
+ 1132,
25
+ 1223,
26
+ 1273,
27
+ 1323,
28
+ 1373,
29
+ 1423,
30
+ 1473,
31
+ 1523,
32
+ 1614,
33
+ 1664,
34
+ 1714,
35
+ 1764,
36
+ 1814,
37
+ 1864,
38
+ 1914,
39
+ 2005,
40
+ 2055,
41
+ 2105,
42
+ 2155,
43
+ 2205,
44
+ 2255,
45
+ 2305,
46
+ 2396,
47
+ 2446,
48
+ 2496,
49
+ 2546,
50
+ 2596,
51
+ 2646,
52
+ 2696,
53
+ 2787,
54
+ 2837,
55
+ 2887,
56
+ 2937,
57
+ 2987,
58
+ 3037,
59
+ 3087,
60
+ 3178,
61
+ 3228,
62
+ 3278,
63
+ 3328,
64
+ 3378,
65
+ 3428,
66
+ 3478,
67
+ 3569,
68
+ 3619,
69
+ 3669,
70
+ 3719,
71
+ 3769,
72
+ 3819,
73
+ 3869
74
+ ],
75
+ "loss": [
76
+ 8.524430274963379,
77
+ 9.103901863098145,
78
+ 11.476999282836914,
79
+ 11.07170295715332,
80
+ 9.617006301879883,
81
+ 8.757355690002441,
82
+ 7.827155113220215,
83
+ 6.2943830490112305,
84
+ 5.568047523498535,
85
+ 4.920413970947266,
86
+ 4.179653167724609,
87
+ 4.015377998352051,
88
+ 3.8744664192199707,
89
+ 3.6563000679016113,
90
+ 3.714816093444824,
91
+ 3.138887882232666,
92
+ 3.476172924041748,
93
+ 3.5434532165527344,
94
+ 3.2397027015686035,
95
+ 3.118135929107666,
96
+ 3.188394546508789,
97
+ 3.03411602973938,
98
+ 3.202545166015625,
99
+ 3.2864937782287598,
100
+ 2.669011116027832,
101
+ 3.0694193840026855,
102
+ 2.755049705505371,
103
+ 2.658294677734375,
104
+ 3.083820343017578,
105
+ 2.523517608642578,
106
+ 2.064281463623047,
107
+ 2.1752371788024902,
108
+ 2.3656363487243652,
109
+ 1.9907090663909912,
110
+ 1.5538188219070435,
111
+ 0.5142228603363037,
112
+ 0.4625096321105957,
113
+ -0.34851527214050293,
114
+ -0.9209808111190796,
115
+ -0.848351240158081,
116
+ -1.2908415794372559,
117
+ -1.9059382677078247,
118
+ -2.5187909603118896,
119
+ -2.6178600788116455,
120
+ -2.9667201042175293,
121
+ -3.4197542667388916,
122
+ -3.4599928855895996,
123
+ -4.1904425621032715,
124
+ -5.044626235961914,
125
+ -4.426793098449707,
126
+ -4.773674011230469,
127
+ -5.006862640380859,
128
+ -5.084996223449707,
129
+ -5.224812030792236,
130
+ -5.75303316116333,
131
+ -5.572091102600098,
132
+ -5.412012100219727,
133
+ -5.735694408416748,
134
+ -4.803545951843262,
135
+ -4.7789177894592285,
136
+ -4.737234115600586,
137
+ -4.012823104858398,
138
+ -3.9731242656707764,
139
+ -4.444588661193848,
140
+ -4.273018836975098,
141
+ -4.604288101196289,
142
+ -4.26378059387207,
143
+ -4.285333156585693,
144
+ -3.932020902633667,
145
+ -4.814791202545166
146
+ ],
147
+ "base_loss": [
148
+ 11.692604064941406,
149
+ 10.586286544799805,
150
+ 9.575356483459473,
151
+ 8.626004219055176,
152
+ 7.766415119171143,
153
+ 6.867415904998779,
154
+ 5.944822788238525,
155
+ 4.488890647888184,
156
+ 3.712851047515869,
157
+ 3.1012394428253174,
158
+ 2.6632230281829834,
159
+ 2.547546863555908,
160
+ 2.3669917583465576,
161
+ 2.289210081100464,
162
+ 2.217154026031494,
163
+ 2.0566632747650146,
164
+ 2.032634735107422,
165
+ 1.9611835479736328,
166
+ 2.062990427017212,
167
+ 1.9896939992904663,
168
+ 1.9653056859970093,
169
+ 1.9700582027435303,
170
+ 1.9406718015670776,
171
+ 1.9608590602874756,
172
+ 1.9188401699066162,
173
+ 1.9645260572433472,
174
+ 1.8941099643707275,
175
+ 1.8756904602050781,
176
+ 1.9264477491378784,
177
+ 1.9439666271209717,
178
+ 1.8485826253890991,
179
+ 1.9204241037368774,
180
+ 1.8739402294158936,
181
+ 1.8911492824554443,
182
+ 1.90359628200531,
183
+ 1.8942089080810547,
184
+ 1.9322471618652344,
185
+ 1.8236247301101685,
186
+ 1.8280603885650635,
187
+ 1.8796042203903198,
188
+ 1.8643499612808228,
189
+ 1.9163367748260498,
190
+ 1.8653056621551514,
191
+ 1.8866708278656006,
192
+ 1.8857192993164062,
193
+ 1.8382318019866943,
194
+ 1.8878859281539917,
195
+ 1.8952440023422241,
196
+ 1.9479080438613892,
197
+ 1.8315229415893555,
198
+ 1.853525161743164,
199
+ 1.7897322177886963,
200
+ 1.7672021389007568,
201
+ 1.67568838596344,
202
+ 1.732727289199829,
203
+ 1.6849744319915771,
204
+ 1.6561976671218872,
205
+ 1.6894086599349976,
206
+ 1.5698803663253784,
207
+ 1.526742935180664,
208
+ 1.5828590393066406,
209
+ 1.4620859622955322,
210
+ 1.427542805671692,
211
+ 1.5105113983154297,
212
+ 1.487876296043396,
213
+ 1.4823344945907593,
214
+ 1.458412766456604,
215
+ 1.479615330696106,
216
+ 1.3868151903152466,
217
+ 1.540739893913269
218
+ ],
219
+ "info_loss": [
220
+ -1.2530698776245117,
221
+ -0.9802732467651367,
222
+ -0.4201974868774414,
223
+ -0.18035411834716797,
224
+ -0.13341093063354492,
225
+ -0.07659339904785156,
226
+ -0.04645824432373047,
227
+ -0.028890609741210938,
228
+ -0.016974687576293945,
229
+ -0.016387462615966797,
230
+ -0.04442238807678223,
231
+ -0.047545671463012695,
232
+ -0.042586565017700195,
233
+ -0.056116580963134766,
234
+ -0.04165053367614746,
235
+ -0.08310163021087646,
236
+ -0.04619729518890381,
237
+ -0.03194534778594971,
238
+ -0.07237017154693604,
239
+ -0.076835036277771,
240
+ -0.06701540946960449,
241
+ -0.08263790607452393,
242
+ -0.06275796890258789,
243
+ -0.05613744258880615,
244
+ -0.11362743377685547,
245
+ -0.07795798778533936,
246
+ -0.10230779647827148,
247
+ -0.11010193824768066,
248
+ -0.07229125499725342,
249
+ -0.13019275665283203,
250
+ -0.16647255420684814,
251
+ -0.16245436668395996,
252
+ -0.138879656791687,
253
+ -0.17805254459381104,
254
+ -0.22295153141021729,
255
+ -0.32607901096343994,
256
+ -0.3350032567977905,
257
+ -0.40525615215301514,
258
+ -0.4629077911376953,
259
+ -0.46074652671813965,
260
+ -0.5033977031707764,
261
+ -0.5704048871994019,
262
+ -0.6261879205703735,
263
+ -0.6387062072753906,
264
+ -0.6733493804931641,
265
+ -0.7140244245529175,
266
+ -0.7230769395828247,
267
+ -0.796980619430542,
268
+ -0.887682318687439,
269
+ -0.8138977289199829,
270
+ -0.8508479595184326,
271
+ -0.8680267930030823,
272
+ -0.8732814192771912,
273
+ -0.8783826231956482,
274
+ -0.9365927577018738,
275
+ -0.9135531187057495,
276
+ -0.8946322798728943,
277
+ -0.9301126599311829,
278
+ -0.8245593905448914,
279
+ -0.8180517554283142,
280
+ -0.8192586898803711,
281
+ -0.734249472618103,
282
+ -0.7269594073295593,
283
+ -0.7820543646812439,
284
+ -0.7622780799865723,
285
+ -0.7947943210601807,
286
+ -0.7579862475395203,
287
+ -0.762677013874054,
288
+ -0.7179456353187561,
289
+ -0.821166455745697
290
+ ],
291
+ "abs_loss": [
292
+ 2.301368474960327,
293
+ 2.272613763809204,
294
+ 2.1877543926239014,
295
+ 2.0855491161346436,
296
+ 2.003074884414673,
297
+ 1.9530082941055298,
298
+ 1.8817201852798462,
299
+ 1.862039566040039,
300
+ 1.8272932767868042,
301
+ 1.8210479021072388,
302
+ 1.8502413034439087,
303
+ 1.8328882455825806,
304
+ 1.8329845666885376,
305
+ 1.8578821420669556,
306
+ 1.8351255655288696,
307
+ 1.8775984048843384,
308
+ 1.8421878814697266,
309
+ 1.8364969491958618,
310
+ 1.8568716049194336,
311
+ 1.8509222269058228,
312
+ 1.840528130531311,
313
+ 1.8526434898376465,
314
+ 1.8566508293151855,
315
+ 1.8434433937072754,
316
+ 1.8501802682876587,
317
+ 1.8451181650161743,
318
+ 1.8511062860488892,
319
+ 1.8564246892929077,
320
+ 1.8391847610473633,
321
+ 1.8499897718429565,
322
+ 1.8488394021987915,
323
+ 1.8438078165054321,
324
+ 1.8546184301376343,
325
+ 1.857653021812439,
326
+ 1.8427098989486694,
327
+ 1.837051272392273,
328
+ 1.8287359476089478,
329
+ 1.8250846862792969,
330
+ 1.8261111974716187,
331
+ 1.8157788515090942,
332
+ 1.813087821006775,
333
+ 1.8383451700210571,
334
+ 1.7974052429199219,
335
+ 1.8346837759017944,
336
+ 1.8273576498031616,
337
+ 1.8329792022705078,
338
+ 1.8257931470870972,
339
+ 1.8227308988571167,
340
+ 1.8331661224365234,
341
+ 1.8032366037368774,
342
+ 1.8001683950424194,
343
+ 1.8143806457519531,
344
+ 1.8048582077026367,
345
+ 1.8383716344833374,
346
+ 1.8124042749404907,
347
+ 1.7793735265731812,
348
+ 1.7747602462768555,
349
+ 1.7881097793579102,
350
+ 1.7533124685287476,
351
+ 1.776807188987732,
352
+ 1.735535979270935,
353
+ 1.737404465675354,
354
+ 1.7615875005722046,
355
+ 1.7293590307235718,
356
+ 1.7104082107543945,
357
+ 1.7312254905700684,
358
+ 1.700640082359314,
359
+ 1.708137035369873,
360
+ 1.7061353921890259,
361
+ 1.6751855611801147
362
+ ],
363
+ "zipf_loss": [
364
+ 9.1323881149292,
365
+ 8.093086242675781,
366
+ 5.884842872619629,
367
+ 4.040685653686523,
368
+ 2.9843926429748535,
369
+ 2.4605727195739746,
370
+ 2.158742666244507,
371
+ 1.9081945419311523,
372
+ 1.8422138690948486,
373
+ 1.8009443283081055,
374
+ 1.775629997253418,
375
+ 1.75999915599823,
376
+ 1.7500418424606323,
377
+ 1.7424676418304443,
378
+ 1.7306548357009888,
379
+ 1.7254811525344849,
380
+ 1.7212923765182495,
381
+ 1.7180733680725098,
382
+ 1.7147266864776611,
383
+ 1.711700201034546,
384
+ 1.709190011024475,
385
+ 1.7051725387573242,
386
+ 1.703787922859192,
387
+ 1.702664852142334,
388
+ 1.7014273405075073,
389
+ 1.699961543083191,
390
+ 1.6989071369171143,
391
+ 1.6979811191558838,
392
+ 1.696366548538208,
393
+ 1.6964795589447021,
394
+ 1.695540428161621,
395
+ 1.6949759721755981,
396
+ 1.6950308084487915,
397
+ 1.6943199634552002,
398
+ 1.6954668760299683,
399
+ 1.697098970413208,
400
+ 1.6974214315414429,
401
+ 1.697913408279419,
402
+ 1.6974254846572876,
403
+ 1.6979317665100098,
404
+ 1.697476863861084,
405
+ 1.6979395151138306,
406
+ 1.6980416774749756,
407
+ 1.6990625858306885,
408
+ 1.6983184814453125,
409
+ 1.6989600658416748,
410
+ 1.7003107070922852,
411
+ 1.701846718788147,
412
+ 1.7009726762771606,
413
+ 1.700337290763855,
414
+ 1.7012641429901123,
415
+ 1.7022359371185303,
416
+ 1.7001292705535889,
417
+ 1.6994887590408325,
418
+ 1.6989268064498901,
419
+ 1.7005280256271362,
420
+ 1.7006367444992065,
421
+ 1.6972121000289917,
422
+ 1.6968367099761963,
423
+ 1.6971760988235474,
424
+ 1.6989400386810303,
425
+ 1.6938459873199463,
426
+ 1.6927683353424072,
427
+ 1.6925078630447388,
428
+ 1.6908442974090576,
429
+ 1.688198208808899,
430
+ 1.687605381011963,
431
+ 1.6910079717636108,
432
+ 1.6900064945220947,
433
+ 1.6886142492294312
434
+ ],
435
+ "denoise_loss": [],
436
+ "ortho_loss": [
437
+ 0.7410624623298645,
438
+ 0.5486676096916199,
439
+ 0.4325517416000366,
440
+ 0.4293130338191986,
441
+ 0.42301344871520996,
442
+ 0.43677014112472534,
443
+ 0.4322786331176758,
444
+ 0.43111148476600647,
445
+ 0.42917412519454956,
446
+ 0.4202711284160614,
447
+ 0.40449219942092896,
448
+ 0.38009271025657654,
449
+ 0.38311055302619934,
450
+ 0.38355082273483276,
451
+ 0.3772588074207306,
452
+ 0.37517693638801575,
453
+ 0.35672980546951294,
454
+ 0.3655826151371002,
455
+ 0.36484745144844055,
456
+ 0.36458733677864075,
457
+ 0.3716451823711395,
458
+ 0.3796772062778473,
459
+ 0.37268635630607605,
460
+ 0.36658456921577454,
461
+ 0.36402279138565063,
462
+ 0.36522912979125977,
463
+ 0.3712886571884155,
464
+ 0.3701127767562866,
465
+ 0.3669368028640747,
466
+ 0.35840272903442383,
467
+ 0.36034736037254333,
468
+ 0.36091148853302,
469
+ 0.35899659991264343,
470
+ 0.356486439704895,
471
+ 0.3690042495727539,
472
+ 0.3797115981578827,
473
+ 0.3873102366924286,
474
+ 0.39992478489875793,
475
+ 0.41160136461257935,
476
+ 0.42042696475982666,
477
+ 0.42867836356163025,
478
+ 0.43672993779182434,
479
+ 0.4450317621231079,
480
+ 0.45142629742622375,
481
+ 0.451881468296051,
482
+ 0.45120909810066223,
483
+ 0.4502365291118622,
484
+ 0.44785618782043457,
485
+ 0.4534400999546051,
486
+ 0.45658400654792786,
487
+ 0.45758968591690063,
488
+ 0.45804986357688904,
489
+ 0.4550400674343109,
490
+ 0.451234370470047,
491
+ 0.4503278136253357,
492
+ 0.45379236340522766,
493
+ 0.4566922187805176,
494
+ 0.4583297073841095,
495
+ 0.46125417947769165,
496
+ 0.46176251769065857,
497
+ 0.4625169634819031,
498
+ 0.46337494254112244,
499
+ 0.46481263637542725,
500
+ 0.4634028375148773,
501
+ 0.4632355272769928,
502
+ 0.46351659297943115,
503
+ 0.46328744292259216,
504
+ 0.4644387364387512,
505
+ 0.46469375491142273,
506
+ 0.4648001492023468
507
+ ],
508
+ "lr": [
509
+ 8.376068376068378e-06,
510
+ 1.6923076923076924e-05,
511
+ 2e-05,
512
+ 2e-05,
513
+ 2e-05,
514
+ 2e-05,
515
+ 2e-05,
516
+ 2e-05,
517
+ 2e-05,
518
+ 2e-05,
519
+ 2e-05,
520
+ 2e-05,
521
+ 2e-05,
522
+ 2e-05,
523
+ 2e-05,
524
+ 2e-05,
525
+ 2e-05,
526
+ 2e-05,
527
+ 2e-05,
528
+ 2e-05,
529
+ 2e-05,
530
+ 2e-05,
531
+ 2e-05,
532
+ 2e-05,
533
+ 2e-05,
534
+ 2e-05,
535
+ 2e-05,
536
+ 2e-05,
537
+ 2e-05,
538
+ 2e-05,
539
+ 2e-05,
540
+ 2e-05,
541
+ 2e-05,
542
+ 2e-05,
543
+ 2e-05,
544
+ 2e-05,
545
+ 2e-05,
546
+ 2e-05,
547
+ 2e-05,
548
+ 2e-05,
549
+ 2e-05,
550
+ 2e-05,
551
+ 1.9973899288162407e-05,
552
+ 1.9380701291853413e-05,
553
+ 1.8787503295544426e-05,
554
+ 1.8194305299235432e-05,
555
+ 1.7601107302926442e-05,
556
+ 1.7007909306617455e-05,
557
+ 1.6414711310308464e-05,
558
+ 1.5335090957026102e-05,
559
+ 1.474189296071711e-05,
560
+ 1.4148694964408121e-05,
561
+ 1.355549696809913e-05,
562
+ 1.2962298971790142e-05,
563
+ 1.2369100975481152e-05,
564
+ 1.1775902979172158e-05,
565
+ 1.0696282625889797e-05,
566
+ 1.0103084629580805e-05,
567
+ 9.50988663327182e-06,
568
+ 8.91668863696283e-06,
569
+ 8.323490640653837e-06,
570
+ 7.730292644344845e-06,
571
+ 7.137094648035855e-06,
572
+ 6.057474294753492e-06,
573
+ 5.4642762984445065e-06,
574
+ 4.871078302135514e-06,
575
+ 4.277880305826523e-06,
576
+ 3.684682309517532e-06,
577
+ 3.091484313208541e-06,
578
+ 2.4982863168995496e-06
579
+ ],
580
+ "emb_lr": [],
581
+ "eval_step": [
582
+ 350,
583
+ 741,
584
+ 1132,
585
+ 1523,
586
+ 1914,
587
+ 2305,
588
+ 2696,
589
+ 3087,
590
+ 3478,
591
+ 3869
592
+ ],
593
+ "eval_accuracy": [
594
+ 0.01,
595
+ 0.0,
596
+ 0.0,
597
+ 0.01,
598
+ 0.0,
599
+ 0.0,
600
+ 0.01,
601
+ 0.0,
602
+ 0.0,
603
+ 0.0
604
+ ]
605
+ },
606
+ "final_accuracy": 0.06291666666666666,
607
+ "sft_eval": {
608
+ "config": {
609
+ "ops": "add_sub",
610
+ "K": null,
611
+ "mode": "sft",
612
+ "n_digits": 6,
613
+ "n_per_split": 100
614
+ },
615
+ "splits": {
616
+ "add_S0": {
617
+ "full_accuracy": 0.01,
618
+ "n_examples": 100,
619
+ "per_subtask": {
620
+ "SA": {
621
+ "accuracy": 0.5652892561983471,
622
+ "count": 605
623
+ },
624
+ "SS": {
625
+ "accuracy": 0.7578947368421053,
626
+ "count": 95
627
+ }
628
+ }
629
+ },
630
+ "add_S1": {
631
+ "full_accuracy": 0.0,
632
+ "n_examples": 100,
633
+ "per_subtask": {
634
+ "SA": {
635
+ "accuracy": 0.5735294117647058,
636
+ "count": 204
637
+ },
638
+ "SC": {
639
+ "accuracy": 0.41420118343195267,
640
+ "count": 169
641
+ },
642
+ "SS": {
643
+ "accuracy": 0.5483870967741935,
644
+ "count": 31
645
+ },
646
+ "UC": {
647
+ "accuracy": 0.38175675675675674,
648
+ "count": 296
649
+ }
650
+ }
651
+ },
652
+ "add_S2": {
653
+ "full_accuracy": 0.02,
654
+ "n_examples": 100,
655
+ "per_subtask": {
656
+ "SA": {
657
+ "accuracy": 0.5828220858895705,
658
+ "count": 163
659
+ },
660
+ "SC": {
661
+ "accuracy": 0.38461538461538464,
662
+ "count": 130
663
+ },
664
+ "SS": {
665
+ "accuracy": 0.7586206896551724,
666
+ "count": 87
667
+ },
668
+ "UC": {
669
+ "accuracy": 0.4433497536945813,
670
+ "count": 203
671
+ },
672
+ "US": {
673
+ "accuracy": 0.46153846153846156,
674
+ "count": 117
675
+ }
676
+ }
677
+ },
678
+ "add_S3": {
679
+ "full_accuracy": 0.0,
680
+ "n_examples": 100,
681
+ "per_subtask": {
682
+ "SA": {
683
+ "accuracy": 0.6694214876033058,
684
+ "count": 121
685
+ },
686
+ "SC": {
687
+ "accuracy": 0.371900826446281,
688
+ "count": 121
689
+ },
690
+ "SS": {
691
+ "accuracy": 0.8775510204081632,
692
+ "count": 49
693
+ },
694
+ "UC": {
695
+ "accuracy": 0.3333333333333333,
696
+ "count": 186
697
+ },
698
+ "US": {
699
+ "accuracy": 0.23766816143497757,
700
+ "count": 223
701
+ }
702
+ }
703
+ },
704
+ "add_S4": {
705
+ "full_accuracy": 0.05,
706
+ "n_examples": 100,
707
+ "per_subtask": {
708
+ "SA": {
709
+ "accuracy": 0.7019230769230769,
710
+ "count": 104
711
+ },
712
+ "SC": {
713
+ "accuracy": 0.3584905660377358,
714
+ "count": 106
715
+ },
716
+ "SS": {
717
+ "accuracy": 0.8695652173913043,
718
+ "count": 23
719
+ },
720
+ "UC": {
721
+ "accuracy": 0.39375,
722
+ "count": 160
723
+ },
724
+ "US": {
725
+ "accuracy": 0.24104234527687296,
726
+ "count": 307
727
+ }
728
+ }
729
+ },
730
+ "add_S5": {
731
+ "full_accuracy": 0.04,
732
+ "n_examples": 100,
733
+ "per_subtask": {
734
+ "SA": {
735
+ "accuracy": 0.65,
736
+ "count": 100
737
+ },
738
+ "SC": {
739
+ "accuracy": 0.28,
740
+ "count": 100
741
+ },
742
+ "UC": {
743
+ "accuracy": 0.29,
744
+ "count": 100
745
+ },
746
+ "US": {
747
+ "accuracy": 0.22,
748
+ "count": 400
749
+ }
750
+ }
751
+ },
752
+ "add_S6": {
753
+ "full_accuracy": 0.09,
754
+ "n_examples": 100,
755
+ "per_subtask": {
756
+ "SC": {
757
+ "accuracy": 0.34,
758
+ "count": 100
759
+ },
760
+ "UC": {
761
+ "accuracy": 0.43,
762
+ "count": 100
763
+ },
764
+ "US": {
765
+ "accuracy": 0.32,
766
+ "count": 500
767
+ }
768
+ }
769
+ },
770
+ "add_random": {
771
+ "full_accuracy": 0.005,
772
+ "n_examples": 200,
773
+ "per_subtask": {
774
+ "SA": {
775
+ "accuracy": 0.5682326621923938,
776
+ "count": 447
777
+ },
778
+ "SC": {
779
+ "accuracy": 0.40625,
780
+ "count": 320
781
+ },
782
+ "SS": {
783
+ "accuracy": 0.8035714285714286,
784
+ "count": 56
785
+ },
786
+ "UC": {
787
+ "accuracy": 0.34026465028355385,
788
+ "count": 529
789
+ },
790
+ "US": {
791
+ "accuracy": 0.2916666666666667,
792
+ "count": 48
793
+ }
794
+ }
795
+ },
796
+ "add_C3": {
797
+ "full_accuracy": 0.0,
798
+ "n_examples": 100,
799
+ "per_subtask": {
800
+ "SA": {
801
+ "accuracy": 0.63,
802
+ "count": 300
803
+ },
804
+ "SC": {
805
+ "accuracy": 0.34,
806
+ "count": 100
807
+ },
808
+ "UC": {
809
+ "accuracy": 0.29533678756476683,
810
+ "count": 193
811
+ },
812
+ "US": {
813
+ "accuracy": 0.29906542056074764,
814
+ "count": 107
815
+ }
816
+ }
817
+ },
818
+ "add_C4": {
819
+ "full_accuracy": 0.01,
820
+ "n_examples": 100,
821
+ "per_subtask": {
822
+ "SA": {
823
+ "accuracy": 0.695,
824
+ "count": 200
825
+ },
826
+ "SC": {
827
+ "accuracy": 0.4,
828
+ "count": 100
829
+ },
830
+ "UC": {
831
+ "accuracy": 0.26171875,
832
+ "count": 256
833
+ },
834
+ "US": {
835
+ "accuracy": 0.2777777777777778,
836
+ "count": 144
837
+ }
838
+ }
839
+ },
840
+ "add_C5": {
841
+ "full_accuracy": 0.03,
842
+ "n_examples": 100,
843
+ "per_subtask": {
844
+ "SA": {
845
+ "accuracy": 0.74,
846
+ "count": 100
847
+ },
848
+ "SC": {
849
+ "accuracy": 0.49,
850
+ "count": 100
851
+ },
852
+ "UC": {
853
+ "accuracy": 0.30718954248366015,
854
+ "count": 306
855
+ },
856
+ "US": {
857
+ "accuracy": 0.4742268041237113,
858
+ "count": 194
859
+ }
860
+ }
861
+ },
862
+ "add_C6": {
863
+ "full_accuracy": 0.01,
864
+ "n_examples": 100,
865
+ "per_subtask": {
866
+ "SC": {
867
+ "accuracy": 0.46,
868
+ "count": 100
869
+ },
870
+ "UC": {
871
+ "accuracy": 0.37158469945355194,
872
+ "count": 366
873
+ },
874
+ "US": {
875
+ "accuracy": 0.47435897435897434,
876
+ "count": 234
877
+ }
878
+ }
879
+ },
880
+ "sub_M0": {
881
+ "full_accuracy": 0.0,
882
+ "n_examples": 100,
883
+ "per_subtask": {
884
+ "MD": {
885
+ "accuracy": 0.3876871880199667,
886
+ "count": 601
887
+ },
888
+ "ME": {
889
+ "accuracy": 0.7878787878787878,
890
+ "count": 99
891
+ }
892
+ }
893
+ },
894
+ "sub_M1": {
895
+ "full_accuracy": 0.0,
896
+ "n_examples": 100,
897
+ "per_subtask": {
898
+ "MD": {
899
+ "accuracy": 0.5376344086021505,
900
+ "count": 279
901
+ },
902
+ "MB": {
903
+ "accuracy": 0.11724137931034483,
904
+ "count": 145
905
+ },
906
+ "ME": {
907
+ "accuracy": 0.7916666666666666,
908
+ "count": 24
909
+ },
910
+ "UB": {
911
+ "accuracy": 0.14682539682539683,
912
+ "count": 252
913
+ }
914
+ }
915
+ },
916
+ "sub_M2": {
917
+ "full_accuracy": 0.0,
918
+ "n_examples": 100,
919
+ "per_subtask": {
920
+ "MD": {
921
+ "accuracy": 0.7417840375586855,
922
+ "count": 213
923
+ },
924
+ "MB": {
925
+ "accuracy": 0.061946902654867256,
926
+ "count": 113
927
+ },
928
+ "ME": {
929
+ "accuracy": 0.8235294117647058,
930
+ "count": 85
931
+ },
932
+ "UB": {
933
+ "accuracy": 0.143646408839779,
934
+ "count": 181
935
+ },
936
+ "UD": {
937
+ "accuracy": 0.07407407407407407,
938
+ "count": 108
939
+ }
940
+ }
941
+ },
942
+ "sub_M3": {
943
+ "full_accuracy": 0.0,
944
+ "n_examples": 100,
945
+ "per_subtask": {
946
+ "MD": {
947
+ "accuracy": 0.8044692737430168,
948
+ "count": 179
949
+ },
950
+ "MB": {
951
+ "accuracy": 0.02912621359223301,
952
+ "count": 103
953
+ },
954
+ "ME": {
955
+ "accuracy": 0.9107142857142857,
956
+ "count": 56
957
+ },
958
+ "UB": {
959
+ "accuracy": 0.2214765100671141,
960
+ "count": 149
961
+ },
962
+ "UD": {
963
+ "accuracy": 0.1267605633802817,
964
+ "count": 213
965
+ }
966
+ }
967
+ },
968
+ "sub_M4": {
969
+ "full_accuracy": 0.0,
970
+ "n_examples": 100,
971
+ "per_subtask": {
972
+ "MD": {
973
+ "accuracy": 0.615,
974
+ "count": 200
975
+ },
976
+ "MB": {
977
+ "accuracy": 0.11,
978
+ "count": 100
979
+ },
980
+ "UB": {
981
+ "accuracy": 0.36,
982
+ "count": 100
983
+ },
984
+ "UD": {
985
+ "accuracy": 0.14,
986
+ "count": 300
987
+ }
988
+ }
989
+ },
990
+ "sub_M5": {
991
+ "full_accuracy": 0.01,
992
+ "n_examples": 100,
993
+ "per_subtask": {
994
+ "MD": {
995
+ "accuracy": 1.0,
996
+ "count": 100
997
+ },
998
+ "MB": {
999
+ "accuracy": 0.08,
1000
+ "count": 100
1001
+ },
1002
+ "UB": {
1003
+ "accuracy": 0.43,
1004
+ "count": 100
1005
+ },
1006
+ "UD": {
1007
+ "accuracy": 0.045,
1008
+ "count": 400
1009
+ }
1010
+ }
1011
+ },
1012
+ "sub_random": {
1013
+ "full_accuracy": 0.0,
1014
+ "n_examples": 200,
1015
+ "per_subtask": {
1016
+ "MD": {
1017
+ "accuracy": 0.54,
1018
+ "count": 600
1019
+ },
1020
+ "MB": {
1021
+ "accuracy": 0.0898876404494382,
1022
+ "count": 267
1023
+ },
1024
+ "ME": {
1025
+ "accuracy": 0.8301886792452831,
1026
+ "count": 53
1027
+ },
1028
+ "UB": {
1029
+ "accuracy": 0.14578587699316628,
1030
+ "count": 439
1031
+ },
1032
+ "UD": {
1033
+ "accuracy": 0.0975609756097561,
1034
+ "count": 41
1035
+ }
1036
+ }
1037
+ },
1038
+ "sub_B3": {
1039
+ "full_accuracy": 0.0,
1040
+ "n_examples": 100,
1041
+ "per_subtask": {
1042
+ "MD": {
1043
+ "accuracy": 0.44666666666666666,
1044
+ "count": 300
1045
+ },
1046
+ "MB": {
1047
+ "accuracy": 0.01,
1048
+ "count": 100
1049
+ },
1050
+ "UB": {
1051
+ "accuracy": 0.18781725888324874,
1052
+ "count": 197
1053
+ },
1054
+ "UD": {
1055
+ "accuracy": 0.0970873786407767,
1056
+ "count": 103
1057
+ }
1058
+ }
1059
+ },
1060
+ "sub_B4": {
1061
+ "full_accuracy": 0.0,
1062
+ "n_examples": 100,
1063
+ "per_subtask": {
1064
+ "MD": {
1065
+ "accuracy": 0.65,
1066
+ "count": 200
1067
+ },
1068
+ "MB": {
1069
+ "accuracy": 0.02,
1070
+ "count": 100
1071
+ },
1072
+ "UB": {
1073
+ "accuracy": 0.1659919028340081,
1074
+ "count": 247
1075
+ },
1076
+ "UD": {
1077
+ "accuracy": 0.08496732026143791,
1078
+ "count": 153
1079
+ }
1080
+ }
1081
+ },
1082
+ "sub_B5": {
1083
+ "full_accuracy": 0.0,
1084
+ "n_examples": 100,
1085
+ "per_subtask": {
1086
+ "MD": {
1087
+ "accuracy": 1.0,
1088
+ "count": 100
1089
+ },
1090
+ "MB": {
1091
+ "accuracy": 0.04,
1092
+ "count": 100
1093
+ },
1094
+ "UB": {
1095
+ "accuracy": 0.18791946308724833,
1096
+ "count": 298
1097
+ },
1098
+ "UD": {
1099
+ "accuracy": 0.06930693069306931,
1100
+ "count": 202
1101
+ }
1102
+ }
1103
+ }
1104
+ },
1105
+ "summary": {
1106
+ "overall_accuracy": 0.011666666666666667,
1107
+ "total_examples": 2400,
1108
+ "n_splits": 22
1109
+ }
1110
+ },
1111
+ "sorl_eval": {
1112
+ "config": {
1113
+ "ops": "add_sub",
1114
+ "K": 1,
1115
+ "mode": "sorl",
1116
+ "n_digits": 6,
1117
+ "n_per_split": 100
1118
+ },
1119
+ "splits": {
1120
+ "add_S0": {
1121
+ "full_accuracy": 0.65,
1122
+ "n_examples": 100,
1123
+ "per_subtask": {
1124
+ "SA": {
1125
+ "accuracy": 0.9388429752066115,
1126
+ "count": 605
1127
+ },
1128
+ "SS": {
1129
+ "accuracy": 0.8736842105263158,
1130
+ "count": 95
1131
+ }
1132
+ }
1133
+ },
1134
+ "add_S1": {
1135
+ "full_accuracy": 0.01,
1136
+ "n_examples": 100,
1137
+ "per_subtask": {
1138
+ "SA": {
1139
+ "accuracy": 0.9509803921568627,
1140
+ "count": 204
1141
+ },
1142
+ "SC": {
1143
+ "accuracy": 0.8520710059171598,
1144
+ "count": 169
1145
+ },
1146
+ "SS": {
1147
+ "accuracy": 0.9032258064516129,
1148
+ "count": 31
1149
+ },
1150
+ "UC": {
1151
+ "accuracy": 0.24662162162162163,
1152
+ "count": 296
1153
+ }
1154
+ }
1155
+ },
1156
+ "add_S2": {
1157
+ "full_accuracy": 0.02,
1158
+ "n_examples": 100,
1159
+ "per_subtask": {
1160
+ "SA": {
1161
+ "accuracy": 0.9079754601226994,
1162
+ "count": 163
1163
+ },
1164
+ "SC": {
1165
+ "accuracy": 0.8538461538461538,
1166
+ "count": 130
1167
+ },
1168
+ "SS": {
1169
+ "accuracy": 0.896551724137931,
1170
+ "count": 87
1171
+ },
1172
+ "UC": {
1173
+ "accuracy": 0.22167487684729065,
1174
+ "count": 203
1175
+ },
1176
+ "US": {
1177
+ "accuracy": 0.21367521367521367,
1178
+ "count": 117
1179
+ }
1180
+ }
1181
+ },
1182
+ "add_S3": {
1183
+ "full_accuracy": 0.0,
1184
+ "n_examples": 100,
1185
+ "per_subtask": {
1186
+ "SA": {
1187
+ "accuracy": 0.9256198347107438,
1188
+ "count": 121
1189
+ },
1190
+ "SC": {
1191
+ "accuracy": 0.8264462809917356,
1192
+ "count": 121
1193
+ },
1194
+ "SS": {
1195
+ "accuracy": 0.8979591836734694,
1196
+ "count": 49
1197
+ },
1198
+ "UC": {
1199
+ "accuracy": 0.3172043010752688,
1200
+ "count": 186
1201
+ },
1202
+ "US": {
1203
+ "accuracy": 0.08520179372197309,
1204
+ "count": 223
1205
+ }
1206
+ }
1207
+ },
1208
+ "add_S4": {
1209
+ "full_accuracy": 0.0,
1210
+ "n_examples": 100,
1211
+ "per_subtask": {
1212
+ "SA": {
1213
+ "accuracy": 0.9711538461538461,
1214
+ "count": 104
1215
+ },
1216
+ "SC": {
1217
+ "accuracy": 0.8113207547169812,
1218
+ "count": 106
1219
+ },
1220
+ "SS": {
1221
+ "accuracy": 0.9565217391304348,
1222
+ "count": 23
1223
+ },
1224
+ "UC": {
1225
+ "accuracy": 0.3375,
1226
+ "count": 160
1227
+ },
1228
+ "US": {
1229
+ "accuracy": 0.10749185667752444,
1230
+ "count": 307
1231
+ }
1232
+ }
1233
+ },
1234
+ "add_S5": {
1235
+ "full_accuracy": 0.0,
1236
+ "n_examples": 100,
1237
+ "per_subtask": {
1238
+ "SA": {
1239
+ "accuracy": 0.94,
1240
+ "count": 100
1241
+ },
1242
+ "SC": {
1243
+ "accuracy": 0.63,
1244
+ "count": 100
1245
+ },
1246
+ "UC": {
1247
+ "accuracy": 0.05,
1248
+ "count": 100
1249
+ },
1250
+ "US": {
1251
+ "accuracy": 0.0825,
1252
+ "count": 400
1253
+ }
1254
+ }
1255
+ },
1256
+ "add_S6": {
1257
+ "full_accuracy": 0.0,
1258
+ "n_examples": 100,
1259
+ "per_subtask": {
1260
+ "SC": {
1261
+ "accuracy": 0.54,
1262
+ "count": 100
1263
+ },
1264
+ "UC": {
1265
+ "accuracy": 0.21,
1266
+ "count": 100
1267
+ },
1268
+ "US": {
1269
+ "accuracy": 0.046,
1270
+ "count": 500
1271
+ }
1272
+ }
1273
+ },
1274
+ "add_random": {
1275
+ "full_accuracy": 0.035,
1276
+ "n_examples": 200,
1277
+ "per_subtask": {
1278
+ "SA": {
1279
+ "accuracy": 0.9194630872483222,
1280
+ "count": 447
1281
+ },
1282
+ "SC": {
1283
+ "accuracy": 0.8625,
1284
+ "count": 320
1285
+ },
1286
+ "SS": {
1287
+ "accuracy": 0.8571428571428571,
1288
+ "count": 56
1289
+ },
1290
+ "UC": {
1291
+ "accuracy": 0.2608695652173913,
1292
+ "count": 529
1293
+ },
1294
+ "US": {
1295
+ "accuracy": 0.1875,
1296
+ "count": 48
1297
+ }
1298
+ }
1299
+ },
1300
+ "add_C3": {
1301
+ "full_accuracy": 0.0,
1302
+ "n_examples": 100,
1303
+ "per_subtask": {
1304
+ "SA": {
1305
+ "accuracy": 0.95,
1306
+ "count": 300
1307
+ },
1308
+ "SC": {
1309
+ "accuracy": 0.81,
1310
+ "count": 100
1311
+ },
1312
+ "UC": {
1313
+ "accuracy": 0.20207253886010362,
1314
+ "count": 193
1315
+ },
1316
+ "US": {
1317
+ "accuracy": 0.14953271028037382,
1318
+ "count": 107
1319
+ }
1320
+ }
1321
+ },
1322
+ "add_C4": {
1323
+ "full_accuracy": 0.0,
1324
+ "n_examples": 100,
1325
+ "per_subtask": {
1326
+ "SA": {
1327
+ "accuracy": 0.935,
1328
+ "count": 200
1329
+ },
1330
+ "SC": {
1331
+ "accuracy": 0.76,
1332
+ "count": 100
1333
+ },
1334
+ "UC": {
1335
+ "accuracy": 0.19140625,
1336
+ "count": 256
1337
+ },
1338
+ "US": {
1339
+ "accuracy": 0.19444444444444445,
1340
+ "count": 144
1341
+ }
1342
+ }
1343
+ },
1344
+ "add_C5": {
1345
+ "full_accuracy": 0.0,
1346
+ "n_examples": 100,
1347
+ "per_subtask": {
1348
+ "SA": {
1349
+ "accuracy": 0.94,
1350
+ "count": 100
1351
+ },
1352
+ "SC": {
1353
+ "accuracy": 0.81,
1354
+ "count": 100
1355
+ },
1356
+ "UC": {
1357
+ "accuracy": 0.19607843137254902,
1358
+ "count": 306
1359
+ },
1360
+ "US": {
1361
+ "accuracy": 0.10309278350515463,
1362
+ "count": 194
1363
+ }
1364
+ }
1365
+ },
1366
+ "add_C6": {
1367
+ "full_accuracy": 0.0,
1368
+ "n_examples": 100,
1369
+ "per_subtask": {
1370
+ "SC": {
1371
+ "accuracy": 0.62,
1372
+ "count": 100
1373
+ },
1374
+ "UC": {
1375
+ "accuracy": 0.31693989071038253,
1376
+ "count": 366
1377
+ },
1378
+ "US": {
1379
+ "accuracy": 0.14957264957264957,
1380
+ "count": 234
1381
+ }
1382
+ }
1383
+ },
1384
+ "sub_M0": {
1385
+ "full_accuracy": 0.66,
1386
+ "n_examples": 100,
1387
+ "per_subtask": {
1388
+ "MD": {
1389
+ "accuracy": 0.9384359400998337,
1390
+ "count": 601
1391
+ },
1392
+ "ME": {
1393
+ "accuracy": 0.9696969696969697,
1394
+ "count": 99
1395
+ }
1396
+ }
1397
+ },
1398
+ "sub_M1": {
1399
+ "full_accuracy": 0.01,
1400
+ "n_examples": 100,
1401
+ "per_subtask": {
1402
+ "MD": {
1403
+ "accuracy": 0.9283154121863799,
1404
+ "count": 279
1405
+ },
1406
+ "MB": {
1407
+ "accuracy": 0.8,
1408
+ "count": 145
1409
+ },
1410
+ "ME": {
1411
+ "accuracy": 1.0,
1412
+ "count": 24
1413
+ },
1414
+ "UB": {
1415
+ "accuracy": 0.12301587301587301,
1416
+ "count": 252
1417
+ }
1418
+ }
1419
+ },
1420
+ "sub_M2": {
1421
+ "full_accuracy": 0.0,
1422
+ "n_examples": 100,
1423
+ "per_subtask": {
1424
+ "MD": {
1425
+ "accuracy": 0.9765258215962441,
1426
+ "count": 213
1427
+ },
1428
+ "MB": {
1429
+ "accuracy": 0.7699115044247787,
1430
+ "count": 113
1431
+ },
1432
+ "ME": {
1433
+ "accuracy": 0.9882352941176471,
1434
+ "count": 85
1435
+ },
1436
+ "UB": {
1437
+ "accuracy": 0.15469613259668508,
1438
+ "count": 181
1439
+ },
1440
+ "UD": {
1441
+ "accuracy": 0.027777777777777776,
1442
+ "count": 108
1443
+ }
1444
+ }
1445
+ },
1446
+ "sub_M3": {
1447
+ "full_accuracy": 0.0,
1448
+ "n_examples": 100,
1449
+ "per_subtask": {
1450
+ "MD": {
1451
+ "accuracy": 0.9832402234636871,
1452
+ "count": 179
1453
+ },
1454
+ "MB": {
1455
+ "accuracy": 0.6796116504854369,
1456
+ "count": 103
1457
+ },
1458
+ "ME": {
1459
+ "accuracy": 0.9642857142857143,
1460
+ "count": 56
1461
+ },
1462
+ "UB": {
1463
+ "accuracy": 0.16778523489932887,
1464
+ "count": 149
1465
+ },
1466
+ "UD": {
1467
+ "accuracy": 0.06572769953051644,
1468
+ "count": 213
1469
+ }
1470
+ }
1471
+ },
1472
+ "sub_M4": {
1473
+ "full_accuracy": 0.0,
1474
+ "n_examples": 100,
1475
+ "per_subtask": {
1476
+ "MD": {
1477
+ "accuracy": 0.95,
1478
+ "count": 200
1479
+ },
1480
+ "MB": {
1481
+ "accuracy": 0.68,
1482
+ "count": 100
1483
+ },
1484
+ "UB": {
1485
+ "accuracy": 0.16,
1486
+ "count": 100
1487
+ },
1488
+ "UD": {
1489
+ "accuracy": 0.05333333333333334,
1490
+ "count": 300
1491
+ }
1492
+ }
1493
+ },
1494
+ "sub_M5": {
1495
+ "full_accuracy": 0.0,
1496
+ "n_examples": 100,
1497
+ "per_subtask": {
1498
+ "MD": {
1499
+ "accuracy": 1.0,
1500
+ "count": 100
1501
+ },
1502
+ "MB": {
1503
+ "accuracy": 0.45,
1504
+ "count": 100
1505
+ },
1506
+ "UB": {
1507
+ "accuracy": 0.2,
1508
+ "count": 100
1509
+ },
1510
+ "UD": {
1511
+ "accuracy": 0.045,
1512
+ "count": 400
1513
+ }
1514
+ }
1515
+ },
1516
+ "sub_random": {
1517
+ "full_accuracy": 0.045,
1518
+ "n_examples": 200,
1519
+ "per_subtask": {
1520
+ "MD": {
1521
+ "accuracy": 0.9166666666666666,
1522
+ "count": 600
1523
+ },
1524
+ "MB": {
1525
+ "accuracy": 0.8389513108614233,
1526
+ "count": 267
1527
+ },
1528
+ "ME": {
1529
+ "accuracy": 0.9811320754716981,
1530
+ "count": 53
1531
+ },
1532
+ "UB": {
1533
+ "accuracy": 0.10022779043280182,
1534
+ "count": 439
1535
+ },
1536
+ "UD": {
1537
+ "accuracy": 0.0,
1538
+ "count": 41
1539
+ }
1540
+ }
1541
+ },
1542
+ "sub_B3": {
1543
+ "full_accuracy": 0.0,
1544
+ "n_examples": 100,
1545
+ "per_subtask": {
1546
+ "MD": {
1547
+ "accuracy": 0.9333333333333333,
1548
+ "count": 300
1549
+ },
1550
+ "MB": {
1551
+ "accuracy": 0.84,
1552
+ "count": 100
1553
+ },
1554
+ "UB": {
1555
+ "accuracy": 0.15736040609137056,
1556
+ "count": 197
1557
+ },
1558
+ "UD": {
1559
+ "accuracy": 0.038834951456310676,
1560
+ "count": 103
1561
+ }
1562
+ }
1563
+ },
1564
+ "sub_B4": {
1565
+ "full_accuracy": 0.0,
1566
+ "n_examples": 100,
1567
+ "per_subtask": {
1568
+ "MD": {
1569
+ "accuracy": 0.96,
1570
+ "count": 200
1571
+ },
1572
+ "MB": {
1573
+ "accuracy": 0.84,
1574
+ "count": 100
1575
+ },
1576
+ "UB": {
1577
+ "accuracy": 0.13360323886639677,
1578
+ "count": 247
1579
+ },
1580
+ "UD": {
1581
+ "accuracy": 0.06535947712418301,
1582
+ "count": 153
1583
+ }
1584
+ }
1585
+ },
1586
+ "sub_B5": {
1587
+ "full_accuracy": 0.0,
1588
+ "n_examples": 100,
1589
+ "per_subtask": {
1590
+ "MD": {
1591
+ "accuracy": 1.0,
1592
+ "count": 100
1593
+ },
1594
+ "MB": {
1595
+ "accuracy": 0.64,
1596
+ "count": 100
1597
+ },
1598
+ "UB": {
1599
+ "accuracy": 0.20469798657718122,
1600
+ "count": 298
1601
+ },
1602
+ "UD": {
1603
+ "accuracy": 0.06930693069306931,
1604
+ "count": 202
1605
+ }
1606
+ }
1607
+ }
1608
+ },
1609
+ "summary": {
1610
+ "overall_accuracy": 0.06291666666666666,
1611
+ "total_examples": 2400,
1612
+ "n_splits": 22
1613
+ }
1614
+ },
1615
+ "sorl_overall_accuracy": 0.06291666666666666,
1616
+ "sft_overall_accuracy": 0.011666666666666667
1617
+ }
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a10bb1f5b2eb181b8afa8885bc9617b44e94d4a7398d421530a00ee21fe002b3
3
+ size 315091124
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/train_config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_rollouts": 4,
3
+ "K": 1,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 2e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 117,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
+ "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
+ "num_epochs": 10,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 390,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_sorl_abs10_K1_25K_1L2H256d",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 1,
61
+ "n_head": 2,
62
+ "n_embd": 256,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 10,
65
+ "dataset_size": 25000,
66
+ "mode": "sorl",
67
+ "device": "cuda",
68
+ "push_to_hub": true,
69
+ "no_wandb": false,
70
+ "n_params": 78696448,
71
+ "run_name": "add_sub_sorl_v1_abs10_K1_25K_1L2H256d",
72
+ "git_commit": "3ae8ca0d6b88706715f25991b1e1acd0e3a6e0a6",
73
+ "timestamp": "2026-04-12T20:10:10.507089+00:00",
74
+ "tokenizer": "Qwen/Qwen3-0.6B",
75
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
+ "dataset_config": "add_sub_6digit",
77
+ "model_repo": "thoughtworks/arithmetic-sorl",
78
+ "trainer_version": "v1",
79
+ "wandb_run_id": "8olk3u7a",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/8olk3u7a",
81
+ "final_accuracy": 0.06291666666666666,
82
+ "sft_accuracy": 0.011666666666666667,
83
+ "eval_method": "ArithmeticEvaluator"
84
+ }