amirali1985 commited on
Commit
af46ec0
·
verified ·
1 Parent(s): b3215ae

Upload add_sub_sorl_v1_abs10_K1_25K_2L1H128d

Browse files
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 128,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 512,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 128,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 1,
23
+ "num_hidden_layers": 2,
24
+ "num_key_value_heads": 1,
25
+ "pad_token_id": null,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "rope_theta": 10000.0,
29
+ "rope_type": "default"
30
+ },
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "transformers_version": "5.5.0",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 151654
37
+ }
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/metrics.json ADDED
@@ -0,0 +1,1617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 441,
12
+ 491,
13
+ 541,
14
+ 591,
15
+ 641,
16
+ 691,
17
+ 741,
18
+ 832,
19
+ 882,
20
+ 932,
21
+ 982,
22
+ 1032,
23
+ 1082,
24
+ 1132,
25
+ 1223,
26
+ 1273,
27
+ 1323,
28
+ 1373,
29
+ 1423,
30
+ 1473,
31
+ 1523,
32
+ 1614,
33
+ 1664,
34
+ 1714,
35
+ 1764,
36
+ 1814,
37
+ 1864,
38
+ 1914,
39
+ 2005,
40
+ 2055,
41
+ 2105,
42
+ 2155,
43
+ 2205,
44
+ 2255,
45
+ 2305,
46
+ 2396,
47
+ 2446,
48
+ 2496,
49
+ 2546,
50
+ 2596,
51
+ 2646,
52
+ 2696,
53
+ 2787,
54
+ 2837,
55
+ 2887,
56
+ 2937,
57
+ 2987,
58
+ 3037,
59
+ 3087,
60
+ 3178,
61
+ 3228,
62
+ 3278,
63
+ 3328,
64
+ 3378,
65
+ 3428,
66
+ 3478,
67
+ 3569,
68
+ 3619,
69
+ 3669,
70
+ 3719,
71
+ 3769,
72
+ 3819,
73
+ 3869
74
+ ],
75
+ "loss": [
76
+ 13.454885482788086,
77
+ 11.478792190551758,
78
+ 13.050385475158691,
79
+ 14.577110290527344,
80
+ 14.17066764831543,
81
+ 13.036783218383789,
82
+ 11.886457443237305,
83
+ 10.250679016113281,
84
+ 9.566696166992188,
85
+ 8.962414741516113,
86
+ 8.374322891235352,
87
+ 7.972871780395508,
88
+ 7.557723045349121,
89
+ 7.117671012878418,
90
+ 6.462696075439453,
91
+ 6.050843238830566,
92
+ 5.741342067718506,
93
+ 5.391325950622559,
94
+ 5.264486789703369,
95
+ 4.956804275512695,
96
+ 4.797525405883789,
97
+ 4.500096321105957,
98
+ 4.433718681335449,
99
+ 4.317060470581055,
100
+ 4.279264450073242,
101
+ 4.234098434448242,
102
+ 4.042954921722412,
103
+ 4.035151481628418,
104
+ 4.045926094055176,
105
+ 3.9358861446380615,
106
+ 3.753159761428833,
107
+ 3.7763824462890625,
108
+ 3.8419253826141357,
109
+ 3.6180100440979004,
110
+ 3.3646740913391113,
111
+ 3.16855525970459,
112
+ 3.360339641571045,
113
+ 2.8317813873291016,
114
+ 2.9290781021118164,
115
+ 2.785353660583496,
116
+ 2.54134464263916,
117
+ 2.228949785232544,
118
+ 2.0700716972351074,
119
+ 1.8415426015853882,
120
+ 1.5988508462905884,
121
+ 1.569923758506775,
122
+ 1.2837797403335571,
123
+ 1.1172637939453125,
124
+ 0.7710850834846497,
125
+ 0.6829363107681274,
126
+ 0.49595534801483154,
127
+ 0.17024922370910645,
128
+ 0.21459567546844482,
129
+ -0.10922026634216309,
130
+ -0.38342714309692383,
131
+ -0.10504281520843506,
132
+ -0.5501455068588257,
133
+ -0.7160437107086182,
134
+ -0.707771897315979,
135
+ -0.9133639335632324,
136
+ -1.1110361814498901,
137
+ -0.8491053581237793,
138
+ -0.9960892200469971,
139
+ -1.0281953811645508,
140
+ -1.2926632165908813,
141
+ -1.2633980512619019,
142
+ -1.2680187225341797,
143
+ -1.4050925970077515,
144
+ -1.2741034030914307,
145
+ -1.4242706298828125
146
+ ],
147
+ "base_loss": [
148
+ 11.880859375,
149
+ 11.633416175842285,
150
+ 11.221420288085938,
151
+ 10.539042472839355,
152
+ 10.006148338317871,
153
+ 9.441154479980469,
154
+ 8.854585647583008,
155
+ 7.859694480895996,
156
+ 7.329966068267822,
157
+ 6.851498603820801,
158
+ 6.352517127990723,
159
+ 6.018871784210205,
160
+ 5.628158092498779,
161
+ 5.2450385093688965,
162
+ 4.57428503036499,
163
+ 4.198834419250488,
164
+ 3.868682861328125,
165
+ 3.5653653144836426,
166
+ 3.412782907485962,
167
+ 3.165587902069092,
168
+ 2.9751205444335938,
169
+ 2.7248756885528564,
170
+ 2.6281933784484863,
171
+ 2.598451852798462,
172
+ 2.5072898864746094,
173
+ 2.5161983966827393,
174
+ 2.393761157989502,
175
+ 2.3599026203155518,
176
+ 2.3197877407073975,
177
+ 2.3430163860321045,
178
+ 2.2583587169647217,
179
+ 2.2844526767730713,
180
+ 2.254804849624634,
181
+ 2.290438413619995,
182
+ 2.2097489833831787,
183
+ 2.197057008743286,
184
+ 2.2262990474700928,
185
+ 2.1319544315338135,
186
+ 2.107748031616211,
187
+ 2.173170328140259,
188
+ 2.1438701152801514,
189
+ 2.181342363357544,
190
+ 2.0934159755706787,
191
+ 2.126227617263794,
192
+ 2.09466814994812,
193
+ 2.0434868335723877,
194
+ 2.1309261322021484,
195
+ 2.075498342514038,
196
+ 2.1343188285827637,
197
+ 2.0033481121063232,
198
+ 2.0886590480804443,
199
+ 2.092684507369995,
200
+ 2.020324945449829,
201
+ 1.984065055847168,
202
+ 2.0311124324798584,
203
+ 2.026329755783081,
204
+ 1.9996259212493896,
205
+ 2.0779316425323486,
206
+ 1.9736812114715576,
207
+ 2.030266284942627,
208
+ 2.1000616550445557,
209
+ 2.038893461227417,
210
+ 1.9640614986419678,
211
+ 2.036902904510498,
212
+ 2.0785582065582275,
213
+ 2.0229272842407227,
214
+ 2.0268771648406982,
215
+ 2.0155155658721924,
216
+ 1.9266163110733032,
217
+ 2.076498508453369
218
+ ],
219
+ "info_loss": [
220
+ -0.7863264083862305,
221
+ -0.9126081466674805,
222
+ -0.61785888671875,
223
+ -0.2595643997192383,
224
+ -0.1060333251953125,
225
+ -0.05567169189453125,
226
+ -0.03524017333984375,
227
+ -0.020551681518554688,
228
+ -0.01478719711303711,
229
+ -0.013951778411865234,
230
+ -0.01464700698852539,
231
+ -0.015613079071044922,
232
+ -0.013948440551757812,
233
+ -0.016277790069580078,
234
+ -0.010252952575683594,
235
+ -0.012321949005126953,
236
+ -0.008607149124145508,
237
+ -0.012037992477416992,
238
+ -0.008737802505493164,
239
+ -0.013904571533203125,
240
+ -0.010097980499267578,
241
+ -0.013910055160522461,
242
+ -0.010562896728515625,
243
+ -0.018952369689941406,
244
+ -0.013309478759765625,
245
+ -0.018421173095703125,
246
+ -0.025274276733398438,
247
+ -0.02254462242126465,
248
+ -0.01696610450744629,
249
+ -0.03021550178527832,
250
+ -0.03984832763671875,
251
+ -0.0401759147644043,
252
+ -0.030680418014526367,
253
+ -0.05649089813232422,
254
+ -0.07359719276428223,
255
+ -0.09176802635192871,
256
+ -0.07550787925720215,
257
+ -0.11891603469848633,
258
+ -0.10670852661132812,
259
+ -0.1276233196258545,
260
+ -0.14898061752319336,
261
+ -0.1840728521347046,
262
+ -0.19084644317626953,
263
+ -0.2173391580581665,
264
+ -0.2383211851119995,
265
+ -0.23623204231262207,
266
+ -0.27347636222839355,
267
+ -0.28460466861724854,
268
+ -0.32501447200775146,
269
+ -0.32063066959381104,
270
+ -0.34785377979278564,
271
+ -0.38096821308135986,
272
+ -0.3693283796310425,
273
+ -0.39820408821105957,
274
+ -0.4302417039871216,
275
+ -0.40181398391723633,
276
+ -0.4437370300292969,
277
+ -0.46821415424346924,
278
+ -0.4569128751754761,
279
+ -0.4830566644668579,
280
+ -0.5098972320556641,
281
+ -0.4775048494338989,
282
+ -0.4846118688583374,
283
+ -0.4952908754348755,
284
+ -0.5258979797363281,
285
+ -0.5176137685775757,
286
+ -0.518396258354187,
287
+ -0.5309580564498901,
288
+ -0.5087277889251709,
289
+ -0.5388277769088745
290
+ ],
291
+ "abs_loss": [
292
+ 2.3025481700897217,
293
+ 2.2900214195251465,
294
+ 2.254666566848755,
295
+ 2.190378189086914,
296
+ 2.0967636108398438,
297
+ 2.037475347518921,
298
+ 1.9807038307189941,
299
+ 1.8861985206604004,
300
+ 1.8569782972335815,
301
+ 1.824637532234192,
302
+ 1.8228012323379517,
303
+ 1.818422794342041,
304
+ 1.8206357955932617,
305
+ 1.822617530822754,
306
+ 1.804533839225769,
307
+ 1.8139501810073853,
308
+ 1.793150544166565,
309
+ 1.7828887701034546,
310
+ 1.8064988851547241,
311
+ 1.8026375770568848,
312
+ 1.797040343284607,
313
+ 1.8011388778686523,
314
+ 1.8015633821487427,
315
+ 1.8015815019607544,
316
+ 1.799573302268982,
317
+ 1.7933950424194336,
318
+ 1.805014967918396,
319
+ 1.8085817098617554,
320
+ 1.7863329648971558,
321
+ 1.7919312715530396,
322
+ 1.783199429512024,
323
+ 1.7947336435317993,
324
+ 1.8047763109207153,
325
+ 1.8011966943740845,
326
+ 1.788076400756836,
327
+ 1.7798686027526855,
328
+ 1.7843881845474243,
329
+ 1.7868894338607788,
330
+ 1.7851332426071167,
331
+ 1.7916141748428345,
332
+ 1.7841373682022095,
333
+ 1.7945789098739624,
334
+ 1.7695688009262085,
335
+ 1.8075765371322632,
336
+ 1.8012295961380005,
337
+ 1.8081436157226562,
338
+ 1.7965914011001587,
339
+ 1.7999669313430786,
340
+ 1.7932742834091187,
341
+ 1.7773023843765259,
342
+ 1.7790831327438354,
343
+ 1.7893775701522827,
344
+ 1.7931427955627441,
345
+ 1.8046531677246094,
346
+ 1.7975425720214844,
347
+ 1.7841027975082397,
348
+ 1.7952299118041992,
349
+ 1.8000589609146118,
350
+ 1.7956489324569702,
351
+ 1.7838338613510132,
352
+ 1.7932010889053345,
353
+ 1.785744547843933,
354
+ 1.773536205291748,
355
+ 1.7905513048171997,
356
+ 1.792868971824646,
357
+ 1.8099616765975952,
358
+ 1.801303505897522,
359
+ 1.800384521484375,
360
+ 1.7764925956726074,
361
+ 1.7878471612930298
362
+ ],
363
+ "zipf_loss": [
364
+ 9.207036018371582,
365
+ 8.74245548248291,
366
+ 7.782087326049805,
367
+ 6.414673805236816,
368
+ 5.0151753425598145,
369
+ 3.9485976696014404,
370
+ 3.1862025260925293,
371
+ 2.4078822135925293,
372
+ 2.198904514312744,
373
+ 2.067970037460327,
374
+ 1.9859960079193115,
375
+ 1.928288459777832,
376
+ 1.8869855403900146,
377
+ 1.853148341178894,
378
+ 1.8104872703552246,
379
+ 1.7938334941864014,
380
+ 1.7794156074523926,
381
+ 1.7680517435073853,
382
+ 1.7584320306777954,
383
+ 1.7499982118606567,
384
+ 1.743680715560913,
385
+ 1.7342076301574707,
386
+ 1.730997920036316,
387
+ 1.7279741764068604,
388
+ 1.7251121997833252,
389
+ 1.7227725982666016,
390
+ 1.7214350700378418,
391
+ 1.7198371887207031,
392
+ 1.7171660661697388,
393
+ 1.7158317565917969,
394
+ 1.7149643898010254,
395
+ 1.7142155170440674,
396
+ 1.713447093963623,
397
+ 1.7123608589172363,
398
+ 1.7120893001556396,
399
+ 1.7111916542053223,
400
+ 1.7106807231903076,
401
+ 1.7102984189987183,
402
+ 1.7099019289016724,
403
+ 1.7092549800872803,
404
+ 1.7088669538497925,
405
+ 1.7088780403137207,
406
+ 1.7081632614135742,
407
+ 1.707948923110962,
408
+ 1.7072715759277344,
409
+ 1.7079429626464844,
410
+ 1.7079581022262573,
411
+ 1.707815408706665,
412
+ 1.7075835466384888,
413
+ 1.7081646919250488,
414
+ 1.707925796508789,
415
+ 1.7083090543746948,
416
+ 1.708240270614624,
417
+ 1.7082902193069458,
418
+ 1.708122968673706,
419
+ 1.7083569765090942,
420
+ 1.7080758810043335,
421
+ 1.708160161972046,
422
+ 1.7081109285354614,
423
+ 1.7085528373718262,
424
+ 1.7085543870925903,
425
+ 1.70847487449646,
426
+ 1.7086145877838135,
427
+ 1.7087550163269043,
428
+ 1.708471417427063,
429
+ 1.708816409111023,
430
+ 1.7089366912841797,
431
+ 1.7089341878890991,
432
+ 1.708909034729004,
433
+ 1.7087242603302002
434
+ ],
435
+ "denoise_loss": [],
436
+ "ortho_loss": [
437
+ 0.6665944457054138,
438
+ 0.6689225435256958,
439
+ 0.6665652394294739,
440
+ 0.5638207197189331,
441
+ 0.5186919569969177,
442
+ 0.4937678277492523,
443
+ 0.440208375453949,
444
+ 0.44368138909339905,
445
+ 0.4490883946418762,
446
+ 0.46086809039115906,
447
+ 0.47744476795196533,
448
+ 0.4832924008369446,
449
+ 0.5013949871063232,
450
+ 0.49298223853111267,
451
+ 0.48781320452690125,
452
+ 0.4863438904285431,
453
+ 0.4936782121658325,
454
+ 0.49258145689964294,
455
+ 0.49533185362815857,
456
+ 0.507675290107727,
457
+ 0.5198795795440674,
458
+ 0.49237918853759766,
459
+ 0.5015400648117065,
460
+ 0.4975549578666687,
461
+ 0.4957054555416107,
462
+ 0.49398577213287354,
463
+ 0.507213294506073,
464
+ 0.5123801827430725,
465
+ 0.5207350254058838,
466
+ 0.5060194134712219,
467
+ 0.5115739107131958,
468
+ 0.5285096764564514,
469
+ 0.5283535122871399,
470
+ 0.524570643901825,
471
+ 0.5327265858650208,
472
+ 0.549440324306488,
473
+ 0.5494626760482788,
474
+ 0.558219313621521,
475
+ 0.5633650422096252,
476
+ 0.5702787637710571,
477
+ 0.5571073293685913,
478
+ 0.5566506385803223,
479
+ 0.5707675814628601,
480
+ 0.5843377113342285,
481
+ 0.584276556968689,
482
+ 0.5893025994300842,
483
+ 0.6109833121299744,
484
+ 0.6101059317588806,
485
+ 0.6098511815071106,
486
+ 0.6212247610092163,
487
+ 0.6309865713119507,
488
+ 0.6344567537307739,
489
+ 0.6354795098304749,
490
+ 0.647710919380188,
491
+ 0.6540753841400146,
492
+ 0.6593545079231262,
493
+ 0.6664237976074219,
494
+ 0.6748871803283691,
495
+ 0.6771064400672913,
496
+ 0.6822501420974731,
497
+ 0.6884104013442993,
498
+ 0.6926484704017639,
499
+ 0.6983345150947571,
500
+ 0.7035016417503357,
501
+ 0.7052139639854431,
502
+ 0.7094832062721252,
503
+ 0.7137283682823181,
504
+ 0.7142598628997803,
505
+ 0.7140603065490723,
506
+ 0.716733455657959
507
+ ],
508
+ "lr": [
509
+ 8.376068376068378e-06,
510
+ 1.6923076923076924e-05,
511
+ 2e-05,
512
+ 2e-05,
513
+ 2e-05,
514
+ 2e-05,
515
+ 2e-05,
516
+ 2e-05,
517
+ 2e-05,
518
+ 2e-05,
519
+ 2e-05,
520
+ 2e-05,
521
+ 2e-05,
522
+ 2e-05,
523
+ 2e-05,
524
+ 2e-05,
525
+ 2e-05,
526
+ 2e-05,
527
+ 2e-05,
528
+ 2e-05,
529
+ 2e-05,
530
+ 2e-05,
531
+ 2e-05,
532
+ 2e-05,
533
+ 2e-05,
534
+ 2e-05,
535
+ 2e-05,
536
+ 2e-05,
537
+ 2e-05,
538
+ 2e-05,
539
+ 2e-05,
540
+ 2e-05,
541
+ 2e-05,
542
+ 2e-05,
543
+ 2e-05,
544
+ 2e-05,
545
+ 2e-05,
546
+ 2e-05,
547
+ 2e-05,
548
+ 2e-05,
549
+ 2e-05,
550
+ 2e-05,
551
+ 1.9973899288162407e-05,
552
+ 1.9380701291853413e-05,
553
+ 1.8787503295544426e-05,
554
+ 1.8194305299235432e-05,
555
+ 1.7601107302926442e-05,
556
+ 1.7007909306617455e-05,
557
+ 1.6414711310308464e-05,
558
+ 1.5335090957026102e-05,
559
+ 1.474189296071711e-05,
560
+ 1.4148694964408121e-05,
561
+ 1.355549696809913e-05,
562
+ 1.2962298971790142e-05,
563
+ 1.2369100975481152e-05,
564
+ 1.1775902979172158e-05,
565
+ 1.0696282625889797e-05,
566
+ 1.0103084629580805e-05,
567
+ 9.50988663327182e-06,
568
+ 8.91668863696283e-06,
569
+ 8.323490640653837e-06,
570
+ 7.730292644344845e-06,
571
+ 7.137094648035855e-06,
572
+ 6.057474294753492e-06,
573
+ 5.4642762984445065e-06,
574
+ 4.871078302135514e-06,
575
+ 4.277880305826523e-06,
576
+ 3.684682309517532e-06,
577
+ 3.091484313208541e-06,
578
+ 2.4982863168995496e-06
579
+ ],
580
+ "emb_lr": [],
581
+ "eval_step": [
582
+ 350,
583
+ 741,
584
+ 1132,
585
+ 1523,
586
+ 1914,
587
+ 2305,
588
+ 2696,
589
+ 3087,
590
+ 3478,
591
+ 3869
592
+ ],
593
+ "eval_accuracy": [
594
+ 0.01,
595
+ 0.0,
596
+ 0.0,
597
+ 0.01,
598
+ 0.0,
599
+ 0.0,
600
+ 0.01,
601
+ 0.0,
602
+ 0.0,
603
+ 0.01
604
+ ]
605
+ },
606
+ "final_accuracy": 0.020416666666666666,
607
+ "sft_eval": {
608
+ "config": {
609
+ "ops": "add_sub",
610
+ "K": null,
611
+ "mode": "sft",
612
+ "n_digits": 6,
613
+ "n_per_split": 100
614
+ },
615
+ "splits": {
616
+ "add_S0": {
617
+ "full_accuracy": 0.0,
618
+ "n_examples": 100,
619
+ "per_subtask": {
620
+ "SA": {
621
+ "accuracy": 0.2066115702479339,
622
+ "count": 605
623
+ },
624
+ "SS": {
625
+ "accuracy": 1.0,
626
+ "count": 95
627
+ }
628
+ }
629
+ },
630
+ "add_S1": {
631
+ "full_accuracy": 0.0,
632
+ "n_examples": 100,
633
+ "per_subtask": {
634
+ "SA": {
635
+ "accuracy": 0.2549019607843137,
636
+ "count": 204
637
+ },
638
+ "SC": {
639
+ "accuracy": 0.03550295857988166,
640
+ "count": 169
641
+ },
642
+ "SS": {
643
+ "accuracy": 0.8064516129032258,
644
+ "count": 31
645
+ },
646
+ "UC": {
647
+ "accuracy": 0.14527027027027026,
648
+ "count": 296
649
+ }
650
+ }
651
+ },
652
+ "add_S2": {
653
+ "full_accuracy": 0.0,
654
+ "n_examples": 100,
655
+ "per_subtask": {
656
+ "SA": {
657
+ "accuracy": 0.4662576687116564,
658
+ "count": 163
659
+ },
660
+ "SC": {
661
+ "accuracy": 0.07692307692307693,
662
+ "count": 130
663
+ },
664
+ "SS": {
665
+ "accuracy": 0.5977011494252874,
666
+ "count": 87
667
+ },
668
+ "UC": {
669
+ "accuracy": 0.26108374384236455,
670
+ "count": 203
671
+ },
672
+ "US": {
673
+ "accuracy": 0.4017094017094017,
674
+ "count": 117
675
+ }
676
+ }
677
+ },
678
+ "add_S3": {
679
+ "full_accuracy": 0.0,
680
+ "n_examples": 100,
681
+ "per_subtask": {
682
+ "SA": {
683
+ "accuracy": 0.5206611570247934,
684
+ "count": 121
685
+ },
686
+ "SC": {
687
+ "accuracy": 0.024793388429752067,
688
+ "count": 121
689
+ },
690
+ "SS": {
691
+ "accuracy": 0.8571428571428571,
692
+ "count": 49
693
+ },
694
+ "UC": {
695
+ "accuracy": 0.24731182795698925,
696
+ "count": 186
697
+ },
698
+ "US": {
699
+ "accuracy": 0.35874439461883406,
700
+ "count": 223
701
+ }
702
+ }
703
+ },
704
+ "add_S4": {
705
+ "full_accuracy": 0.0,
706
+ "n_examples": 100,
707
+ "per_subtask": {
708
+ "SA": {
709
+ "accuracy": 0.5576923076923077,
710
+ "count": 104
711
+ },
712
+ "SC": {
713
+ "accuracy": 0.04716981132075472,
714
+ "count": 106
715
+ },
716
+ "SS": {
717
+ "accuracy": 0.8695652173913043,
718
+ "count": 23
719
+ },
720
+ "UC": {
721
+ "accuracy": 0.25,
722
+ "count": 160
723
+ },
724
+ "US": {
725
+ "accuracy": 0.3583061889250814,
726
+ "count": 307
727
+ }
728
+ }
729
+ },
730
+ "add_S5": {
731
+ "full_accuracy": 0.0,
732
+ "n_examples": 100,
733
+ "per_subtask": {
734
+ "SA": {
735
+ "accuracy": 0.45,
736
+ "count": 100
737
+ },
738
+ "SC": {
739
+ "accuracy": 0.0,
740
+ "count": 100
741
+ },
742
+ "UC": {
743
+ "accuracy": 0.11,
744
+ "count": 100
745
+ },
746
+ "US": {
747
+ "accuracy": 0.04,
748
+ "count": 400
749
+ }
750
+ }
751
+ },
752
+ "add_S6": {
753
+ "full_accuracy": 0.0,
754
+ "n_examples": 100,
755
+ "per_subtask": {
756
+ "SC": {
757
+ "accuracy": 0.0,
758
+ "count": 100
759
+ },
760
+ "UC": {
761
+ "accuracy": 0.26,
762
+ "count": 100
763
+ },
764
+ "US": {
765
+ "accuracy": 0.26,
766
+ "count": 500
767
+ }
768
+ }
769
+ },
770
+ "add_random": {
771
+ "full_accuracy": 0.0,
772
+ "n_examples": 200,
773
+ "per_subtask": {
774
+ "SA": {
775
+ "accuracy": 0.27069351230425054,
776
+ "count": 447
777
+ },
778
+ "SC": {
779
+ "accuracy": 0.053125,
780
+ "count": 320
781
+ },
782
+ "SS": {
783
+ "accuracy": 0.7321428571428571,
784
+ "count": 56
785
+ },
786
+ "UC": {
787
+ "accuracy": 0.166351606805293,
788
+ "count": 529
789
+ },
790
+ "US": {
791
+ "accuracy": 0.3333333333333333,
792
+ "count": 48
793
+ }
794
+ }
795
+ },
796
+ "add_C3": {
797
+ "full_accuracy": 0.0,
798
+ "n_examples": 100,
799
+ "per_subtask": {
800
+ "SA": {
801
+ "accuracy": 0.24,
802
+ "count": 300
803
+ },
804
+ "SC": {
805
+ "accuracy": 0.02,
806
+ "count": 100
807
+ },
808
+ "UC": {
809
+ "accuracy": 0.06735751295336788,
810
+ "count": 193
811
+ },
812
+ "US": {
813
+ "accuracy": 0.06542056074766354,
814
+ "count": 107
815
+ }
816
+ }
817
+ },
818
+ "add_C4": {
819
+ "full_accuracy": 0.0,
820
+ "n_examples": 100,
821
+ "per_subtask": {
822
+ "SA": {
823
+ "accuracy": 0.335,
824
+ "count": 200
825
+ },
826
+ "SC": {
827
+ "accuracy": 0.01,
828
+ "count": 100
829
+ },
830
+ "UC": {
831
+ "accuracy": 0.04296875,
832
+ "count": 256
833
+ },
834
+ "US": {
835
+ "accuracy": 0.06944444444444445,
836
+ "count": 144
837
+ }
838
+ }
839
+ },
840
+ "add_C5": {
841
+ "full_accuracy": 0.0,
842
+ "n_examples": 100,
843
+ "per_subtask": {
844
+ "SA": {
845
+ "accuracy": 0.49,
846
+ "count": 100
847
+ },
848
+ "SC": {
849
+ "accuracy": 0.05,
850
+ "count": 100
851
+ },
852
+ "UC": {
853
+ "accuracy": 0.08169934640522876,
854
+ "count": 306
855
+ },
856
+ "US": {
857
+ "accuracy": 0.23195876288659795,
858
+ "count": 194
859
+ }
860
+ }
861
+ },
862
+ "add_C6": {
863
+ "full_accuracy": 0.0,
864
+ "n_examples": 100,
865
+ "per_subtask": {
866
+ "SC": {
867
+ "accuracy": 0.01,
868
+ "count": 100
869
+ },
870
+ "UC": {
871
+ "accuracy": 0.1912568306010929,
872
+ "count": 366
873
+ },
874
+ "US": {
875
+ "accuracy": 0.6239316239316239,
876
+ "count": 234
877
+ }
878
+ }
879
+ },
880
+ "sub_M0": {
881
+ "full_accuracy": 0.0,
882
+ "n_examples": 100,
883
+ "per_subtask": {
884
+ "MD": {
885
+ "accuracy": 0.20465890183028287,
886
+ "count": 601
887
+ },
888
+ "ME": {
889
+ "accuracy": 1.0,
890
+ "count": 99
891
+ }
892
+ }
893
+ },
894
+ "sub_M1": {
895
+ "full_accuracy": 0.0,
896
+ "n_examples": 100,
897
+ "per_subtask": {
898
+ "MD": {
899
+ "accuracy": 0.3835125448028674,
900
+ "count": 279
901
+ },
902
+ "MB": {
903
+ "accuracy": 0.0,
904
+ "count": 145
905
+ },
906
+ "ME": {
907
+ "accuracy": 1.0,
908
+ "count": 24
909
+ },
910
+ "UB": {
911
+ "accuracy": 0.09523809523809523,
912
+ "count": 252
913
+ }
914
+ }
915
+ },
916
+ "sub_M2": {
917
+ "full_accuracy": 0.0,
918
+ "n_examples": 100,
919
+ "per_subtask": {
920
+ "MD": {
921
+ "accuracy": 0.6150234741784038,
922
+ "count": 213
923
+ },
924
+ "MB": {
925
+ "accuracy": 0.0,
926
+ "count": 113
927
+ },
928
+ "ME": {
929
+ "accuracy": 1.0,
930
+ "count": 85
931
+ },
932
+ "UB": {
933
+ "accuracy": 0.16574585635359115,
934
+ "count": 181
935
+ },
936
+ "UD": {
937
+ "accuracy": 0.0,
938
+ "count": 108
939
+ }
940
+ }
941
+ },
942
+ "sub_M3": {
943
+ "full_accuracy": 0.0,
944
+ "n_examples": 100,
945
+ "per_subtask": {
946
+ "MD": {
947
+ "accuracy": 0.7597765363128491,
948
+ "count": 179
949
+ },
950
+ "MB": {
951
+ "accuracy": 0.0,
952
+ "count": 103
953
+ },
954
+ "ME": {
955
+ "accuracy": 1.0,
956
+ "count": 56
957
+ },
958
+ "UB": {
959
+ "accuracy": 0.12080536912751678,
960
+ "count": 149
961
+ },
962
+ "UD": {
963
+ "accuracy": 0.0,
964
+ "count": 213
965
+ }
966
+ }
967
+ },
968
+ "sub_M4": {
969
+ "full_accuracy": 0.0,
970
+ "n_examples": 100,
971
+ "per_subtask": {
972
+ "MD": {
973
+ "accuracy": 0.5,
974
+ "count": 200
975
+ },
976
+ "MB": {
977
+ "accuracy": 0.0,
978
+ "count": 100
979
+ },
980
+ "UB": {
981
+ "accuracy": 0.3,
982
+ "count": 100
983
+ },
984
+ "UD": {
985
+ "accuracy": 0.0,
986
+ "count": 300
987
+ }
988
+ }
989
+ },
990
+ "sub_M5": {
991
+ "full_accuracy": 0.0,
992
+ "n_examples": 100,
993
+ "per_subtask": {
994
+ "MD": {
995
+ "accuracy": 1.0,
996
+ "count": 100
997
+ },
998
+ "MB": {
999
+ "accuracy": 0.0,
1000
+ "count": 100
1001
+ },
1002
+ "UB": {
1003
+ "accuracy": 0.31,
1004
+ "count": 100
1005
+ },
1006
+ "UD": {
1007
+ "accuracy": 0.0,
1008
+ "count": 400
1009
+ }
1010
+ }
1011
+ },
1012
+ "sub_random": {
1013
+ "full_accuracy": 0.0,
1014
+ "n_examples": 200,
1015
+ "per_subtask": {
1016
+ "MD": {
1017
+ "accuracy": 0.3616666666666667,
1018
+ "count": 600
1019
+ },
1020
+ "MB": {
1021
+ "accuracy": 0.0,
1022
+ "count": 267
1023
+ },
1024
+ "ME": {
1025
+ "accuracy": 1.0,
1026
+ "count": 53
1027
+ },
1028
+ "UB": {
1029
+ "accuracy": 0.12072892938496584,
1030
+ "count": 439
1031
+ },
1032
+ "UD": {
1033
+ "accuracy": 0.0,
1034
+ "count": 41
1035
+ }
1036
+ }
1037
+ },
1038
+ "sub_B3": {
1039
+ "full_accuracy": 0.0,
1040
+ "n_examples": 100,
1041
+ "per_subtask": {
1042
+ "MD": {
1043
+ "accuracy": 0.3333333333333333,
1044
+ "count": 300
1045
+ },
1046
+ "MB": {
1047
+ "accuracy": 0.0,
1048
+ "count": 100
1049
+ },
1050
+ "UB": {
1051
+ "accuracy": 0.17766497461928935,
1052
+ "count": 197
1053
+ },
1054
+ "UD": {
1055
+ "accuracy": 0.0,
1056
+ "count": 103
1057
+ }
1058
+ }
1059
+ },
1060
+ "sub_B4": {
1061
+ "full_accuracy": 0.0,
1062
+ "n_examples": 100,
1063
+ "per_subtask": {
1064
+ "MD": {
1065
+ "accuracy": 0.5,
1066
+ "count": 200
1067
+ },
1068
+ "MB": {
1069
+ "accuracy": 0.0,
1070
+ "count": 100
1071
+ },
1072
+ "UB": {
1073
+ "accuracy": 0.145748987854251,
1074
+ "count": 247
1075
+ },
1076
+ "UD": {
1077
+ "accuracy": 0.0,
1078
+ "count": 153
1079
+ }
1080
+ }
1081
+ },
1082
+ "sub_B5": {
1083
+ "full_accuracy": 0.0,
1084
+ "n_examples": 100,
1085
+ "per_subtask": {
1086
+ "MD": {
1087
+ "accuracy": 1.0,
1088
+ "count": 100
1089
+ },
1090
+ "MB": {
1091
+ "accuracy": 0.0,
1092
+ "count": 100
1093
+ },
1094
+ "UB": {
1095
+ "accuracy": 0.11073825503355705,
1096
+ "count": 298
1097
+ },
1098
+ "UD": {
1099
+ "accuracy": 0.0,
1100
+ "count": 202
1101
+ }
1102
+ }
1103
+ }
1104
+ },
1105
+ "summary": {
1106
+ "overall_accuracy": 0.0,
1107
+ "total_examples": 2400,
1108
+ "n_splits": 22
1109
+ }
1110
+ },
1111
+ "sorl_eval": {
1112
+ "config": {
1113
+ "ops": "add_sub",
1114
+ "K": 1,
1115
+ "mode": "sorl",
1116
+ "n_digits": 6,
1117
+ "n_per_split": 100
1118
+ },
1119
+ "splits": {
1120
+ "add_S0": {
1121
+ "full_accuracy": 0.09,
1122
+ "n_examples": 100,
1123
+ "per_subtask": {
1124
+ "SA": {
1125
+ "accuracy": 0.6776859504132231,
1126
+ "count": 605
1127
+ },
1128
+ "SS": {
1129
+ "accuracy": 0.8947368421052632,
1130
+ "count": 95
1131
+ }
1132
+ }
1133
+ },
1134
+ "add_S1": {
1135
+ "full_accuracy": 0.03,
1136
+ "n_examples": 100,
1137
+ "per_subtask": {
1138
+ "SA": {
1139
+ "accuracy": 0.696078431372549,
1140
+ "count": 204
1141
+ },
1142
+ "SC": {
1143
+ "accuracy": 0.6804733727810651,
1144
+ "count": 169
1145
+ },
1146
+ "SS": {
1147
+ "accuracy": 0.8387096774193549,
1148
+ "count": 31
1149
+ },
1150
+ "UC": {
1151
+ "accuracy": 0.4831081081081081,
1152
+ "count": 296
1153
+ }
1154
+ }
1155
+ },
1156
+ "add_S2": {
1157
+ "full_accuracy": 0.05,
1158
+ "n_examples": 100,
1159
+ "per_subtask": {
1160
+ "SA": {
1161
+ "accuracy": 0.7423312883435583,
1162
+ "count": 163
1163
+ },
1164
+ "SC": {
1165
+ "accuracy": 0.6615384615384615,
1166
+ "count": 130
1167
+ },
1168
+ "SS": {
1169
+ "accuracy": 0.7471264367816092,
1170
+ "count": 87
1171
+ },
1172
+ "UC": {
1173
+ "accuracy": 0.37438423645320196,
1174
+ "count": 203
1175
+ },
1176
+ "US": {
1177
+ "accuracy": 0.49572649572649574,
1178
+ "count": 117
1179
+ }
1180
+ }
1181
+ },
1182
+ "add_S3": {
1183
+ "full_accuracy": 0.01,
1184
+ "n_examples": 100,
1185
+ "per_subtask": {
1186
+ "SA": {
1187
+ "accuracy": 0.743801652892562,
1188
+ "count": 121
1189
+ },
1190
+ "SC": {
1191
+ "accuracy": 0.6611570247933884,
1192
+ "count": 121
1193
+ },
1194
+ "SS": {
1195
+ "accuracy": 0.8367346938775511,
1196
+ "count": 49
1197
+ },
1198
+ "UC": {
1199
+ "accuracy": 0.3709677419354839,
1200
+ "count": 186
1201
+ },
1202
+ "US": {
1203
+ "accuracy": 0.3811659192825112,
1204
+ "count": 223
1205
+ }
1206
+ }
1207
+ },
1208
+ "add_S4": {
1209
+ "full_accuracy": 0.03,
1210
+ "n_examples": 100,
1211
+ "per_subtask": {
1212
+ "SA": {
1213
+ "accuracy": 0.7307692307692307,
1214
+ "count": 104
1215
+ },
1216
+ "SC": {
1217
+ "accuracy": 0.7075471698113207,
1218
+ "count": 106
1219
+ },
1220
+ "SS": {
1221
+ "accuracy": 0.9565217391304348,
1222
+ "count": 23
1223
+ },
1224
+ "UC": {
1225
+ "accuracy": 0.325,
1226
+ "count": 160
1227
+ },
1228
+ "US": {
1229
+ "accuracy": 0.3322475570032573,
1230
+ "count": 307
1231
+ }
1232
+ }
1233
+ },
1234
+ "add_S5": {
1235
+ "full_accuracy": 0.03,
1236
+ "n_examples": 100,
1237
+ "per_subtask": {
1238
+ "SA": {
1239
+ "accuracy": 0.65,
1240
+ "count": 100
1241
+ },
1242
+ "SC": {
1243
+ "accuracy": 0.63,
1244
+ "count": 100
1245
+ },
1246
+ "UC": {
1247
+ "accuracy": 0.13,
1248
+ "count": 100
1249
+ },
1250
+ "US": {
1251
+ "accuracy": 0.285,
1252
+ "count": 400
1253
+ }
1254
+ }
1255
+ },
1256
+ "add_S6": {
1257
+ "full_accuracy": 0.0,
1258
+ "n_examples": 100,
1259
+ "per_subtask": {
1260
+ "SC": {
1261
+ "accuracy": 0.48,
1262
+ "count": 100
1263
+ },
1264
+ "UC": {
1265
+ "accuracy": 0.0,
1266
+ "count": 100
1267
+ },
1268
+ "US": {
1269
+ "accuracy": 0.036,
1270
+ "count": 500
1271
+ }
1272
+ }
1273
+ },
1274
+ "add_random": {
1275
+ "full_accuracy": 0.04,
1276
+ "n_examples": 200,
1277
+ "per_subtask": {
1278
+ "SA": {
1279
+ "accuracy": 0.6487695749440716,
1280
+ "count": 447
1281
+ },
1282
+ "SC": {
1283
+ "accuracy": 0.70625,
1284
+ "count": 320
1285
+ },
1286
+ "SS": {
1287
+ "accuracy": 0.8571428571428571,
1288
+ "count": 56
1289
+ },
1290
+ "UC": {
1291
+ "accuracy": 0.46313799621928164,
1292
+ "count": 529
1293
+ },
1294
+ "US": {
1295
+ "accuracy": 0.625,
1296
+ "count": 48
1297
+ }
1298
+ }
1299
+ },
1300
+ "add_C3": {
1301
+ "full_accuracy": 0.02,
1302
+ "n_examples": 100,
1303
+ "per_subtask": {
1304
+ "SA": {
1305
+ "accuracy": 0.8033333333333333,
1306
+ "count": 300
1307
+ },
1308
+ "SC": {
1309
+ "accuracy": 0.78,
1310
+ "count": 100
1311
+ },
1312
+ "UC": {
1313
+ "accuracy": 0.19170984455958548,
1314
+ "count": 193
1315
+ },
1316
+ "US": {
1317
+ "accuracy": 0.5327102803738317,
1318
+ "count": 107
1319
+ }
1320
+ }
1321
+ },
1322
+ "add_C4": {
1323
+ "full_accuracy": 0.01,
1324
+ "n_examples": 100,
1325
+ "per_subtask": {
1326
+ "SA": {
1327
+ "accuracy": 0.8,
1328
+ "count": 200
1329
+ },
1330
+ "SC": {
1331
+ "accuracy": 0.75,
1332
+ "count": 100
1333
+ },
1334
+ "UC": {
1335
+ "accuracy": 0.1875,
1336
+ "count": 256
1337
+ },
1338
+ "US": {
1339
+ "accuracy": 0.5833333333333334,
1340
+ "count": 144
1341
+ }
1342
+ }
1343
+ },
1344
+ "add_C5": {
1345
+ "full_accuracy": 0.02,
1346
+ "n_examples": 100,
1347
+ "per_subtask": {
1348
+ "SA": {
1349
+ "accuracy": 0.79,
1350
+ "count": 100
1351
+ },
1352
+ "SC": {
1353
+ "accuracy": 0.65,
1354
+ "count": 100
1355
+ },
1356
+ "UC": {
1357
+ "accuracy": 0.23529411764705882,
1358
+ "count": 306
1359
+ },
1360
+ "US": {
1361
+ "accuracy": 0.5979381443298969,
1362
+ "count": 194
1363
+ }
1364
+ }
1365
+ },
1366
+ "add_C6": {
1367
+ "full_accuracy": 0.01,
1368
+ "n_examples": 100,
1369
+ "per_subtask": {
1370
+ "SC": {
1371
+ "accuracy": 0.6,
1372
+ "count": 100
1373
+ },
1374
+ "UC": {
1375
+ "accuracy": 0.319672131147541,
1376
+ "count": 366
1377
+ },
1378
+ "US": {
1379
+ "accuracy": 0.6282051282051282,
1380
+ "count": 234
1381
+ }
1382
+ }
1383
+ },
1384
+ "sub_M0": {
1385
+ "full_accuracy": 0.01,
1386
+ "n_examples": 100,
1387
+ "per_subtask": {
1388
+ "MD": {
1389
+ "accuracy": 0.5391014975041597,
1390
+ "count": 601
1391
+ },
1392
+ "ME": {
1393
+ "accuracy": 0.9595959595959596,
1394
+ "count": 99
1395
+ }
1396
+ }
1397
+ },
1398
+ "sub_M1": {
1399
+ "full_accuracy": 0.0,
1400
+ "n_examples": 100,
1401
+ "per_subtask": {
1402
+ "MD": {
1403
+ "accuracy": 0.6666666666666666,
1404
+ "count": 279
1405
+ },
1406
+ "MB": {
1407
+ "accuracy": 0.38620689655172413,
1408
+ "count": 145
1409
+ },
1410
+ "ME": {
1411
+ "accuracy": 0.9583333333333334,
1412
+ "count": 24
1413
+ },
1414
+ "UB": {
1415
+ "accuracy": 0.5198412698412699,
1416
+ "count": 252
1417
+ }
1418
+ }
1419
+ },
1420
+ "sub_M2": {
1421
+ "full_accuracy": 0.0,
1422
+ "n_examples": 100,
1423
+ "per_subtask": {
1424
+ "MD": {
1425
+ "accuracy": 0.755868544600939,
1426
+ "count": 213
1427
+ },
1428
+ "MB": {
1429
+ "accuracy": 0.26548672566371684,
1430
+ "count": 113
1431
+ },
1432
+ "ME": {
1433
+ "accuracy": 0.9764705882352941,
1434
+ "count": 85
1435
+ },
1436
+ "UB": {
1437
+ "accuracy": 0.5524861878453039,
1438
+ "count": 181
1439
+ },
1440
+ "UD": {
1441
+ "accuracy": 0.2037037037037037,
1442
+ "count": 108
1443
+ }
1444
+ }
1445
+ },
1446
+ "sub_M3": {
1447
+ "full_accuracy": 0.01,
1448
+ "n_examples": 100,
1449
+ "per_subtask": {
1450
+ "MD": {
1451
+ "accuracy": 0.8547486033519553,
1452
+ "count": 179
1453
+ },
1454
+ "MB": {
1455
+ "accuracy": 0.3300970873786408,
1456
+ "count": 103
1457
+ },
1458
+ "ME": {
1459
+ "accuracy": 0.9821428571428571,
1460
+ "count": 56
1461
+ },
1462
+ "UB": {
1463
+ "accuracy": 0.5838926174496645,
1464
+ "count": 149
1465
+ },
1466
+ "UD": {
1467
+ "accuracy": 0.08450704225352113,
1468
+ "count": 213
1469
+ }
1470
+ }
1471
+ },
1472
+ "sub_M4": {
1473
+ "full_accuracy": 0.01,
1474
+ "n_examples": 100,
1475
+ "per_subtask": {
1476
+ "MD": {
1477
+ "accuracy": 0.65,
1478
+ "count": 200
1479
+ },
1480
+ "MB": {
1481
+ "accuracy": 0.21,
1482
+ "count": 100
1483
+ },
1484
+ "UB": {
1485
+ "accuracy": 0.67,
1486
+ "count": 100
1487
+ },
1488
+ "UD": {
1489
+ "accuracy": 0.06333333333333334,
1490
+ "count": 300
1491
+ }
1492
+ }
1493
+ },
1494
+ "sub_M5": {
1495
+ "full_accuracy": 0.06,
1496
+ "n_examples": 100,
1497
+ "per_subtask": {
1498
+ "MD": {
1499
+ "accuracy": 1.0,
1500
+ "count": 100
1501
+ },
1502
+ "MB": {
1503
+ "accuracy": 0.13,
1504
+ "count": 100
1505
+ },
1506
+ "UB": {
1507
+ "accuracy": 0.72,
1508
+ "count": 100
1509
+ },
1510
+ "UD": {
1511
+ "accuracy": 0.145,
1512
+ "count": 400
1513
+ }
1514
+ }
1515
+ },
1516
+ "sub_random": {
1517
+ "full_accuracy": 0.01,
1518
+ "n_examples": 200,
1519
+ "per_subtask": {
1520
+ "MD": {
1521
+ "accuracy": 0.635,
1522
+ "count": 600
1523
+ },
1524
+ "MB": {
1525
+ "accuracy": 0.45318352059925093,
1526
+ "count": 267
1527
+ },
1528
+ "ME": {
1529
+ "accuracy": 0.9811320754716981,
1530
+ "count": 53
1531
+ },
1532
+ "UB": {
1533
+ "accuracy": 0.5444191343963554,
1534
+ "count": 439
1535
+ },
1536
+ "UD": {
1537
+ "accuracy": 0.1951219512195122,
1538
+ "count": 41
1539
+ }
1540
+ }
1541
+ },
1542
+ "sub_B3": {
1543
+ "full_accuracy": 0.0,
1544
+ "n_examples": 100,
1545
+ "per_subtask": {
1546
+ "MD": {
1547
+ "accuracy": 0.5266666666666666,
1548
+ "count": 300
1549
+ },
1550
+ "MB": {
1551
+ "accuracy": 0.41,
1552
+ "count": 100
1553
+ },
1554
+ "UB": {
1555
+ "accuracy": 0.49746192893401014,
1556
+ "count": 197
1557
+ },
1558
+ "UD": {
1559
+ "accuracy": 0.11650485436893204,
1560
+ "count": 103
1561
+ }
1562
+ }
1563
+ },
1564
+ "sub_B4": {
1565
+ "full_accuracy": 0.0,
1566
+ "n_examples": 100,
1567
+ "per_subtask": {
1568
+ "MD": {
1569
+ "accuracy": 0.67,
1570
+ "count": 200
1571
+ },
1572
+ "MB": {
1573
+ "accuracy": 0.35,
1574
+ "count": 100
1575
+ },
1576
+ "UB": {
1577
+ "accuracy": 0.4898785425101215,
1578
+ "count": 247
1579
+ },
1580
+ "UD": {
1581
+ "accuracy": 0.16993464052287582,
1582
+ "count": 153
1583
+ }
1584
+ }
1585
+ },
1586
+ "sub_B5": {
1587
+ "full_accuracy": 0.0,
1588
+ "n_examples": 100,
1589
+ "per_subtask": {
1590
+ "MD": {
1591
+ "accuracy": 1.0,
1592
+ "count": 100
1593
+ },
1594
+ "MB": {
1595
+ "accuracy": 0.15,
1596
+ "count": 100
1597
+ },
1598
+ "UB": {
1599
+ "accuracy": 0.4395973154362416,
1600
+ "count": 298
1601
+ },
1602
+ "UD": {
1603
+ "accuracy": 0.1782178217821782,
1604
+ "count": 202
1605
+ }
1606
+ }
1607
+ }
1608
+ },
1609
+ "summary": {
1610
+ "overall_accuracy": 0.020416666666666666,
1611
+ "total_examples": 2400,
1612
+ "n_splits": 22
1613
+ }
1614
+ },
1615
+ "sorl_overall_accuracy": 0.020416666666666666,
1616
+ "sft_overall_accuracy": 0.0
1617
+ }
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:552c3c376c464a283388ace103d56b77840cec872425d931fcab7533aaebd6fd
3
+ size 157702060
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/train_config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_rollouts": 4,
3
+ "K": 1,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 2e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 117,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
+ "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
+ "num_epochs": 10,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 390,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_sorl_abs10_K1_25K_2L1H128d",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 2,
61
+ "n_head": 1,
62
+ "n_embd": 128,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 10,
65
+ "dataset_size": 25000,
66
+ "mode": "sorl",
67
+ "device": "cuda",
68
+ "push_to_hub": true,
69
+ "no_wandb": false,
70
+ "n_params": 39348864,
71
+ "run_name": "add_sub_sorl_v1_abs10_K1_25K_2L1H128d",
72
+ "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
+ "timestamp": "2026-04-12T13:04:02.986587+00:00",
74
+ "tokenizer": "Qwen/Qwen3-0.6B",
75
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
+ "dataset_config": "add_sub_6digit",
77
+ "model_repo": "thoughtworks/arithmetic-sorl",
78
+ "trainer_version": "v1",
79
+ "wandb_run_id": "r858hl5t",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/r858hl5t",
81
+ "final_accuracy": 0.020416666666666666,
82
+ "sft_accuracy": 0.0,
83
+ "eval_method": "ArithmeticEvaluator"
84
+ }