amirali1985 commited on
Commit
9db8d82
·
verified ·
1 Parent(s): 23816ca

Upload add_sub_baseline_25K_1L2H256d

Browse files
add_sub_baseline_25K_1L2H256d/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 256,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1024,
15
+ "layer_types": [
16
+ "full_attention"
17
+ ],
18
+ "max_position_embeddings": 128,
19
+ "max_window_layers": 28,
20
+ "model_type": "qwen3",
21
+ "num_attention_heads": 2,
22
+ "num_hidden_layers": 1,
23
+ "num_key_value_heads": 2,
24
+ "pad_token_id": null,
25
+ "rms_norm_eps": 1e-06,
26
+ "rope_parameters": {
27
+ "rope_theta": 10000.0,
28
+ "rope_type": "default"
29
+ },
30
+ "sliding_window": null,
31
+ "tie_word_embeddings": false,
32
+ "transformers_version": "5.5.0",
33
+ "use_cache": true,
34
+ "use_sliding_window": false,
35
+ "vocab_size": 151645
36
+ }
add_sub_baseline_25K_1L2H256d/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_25K_1L2H256d/metrics.json ADDED
@@ -0,0 +1,1207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300,
50
+ 2350,
51
+ 2400,
52
+ 2450,
53
+ 2500,
54
+ 2550,
55
+ 2600,
56
+ 2650,
57
+ 2700,
58
+ 2750,
59
+ 2800,
60
+ 2850,
61
+ 2900,
62
+ 2950,
63
+ 3000,
64
+ 3050,
65
+ 3100,
66
+ 3150,
67
+ 3200,
68
+ 3250,
69
+ 3300,
70
+ 3350,
71
+ 3400,
72
+ 3450,
73
+ 3500,
74
+ 3550,
75
+ 3600,
76
+ 3650,
77
+ 3700,
78
+ 3750,
79
+ 3800,
80
+ 3850,
81
+ 3900,
82
+ 3950,
83
+ 4000,
84
+ 4050,
85
+ 4100,
86
+ 4150,
87
+ 4200,
88
+ 4250,
89
+ 4300,
90
+ 4350,
91
+ 4400,
92
+ 4450,
93
+ 4500,
94
+ 4550,
95
+ 4600,
96
+ 4650,
97
+ 4700,
98
+ 4750,
99
+ 4800,
100
+ 4850,
101
+ 4900,
102
+ 4950,
103
+ 5000,
104
+ 5050,
105
+ 5100,
106
+ 5150,
107
+ 5200,
108
+ 5250,
109
+ 5300,
110
+ 5350,
111
+ 5400,
112
+ 5450,
113
+ 5500,
114
+ 5550,
115
+ 5600,
116
+ 5650,
117
+ 5700,
118
+ 5750,
119
+ 5800,
120
+ 5850,
121
+ 5900,
122
+ 5950,
123
+ 6000,
124
+ 6050,
125
+ 6100,
126
+ 6150,
127
+ 6200,
128
+ 6250,
129
+ 6300,
130
+ 6350,
131
+ 6400,
132
+ 6450,
133
+ 6500,
134
+ 6550,
135
+ 6600,
136
+ 6650,
137
+ 6700,
138
+ 6750,
139
+ 6800,
140
+ 6850,
141
+ 6900,
142
+ 6950,
143
+ 7000,
144
+ 7050,
145
+ 7100,
146
+ 7150,
147
+ 7200,
148
+ 7250,
149
+ 7300,
150
+ 7350,
151
+ 7400,
152
+ 7450,
153
+ 7500,
154
+ 7550,
155
+ 7600,
156
+ 7650,
157
+ 7700,
158
+ 7750,
159
+ 7800
160
+ ],
161
+ "loss": [
162
+ 11.671636581420898,
163
+ 10.659524917602539,
164
+ 9.829411506652832,
165
+ 9.199652671813965,
166
+ 8.631672859191895,
167
+ 7.977789878845215,
168
+ 7.31129789352417,
169
+ 6.896409034729004,
170
+ 6.216288089752197,
171
+ 5.601308345794678,
172
+ 5.060961723327637,
173
+ 4.576255798339844,
174
+ 4.051487922668457,
175
+ 3.5682461261749268,
176
+ 3.152874231338501,
177
+ 2.812526226043701,
178
+ 2.522318124771118,
179
+ 2.3712620735168457,
180
+ 2.1769790649414062,
181
+ 2.142094135284424,
182
+ 2.0678162574768066,
183
+ 1.915800929069519,
184
+ 2.0273938179016113,
185
+ 1.892726182937622,
186
+ 1.8822896480560303,
187
+ 1.9177898168563843,
188
+ 1.9286824464797974,
189
+ 1.8752065896987915,
190
+ 1.8638038635253906,
191
+ 1.8708428144454956,
192
+ 1.8120969533920288,
193
+ 1.827845811843872,
194
+ 1.7674380540847778,
195
+ 1.86089026927948,
196
+ 1.7976375818252563,
197
+ 1.7858340740203857,
198
+ 1.8120166063308716,
199
+ 1.7935630083084106,
200
+ 1.8080732822418213,
201
+ 1.772081971168518,
202
+ 1.7113384008407593,
203
+ 1.8078746795654297,
204
+ 1.7796181440353394,
205
+ 1.7605429887771606,
206
+ 1.749908685684204,
207
+ 1.8526047468185425,
208
+ 1.809884786605835,
209
+ 1.744711995124817,
210
+ 1.7835290431976318,
211
+ 1.7533817291259766,
212
+ 1.7311333417892456,
213
+ 1.7385872602462769,
214
+ 1.7065470218658447,
215
+ 1.7380945682525635,
216
+ 1.6556047201156616,
217
+ 1.6597269773483276,
218
+ 1.5854241847991943,
219
+ 1.6379982233047485,
220
+ 1.636677861213684,
221
+ 1.708864450454712,
222
+ 1.61447274684906,
223
+ 1.7286317348480225,
224
+ 1.5406564474105835,
225
+ 1.5738743543624878,
226
+ 1.599366307258606,
227
+ 1.551311731338501,
228
+ 1.5821548700332642,
229
+ 1.4954397678375244,
230
+ 1.6357351541519165,
231
+ 1.610660433769226,
232
+ 1.5579359531402588,
233
+ 1.585968017578125,
234
+ 1.4483888149261475,
235
+ 1.5140514373779297,
236
+ 1.4171068668365479,
237
+ 1.3210866451263428,
238
+ 1.3247480392456055,
239
+ 1.3959987163543701,
240
+ 1.2662498950958252,
241
+ 1.314728856086731,
242
+ 1.2179921865463257,
243
+ 1.162067174911499,
244
+ 1.156597375869751,
245
+ 1.180772066116333,
246
+ 1.093071460723877,
247
+ 1.0152738094329834,
248
+ 0.9924407601356506,
249
+ 1.028732180595398,
250
+ 1.0052201747894287,
251
+ 1.0063453912734985,
252
+ 0.9981479644775391,
253
+ 0.8756313920021057,
254
+ 0.9030128717422485,
255
+ 0.9033327102661133,
256
+ 0.9253665804862976,
257
+ 0.9430330991744995,
258
+ 0.9484753608703613,
259
+ 0.8658924698829651,
260
+ 0.8693656921386719,
261
+ 0.8428267240524292,
262
+ 0.8409420847892761,
263
+ 0.8913314938545227,
264
+ 0.7889167070388794,
265
+ 0.786769449710846,
266
+ 0.8332729339599609,
267
+ 0.8155103325843811,
268
+ 0.8029834628105164,
269
+ 0.7777830362319946,
270
+ 0.8465943932533264,
271
+ 0.7783411741256714,
272
+ 0.7732486724853516,
273
+ 0.7279804944992065,
274
+ 0.7854341864585876,
275
+ 0.6857983469963074,
276
+ 0.7444823384284973,
277
+ 0.7447658181190491,
278
+ 0.7315057516098022,
279
+ 0.7095687985420227,
280
+ 0.7285814881324768,
281
+ 0.7516853213310242,
282
+ 0.7119408845901489,
283
+ 0.7268796563148499,
284
+ 0.7778827548027039,
285
+ 0.7403841018676758,
286
+ 0.7479632496833801,
287
+ 0.7538272738456726,
288
+ 0.6861394643783569,
289
+ 0.7120760679244995,
290
+ 0.701282799243927,
291
+ 0.6962236762046814,
292
+ 0.7168822884559631,
293
+ 0.7258835434913635,
294
+ 0.7194567918777466,
295
+ 0.702591061592102,
296
+ 0.7357982993125916,
297
+ 0.7088969349861145,
298
+ 0.6972709894180298,
299
+ 0.7118088603019714,
300
+ 0.6672061085700989,
301
+ 0.7101794481277466,
302
+ 0.704630434513092,
303
+ 0.678624153137207,
304
+ 0.7467736005783081,
305
+ 0.7612290978431702,
306
+ 0.6677433848381042,
307
+ 0.7306165099143982,
308
+ 0.6715617179870605,
309
+ 0.6923214197158813,
310
+ 0.6501680016517639,
311
+ 0.6605051755905151,
312
+ 0.6703984141349792,
313
+ 0.7571574449539185,
314
+ 0.7139632105827332,
315
+ 0.74234539270401,
316
+ 0.776197612285614,
317
+ 0.6756425499916077
318
+ ],
319
+ "base_loss": [
320
+ 11.671636581420898,
321
+ 10.659524917602539,
322
+ 9.829411506652832,
323
+ 9.199652671813965,
324
+ 8.631672859191895,
325
+ 7.977789878845215,
326
+ 7.31129789352417,
327
+ 6.896409034729004,
328
+ 6.216288089752197,
329
+ 5.601308345794678,
330
+ 5.060961723327637,
331
+ 4.576255798339844,
332
+ 4.051487922668457,
333
+ 3.5682461261749268,
334
+ 3.152874231338501,
335
+ 2.812526226043701,
336
+ 2.522318124771118,
337
+ 2.3712620735168457,
338
+ 2.1769790649414062,
339
+ 2.142094135284424,
340
+ 2.0678162574768066,
341
+ 1.915800929069519,
342
+ 2.0273938179016113,
343
+ 1.892726182937622,
344
+ 1.8822896480560303,
345
+ 1.9177898168563843,
346
+ 1.9286824464797974,
347
+ 1.8752065896987915,
348
+ 1.8638038635253906,
349
+ 1.8708428144454956,
350
+ 1.8120969533920288,
351
+ 1.827845811843872,
352
+ 1.7674380540847778,
353
+ 1.86089026927948,
354
+ 1.7976375818252563,
355
+ 1.7858340740203857,
356
+ 1.8120166063308716,
357
+ 1.7935630083084106,
358
+ 1.8080732822418213,
359
+ 1.772081971168518,
360
+ 1.7113384008407593,
361
+ 1.8078746795654297,
362
+ 1.7796181440353394,
363
+ 1.7605429887771606,
364
+ 1.749908685684204,
365
+ 1.8526047468185425,
366
+ 1.809884786605835,
367
+ 1.744711995124817,
368
+ 1.7835290431976318,
369
+ 1.7533817291259766,
370
+ 1.7311333417892456,
371
+ 1.7385872602462769,
372
+ 1.7065470218658447,
373
+ 1.7380945682525635,
374
+ 1.6556047201156616,
375
+ 1.6597269773483276,
376
+ 1.5854241847991943,
377
+ 1.6379982233047485,
378
+ 1.636677861213684,
379
+ 1.708864450454712,
380
+ 1.61447274684906,
381
+ 1.7286317348480225,
382
+ 1.5406564474105835,
383
+ 1.5738743543624878,
384
+ 1.599366307258606,
385
+ 1.551311731338501,
386
+ 1.5821548700332642,
387
+ 1.4954397678375244,
388
+ 1.6357351541519165,
389
+ 1.610660433769226,
390
+ 1.5579359531402588,
391
+ 1.585968017578125,
392
+ 1.4483888149261475,
393
+ 1.5140514373779297,
394
+ 1.4171068668365479,
395
+ 1.3210866451263428,
396
+ 1.3247480392456055,
397
+ 1.3959987163543701,
398
+ 1.2662498950958252,
399
+ 1.314728856086731,
400
+ 1.2179921865463257,
401
+ 1.162067174911499,
402
+ 1.156597375869751,
403
+ 1.180772066116333,
404
+ 1.093071460723877,
405
+ 1.0152738094329834,
406
+ 0.9924407601356506,
407
+ 1.028732180595398,
408
+ 1.0052201747894287,
409
+ 1.0063453912734985,
410
+ 0.9981479644775391,
411
+ 0.8756313920021057,
412
+ 0.9030128717422485,
413
+ 0.9033327102661133,
414
+ 0.9253665804862976,
415
+ 0.9430330991744995,
416
+ 0.9484753608703613,
417
+ 0.8658924698829651,
418
+ 0.8693656921386719,
419
+ 0.8428267240524292,
420
+ 0.8409420847892761,
421
+ 0.8913314938545227,
422
+ 0.7889167070388794,
423
+ 0.786769449710846,
424
+ 0.8332729339599609,
425
+ 0.8155103325843811,
426
+ 0.8029834628105164,
427
+ 0.7777830362319946,
428
+ 0.8465943932533264,
429
+ 0.7783411741256714,
430
+ 0.7732486724853516,
431
+ 0.7279804944992065,
432
+ 0.7854341864585876,
433
+ 0.6857983469963074,
434
+ 0.7444823384284973,
435
+ 0.7447658181190491,
436
+ 0.7315057516098022,
437
+ 0.7095687985420227,
438
+ 0.7285814881324768,
439
+ 0.7516853213310242,
440
+ 0.7119408845901489,
441
+ 0.7268796563148499,
442
+ 0.7778827548027039,
443
+ 0.7403841018676758,
444
+ 0.7479632496833801,
445
+ 0.7538272738456726,
446
+ 0.6861394643783569,
447
+ 0.7120760679244995,
448
+ 0.701282799243927,
449
+ 0.6962236762046814,
450
+ 0.7168822884559631,
451
+ 0.7258835434913635,
452
+ 0.7194567918777466,
453
+ 0.702591061592102,
454
+ 0.7357982993125916,
455
+ 0.7088969349861145,
456
+ 0.6972709894180298,
457
+ 0.7118088603019714,
458
+ 0.6672061085700989,
459
+ 0.7101794481277466,
460
+ 0.704630434513092,
461
+ 0.678624153137207,
462
+ 0.7467736005783081,
463
+ 0.7612290978431702,
464
+ 0.6677433848381042,
465
+ 0.7306165099143982,
466
+ 0.6715617179870605,
467
+ 0.6923214197158813,
468
+ 0.6501680016517639,
469
+ 0.6605051755905151,
470
+ 0.6703984141349792,
471
+ 0.7571574449539185,
472
+ 0.7139632105827332,
473
+ 0.74234539270401,
474
+ 0.776197612285614,
475
+ 0.6756425499916077
476
+ ],
477
+ "lr": [
478
+ 4.188034188034189e-06,
479
+ 8.461538461538462e-06,
480
+ 1.2735042735042738e-05,
481
+ 1.700854700854701e-05,
482
+ 1.99998070587026e-05,
483
+ 1.9996377198308693e-05,
484
+ 1.9988661446190374e-05,
485
+ 1.9976663110433913e-05,
486
+ 1.996038733525975e-05,
487
+ 1.9939841098816945e-05,
488
+ 1.9915033210191323e-05,
489
+ 1.988597430562865e-05,
490
+ 1.9852676843974367e-05,
491
+ 1.9815155101331968e-05,
492
+ 1.9773425164942158e-05,
493
+ 1.972750492628558e-05,
494
+ 1.9677414073411902e-05,
495
+ 1.9623174082498696e-05,
496
+ 1.9564808208643647e-05,
497
+ 1.9502341475894044e-05,
498
+ 1.943580066651786e-05,
499
+ 1.9365214309521018e-05,
500
+ 1.929061266841572e-05,
501
+ 1.921202772824513e-05,
502
+ 1.9129493181869986e-05,
503
+ 1.9043044415522936e-05,
504
+ 1.895271849363693e-05,
505
+ 1.8858554142954e-05,
506
+ 1.876059173592142e-05,
507
+ 1.8658873273382223e-05,
508
+ 1.855344236656757e-05,
509
+ 1.844434421839869e-05,
510
+ 1.833162560410636e-05,
511
+ 1.821533485117628e-05,
512
+ 1.8095521818628955e-05,
513
+ 1.7972237875642883e-05,
514
+ 1.784553587953032e-05,
515
+ 1.7715470153074993e-05,
516
+ 1.7582096461241506e-05,
517
+ 1.7445471987266426e-05,
518
+ 1.7305655308141265e-05,
519
+ 1.7162706369497937e-05,
520
+ 1.7016686459907395e-05,
521
+ 1.686765818460248e-05,
522
+ 1.671568543863629e-05,
523
+ 1.6560833379487507e-05,
524
+ 1.6403168399124515e-05,
525
+ 1.624275809554018e-05,
526
+ 1.60796712437696e-05,
527
+ 1.5913977766403167e-05,
528
+ 1.5745748703607686e-05,
529
+ 1.5575056182668234e-05,
530
+ 1.540197338706403e-05,
531
+ 1.5226574525091396e-05,
532
+ 1.5048934798047377e-05,
533
+ 1.4869130367987577e-05,
534
+ 1.4687238325072114e-05,
535
+ 1.4503336654513631e-05,
536
+ 1.4317504203141566e-05,
537
+ 1.4129820645597008e-05,
538
+ 1.3940366450172631e-05,
539
+ 1.3749222844312363e-05,
540
+ 1.3556471779785555e-05,
541
+ 1.3362195897550621e-05,
542
+ 1.3166478492323176e-05,
543
+ 1.2969403476863889e-05,
544
+ 1.2771055346001343e-05,
545
+ 1.2571519140405352e-05,
546
+ 1.2370880410126232e-05,
547
+ 1.2169225177915685e-05,
548
+ 1.1966639902345032e-05,
549
+ 1.176321144073656e-05,
550
+ 1.1559027011923938e-05,
551
+ 1.1354174158857612e-05,
552
+ 1.1148740711071244e-05,
553
+ 1.0942814747025295e-05,
554
+ 1.0736484556343858e-05,
555
+ 1.0529838601960978e-05,
556
+ 1.032296548219264e-05,
557
+ 1.0115953892750747e-05,
558
+ 9.908892588715302e-06,
559
+ 9.701870346481186e-06,
560
+ 9.494975925695724e-06,
561
+ 9.288298031203511e-06,
562
+ 9.08192527501469e-06,
563
+ 8.875946138313021e-06,
564
+ 8.670448933520089e-06,
565
+ 8.465521766431812e-06,
566
+ 8.261252498443584e-06,
567
+ 8.057728708880185e-06,
568
+ 7.855037657446609e-06,
569
+ 7.653266246815977e-06,
570
+ 7.452500985370459e-06,
571
+ 7.252827950111301e-06,
572
+ 7.054332749753787e-06,
573
+ 6.85710048802296e-06,
574
+ 6.661215727165885e-06,
575
+ 6.466762451696027e-06,
576
+ 6.273824032385387e-06,
577
+ 6.082483190519702e-06,
578
+ 5.89282196243218e-06,
579
+ 5.70492166433086e-06,
580
+ 5.518862857434741e-06,
581
+ 5.334725313433595e-06,
582
+ 5.152587980286308e-06,
583
+ 4.9725289483723496e-06,
584
+ 4.794625417010984e-06,
585
+ 4.618953661362472e-06,
586
+ 4.445588999725514e-06,
587
+ 4.274605761244949e-06,
588
+ 4.1060772540435276e-06,
589
+ 3.940075733791495e-06,
590
+ 3.7766723727273204e-06,
591
+ 3.6159372291430285e-06,
592
+ 3.4579392173470783e-06,
593
+ 3.3027460781177336e-06,
594
+ 3.15042434965958e-06,
595
+ 3.001039339075622e-06,
596
+ 2.8546550943672535e-06,
597
+ 2.7113343769739875e-06,
598
+ 2.57113863486487e-06,
599
+ 2.4341279761929824e-06,
600
+ 2.300361143524409e-06,
601
+ 2.169895488652681e-06,
602
+ 2.0427869480095065e-06,
603
+ 1.9190900186823515e-06,
604
+ 1.7988577350490876e-06,
605
+ 1.6821416460398155e-06,
606
+ 1.5689917930355325e-06,
607
+ 1.4594566884131744e-06,
608
+ 1.3535832947461935e-06,
609
+ 1.2514170046696128e-06,
610
+ 1.1530016214181938e-06,
611
+ 1.0583793400460306e-06,
612
+ 9.675907293356556e-07,
613
+ 8.806747144044004e-07,
614
+ 7.976685600154599e-07,
615
+ 7.186078546008335e-07,
616
+ 6.435264950029818e-07,
617
+ 5.724566719417401e-07,
618
+ 5.054288562127241e-07,
619
+ 4.424717856231486e-07,
620
+ 3.836124526706375e-07,
621
+ 3.288760929703605e-07,
622
+ 2.782861744353793e-07,
623
+ 2.318643872149251e-07,
624
+ 1.896306343948595e-07,
625
+ 1.516030234643362e-07,
626
+ 1.1779785855231873e-07,
627
+ 8.822963343726454e-08,
628
+ 6.291102533301563e-08,
629
+ 4.1852889453490864e-08,
630
+ 2.5064254358589946e-08,
631
+ 1.2552318083239868e-08,
632
+ 4.32244505127799e-09,
633
+ 3.781637754884937e-10
634
+ ],
635
+ "eval_step": [
636
+ 390,
637
+ 780,
638
+ 1170,
639
+ 1560,
640
+ 1950,
641
+ 2340,
642
+ 2730,
643
+ 3120,
644
+ 3510,
645
+ 3900,
646
+ 4290,
647
+ 4680,
648
+ 5070,
649
+ 5460,
650
+ 5850,
651
+ 6240,
652
+ 6630,
653
+ 7020,
654
+ 7410,
655
+ 7800
656
+ ],
657
+ "eval_epoch": [
658
+ 1,
659
+ 2,
660
+ 3,
661
+ 4,
662
+ 5,
663
+ 6,
664
+ 7,
665
+ 8,
666
+ 9,
667
+ 10,
668
+ 11,
669
+ 12,
670
+ 13,
671
+ 14,
672
+ 15,
673
+ 16,
674
+ 17,
675
+ 18,
676
+ 19,
677
+ 20
678
+ ],
679
+ "eval_accuracy": [
680
+ 0.0,
681
+ 0.0,
682
+ 0.0077777777777777776,
683
+ 0.011111111111111112,
684
+ 0.0033333333333333335,
685
+ 0.006666666666666667,
686
+ 0.005555555555555556,
687
+ 0.0033333333333333335,
688
+ 0.01,
689
+ 0.015555555555555555,
690
+ 0.024444444444444446,
691
+ 0.03333333333333333,
692
+ 0.056666666666666664,
693
+ 0.06,
694
+ 0.05555555555555555,
695
+ 0.05555555555555555,
696
+ 0.057777777777777775,
697
+ 0.056666666666666664,
698
+ 0.058888888888888886,
699
+ 0.06444444444444444
700
+ ]
701
+ },
702
+ "final_accuracy": 0.06916666666666667,
703
+ "sft_eval": {
704
+ "config": {
705
+ "ops": "add_sub",
706
+ "K": null,
707
+ "mode": "sft",
708
+ "n_digits": 6,
709
+ "n_per_split": 100
710
+ },
711
+ "splits": {
712
+ "add_S0": {
713
+ "full_accuracy": 0.42,
714
+ "n_examples": 100,
715
+ "per_subtask": {
716
+ "SA": {
717
+ "accuracy": 0.8628099173553719,
718
+ "count": 605
719
+ },
720
+ "SS": {
721
+ "accuracy": 0.9473684210526315,
722
+ "count": 95
723
+ }
724
+ }
725
+ },
726
+ "add_S1": {
727
+ "full_accuracy": 0.01,
728
+ "n_examples": 100,
729
+ "per_subtask": {
730
+ "SA": {
731
+ "accuracy": 0.9019607843137255,
732
+ "count": 204
733
+ },
734
+ "SC": {
735
+ "accuracy": 0.7692307692307693,
736
+ "count": 169
737
+ },
738
+ "SS": {
739
+ "accuracy": 0.8709677419354839,
740
+ "count": 31
741
+ },
742
+ "UC": {
743
+ "accuracy": 0.3783783783783784,
744
+ "count": 296
745
+ }
746
+ }
747
+ },
748
+ "add_S2": {
749
+ "full_accuracy": 0.11,
750
+ "n_examples": 100,
751
+ "per_subtask": {
752
+ "SA": {
753
+ "accuracy": 0.9447852760736196,
754
+ "count": 163
755
+ },
756
+ "SC": {
757
+ "accuracy": 0.7307692307692307,
758
+ "count": 130
759
+ },
760
+ "SS": {
761
+ "accuracy": 0.8275862068965517,
762
+ "count": 87
763
+ },
764
+ "UC": {
765
+ "accuracy": 0.4433497536945813,
766
+ "count": 203
767
+ },
768
+ "US": {
769
+ "accuracy": 0.5213675213675214,
770
+ "count": 117
771
+ }
772
+ }
773
+ },
774
+ "add_S3": {
775
+ "full_accuracy": 0.07,
776
+ "n_examples": 100,
777
+ "per_subtask": {
778
+ "SA": {
779
+ "accuracy": 0.9173553719008265,
780
+ "count": 121
781
+ },
782
+ "SC": {
783
+ "accuracy": 0.7107438016528925,
784
+ "count": 121
785
+ },
786
+ "SS": {
787
+ "accuracy": 0.9183673469387755,
788
+ "count": 49
789
+ },
790
+ "UC": {
791
+ "accuracy": 0.3655913978494624,
792
+ "count": 186
793
+ },
794
+ "US": {
795
+ "accuracy": 0.2062780269058296,
796
+ "count": 223
797
+ }
798
+ }
799
+ },
800
+ "add_S4": {
801
+ "full_accuracy": 0.1,
802
+ "n_examples": 100,
803
+ "per_subtask": {
804
+ "SA": {
805
+ "accuracy": 0.9519230769230769,
806
+ "count": 104
807
+ },
808
+ "SC": {
809
+ "accuracy": 0.7358490566037735,
810
+ "count": 106
811
+ },
812
+ "SS": {
813
+ "accuracy": 0.9130434782608695,
814
+ "count": 23
815
+ },
816
+ "UC": {
817
+ "accuracy": 0.46875,
818
+ "count": 160
819
+ },
820
+ "US": {
821
+ "accuracy": 0.20846905537459284,
822
+ "count": 307
823
+ }
824
+ }
825
+ },
826
+ "add_S5": {
827
+ "full_accuracy": 0.17,
828
+ "n_examples": 100,
829
+ "per_subtask": {
830
+ "SA": {
831
+ "accuracy": 0.99,
832
+ "count": 100
833
+ },
834
+ "SC": {
835
+ "accuracy": 0.62,
836
+ "count": 100
837
+ },
838
+ "UC": {
839
+ "accuracy": 0.36,
840
+ "count": 100
841
+ },
842
+ "US": {
843
+ "accuracy": 0.19,
844
+ "count": 400
845
+ }
846
+ }
847
+ },
848
+ "add_S6": {
849
+ "full_accuracy": 0.22,
850
+ "n_examples": 100,
851
+ "per_subtask": {
852
+ "SC": {
853
+ "accuracy": 0.67,
854
+ "count": 100
855
+ },
856
+ "UC": {
857
+ "accuracy": 0.38,
858
+ "count": 100
859
+ },
860
+ "US": {
861
+ "accuracy": 0.324,
862
+ "count": 500
863
+ }
864
+ }
865
+ },
866
+ "add_random": {
867
+ "full_accuracy": 0.025,
868
+ "n_examples": 200,
869
+ "per_subtask": {
870
+ "SA": {
871
+ "accuracy": 0.8814317673378076,
872
+ "count": 447
873
+ },
874
+ "SC": {
875
+ "accuracy": 0.7875,
876
+ "count": 320
877
+ },
878
+ "SS": {
879
+ "accuracy": 0.9285714285714286,
880
+ "count": 56
881
+ },
882
+ "UC": {
883
+ "accuracy": 0.3516068052930057,
884
+ "count": 529
885
+ },
886
+ "US": {
887
+ "accuracy": 0.3333333333333333,
888
+ "count": 48
889
+ }
890
+ }
891
+ },
892
+ "add_C3": {
893
+ "full_accuracy": 0.07,
894
+ "n_examples": 100,
895
+ "per_subtask": {
896
+ "SA": {
897
+ "accuracy": 0.9266666666666666,
898
+ "count": 300
899
+ },
900
+ "SC": {
901
+ "accuracy": 0.76,
902
+ "count": 100
903
+ },
904
+ "UC": {
905
+ "accuracy": 0.29015544041450775,
906
+ "count": 193
907
+ },
908
+ "US": {
909
+ "accuracy": 0.3644859813084112,
910
+ "count": 107
911
+ }
912
+ }
913
+ },
914
+ "add_C4": {
915
+ "full_accuracy": 0.05,
916
+ "n_examples": 100,
917
+ "per_subtask": {
918
+ "SA": {
919
+ "accuracy": 0.94,
920
+ "count": 200
921
+ },
922
+ "SC": {
923
+ "accuracy": 0.73,
924
+ "count": 100
925
+ },
926
+ "UC": {
927
+ "accuracy": 0.3125,
928
+ "count": 256
929
+ },
930
+ "US": {
931
+ "accuracy": 0.3819444444444444,
932
+ "count": 144
933
+ }
934
+ }
935
+ },
936
+ "add_C5": {
937
+ "full_accuracy": 0.01,
938
+ "n_examples": 100,
939
+ "per_subtask": {
940
+ "SA": {
941
+ "accuracy": 0.96,
942
+ "count": 100
943
+ },
944
+ "SC": {
945
+ "accuracy": 0.71,
946
+ "count": 100
947
+ },
948
+ "UC": {
949
+ "accuracy": 0.35294117647058826,
950
+ "count": 306
951
+ },
952
+ "US": {
953
+ "accuracy": 0.32989690721649484,
954
+ "count": 194
955
+ }
956
+ }
957
+ },
958
+ "add_C6": {
959
+ "full_accuracy": 0.0,
960
+ "n_examples": 100,
961
+ "per_subtask": {
962
+ "SC": {
963
+ "accuracy": 0.7,
964
+ "count": 100
965
+ },
966
+ "UC": {
967
+ "accuracy": 0.4453551912568306,
968
+ "count": 366
969
+ },
970
+ "US": {
971
+ "accuracy": 0.4444444444444444,
972
+ "count": 234
973
+ }
974
+ }
975
+ },
976
+ "sub_M0": {
977
+ "full_accuracy": 0.25,
978
+ "n_examples": 100,
979
+ "per_subtask": {
980
+ "MD": {
981
+ "accuracy": 0.8419301164725458,
982
+ "count": 601
983
+ },
984
+ "ME": {
985
+ "accuracy": 0.9595959595959596,
986
+ "count": 99
987
+ }
988
+ }
989
+ },
990
+ "sub_M1": {
991
+ "full_accuracy": 0.02,
992
+ "n_examples": 100,
993
+ "per_subtask": {
994
+ "MD": {
995
+ "accuracy": 0.8781362007168458,
996
+ "count": 279
997
+ },
998
+ "MB": {
999
+ "accuracy": 0.7448275862068966,
1000
+ "count": 145
1001
+ },
1002
+ "ME": {
1003
+ "accuracy": 0.9166666666666666,
1004
+ "count": 24
1005
+ },
1006
+ "UB": {
1007
+ "accuracy": 0.2896825396825397,
1008
+ "count": 252
1009
+ }
1010
+ }
1011
+ },
1012
+ "sub_M2": {
1013
+ "full_accuracy": 0.01,
1014
+ "n_examples": 100,
1015
+ "per_subtask": {
1016
+ "MD": {
1017
+ "accuracy": 0.9436619718309859,
1018
+ "count": 213
1019
+ },
1020
+ "MB": {
1021
+ "accuracy": 0.5752212389380531,
1022
+ "count": 113
1023
+ },
1024
+ "ME": {
1025
+ "accuracy": 0.9529411764705882,
1026
+ "count": 85
1027
+ },
1028
+ "UB": {
1029
+ "accuracy": 0.281767955801105,
1030
+ "count": 181
1031
+ },
1032
+ "UD": {
1033
+ "accuracy": 0.18518518518518517,
1034
+ "count": 108
1035
+ }
1036
+ }
1037
+ },
1038
+ "sub_M3": {
1039
+ "full_accuracy": 0.0,
1040
+ "n_examples": 100,
1041
+ "per_subtask": {
1042
+ "MD": {
1043
+ "accuracy": 0.9664804469273743,
1044
+ "count": 179
1045
+ },
1046
+ "MB": {
1047
+ "accuracy": 0.5242718446601942,
1048
+ "count": 103
1049
+ },
1050
+ "ME": {
1051
+ "accuracy": 0.9642857142857143,
1052
+ "count": 56
1053
+ },
1054
+ "UB": {
1055
+ "accuracy": 0.2953020134228188,
1056
+ "count": 149
1057
+ },
1058
+ "UD": {
1059
+ "accuracy": 0.08450704225352113,
1060
+ "count": 213
1061
+ }
1062
+ }
1063
+ },
1064
+ "sub_M4": {
1065
+ "full_accuracy": 0.0,
1066
+ "n_examples": 100,
1067
+ "per_subtask": {
1068
+ "MD": {
1069
+ "accuracy": 0.89,
1070
+ "count": 200
1071
+ },
1072
+ "MB": {
1073
+ "accuracy": 0.75,
1074
+ "count": 100
1075
+ },
1076
+ "UB": {
1077
+ "accuracy": 0.29,
1078
+ "count": 100
1079
+ },
1080
+ "UD": {
1081
+ "accuracy": 0.006666666666666667,
1082
+ "count": 300
1083
+ }
1084
+ }
1085
+ },
1086
+ "sub_M5": {
1087
+ "full_accuracy": 0.0,
1088
+ "n_examples": 100,
1089
+ "per_subtask": {
1090
+ "MD": {
1091
+ "accuracy": 1.0,
1092
+ "count": 100
1093
+ },
1094
+ "MB": {
1095
+ "accuracy": 0.74,
1096
+ "count": 100
1097
+ },
1098
+ "UB": {
1099
+ "accuracy": 0.35,
1100
+ "count": 100
1101
+ },
1102
+ "UD": {
1103
+ "accuracy": 0.025,
1104
+ "count": 400
1105
+ }
1106
+ }
1107
+ },
1108
+ "sub_random": {
1109
+ "full_accuracy": 0.04,
1110
+ "n_examples": 200,
1111
+ "per_subtask": {
1112
+ "MD": {
1113
+ "accuracy": 0.875,
1114
+ "count": 600
1115
+ },
1116
+ "MB": {
1117
+ "accuracy": 0.6629213483146067,
1118
+ "count": 267
1119
+ },
1120
+ "ME": {
1121
+ "accuracy": 0.9811320754716981,
1122
+ "count": 53
1123
+ },
1124
+ "UB": {
1125
+ "accuracy": 0.2437357630979499,
1126
+ "count": 439
1127
+ },
1128
+ "UD": {
1129
+ "accuracy": 0.12195121951219512,
1130
+ "count": 41
1131
+ }
1132
+ }
1133
+ },
1134
+ "sub_B3": {
1135
+ "full_accuracy": 0.02,
1136
+ "n_examples": 100,
1137
+ "per_subtask": {
1138
+ "MD": {
1139
+ "accuracy": 0.87,
1140
+ "count": 300
1141
+ },
1142
+ "MB": {
1143
+ "accuracy": 0.75,
1144
+ "count": 100
1145
+ },
1146
+ "UB": {
1147
+ "accuracy": 0.26903553299492383,
1148
+ "count": 197
1149
+ },
1150
+ "UD": {
1151
+ "accuracy": 0.11650485436893204,
1152
+ "count": 103
1153
+ }
1154
+ }
1155
+ },
1156
+ "sub_B4": {
1157
+ "full_accuracy": 0.0,
1158
+ "n_examples": 100,
1159
+ "per_subtask": {
1160
+ "MD": {
1161
+ "accuracy": 0.915,
1162
+ "count": 200
1163
+ },
1164
+ "MB": {
1165
+ "accuracy": 0.79,
1166
+ "count": 100
1167
+ },
1168
+ "UB": {
1169
+ "accuracy": 0.291497975708502,
1170
+ "count": 247
1171
+ },
1172
+ "UD": {
1173
+ "accuracy": 0.10457516339869281,
1174
+ "count": 153
1175
+ }
1176
+ }
1177
+ },
1178
+ "sub_B5": {
1179
+ "full_accuracy": 0.0,
1180
+ "n_examples": 100,
1181
+ "per_subtask": {
1182
+ "MD": {
1183
+ "accuracy": 1.0,
1184
+ "count": 100
1185
+ },
1186
+ "MB": {
1187
+ "accuracy": 0.76,
1188
+ "count": 100
1189
+ },
1190
+ "UB": {
1191
+ "accuracy": 0.29194630872483224,
1192
+ "count": 298
1193
+ },
1194
+ "UD": {
1195
+ "accuracy": 0.16336633663366337,
1196
+ "count": 202
1197
+ }
1198
+ }
1199
+ }
1200
+ },
1201
+ "summary": {
1202
+ "overall_accuracy": 0.06916666666666667,
1203
+ "total_examples": 2400,
1204
+ "n_splits": 22
1205
+ }
1206
+ }
1207
+ }
add_sub_baseline_25K_1L2H256d/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2419df932e118212192de7abd45dde4a7ef65934c47e2f930ab92a12790c2905
3
+ size 315072674
add_sub_baseline_25K_1L2H256d/train_config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_rollouts": 4,
3
+ "K": 4,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 2e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 234,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
+ "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
+ "num_epochs": 20,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 390,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_baseline_25K_1L2H256d",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 1,
61
+ "n_head": 2,
62
+ "n_embd": 256,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 0,
65
+ "dataset_size": 25000,
66
+ "mode": "baseline",
67
+ "device": "cuda",
68
+ "push_to_hub": true,
69
+ "no_wandb": false,
70
+ "n_params": 78691840,
71
+ "run_name": "add_sub_baseline_25K_1L2H256d",
72
+ "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
+ "timestamp": "2026-04-12T18:21:44.884269+00:00",
74
+ "tokenizer": "Qwen/Qwen3-0.6B",
75
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
+ "dataset_config": "add_sub_6digit",
77
+ "model_repo": "thoughtworks/arithmetic-sorl",
78
+ "trainer_version": "sft",
79
+ "wandb_run_id": "122jv7wp",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/122jv7wp",
81
+ "final_accuracy": 0.06916666666666667,
82
+ "sft_accuracy": 0.06916666666666667,
83
+ "eval_method": "ArithmeticEvaluator"
84
+ }