amirali1985 commited on
Commit
73e6b5f
·
verified ·
1 Parent(s): 62d71b3

Upload add_sub_baseline_10K

Browse files
add_sub_baseline_10K/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 510,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2040,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 128,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 3,
23
+ "num_hidden_layers": 2,
24
+ "num_key_value_heads": 3,
25
+ "pad_token_id": null,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "rope_theta": 10000.0,
29
+ "rope_type": "default"
30
+ },
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "transformers_version": "5.5.0",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 151645
37
+ }
add_sub_baseline_10K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_10K/metrics.json ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300,
50
+ 2350,
51
+ 2400,
52
+ 2450,
53
+ 2500,
54
+ 2550,
55
+ 2600,
56
+ 2650,
57
+ 2700,
58
+ 2750,
59
+ 2800,
60
+ 2850,
61
+ 2900,
62
+ 2950,
63
+ 3000,
64
+ 3050,
65
+ 3100
66
+ ],
67
+ "loss": [
68
+ 10.090532302856445,
69
+ 7.41793155670166,
70
+ 6.670701503753662,
71
+ 5.4355902671813965,
72
+ 4.486183166503906,
73
+ 3.106839418411255,
74
+ 2.181303024291992,
75
+ 1.9472637176513672,
76
+ 1.6879997253417969,
77
+ 1.7242902517318726,
78
+ 1.3950544595718384,
79
+ 0.9311097860336304,
80
+ 0.593478798866272,
81
+ 0.4981410801410675,
82
+ 0.24719169735908508,
83
+ 0.10993476957082748,
84
+ 0.09566618502140045,
85
+ 0.08340698480606079,
86
+ 0.07292894273996353,
87
+ 0.05232112854719162,
88
+ 0.04302288219332695,
89
+ 0.04004356637597084,
90
+ 0.04495292156934738,
91
+ 0.048470307141542435,
92
+ 0.033653225749731064,
93
+ 0.030521346256136894,
94
+ 0.06012653931975365,
95
+ 0.05565373972058296,
96
+ 0.04759713634848595,
97
+ 0.054727476090192795,
98
+ 0.034593965858221054,
99
+ 0.027987422421574593,
100
+ 0.034813184291124344,
101
+ 0.03215349465608597,
102
+ 0.02478446625173092,
103
+ 0.022883405908942223,
104
+ 0.02932637929916382,
105
+ 0.017739301547408104,
106
+ 0.024523159489035606,
107
+ 0.02478833682835102,
108
+ 0.01739186979830265,
109
+ 0.01981291174888611,
110
+ 0.038495369255542755,
111
+ 0.02020339109003544,
112
+ 0.035459987819194794,
113
+ 0.033973682671785355,
114
+ 0.03820496052503586,
115
+ 0.01859247498214245,
116
+ 0.02102138102054596,
117
+ 0.021527009084820747,
118
+ 0.02982836402952671,
119
+ 0.026523862034082413,
120
+ 0.0283384770154953,
121
+ 0.023189542815089226,
122
+ 0.022154804319143295,
123
+ 0.02203175611793995,
124
+ 0.01161875855177641,
125
+ 0.01782594434916973,
126
+ 0.014171228744089603,
127
+ 0.014905942603945732,
128
+ 0.019558599218726158,
129
+ 0.012723379768431187
130
+ ],
131
+ "base_loss": [
132
+ 10.090532302856445,
133
+ 7.41793155670166,
134
+ 6.670701503753662,
135
+ 5.4355902671813965,
136
+ 4.486183166503906,
137
+ 3.106839418411255,
138
+ 2.181303024291992,
139
+ 1.9472637176513672,
140
+ 1.6879997253417969,
141
+ 1.7242902517318726,
142
+ 1.3950544595718384,
143
+ 0.9311097860336304,
144
+ 0.593478798866272,
145
+ 0.4981410801410675,
146
+ 0.24719169735908508,
147
+ 0.10993476957082748,
148
+ 0.09566618502140045,
149
+ 0.08340698480606079,
150
+ 0.07292894273996353,
151
+ 0.05232112854719162,
152
+ 0.04302288219332695,
153
+ 0.04004356637597084,
154
+ 0.04495292156934738,
155
+ 0.048470307141542435,
156
+ 0.033653225749731064,
157
+ 0.030521346256136894,
158
+ 0.06012653931975365,
159
+ 0.05565373972058296,
160
+ 0.04759713634848595,
161
+ 0.054727476090192795,
162
+ 0.034593965858221054,
163
+ 0.027987422421574593,
164
+ 0.034813184291124344,
165
+ 0.03215349465608597,
166
+ 0.02478446625173092,
167
+ 0.022883405908942223,
168
+ 0.02932637929916382,
169
+ 0.017739301547408104,
170
+ 0.024523159489035606,
171
+ 0.02478833682835102,
172
+ 0.01739186979830265,
173
+ 0.01981291174888611,
174
+ 0.038495369255542755,
175
+ 0.02020339109003544,
176
+ 0.035459987819194794,
177
+ 0.033973682671785355,
178
+ 0.03820496052503586,
179
+ 0.01859247498214245,
180
+ 0.02102138102054596,
181
+ 0.021527009084820747,
182
+ 0.02982836402952671,
183
+ 0.026523862034082413,
184
+ 0.0283384770154953,
185
+ 0.023189542815089226,
186
+ 0.022154804319143295,
187
+ 0.02203175611793995,
188
+ 0.01161875855177641,
189
+ 0.01782594434916973,
190
+ 0.014171228744089603,
191
+ 0.014905942603945732,
192
+ 0.019558599218726158,
193
+ 0.012723379768431187
194
+ ],
195
+ "lr": [
196
+ 6.242038216560511e-06,
197
+ 1.2611464968152866e-05,
198
+ 1.8980891719745225e-05,
199
+ 2.5350318471337578e-05,
200
+ 3.1719745222929934e-05,
201
+ 3.808917197452229e-05,
202
+ 4.445859872611465e-05,
203
+ 5.082802547770701e-05,
204
+ 5.7197452229299365e-05,
205
+ 6.356687898089173e-05,
206
+ 6.993630573248408e-05,
207
+ 7.630573248407644e-05,
208
+ 7.99862055592881e-05,
209
+ 7.984241248831029e-05,
210
+ 7.954287783192742e-05,
211
+ 7.90887724530305e-05,
212
+ 7.848187142213441e-05,
213
+ 7.772454707873448e-05,
214
+ 7.681975975797462e-05,
215
+ 7.57710462188759e-05,
216
+ 7.458250581935905e-05,
217
+ 7.325878449210182e-05,
218
+ 7.180505658386849e-05,
219
+ 7.022700462930083e-05,
220
+ 6.853079713823312e-05,
221
+ 6.672306448335957e-05,
222
+ 6.481087298250779e-05,
223
+ 6.280169727682872e-05,
224
+ 6.070339111287581e-05,
225
+ 5.8524156642783655e-05,
226
+ 5.627251236255051e-05,
227
+ 5.3957259813751526e-05,
228
+ 5.1587449178844164e-05,
229
+ 4.917234390455111e-05,
230
+ 4.672138449160635e-05,
231
+ 4.424415159240753e-05,
232
+ 4.175032856082417e-05,
233
+ 3.924966360055181e-05,
234
+ 3.675193165997228e-05,
235
+ 3.42668962224704e-05,
236
+ 3.180427114156694e-05,
237
+ 2.9373682670051437e-05,
238
+ 2.6984631831541183e-05,
239
+ 2.4646457281553407e-05,
240
+ 2.2368298803264487e-05,
241
+ 2.0159061580649347e-05,
242
+ 1.8027381388654794e-05,
243
+ 1.5981590836476463e-05,
244
+ 1.4029686795892575e-05,
245
+ 1.2179299141974771e-05,
246
+ 1.0437660928367057e-05,
247
+ 8.811580113715755e-06,
248
+ 7.307412949770034e-06,
249
+ 5.9310391351775455e-06,
250
+ 4.687838832097362e-06,
251
+ 3.5826716354707645e-06,
252
+ 2.6198575771580583e-06,
253
+ 1.8031602391947344e-06,
254
+ 1.1357720421765062e-06,
255
+ 6.203017662798872e-07,
256
+ 2.5876435369797334e-07,
257
+ 5.257303235302935e-08
258
+ ],
259
+ "eval_step": [
260
+ 156,
261
+ 312,
262
+ 468,
263
+ 624,
264
+ 780,
265
+ 936,
266
+ 1092,
267
+ 1248,
268
+ 1404,
269
+ 1560,
270
+ 1716,
271
+ 1872,
272
+ 2028,
273
+ 2184,
274
+ 2340,
275
+ 2496,
276
+ 2652,
277
+ 2808,
278
+ 2964,
279
+ 3120
280
+ ],
281
+ "eval_epoch": [
282
+ 1,
283
+ 2,
284
+ 3,
285
+ 4,
286
+ 5,
287
+ 6,
288
+ 7,
289
+ 8,
290
+ 9,
291
+ 10,
292
+ 11,
293
+ 12,
294
+ 13,
295
+ 14,
296
+ 15,
297
+ 16,
298
+ 17,
299
+ 18,
300
+ 19,
301
+ 20
302
+ ],
303
+ "eval_accuracy": [
304
+ 0.0,
305
+ 0.0064285714285714285,
306
+ 0.007142857142857143,
307
+ 0.04,
308
+ 0.6357142857142857,
309
+ 0.7271428571428571,
310
+ 0.7485714285714286,
311
+ 0.7471428571428571,
312
+ 0.75,
313
+ 0.7878571428571428,
314
+ 0.7842857142857143,
315
+ 0.7714285714285715,
316
+ 0.8028571428571428,
317
+ 0.8085714285714286,
318
+ 0.8228571428571428,
319
+ 0.8342857142857143,
320
+ 0.8307142857142857,
321
+ 0.8635714285714285,
322
+ 0.8814285714285715,
323
+ 0.8628571428571429
324
+ ]
325
+ },
326
+ "final_accuracy": 0.8628571428571429,
327
+ "sft_eval": {
328
+ "config": {
329
+ "ops": "add_sub",
330
+ "K": null,
331
+ "mode": "sft",
332
+ "n_digits": 6,
333
+ "n_per_split": 50
334
+ },
335
+ "splits": {
336
+ "add_S0": {
337
+ "full_accuracy": 1.0,
338
+ "n_examples": 50,
339
+ "per_subtask": {
340
+ "SA": {
341
+ "accuracy": 1.0,
342
+ "count": 295
343
+ },
344
+ "SS": {
345
+ "accuracy": 1.0,
346
+ "count": 55
347
+ }
348
+ }
349
+ },
350
+ "add_S1": {
351
+ "full_accuracy": 1.0,
352
+ "n_examples": 50,
353
+ "per_subtask": {
354
+ "SA": {
355
+ "accuracy": 1.0,
356
+ "count": 126
357
+ },
358
+ "SC": {
359
+ "accuracy": 1.0,
360
+ "count": 79
361
+ },
362
+ "SS": {
363
+ "accuracy": 1.0,
364
+ "count": 21
365
+ },
366
+ "UC": {
367
+ "accuracy": 1.0,
368
+ "count": 124
369
+ }
370
+ }
371
+ },
372
+ "add_S2": {
373
+ "full_accuracy": 0.96,
374
+ "n_examples": 50,
375
+ "per_subtask": {
376
+ "SA": {
377
+ "accuracy": 1.0,
378
+ "count": 75
379
+ },
380
+ "SC": {
381
+ "accuracy": 0.967741935483871,
382
+ "count": 62
383
+ },
384
+ "SS": {
385
+ "accuracy": 1.0,
386
+ "count": 39
387
+ },
388
+ "UC": {
389
+ "accuracy": 1.0,
390
+ "count": 111
391
+ },
392
+ "US": {
393
+ "accuracy": 1.0,
394
+ "count": 63
395
+ }
396
+ }
397
+ },
398
+ "add_S3": {
399
+ "full_accuracy": 0.88,
400
+ "n_examples": 50,
401
+ "per_subtask": {
402
+ "SA": {
403
+ "accuracy": 1.0,
404
+ "count": 60
405
+ },
406
+ "SC": {
407
+ "accuracy": 1.0,
408
+ "count": 57
409
+ },
410
+ "SS": {
411
+ "accuracy": 1.0,
412
+ "count": 19
413
+ },
414
+ "UC": {
415
+ "accuracy": 0.9423076923076923,
416
+ "count": 104
417
+ },
418
+ "US": {
419
+ "accuracy": 1.0,
420
+ "count": 110
421
+ }
422
+ }
423
+ },
424
+ "add_S4": {
425
+ "full_accuracy": 0.7,
426
+ "n_examples": 50,
427
+ "per_subtask": {
428
+ "SA": {
429
+ "accuracy": 1.0,
430
+ "count": 48
431
+ },
432
+ "SC": {
433
+ "accuracy": 1.0,
434
+ "count": 52
435
+ },
436
+ "SS": {
437
+ "accuracy": 1.0,
438
+ "count": 7
439
+ },
440
+ "UC": {
441
+ "accuracy": 0.8314606741573034,
442
+ "count": 89
443
+ },
444
+ "US": {
445
+ "accuracy": 0.9025974025974026,
446
+ "count": 154
447
+ }
448
+ }
449
+ },
450
+ "add_S5": {
451
+ "full_accuracy": 0.58,
452
+ "n_examples": 50,
453
+ "per_subtask": {
454
+ "SA": {
455
+ "accuracy": 1.0,
456
+ "count": 50
457
+ },
458
+ "SC": {
459
+ "accuracy": 1.0,
460
+ "count": 50
461
+ },
462
+ "UC": {
463
+ "accuracy": 0.6,
464
+ "count": 50
465
+ },
466
+ "US": {
467
+ "accuracy": 0.79,
468
+ "count": 200
469
+ }
470
+ }
471
+ },
472
+ "add_S6": {
473
+ "full_accuracy": 1.0,
474
+ "n_examples": 50,
475
+ "per_subtask": {
476
+ "SC": {
477
+ "accuracy": 1.0,
478
+ "count": 50
479
+ },
480
+ "UC": {
481
+ "accuracy": 1.0,
482
+ "count": 50
483
+ },
484
+ "US": {
485
+ "accuracy": 1.0,
486
+ "count": 250
487
+ }
488
+ }
489
+ },
490
+ "add_random": {
491
+ "full_accuracy": 0.99,
492
+ "n_examples": 200,
493
+ "per_subtask": {
494
+ "SA": {
495
+ "accuracy": 1.0,
496
+ "count": 431
497
+ },
498
+ "SC": {
499
+ "accuracy": 0.9968354430379747,
500
+ "count": 316
501
+ },
502
+ "SS": {
503
+ "accuracy": 1.0,
504
+ "count": 39
505
+ },
506
+ "UC": {
507
+ "accuracy": 0.9982142857142857,
508
+ "count": 560
509
+ },
510
+ "US": {
511
+ "accuracy": 1.0,
512
+ "count": 54
513
+ }
514
+ }
515
+ },
516
+ "add_C3": {
517
+ "full_accuracy": 0.88,
518
+ "n_examples": 50,
519
+ "per_subtask": {
520
+ "SA": {
521
+ "accuracy": 1.0,
522
+ "count": 150
523
+ },
524
+ "SC": {
525
+ "accuracy": 1.0,
526
+ "count": 50
527
+ },
528
+ "UC": {
529
+ "accuracy": 0.9423076923076923,
530
+ "count": 104
531
+ },
532
+ "US": {
533
+ "accuracy": 1.0,
534
+ "count": 46
535
+ }
536
+ }
537
+ },
538
+ "add_C4": {
539
+ "full_accuracy": 0.86,
540
+ "n_examples": 50,
541
+ "per_subtask": {
542
+ "SA": {
543
+ "accuracy": 1.0,
544
+ "count": 100
545
+ },
546
+ "SC": {
547
+ "accuracy": 1.0,
548
+ "count": 50
549
+ },
550
+ "UC": {
551
+ "accuracy": 0.943089430894309,
552
+ "count": 123
553
+ },
554
+ "US": {
555
+ "accuracy": 0.948051948051948,
556
+ "count": 77
557
+ }
558
+ }
559
+ },
560
+ "add_C5": {
561
+ "full_accuracy": 0.86,
562
+ "n_examples": 50,
563
+ "per_subtask": {
564
+ "SA": {
565
+ "accuracy": 1.0,
566
+ "count": 50
567
+ },
568
+ "SC": {
569
+ "accuracy": 1.0,
570
+ "count": 50
571
+ },
572
+ "UC": {
573
+ "accuracy": 0.9545454545454546,
574
+ "count": 154
575
+ },
576
+ "US": {
577
+ "accuracy": 0.9895833333333334,
578
+ "count": 96
579
+ }
580
+ }
581
+ },
582
+ "add_C6": {
583
+ "full_accuracy": 0.88,
584
+ "n_examples": 50,
585
+ "per_subtask": {
586
+ "SC": {
587
+ "accuracy": 1.0,
588
+ "count": 50
589
+ },
590
+ "UC": {
591
+ "accuracy": 0.967032967032967,
592
+ "count": 182
593
+ },
594
+ "US": {
595
+ "accuracy": 0.940677966101695,
596
+ "count": 118
597
+ }
598
+ }
599
+ },
600
+ "sub_M0": {
601
+ "full_accuracy": 1.0,
602
+ "n_examples": 50,
603
+ "per_subtask": {
604
+ "MD": {
605
+ "accuracy": 1.0,
606
+ "count": 294
607
+ },
608
+ "ME": {
609
+ "accuracy": 1.0,
610
+ "count": 56
611
+ }
612
+ }
613
+ },
614
+ "sub_M1": {
615
+ "full_accuracy": 1.0,
616
+ "n_examples": 50,
617
+ "per_subtask": {
618
+ "MD": {
619
+ "accuracy": 1.0,
620
+ "count": 143
621
+ },
622
+ "MB": {
623
+ "accuracy": 1.0,
624
+ "count": 69
625
+ },
626
+ "ME": {
627
+ "accuracy": 1.0,
628
+ "count": 15
629
+ },
630
+ "UB": {
631
+ "accuracy": 1.0,
632
+ "count": 123
633
+ }
634
+ }
635
+ },
636
+ "sub_M2": {
637
+ "full_accuracy": 1.0,
638
+ "n_examples": 50,
639
+ "per_subtask": {
640
+ "MD": {
641
+ "accuracy": 1.0,
642
+ "count": 108
643
+ },
644
+ "MB": {
645
+ "accuracy": 1.0,
646
+ "count": 52
647
+ },
648
+ "ME": {
649
+ "accuracy": 1.0,
650
+ "count": 52
651
+ },
652
+ "UB": {
653
+ "accuracy": 1.0,
654
+ "count": 87
655
+ },
656
+ "UD": {
657
+ "accuracy": 1.0,
658
+ "count": 51
659
+ }
660
+ }
661
+ },
662
+ "sub_M3": {
663
+ "full_accuracy": 0.46,
664
+ "n_examples": 50,
665
+ "per_subtask": {
666
+ "MD": {
667
+ "accuracy": 1.0,
668
+ "count": 94
669
+ },
670
+ "MB": {
671
+ "accuracy": 1.0,
672
+ "count": 51
673
+ },
674
+ "ME": {
675
+ "accuracy": 1.0,
676
+ "count": 25
677
+ },
678
+ "UB": {
679
+ "accuracy": 0.6538461538461539,
680
+ "count": 78
681
+ },
682
+ "UD": {
683
+ "accuracy": 1.0,
684
+ "count": 102
685
+ }
686
+ }
687
+ },
688
+ "sub_M4": {
689
+ "full_accuracy": 0.48,
690
+ "n_examples": 50,
691
+ "per_subtask": {
692
+ "MD": {
693
+ "accuracy": 1.0,
694
+ "count": 100
695
+ },
696
+ "MB": {
697
+ "accuracy": 1.0,
698
+ "count": 50
699
+ },
700
+ "UB": {
701
+ "accuracy": 0.48,
702
+ "count": 50
703
+ },
704
+ "UD": {
705
+ "accuracy": 0.8333333333333334,
706
+ "count": 150
707
+ }
708
+ }
709
+ },
710
+ "sub_M5": {
711
+ "full_accuracy": 0.36,
712
+ "n_examples": 50,
713
+ "per_subtask": {
714
+ "MD": {
715
+ "accuracy": 1.0,
716
+ "count": 50
717
+ },
718
+ "MB": {
719
+ "accuracy": 1.0,
720
+ "count": 50
721
+ },
722
+ "UB": {
723
+ "accuracy": 0.44,
724
+ "count": 50
725
+ },
726
+ "UD": {
727
+ "accuracy": 0.69,
728
+ "count": 200
729
+ }
730
+ }
731
+ },
732
+ "sub_random": {
733
+ "full_accuracy": 0.995,
734
+ "n_examples": 200,
735
+ "per_subtask": {
736
+ "MD": {
737
+ "accuracy": 1.0,
738
+ "count": 588
739
+ },
740
+ "MB": {
741
+ "accuracy": 1.0,
742
+ "count": 268
743
+ },
744
+ "ME": {
745
+ "accuracy": 1.0,
746
+ "count": 60
747
+ },
748
+ "UB": {
749
+ "accuracy": 0.9977628635346756,
750
+ "count": 447
751
+ },
752
+ "UD": {
753
+ "accuracy": 1.0,
754
+ "count": 37
755
+ }
756
+ }
757
+ },
758
+ "sub_B3": {
759
+ "full_accuracy": 0.92,
760
+ "n_examples": 50,
761
+ "per_subtask": {
762
+ "MD": {
763
+ "accuracy": 1.0,
764
+ "count": 150
765
+ },
766
+ "MB": {
767
+ "accuracy": 1.0,
768
+ "count": 50
769
+ },
770
+ "UB": {
771
+ "accuracy": 0.9626168224299065,
772
+ "count": 107
773
+ },
774
+ "UD": {
775
+ "accuracy": 1.0,
776
+ "count": 43
777
+ }
778
+ }
779
+ },
780
+ "sub_B4": {
781
+ "full_accuracy": 0.72,
782
+ "n_examples": 50,
783
+ "per_subtask": {
784
+ "MD": {
785
+ "accuracy": 1.0,
786
+ "count": 100
787
+ },
788
+ "MB": {
789
+ "accuracy": 1.0,
790
+ "count": 50
791
+ },
792
+ "UB": {
793
+ "accuracy": 0.8771929824561403,
794
+ "count": 114
795
+ },
796
+ "UD": {
797
+ "accuracy": 0.9302325581395349,
798
+ "count": 86
799
+ }
800
+ }
801
+ },
802
+ "sub_B5": {
803
+ "full_accuracy": 0.7,
804
+ "n_examples": 50,
805
+ "per_subtask": {
806
+ "MD": {
807
+ "accuracy": 1.0,
808
+ "count": 50
809
+ },
810
+ "MB": {
811
+ "accuracy": 1.0,
812
+ "count": 50
813
+ },
814
+ "UB": {
815
+ "accuracy": 0.9019607843137255,
816
+ "count": 153
817
+ },
818
+ "UD": {
819
+ "accuracy": 0.865979381443299,
820
+ "count": 97
821
+ }
822
+ }
823
+ }
824
+ },
825
+ "summary": {
826
+ "overall_accuracy": 0.8628571428571429,
827
+ "total_examples": 1400,
828
+ "n_splits": 22
829
+ }
830
+ }
831
+ }
add_sub_baseline_10K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc2ceca0901e7a6c1a4fdcedf14bf615dda905c45631f688fe16182a3dabbdcd
3
+ size 650266922
add_sub_baseline_10K/train_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "baseline",
3
+ "ops": "add_sub",
4
+ "n_digits": 6,
5
+ "n_layer": 2,
6
+ "n_head": 3,
7
+ "n_embd": 510,
8
+ "abs_vocab": 0,
9
+ "K": 4,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "batch_size": 64,
14
+ "num_epochs": 20,
15
+ "dataset_size": 10000,
16
+ "lr": 8e-05,
17
+ "output_dir": "ckpt/sweep/add_sub_baseline_10K",
18
+ "device": "cuda",
19
+ "push_to_hub": true,
20
+ "no_wandb": false,
21
+ "n_params": 162490082,
22
+ "run_name": "add_sub_baseline_10K",
23
+ "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
+ "timestamp": "2026-04-12T01:58:10.796905+00:00",
25
+ "tokenizer": "Qwen/Qwen3-0.6B",
26
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
+ "dataset_config": "add_sub_6digit",
28
+ "model_repo": "thoughtworks/arithmetic-sorl",
29
+ "trainer_version": "sft",
30
+ "wandb_run_id": "04vy23fy",
31
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/04vy23fy",
32
+ "final_accuracy": 0.8628571428571429,
33
+ "sft_accuracy": 0.8628571428571429,
34
+ "eval_method": "ArithmeticEvaluator"
35
+ }