amirali1985 commited on
Commit
73638b8
·
verified ·
1 Parent(s): 5c68be0

Upload add_sub_baseline_10K

Browse files
add_sub_baseline_10K/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 510,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2040,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 128,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 3,
23
+ "num_hidden_layers": 2,
24
+ "num_key_value_heads": 3,
25
+ "pad_token_id": null,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "rope_theta": 10000.0,
29
+ "rope_type": "default"
30
+ },
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "transformers_version": "5.5.0",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 151645
37
+ }
add_sub_baseline_10K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_10K/metrics.json ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300,
50
+ 2350,
51
+ 2400,
52
+ 2450,
53
+ 2500,
54
+ 2550,
55
+ 2600,
56
+ 2650,
57
+ 2700,
58
+ 2750,
59
+ 2800,
60
+ 2850,
61
+ 2900,
62
+ 2950,
63
+ 3000,
64
+ 3050,
65
+ 3100
66
+ ],
67
+ "loss": [
68
+ 10.006766319274902,
69
+ 7.251689434051514,
70
+ 6.329094886779785,
71
+ 5.457243919372559,
72
+ 4.436343669891357,
73
+ 3.0709240436553955,
74
+ 2.1984894275665283,
75
+ 1.9786336421966553,
76
+ 1.8310412168502808,
77
+ 1.7493754625320435,
78
+ 1.7336280345916748,
79
+ 1.668373942375183,
80
+ 1.656901478767395,
81
+ 1.5222007036209106,
82
+ 1.1166151762008667,
83
+ 0.8200947642326355,
84
+ 0.5962659120559692,
85
+ 0.4486698508262634,
86
+ 0.3298042118549347,
87
+ 0.21894291043281555,
88
+ 0.1570301353931427,
89
+ 0.12834718823432922,
90
+ 0.09281289577484131,
91
+ 0.0865481048822403,
92
+ 0.10090114921331406,
93
+ 0.05000796541571617,
94
+ 0.05903910472989082,
95
+ 0.041255030781030655,
96
+ 0.03712769225239754,
97
+ 0.05814942717552185,
98
+ 0.04444698244333267,
99
+ 0.03615949675440788,
100
+ 0.027622858062386513,
101
+ 0.04681888222694397,
102
+ 0.06002098321914673,
103
+ 0.049318160861730576,
104
+ 0.059921007603406906,
105
+ 0.032836705446243286,
106
+ 0.061555344611406326,
107
+ 0.03171056881546974,
108
+ 0.04210656136274338,
109
+ 0.04413192719221115,
110
+ 0.047493066638708115,
111
+ 0.050301071256399155,
112
+ 0.02659599296748638,
113
+ 0.022891702130436897,
114
+ 0.05459100008010864,
115
+ 0.017184091731905937,
116
+ 0.013001530431210995,
117
+ 0.03498239442706108,
118
+ 0.02405695430934429,
119
+ 0.03834375739097595,
120
+ 0.03403574228286743,
121
+ 0.014610671438276768,
122
+ 0.044524967670440674,
123
+ 0.017222251743078232,
124
+ 0.03136735409498215,
125
+ 0.02747531048953533,
126
+ 0.030222097411751747,
127
+ 0.02000907063484192,
128
+ 0.02265077270567417,
129
+ 0.026362445205450058
130
+ ],
131
+ "base_loss": [
132
+ 10.006766319274902,
133
+ 7.251689434051514,
134
+ 6.329094886779785,
135
+ 5.457243919372559,
136
+ 4.436343669891357,
137
+ 3.0709240436553955,
138
+ 2.1984894275665283,
139
+ 1.9786336421966553,
140
+ 1.8310412168502808,
141
+ 1.7493754625320435,
142
+ 1.7336280345916748,
143
+ 1.668373942375183,
144
+ 1.656901478767395,
145
+ 1.5222007036209106,
146
+ 1.1166151762008667,
147
+ 0.8200947642326355,
148
+ 0.5962659120559692,
149
+ 0.4486698508262634,
150
+ 0.3298042118549347,
151
+ 0.21894291043281555,
152
+ 0.1570301353931427,
153
+ 0.12834718823432922,
154
+ 0.09281289577484131,
155
+ 0.0865481048822403,
156
+ 0.10090114921331406,
157
+ 0.05000796541571617,
158
+ 0.05903910472989082,
159
+ 0.041255030781030655,
160
+ 0.03712769225239754,
161
+ 0.05814942717552185,
162
+ 0.04444698244333267,
163
+ 0.03615949675440788,
164
+ 0.027622858062386513,
165
+ 0.04681888222694397,
166
+ 0.06002098321914673,
167
+ 0.049318160861730576,
168
+ 0.059921007603406906,
169
+ 0.032836705446243286,
170
+ 0.061555344611406326,
171
+ 0.03171056881546974,
172
+ 0.04210656136274338,
173
+ 0.04413192719221115,
174
+ 0.047493066638708115,
175
+ 0.050301071256399155,
176
+ 0.02659599296748638,
177
+ 0.022891702130436897,
178
+ 0.05459100008010864,
179
+ 0.017184091731905937,
180
+ 0.013001530431210995,
181
+ 0.03498239442706108,
182
+ 0.02405695430934429,
183
+ 0.03834375739097595,
184
+ 0.03403574228286743,
185
+ 0.014610671438276768,
186
+ 0.044524967670440674,
187
+ 0.017222251743078232,
188
+ 0.03136735409498215,
189
+ 0.02747531048953533,
190
+ 0.030222097411751747,
191
+ 0.02000907063484192,
192
+ 0.02265077270567417,
193
+ 0.026362445205450058
194
+ ],
195
+ "lr": [
196
+ 6.242038216560511e-06,
197
+ 1.2611464968152866e-05,
198
+ 1.8980891719745225e-05,
199
+ 2.5350318471337578e-05,
200
+ 3.1719745222929934e-05,
201
+ 3.808917197452229e-05,
202
+ 4.445859872611465e-05,
203
+ 5.082802547770701e-05,
204
+ 5.7197452229299365e-05,
205
+ 6.356687898089173e-05,
206
+ 6.993630573248408e-05,
207
+ 7.630573248407644e-05,
208
+ 7.99862055592881e-05,
209
+ 7.984241248831029e-05,
210
+ 7.954287783192742e-05,
211
+ 7.90887724530305e-05,
212
+ 7.848187142213441e-05,
213
+ 7.772454707873448e-05,
214
+ 7.681975975797462e-05,
215
+ 7.57710462188759e-05,
216
+ 7.458250581935905e-05,
217
+ 7.325878449210182e-05,
218
+ 7.180505658386849e-05,
219
+ 7.022700462930083e-05,
220
+ 6.853079713823312e-05,
221
+ 6.672306448335957e-05,
222
+ 6.481087298250779e-05,
223
+ 6.280169727682872e-05,
224
+ 6.070339111287581e-05,
225
+ 5.8524156642783655e-05,
226
+ 5.627251236255051e-05,
227
+ 5.3957259813751526e-05,
228
+ 5.1587449178844164e-05,
229
+ 4.917234390455111e-05,
230
+ 4.672138449160635e-05,
231
+ 4.424415159240753e-05,
232
+ 4.175032856082417e-05,
233
+ 3.924966360055181e-05,
234
+ 3.675193165997228e-05,
235
+ 3.42668962224704e-05,
236
+ 3.180427114156694e-05,
237
+ 2.9373682670051437e-05,
238
+ 2.6984631831541183e-05,
239
+ 2.4646457281553407e-05,
240
+ 2.2368298803264487e-05,
241
+ 2.0159061580649347e-05,
242
+ 1.8027381388654794e-05,
243
+ 1.5981590836476463e-05,
244
+ 1.4029686795892575e-05,
245
+ 1.2179299141974771e-05,
246
+ 1.0437660928367057e-05,
247
+ 8.811580113715755e-06,
248
+ 7.307412949770034e-06,
249
+ 5.9310391351775455e-06,
250
+ 4.687838832097362e-06,
251
+ 3.5826716354707645e-06,
252
+ 2.6198575771580583e-06,
253
+ 1.8031602391947344e-06,
254
+ 1.1357720421765062e-06,
255
+ 6.203017662798872e-07,
256
+ 2.5876435369797334e-07,
257
+ 5.257303235302935e-08
258
+ ],
259
+ "eval_step": [
260
+ 156,
261
+ 312,
262
+ 468,
263
+ 624,
264
+ 780,
265
+ 936,
266
+ 1092,
267
+ 1248,
268
+ 1404,
269
+ 1560,
270
+ 1716,
271
+ 1872,
272
+ 2028,
273
+ 2184,
274
+ 2340,
275
+ 2496,
276
+ 2652,
277
+ 2808,
278
+ 2964,
279
+ 3120
280
+ ],
281
+ "eval_epoch": [
282
+ 1,
283
+ 2,
284
+ 3,
285
+ 4,
286
+ 5,
287
+ 6,
288
+ 7,
289
+ 8,
290
+ 9,
291
+ 10,
292
+ 11,
293
+ 12,
294
+ 13,
295
+ 14,
296
+ 15,
297
+ 16,
298
+ 17,
299
+ 18,
300
+ 19,
301
+ 20
302
+ ],
303
+ "eval_accuracy": [
304
+ 0.0014285714285714286,
305
+ 0.012142857142857143,
306
+ 0.004285714285714286,
307
+ 0.007142857142857143,
308
+ 0.035,
309
+ 0.26285714285714284,
310
+ 0.6278571428571429,
311
+ 0.6864285714285714,
312
+ 0.6885714285714286,
313
+ 0.7314285714285714,
314
+ 0.7442857142857143,
315
+ 0.73,
316
+ 0.735,
317
+ 0.75,
318
+ 0.7807142857142857,
319
+ 0.7964285714285714,
320
+ 0.8185714285714286,
321
+ 0.8185714285714286,
322
+ 0.8221428571428572,
323
+ 0.8235714285714286
324
+ ]
325
+ },
326
+ "final_accuracy": 0.7609259259259259,
327
+ "sft_eval": {
328
+ "config": {
329
+ "ops": "add_sub",
330
+ "K": null,
331
+ "mode": "sft",
332
+ "n_digits": 6,
333
+ "n_per_split": 250
334
+ },
335
+ "splits": {
336
+ "add_S0": {
337
+ "full_accuracy": 0.988,
338
+ "n_examples": 250,
339
+ "per_subtask": {
340
+ "SA": {
341
+ "accuracy": 0.9980302035456337,
342
+ "count": 1523
343
+ },
344
+ "SS": {
345
+ "accuracy": 0.9955947136563876,
346
+ "count": 227
347
+ }
348
+ }
349
+ },
350
+ "add_S1": {
351
+ "full_accuracy": 0.996,
352
+ "n_examples": 250,
353
+ "per_subtask": {
354
+ "SA": {
355
+ "accuracy": 1.0,
356
+ "count": 542
357
+ },
358
+ "SC": {
359
+ "accuracy": 0.9976019184652278,
360
+ "count": 417
361
+ },
362
+ "SS": {
363
+ "accuracy": 1.0,
364
+ "count": 70
365
+ },
366
+ "UC": {
367
+ "accuracy": 1.0,
368
+ "count": 721
369
+ }
370
+ }
371
+ },
372
+ "add_S2": {
373
+ "full_accuracy": 0.968,
374
+ "n_examples": 250,
375
+ "per_subtask": {
376
+ "SA": {
377
+ "accuracy": 1.0,
378
+ "count": 368
379
+ },
380
+ "SC": {
381
+ "accuracy": 0.9750778816199377,
382
+ "count": 321
383
+ },
384
+ "SS": {
385
+ "accuracy": 1.0,
386
+ "count": 228
387
+ },
388
+ "UC": {
389
+ "accuracy": 1.0,
390
+ "count": 531
391
+ },
392
+ "US": {
393
+ "accuracy": 1.0,
394
+ "count": 302
395
+ }
396
+ }
397
+ },
398
+ "add_S3": {
399
+ "full_accuracy": 0.74,
400
+ "n_examples": 250,
401
+ "per_subtask": {
402
+ "SA": {
403
+ "accuracy": 1.0,
404
+ "count": 307
405
+ },
406
+ "SC": {
407
+ "accuracy": 0.9965635738831615,
408
+ "count": 291
409
+ },
410
+ "SS": {
411
+ "accuracy": 1.0,
412
+ "count": 113
413
+ },
414
+ "UC": {
415
+ "accuracy": 0.8674948240165632,
416
+ "count": 483
417
+ },
418
+ "US": {
419
+ "accuracy": 1.0,
420
+ "count": 556
421
+ }
422
+ }
423
+ },
424
+ "add_S4": {
425
+ "full_accuracy": 0.704,
426
+ "n_examples": 250,
427
+ "per_subtask": {
428
+ "SA": {
429
+ "accuracy": 1.0,
430
+ "count": 238
431
+ },
432
+ "SC": {
433
+ "accuracy": 1.0,
434
+ "count": 271
435
+ },
436
+ "SS": {
437
+ "accuracy": 1.0,
438
+ "count": 59
439
+ },
440
+ "UC": {
441
+ "accuracy": 0.8395061728395061,
442
+ "count": 405
443
+ },
444
+ "US": {
445
+ "accuracy": 0.9317889317889317,
446
+ "count": 777
447
+ }
448
+ }
449
+ },
450
+ "add_S5": {
451
+ "full_accuracy": 0.5,
452
+ "n_examples": 250,
453
+ "per_subtask": {
454
+ "SA": {
455
+ "accuracy": 1.0,
456
+ "count": 250
457
+ },
458
+ "SC": {
459
+ "accuracy": 1.0,
460
+ "count": 250
461
+ },
462
+ "UC": {
463
+ "accuracy": 0.644,
464
+ "count": 250
465
+ },
466
+ "US": {
467
+ "accuracy": 0.795,
468
+ "count": 1000
469
+ }
470
+ }
471
+ },
472
+ "add_S6": {
473
+ "full_accuracy": 0.916,
474
+ "n_examples": 250,
475
+ "per_subtask": {
476
+ "SC": {
477
+ "accuracy": 1.0,
478
+ "count": 250
479
+ },
480
+ "UC": {
481
+ "accuracy": 1.0,
482
+ "count": 250
483
+ },
484
+ "US": {
485
+ "accuracy": 0.9824,
486
+ "count": 1250
487
+ }
488
+ }
489
+ },
490
+ "add_random": {
491
+ "full_accuracy": 0.985,
492
+ "n_examples": 200,
493
+ "per_subtask": {
494
+ "SA": {
495
+ "accuracy": 0.9977324263038548,
496
+ "count": 441
497
+ },
498
+ "SC": {
499
+ "accuracy": 1.0,
500
+ "count": 317
501
+ },
502
+ "SS": {
503
+ "accuracy": 1.0,
504
+ "count": 54
505
+ },
506
+ "UC": {
507
+ "accuracy": 0.9962406015037594,
508
+ "count": 532
509
+ },
510
+ "US": {
511
+ "accuracy": 0.9821428571428571,
512
+ "count": 56
513
+ }
514
+ }
515
+ },
516
+ "add_C3": {
517
+ "full_accuracy": 0.812,
518
+ "n_examples": 250,
519
+ "per_subtask": {
520
+ "SA": {
521
+ "accuracy": 1.0,
522
+ "count": 750
523
+ },
524
+ "SC": {
525
+ "accuracy": 1.0,
526
+ "count": 250
527
+ },
528
+ "UC": {
529
+ "accuracy": 0.9012605042016807,
530
+ "count": 476
531
+ },
532
+ "US": {
533
+ "accuracy": 1.0,
534
+ "count": 274
535
+ }
536
+ }
537
+ },
538
+ "add_C4": {
539
+ "full_accuracy": 0.888,
540
+ "n_examples": 250,
541
+ "per_subtask": {
542
+ "SA": {
543
+ "accuracy": 1.0,
544
+ "count": 500
545
+ },
546
+ "SC": {
547
+ "accuracy": 1.0,
548
+ "count": 250
549
+ },
550
+ "UC": {
551
+ "accuracy": 0.9606656580937972,
552
+ "count": 661
553
+ },
554
+ "US": {
555
+ "accuracy": 0.967551622418879,
556
+ "count": 339
557
+ }
558
+ }
559
+ },
560
+ "add_C5": {
561
+ "full_accuracy": 0.812,
562
+ "n_examples": 250,
563
+ "per_subtask": {
564
+ "SA": {
565
+ "accuracy": 1.0,
566
+ "count": 250
567
+ },
568
+ "SC": {
569
+ "accuracy": 1.0,
570
+ "count": 250
571
+ },
572
+ "UC": {
573
+ "accuracy": 0.9385245901639344,
574
+ "count": 732
575
+ },
576
+ "US": {
577
+ "accuracy": 0.9498069498069498,
578
+ "count": 518
579
+ }
580
+ }
581
+ },
582
+ "add_C6": {
583
+ "full_accuracy": 0.824,
584
+ "n_examples": 250,
585
+ "per_subtask": {
586
+ "SC": {
587
+ "accuracy": 1.0,
588
+ "count": 250
589
+ },
590
+ "UC": {
591
+ "accuracy": 0.954954954954955,
592
+ "count": 888
593
+ },
594
+ "US": {
595
+ "accuracy": 0.9624183006535948,
596
+ "count": 612
597
+ }
598
+ }
599
+ },
600
+ "sub_M0": {
601
+ "full_accuracy": 0.968,
602
+ "n_examples": 250,
603
+ "per_subtask": {
604
+ "MD": {
605
+ "accuracy": 0.9946843853820598,
606
+ "count": 1505
607
+ },
608
+ "ME": {
609
+ "accuracy": 1.0,
610
+ "count": 245
611
+ }
612
+ }
613
+ },
614
+ "sub_M1": {
615
+ "full_accuracy": 1.0,
616
+ "n_examples": 250,
617
+ "per_subtask": {
618
+ "MD": {
619
+ "accuracy": 1.0,
620
+ "count": 714
621
+ },
622
+ "MB": {
623
+ "accuracy": 1.0,
624
+ "count": 374
625
+ },
626
+ "ME": {
627
+ "accuracy": 1.0,
628
+ "count": 75
629
+ },
630
+ "UB": {
631
+ "accuracy": 1.0,
632
+ "count": 587
633
+ }
634
+ }
635
+ },
636
+ "sub_M2": {
637
+ "full_accuracy": 0.996,
638
+ "n_examples": 250,
639
+ "per_subtask": {
640
+ "MD": {
641
+ "accuracy": 0.9981949458483754,
642
+ "count": 554
643
+ },
644
+ "MB": {
645
+ "accuracy": 1.0,
646
+ "count": 273
647
+ },
648
+ "ME": {
649
+ "accuracy": 1.0,
650
+ "count": 219
651
+ },
652
+ "UB": {
653
+ "accuracy": 1.0,
654
+ "count": 430
655
+ },
656
+ "UD": {
657
+ "accuracy": 1.0,
658
+ "count": 274
659
+ }
660
+ }
661
+ },
662
+ "sub_M3": {
663
+ "full_accuracy": 0.34,
664
+ "n_examples": 250,
665
+ "per_subtask": {
666
+ "MD": {
667
+ "accuracy": 1.0,
668
+ "count": 458
669
+ },
670
+ "MB": {
671
+ "accuracy": 1.0,
672
+ "count": 261
673
+ },
674
+ "ME": {
675
+ "accuracy": 1.0,
676
+ "count": 124
677
+ },
678
+ "UB": {
679
+ "accuracy": 0.5747422680412371,
680
+ "count": 388
681
+ },
682
+ "UD": {
683
+ "accuracy": 1.0,
684
+ "count": 519
685
+ }
686
+ }
687
+ },
688
+ "sub_M4": {
689
+ "full_accuracy": 0.116,
690
+ "n_examples": 250,
691
+ "per_subtask": {
692
+ "MD": {
693
+ "accuracy": 1.0,
694
+ "count": 500
695
+ },
696
+ "MB": {
697
+ "accuracy": 1.0,
698
+ "count": 250
699
+ },
700
+ "UB": {
701
+ "accuracy": 0.312,
702
+ "count": 250
703
+ },
704
+ "UD": {
705
+ "accuracy": 0.7466666666666667,
706
+ "count": 750
707
+ }
708
+ }
709
+ },
710
+ "sub_M5": {
711
+ "full_accuracy": 0.052,
712
+ "n_examples": 250,
713
+ "per_subtask": {
714
+ "MD": {
715
+ "accuracy": 1.0,
716
+ "count": 250
717
+ },
718
+ "MB": {
719
+ "accuracy": 1.0,
720
+ "count": 250
721
+ },
722
+ "UB": {
723
+ "accuracy": 0.368,
724
+ "count": 250
725
+ },
726
+ "UD": {
727
+ "accuracy": 0.547,
728
+ "count": 1000
729
+ }
730
+ }
731
+ },
732
+ "sub_random": {
733
+ "full_accuracy": 0.99,
734
+ "n_examples": 200,
735
+ "per_subtask": {
736
+ "MD": {
737
+ "accuracy": 1.0,
738
+ "count": 580
739
+ },
740
+ "MB": {
741
+ "accuracy": 1.0,
742
+ "count": 267
743
+ },
744
+ "ME": {
745
+ "accuracy": 1.0,
746
+ "count": 63
747
+ },
748
+ "UB": {
749
+ "accuracy": 0.9955357142857143,
750
+ "count": 448
751
+ },
752
+ "UD": {
753
+ "accuracy": 1.0,
754
+ "count": 42
755
+ }
756
+ }
757
+ },
758
+ "sub_B3": {
759
+ "full_accuracy": 0.868,
760
+ "n_examples": 250,
761
+ "per_subtask": {
762
+ "MD": {
763
+ "accuracy": 1.0,
764
+ "count": 750
765
+ },
766
+ "MB": {
767
+ "accuracy": 1.0,
768
+ "count": 250
769
+ },
770
+ "UB": {
771
+ "accuracy": 0.9350393700787402,
772
+ "count": 508
773
+ },
774
+ "UD": {
775
+ "accuracy": 1.0,
776
+ "count": 242
777
+ }
778
+ }
779
+ },
780
+ "sub_B4": {
781
+ "full_accuracy": 0.716,
782
+ "n_examples": 250,
783
+ "per_subtask": {
784
+ "MD": {
785
+ "accuracy": 1.0,
786
+ "count": 500
787
+ },
788
+ "MB": {
789
+ "accuracy": 1.0,
790
+ "count": 250
791
+ },
792
+ "UB": {
793
+ "accuracy": 0.8973941368078175,
794
+ "count": 614
795
+ },
796
+ "UD": {
797
+ "accuracy": 0.9300518134715026,
798
+ "count": 386
799
+ }
800
+ }
801
+ },
802
+ "sub_B5": {
803
+ "full_accuracy": 0.652,
804
+ "n_examples": 250,
805
+ "per_subtask": {
806
+ "MD": {
807
+ "accuracy": 1.0,
808
+ "count": 250
809
+ },
810
+ "MB": {
811
+ "accuracy": 1.0,
812
+ "count": 250
813
+ },
814
+ "UB": {
815
+ "accuracy": 0.8957528957528957,
816
+ "count": 777
817
+ },
818
+ "UD": {
819
+ "accuracy": 0.9090909090909091,
820
+ "count": 473
821
+ }
822
+ }
823
+ }
824
+ },
825
+ "summary": {
826
+ "overall_accuracy": 0.7609259259259259,
827
+ "total_examples": 5400,
828
+ "n_splits": 22
829
+ }
830
+ }
831
+ }
add_sub_baseline_10K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd67ff426b2d74cb962e75cb59ccc6e89358ef5a3fa76bfe35640b18cc9c69ee
3
+ size 650266922
add_sub_baseline_10K/train_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "baseline",
3
+ "ops": "add_sub",
4
+ "n_digits": 6,
5
+ "n_layer": 2,
6
+ "n_head": 3,
7
+ "n_embd": 510,
8
+ "abs_vocab": 0,
9
+ "K": 4,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "batch_size": 64,
14
+ "num_epochs": 20,
15
+ "dataset_size": 10000,
16
+ "lr": 8e-05,
17
+ "output_dir": "ckpt/sweep/add_sub_baseline_10K",
18
+ "device": "cuda",
19
+ "push_to_hub": true,
20
+ "no_wandb": false,
21
+ "n_params": 162490082,
22
+ "run_name": "add_sub_baseline_10K",
23
+ "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
+ "timestamp": "2026-04-11T21:33:55.773262+00:00",
25
+ "tokenizer": "Qwen/Qwen3-0.6B",
26
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
+ "dataset_config": "add_sub_6digit",
28
+ "model_repo": "thoughtworks/arithmetic-sorl",
29
+ "trainer_version": "sft",
30
+ "wandb_run_id": "1e2dttjb",
31
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/1e2dttjb",
32
+ "final_accuracy": 0.7609259259259259,
33
+ "sft_accuracy": 0.7609259259259259,
34
+ "eval_method": "ArithmeticEvaluator"
35
+ }