amirali1985 commited on
Commit
0757c15
·
verified ·
1 Parent(s): 4194189

Delete folder add_sub_baseline_10K with huggingface_hub

Browse files
add_sub_baseline_10K/config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "architectures": [
3
- "SorlModelWrapper"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "float32",
9
- "eos_token_id": null,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 510,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 2040,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention"
18
- ],
19
- "max_position_embeddings": 128,
20
- "max_window_layers": 28,
21
- "model_type": "qwen3",
22
- "num_attention_heads": 3,
23
- "num_hidden_layers": 2,
24
- "num_key_value_heads": 3,
25
- "pad_token_id": null,
26
- "rms_norm_eps": 1e-06,
27
- "rope_parameters": {
28
- "rope_theta": 10000.0,
29
- "rope_type": "default"
30
- },
31
- "sliding_window": null,
32
- "tie_word_embeddings": false,
33
- "transformers_version": "5.5.0",
34
- "use_cache": true,
35
- "use_sliding_window": false,
36
- "vocab_size": 151645
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add_sub_baseline_10K/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "output_attentions": false,
4
- "output_hidden_states": false,
5
- "transformers_version": "5.5.0",
6
- "use_cache": true
7
- }
 
 
 
 
 
 
 
 
add_sub_baseline_10K/metrics.json DELETED
@@ -1,831 +0,0 @@
1
- {
2
- "history": {
3
- "step": [
4
- 50,
5
- 100,
6
- 150,
7
- 200,
8
- 250,
9
- 300,
10
- 350,
11
- 400,
12
- 450,
13
- 500,
14
- 550,
15
- 600,
16
- 650,
17
- 700,
18
- 750,
19
- 800,
20
- 850,
21
- 900,
22
- 950,
23
- 1000,
24
- 1050,
25
- 1100,
26
- 1150,
27
- 1200,
28
- 1250,
29
- 1300,
30
- 1350,
31
- 1400,
32
- 1450,
33
- 1500,
34
- 1550,
35
- 1600,
36
- 1650,
37
- 1700,
38
- 1750,
39
- 1800,
40
- 1850,
41
- 1900,
42
- 1950,
43
- 2000,
44
- 2050,
45
- 2100,
46
- 2150,
47
- 2200,
48
- 2250,
49
- 2300,
50
- 2350,
51
- 2400,
52
- 2450,
53
- 2500,
54
- 2550,
55
- 2600,
56
- 2650,
57
- 2700,
58
- 2750,
59
- 2800,
60
- 2850,
61
- 2900,
62
- 2950,
63
- 3000,
64
- 3050,
65
- 3100
66
- ],
67
- "loss": [
68
- 10.006766319274902,
69
- 7.251689434051514,
70
- 6.329094886779785,
71
- 5.457243919372559,
72
- 4.436343669891357,
73
- 3.0709240436553955,
74
- 2.1984894275665283,
75
- 1.9786336421966553,
76
- 1.8310412168502808,
77
- 1.7493754625320435,
78
- 1.7336280345916748,
79
- 1.668373942375183,
80
- 1.656901478767395,
81
- 1.5222007036209106,
82
- 1.1166151762008667,
83
- 0.8200947642326355,
84
- 0.5962659120559692,
85
- 0.4486698508262634,
86
- 0.3298042118549347,
87
- 0.21894291043281555,
88
- 0.1570301353931427,
89
- 0.12834718823432922,
90
- 0.09281289577484131,
91
- 0.0865481048822403,
92
- 0.10090114921331406,
93
- 0.05000796541571617,
94
- 0.05903910472989082,
95
- 0.041255030781030655,
96
- 0.03712769225239754,
97
- 0.05814942717552185,
98
- 0.04444698244333267,
99
- 0.03615949675440788,
100
- 0.027622858062386513,
101
- 0.04681888222694397,
102
- 0.06002098321914673,
103
- 0.049318160861730576,
104
- 0.059921007603406906,
105
- 0.032836705446243286,
106
- 0.061555344611406326,
107
- 0.03171056881546974,
108
- 0.04210656136274338,
109
- 0.04413192719221115,
110
- 0.047493066638708115,
111
- 0.050301071256399155,
112
- 0.02659599296748638,
113
- 0.022891702130436897,
114
- 0.05459100008010864,
115
- 0.017184091731905937,
116
- 0.013001530431210995,
117
- 0.03498239442706108,
118
- 0.02405695430934429,
119
- 0.03834375739097595,
120
- 0.03403574228286743,
121
- 0.014610671438276768,
122
- 0.044524967670440674,
123
- 0.017222251743078232,
124
- 0.03136735409498215,
125
- 0.02747531048953533,
126
- 0.030222097411751747,
127
- 0.02000907063484192,
128
- 0.02265077270567417,
129
- 0.026362445205450058
130
- ],
131
- "base_loss": [
132
- 10.006766319274902,
133
- 7.251689434051514,
134
- 6.329094886779785,
135
- 5.457243919372559,
136
- 4.436343669891357,
137
- 3.0709240436553955,
138
- 2.1984894275665283,
139
- 1.9786336421966553,
140
- 1.8310412168502808,
141
- 1.7493754625320435,
142
- 1.7336280345916748,
143
- 1.668373942375183,
144
- 1.656901478767395,
145
- 1.5222007036209106,
146
- 1.1166151762008667,
147
- 0.8200947642326355,
148
- 0.5962659120559692,
149
- 0.4486698508262634,
150
- 0.3298042118549347,
151
- 0.21894291043281555,
152
- 0.1570301353931427,
153
- 0.12834718823432922,
154
- 0.09281289577484131,
155
- 0.0865481048822403,
156
- 0.10090114921331406,
157
- 0.05000796541571617,
158
- 0.05903910472989082,
159
- 0.041255030781030655,
160
- 0.03712769225239754,
161
- 0.05814942717552185,
162
- 0.04444698244333267,
163
- 0.03615949675440788,
164
- 0.027622858062386513,
165
- 0.04681888222694397,
166
- 0.06002098321914673,
167
- 0.049318160861730576,
168
- 0.059921007603406906,
169
- 0.032836705446243286,
170
- 0.061555344611406326,
171
- 0.03171056881546974,
172
- 0.04210656136274338,
173
- 0.04413192719221115,
174
- 0.047493066638708115,
175
- 0.050301071256399155,
176
- 0.02659599296748638,
177
- 0.022891702130436897,
178
- 0.05459100008010864,
179
- 0.017184091731905937,
180
- 0.013001530431210995,
181
- 0.03498239442706108,
182
- 0.02405695430934429,
183
- 0.03834375739097595,
184
- 0.03403574228286743,
185
- 0.014610671438276768,
186
- 0.044524967670440674,
187
- 0.017222251743078232,
188
- 0.03136735409498215,
189
- 0.02747531048953533,
190
- 0.030222097411751747,
191
- 0.02000907063484192,
192
- 0.02265077270567417,
193
- 0.026362445205450058
194
- ],
195
- "lr": [
196
- 6.242038216560511e-06,
197
- 1.2611464968152866e-05,
198
- 1.8980891719745225e-05,
199
- 2.5350318471337578e-05,
200
- 3.1719745222929934e-05,
201
- 3.808917197452229e-05,
202
- 4.445859872611465e-05,
203
- 5.082802547770701e-05,
204
- 5.7197452229299365e-05,
205
- 6.356687898089173e-05,
206
- 6.993630573248408e-05,
207
- 7.630573248407644e-05,
208
- 7.99862055592881e-05,
209
- 7.984241248831029e-05,
210
- 7.954287783192742e-05,
211
- 7.90887724530305e-05,
212
- 7.848187142213441e-05,
213
- 7.772454707873448e-05,
214
- 7.681975975797462e-05,
215
- 7.57710462188759e-05,
216
- 7.458250581935905e-05,
217
- 7.325878449210182e-05,
218
- 7.180505658386849e-05,
219
- 7.022700462930083e-05,
220
- 6.853079713823312e-05,
221
- 6.672306448335957e-05,
222
- 6.481087298250779e-05,
223
- 6.280169727682872e-05,
224
- 6.070339111287581e-05,
225
- 5.8524156642783655e-05,
226
- 5.627251236255051e-05,
227
- 5.3957259813751526e-05,
228
- 5.1587449178844164e-05,
229
- 4.917234390455111e-05,
230
- 4.672138449160635e-05,
231
- 4.424415159240753e-05,
232
- 4.175032856082417e-05,
233
- 3.924966360055181e-05,
234
- 3.675193165997228e-05,
235
- 3.42668962224704e-05,
236
- 3.180427114156694e-05,
237
- 2.9373682670051437e-05,
238
- 2.6984631831541183e-05,
239
- 2.4646457281553407e-05,
240
- 2.2368298803264487e-05,
241
- 2.0159061580649347e-05,
242
- 1.8027381388654794e-05,
243
- 1.5981590836476463e-05,
244
- 1.4029686795892575e-05,
245
- 1.2179299141974771e-05,
246
- 1.0437660928367057e-05,
247
- 8.811580113715755e-06,
248
- 7.307412949770034e-06,
249
- 5.9310391351775455e-06,
250
- 4.687838832097362e-06,
251
- 3.5826716354707645e-06,
252
- 2.6198575771580583e-06,
253
- 1.8031602391947344e-06,
254
- 1.1357720421765062e-06,
255
- 6.203017662798872e-07,
256
- 2.5876435369797334e-07,
257
- 5.257303235302935e-08
258
- ],
259
- "eval_step": [
260
- 156,
261
- 312,
262
- 468,
263
- 624,
264
- 780,
265
- 936,
266
- 1092,
267
- 1248,
268
- 1404,
269
- 1560,
270
- 1716,
271
- 1872,
272
- 2028,
273
- 2184,
274
- 2340,
275
- 2496,
276
- 2652,
277
- 2808,
278
- 2964,
279
- 3120
280
- ],
281
- "eval_epoch": [
282
- 1,
283
- 2,
284
- 3,
285
- 4,
286
- 5,
287
- 6,
288
- 7,
289
- 8,
290
- 9,
291
- 10,
292
- 11,
293
- 12,
294
- 13,
295
- 14,
296
- 15,
297
- 16,
298
- 17,
299
- 18,
300
- 19,
301
- 20
302
- ],
303
- "eval_accuracy": [
304
- 0.0014285714285714286,
305
- 0.012142857142857143,
306
- 0.004285714285714286,
307
- 0.007142857142857143,
308
- 0.035,
309
- 0.26285714285714284,
310
- 0.6278571428571429,
311
- 0.6864285714285714,
312
- 0.6885714285714286,
313
- 0.7314285714285714,
314
- 0.7442857142857143,
315
- 0.73,
316
- 0.735,
317
- 0.75,
318
- 0.7807142857142857,
319
- 0.7964285714285714,
320
- 0.8185714285714286,
321
- 0.8185714285714286,
322
- 0.8221428571428572,
323
- 0.8235714285714286
324
- ]
325
- },
326
- "final_accuracy": 0.7609259259259259,
327
- "sft_eval": {
328
- "config": {
329
- "ops": "add_sub",
330
- "K": null,
331
- "mode": "sft",
332
- "n_digits": 6,
333
- "n_per_split": 250
334
- },
335
- "splits": {
336
- "add_S0": {
337
- "full_accuracy": 0.988,
338
- "n_examples": 250,
339
- "per_subtask": {
340
- "SA": {
341
- "accuracy": 0.9980302035456337,
342
- "count": 1523
343
- },
344
- "SS": {
345
- "accuracy": 0.9955947136563876,
346
- "count": 227
347
- }
348
- }
349
- },
350
- "add_S1": {
351
- "full_accuracy": 0.996,
352
- "n_examples": 250,
353
- "per_subtask": {
354
- "SA": {
355
- "accuracy": 1.0,
356
- "count": 542
357
- },
358
- "SC": {
359
- "accuracy": 0.9976019184652278,
360
- "count": 417
361
- },
362
- "SS": {
363
- "accuracy": 1.0,
364
- "count": 70
365
- },
366
- "UC": {
367
- "accuracy": 1.0,
368
- "count": 721
369
- }
370
- }
371
- },
372
- "add_S2": {
373
- "full_accuracy": 0.968,
374
- "n_examples": 250,
375
- "per_subtask": {
376
- "SA": {
377
- "accuracy": 1.0,
378
- "count": 368
379
- },
380
- "SC": {
381
- "accuracy": 0.9750778816199377,
382
- "count": 321
383
- },
384
- "SS": {
385
- "accuracy": 1.0,
386
- "count": 228
387
- },
388
- "UC": {
389
- "accuracy": 1.0,
390
- "count": 531
391
- },
392
- "US": {
393
- "accuracy": 1.0,
394
- "count": 302
395
- }
396
- }
397
- },
398
- "add_S3": {
399
- "full_accuracy": 0.74,
400
- "n_examples": 250,
401
- "per_subtask": {
402
- "SA": {
403
- "accuracy": 1.0,
404
- "count": 307
405
- },
406
- "SC": {
407
- "accuracy": 0.9965635738831615,
408
- "count": 291
409
- },
410
- "SS": {
411
- "accuracy": 1.0,
412
- "count": 113
413
- },
414
- "UC": {
415
- "accuracy": 0.8674948240165632,
416
- "count": 483
417
- },
418
- "US": {
419
- "accuracy": 1.0,
420
- "count": 556
421
- }
422
- }
423
- },
424
- "add_S4": {
425
- "full_accuracy": 0.704,
426
- "n_examples": 250,
427
- "per_subtask": {
428
- "SA": {
429
- "accuracy": 1.0,
430
- "count": 238
431
- },
432
- "SC": {
433
- "accuracy": 1.0,
434
- "count": 271
435
- },
436
- "SS": {
437
- "accuracy": 1.0,
438
- "count": 59
439
- },
440
- "UC": {
441
- "accuracy": 0.8395061728395061,
442
- "count": 405
443
- },
444
- "US": {
445
- "accuracy": 0.9317889317889317,
446
- "count": 777
447
- }
448
- }
449
- },
450
- "add_S5": {
451
- "full_accuracy": 0.5,
452
- "n_examples": 250,
453
- "per_subtask": {
454
- "SA": {
455
- "accuracy": 1.0,
456
- "count": 250
457
- },
458
- "SC": {
459
- "accuracy": 1.0,
460
- "count": 250
461
- },
462
- "UC": {
463
- "accuracy": 0.644,
464
- "count": 250
465
- },
466
- "US": {
467
- "accuracy": 0.795,
468
- "count": 1000
469
- }
470
- }
471
- },
472
- "add_S6": {
473
- "full_accuracy": 0.916,
474
- "n_examples": 250,
475
- "per_subtask": {
476
- "SC": {
477
- "accuracy": 1.0,
478
- "count": 250
479
- },
480
- "UC": {
481
- "accuracy": 1.0,
482
- "count": 250
483
- },
484
- "US": {
485
- "accuracy": 0.9824,
486
- "count": 1250
487
- }
488
- }
489
- },
490
- "add_random": {
491
- "full_accuracy": 0.985,
492
- "n_examples": 200,
493
- "per_subtask": {
494
- "SA": {
495
- "accuracy": 0.9977324263038548,
496
- "count": 441
497
- },
498
- "SC": {
499
- "accuracy": 1.0,
500
- "count": 317
501
- },
502
- "SS": {
503
- "accuracy": 1.0,
504
- "count": 54
505
- },
506
- "UC": {
507
- "accuracy": 0.9962406015037594,
508
- "count": 532
509
- },
510
- "US": {
511
- "accuracy": 0.9821428571428571,
512
- "count": 56
513
- }
514
- }
515
- },
516
- "add_C3": {
517
- "full_accuracy": 0.812,
518
- "n_examples": 250,
519
- "per_subtask": {
520
- "SA": {
521
- "accuracy": 1.0,
522
- "count": 750
523
- },
524
- "SC": {
525
- "accuracy": 1.0,
526
- "count": 250
527
- },
528
- "UC": {
529
- "accuracy": 0.9012605042016807,
530
- "count": 476
531
- },
532
- "US": {
533
- "accuracy": 1.0,
534
- "count": 274
535
- }
536
- }
537
- },
538
- "add_C4": {
539
- "full_accuracy": 0.888,
540
- "n_examples": 250,
541
- "per_subtask": {
542
- "SA": {
543
- "accuracy": 1.0,
544
- "count": 500
545
- },
546
- "SC": {
547
- "accuracy": 1.0,
548
- "count": 250
549
- },
550
- "UC": {
551
- "accuracy": 0.9606656580937972,
552
- "count": 661
553
- },
554
- "US": {
555
- "accuracy": 0.967551622418879,
556
- "count": 339
557
- }
558
- }
559
- },
560
- "add_C5": {
561
- "full_accuracy": 0.812,
562
- "n_examples": 250,
563
- "per_subtask": {
564
- "SA": {
565
- "accuracy": 1.0,
566
- "count": 250
567
- },
568
- "SC": {
569
- "accuracy": 1.0,
570
- "count": 250
571
- },
572
- "UC": {
573
- "accuracy": 0.9385245901639344,
574
- "count": 732
575
- },
576
- "US": {
577
- "accuracy": 0.9498069498069498,
578
- "count": 518
579
- }
580
- }
581
- },
582
- "add_C6": {
583
- "full_accuracy": 0.824,
584
- "n_examples": 250,
585
- "per_subtask": {
586
- "SC": {
587
- "accuracy": 1.0,
588
- "count": 250
589
- },
590
- "UC": {
591
- "accuracy": 0.954954954954955,
592
- "count": 888
593
- },
594
- "US": {
595
- "accuracy": 0.9624183006535948,
596
- "count": 612
597
- }
598
- }
599
- },
600
- "sub_M0": {
601
- "full_accuracy": 0.968,
602
- "n_examples": 250,
603
- "per_subtask": {
604
- "MD": {
605
- "accuracy": 0.9946843853820598,
606
- "count": 1505
607
- },
608
- "ME": {
609
- "accuracy": 1.0,
610
- "count": 245
611
- }
612
- }
613
- },
614
- "sub_M1": {
615
- "full_accuracy": 1.0,
616
- "n_examples": 250,
617
- "per_subtask": {
618
- "MD": {
619
- "accuracy": 1.0,
620
- "count": 714
621
- },
622
- "MB": {
623
- "accuracy": 1.0,
624
- "count": 374
625
- },
626
- "ME": {
627
- "accuracy": 1.0,
628
- "count": 75
629
- },
630
- "UB": {
631
- "accuracy": 1.0,
632
- "count": 587
633
- }
634
- }
635
- },
636
- "sub_M2": {
637
- "full_accuracy": 0.996,
638
- "n_examples": 250,
639
- "per_subtask": {
640
- "MD": {
641
- "accuracy": 0.9981949458483754,
642
- "count": 554
643
- },
644
- "MB": {
645
- "accuracy": 1.0,
646
- "count": 273
647
- },
648
- "ME": {
649
- "accuracy": 1.0,
650
- "count": 219
651
- },
652
- "UB": {
653
- "accuracy": 1.0,
654
- "count": 430
655
- },
656
- "UD": {
657
- "accuracy": 1.0,
658
- "count": 274
659
- }
660
- }
661
- },
662
- "sub_M3": {
663
- "full_accuracy": 0.34,
664
- "n_examples": 250,
665
- "per_subtask": {
666
- "MD": {
667
- "accuracy": 1.0,
668
- "count": 458
669
- },
670
- "MB": {
671
- "accuracy": 1.0,
672
- "count": 261
673
- },
674
- "ME": {
675
- "accuracy": 1.0,
676
- "count": 124
677
- },
678
- "UB": {
679
- "accuracy": 0.5747422680412371,
680
- "count": 388
681
- },
682
- "UD": {
683
- "accuracy": 1.0,
684
- "count": 519
685
- }
686
- }
687
- },
688
- "sub_M4": {
689
- "full_accuracy": 0.116,
690
- "n_examples": 250,
691
- "per_subtask": {
692
- "MD": {
693
- "accuracy": 1.0,
694
- "count": 500
695
- },
696
- "MB": {
697
- "accuracy": 1.0,
698
- "count": 250
699
- },
700
- "UB": {
701
- "accuracy": 0.312,
702
- "count": 250
703
- },
704
- "UD": {
705
- "accuracy": 0.7466666666666667,
706
- "count": 750
707
- }
708
- }
709
- },
710
- "sub_M5": {
711
- "full_accuracy": 0.052,
712
- "n_examples": 250,
713
- "per_subtask": {
714
- "MD": {
715
- "accuracy": 1.0,
716
- "count": 250
717
- },
718
- "MB": {
719
- "accuracy": 1.0,
720
- "count": 250
721
- },
722
- "UB": {
723
- "accuracy": 0.368,
724
- "count": 250
725
- },
726
- "UD": {
727
- "accuracy": 0.547,
728
- "count": 1000
729
- }
730
- }
731
- },
732
- "sub_random": {
733
- "full_accuracy": 0.99,
734
- "n_examples": 200,
735
- "per_subtask": {
736
- "MD": {
737
- "accuracy": 1.0,
738
- "count": 580
739
- },
740
- "MB": {
741
- "accuracy": 1.0,
742
- "count": 267
743
- },
744
- "ME": {
745
- "accuracy": 1.0,
746
- "count": 63
747
- },
748
- "UB": {
749
- "accuracy": 0.9955357142857143,
750
- "count": 448
751
- },
752
- "UD": {
753
- "accuracy": 1.0,
754
- "count": 42
755
- }
756
- }
757
- },
758
- "sub_B3": {
759
- "full_accuracy": 0.868,
760
- "n_examples": 250,
761
- "per_subtask": {
762
- "MD": {
763
- "accuracy": 1.0,
764
- "count": 750
765
- },
766
- "MB": {
767
- "accuracy": 1.0,
768
- "count": 250
769
- },
770
- "UB": {
771
- "accuracy": 0.9350393700787402,
772
- "count": 508
773
- },
774
- "UD": {
775
- "accuracy": 1.0,
776
- "count": 242
777
- }
778
- }
779
- },
780
- "sub_B4": {
781
- "full_accuracy": 0.716,
782
- "n_examples": 250,
783
- "per_subtask": {
784
- "MD": {
785
- "accuracy": 1.0,
786
- "count": 500
787
- },
788
- "MB": {
789
- "accuracy": 1.0,
790
- "count": 250
791
- },
792
- "UB": {
793
- "accuracy": 0.8973941368078175,
794
- "count": 614
795
- },
796
- "UD": {
797
- "accuracy": 0.9300518134715026,
798
- "count": 386
799
- }
800
- }
801
- },
802
- "sub_B5": {
803
- "full_accuracy": 0.652,
804
- "n_examples": 250,
805
- "per_subtask": {
806
- "MD": {
807
- "accuracy": 1.0,
808
- "count": 250
809
- },
810
- "MB": {
811
- "accuracy": 1.0,
812
- "count": 250
813
- },
814
- "UB": {
815
- "accuracy": 0.8957528957528957,
816
- "count": 777
817
- },
818
- "UD": {
819
- "accuracy": 0.9090909090909091,
820
- "count": 473
821
- }
822
- }
823
- }
824
- },
825
- "summary": {
826
- "overall_accuracy": 0.7609259259259259,
827
- "total_examples": 5400,
828
- "n_splits": 22
829
- }
830
- }
831
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add_sub_baseline_10K/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd67ff426b2d74cb962e75cb59ccc6e89358ef5a3fa76bfe35640b18cc9c69ee
3
- size 650266922
 
 
 
 
add_sub_baseline_10K/train_config.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "mode": "baseline",
3
- "ops": "add_sub",
4
- "n_digits": 6,
5
- "n_layer": 2,
6
- "n_head": 3,
7
- "n_embd": 510,
8
- "abs_vocab": 0,
9
- "K": 4,
10
- "alpha_info_gain": 10.0,
11
- "alpha_abs": 0.1,
12
- "alpha_soft_zipf": 1.0,
13
- "batch_size": 64,
14
- "num_epochs": 20,
15
- "dataset_size": 10000,
16
- "lr": 8e-05,
17
- "output_dir": "ckpt/sweep/add_sub_baseline_10K",
18
- "device": "cuda",
19
- "push_to_hub": true,
20
- "no_wandb": false,
21
- "n_params": 162490082,
22
- "run_name": "add_sub_baseline_10K",
23
- "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
- "timestamp": "2026-04-11T21:33:55.773262+00:00",
25
- "tokenizer": "Qwen/Qwen3-0.6B",
26
- "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
- "dataset_config": "add_sub_6digit",
28
- "model_repo": "thoughtworks/arithmetic-sorl",
29
- "trainer_version": "sft",
30
- "wandb_run_id": "1e2dttjb",
31
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/1e2dttjb",
32
- "final_accuracy": 0.7609259259259259,
33
- "sft_accuracy": 0.7609259259259259,
34
- "eval_method": "ArithmeticEvaluator"
35
- }