amirali1985 commited on
Commit
809e1b9
·
verified ·
1 Parent(s): 9044f4b

Upload add_sub_baseline_10K

Browse files
add_sub_baseline_10K/metrics.json CHANGED
@@ -65,132 +65,132 @@
65
  3100
66
  ],
67
  "loss": [
68
- 7.068239688873291,
69
- 4.189955234527588,
70
- 2.0407068729400635,
71
- 1.8267366886138916,
72
- 1.7952507734298706,
73
- 1.7645628452301025,
74
- 1.595243215560913,
75
- 1.4900548458099365,
76
- 1.0821096897125244,
77
- 0.7350229024887085,
78
- 0.586064875125885,
79
- 0.43922391533851624,
80
- 0.40184348821640015,
81
- 0.31044527888298035,
82
- 0.26080775260925293,
83
- 0.30810627341270447,
84
- 0.2941185534000397,
85
- 0.2382466048002243,
86
- 0.19741371273994446,
87
- 0.24084995687007904,
88
- 0.16109716892242432,
89
- 0.16342215240001678,
90
- 0.15979401767253876,
91
- 0.15030500292778015,
92
- 0.1553926020860672,
93
- 0.1616949737071991,
94
- 0.15501153469085693,
95
- 0.15730805695056915,
96
- 0.1374218612909317,
97
- 0.12141235917806625,
98
- 0.15107305347919464,
99
- 0.12318699806928635,
100
- 0.08949179947376251,
101
- 0.11493054777383804,
102
- 0.09638842195272446,
103
- 0.1008308082818985,
104
- 0.11183486878871918,
105
- 0.08597197383642197,
106
- 0.10642959177494049,
107
- 0.07882422208786011,
108
- 0.09636445343494415,
109
- 0.06600546836853027,
110
- 0.09191834181547165,
111
- 0.058375708758831024,
112
- 0.08556245267391205,
113
- 0.09554620832204819,
114
- 0.05065993592143059,
115
- 0.06083018705248833,
116
- 0.06561274826526642,
117
- 0.05150684341788292,
118
- 0.04865796118974686,
119
- 0.05840981379151344,
120
- 0.06329210847616196,
121
- 0.05198041722178459,
122
- 0.041428614407777786,
123
- 0.04235445708036423,
124
- 0.0411263071000576,
125
- 0.0459698922932148,
126
- 0.04036566987633705,
127
- 0.042875390499830246,
128
- 0.05301470309495926,
129
- 0.05982867628335953
130
  ],
131
  "base_loss": [
132
- 7.068239688873291,
133
- 4.189955234527588,
134
- 2.0407068729400635,
135
- 1.8267366886138916,
136
- 1.7952507734298706,
137
- 1.7645628452301025,
138
- 1.595243215560913,
139
- 1.4900548458099365,
140
- 1.0821096897125244,
141
- 0.7350229024887085,
142
- 0.586064875125885,
143
- 0.43922391533851624,
144
- 0.40184348821640015,
145
- 0.31044527888298035,
146
- 0.26080775260925293,
147
- 0.30810627341270447,
148
- 0.2941185534000397,
149
- 0.2382466048002243,
150
- 0.19741371273994446,
151
- 0.24084995687007904,
152
- 0.16109716892242432,
153
- 0.16342215240001678,
154
- 0.15979401767253876,
155
- 0.15030500292778015,
156
- 0.1553926020860672,
157
- 0.1616949737071991,
158
- 0.15501153469085693,
159
- 0.15730805695056915,
160
- 0.1374218612909317,
161
- 0.12141235917806625,
162
- 0.15107305347919464,
163
- 0.12318699806928635,
164
- 0.08949179947376251,
165
- 0.11493054777383804,
166
- 0.09638842195272446,
167
- 0.1008308082818985,
168
- 0.11183486878871918,
169
- 0.08597197383642197,
170
- 0.10642959177494049,
171
- 0.07882422208786011,
172
- 0.09636445343494415,
173
- 0.06600546836853027,
174
- 0.09191834181547165,
175
- 0.058375708758831024,
176
- 0.08556245267391205,
177
- 0.09554620832204819,
178
- 0.05065993592143059,
179
- 0.06083018705248833,
180
- 0.06561274826526642,
181
- 0.05150684341788292,
182
- 0.04865796118974686,
183
- 0.05840981379151344,
184
- 0.06329210847616196,
185
- 0.05198041722178459,
186
- 0.041428614407777786,
187
- 0.04235445708036423,
188
- 0.0411263071000576,
189
- 0.0459698922932148,
190
- 0.04036566987633705,
191
- 0.042875390499830246,
192
- 0.05301470309495926,
193
- 0.05982867628335953
194
  ],
195
  "lr": [
196
  3.9200000000000004e-05,
@@ -301,595 +301,595 @@
301
  20
302
  ],
303
  "eval_accuracy": [
304
- 0.008888888888888889,
305
- 0.005555555555555556,
306
- 0.024444444444444446,
307
- 0.21333333333333335,
308
- 0.3522222222222222,
309
- 0.4311111111111111,
310
- 0.57,
311
- 0.5322222222222223,
312
- 0.6111111111111112,
313
- 0.6211111111111111,
314
- 0.6477777777777778,
315
- 0.7,
316
- 0.7322222222222222,
317
- 0.7133333333333334,
318
- 0.7388888888888889,
319
- 0.7644444444444445,
320
- 0.7766666666666666,
321
- 0.7833333333333333,
322
- 0.7866666666666666,
323
- 0.7855555555555556
324
  ]
325
  },
326
- "final_accuracy": 0.7241666666666666,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
- "full_accuracy": 0.94,
338
- "digit_accuracy": 0.9914285714285714,
339
- "n_examples": 50,
340
  "per_subtask": {
341
  "SA": {
342
- "accuracy": 0.9898305084745763,
343
- "count": 295
344
  },
345
  "SS": {
346
  "accuracy": 1.0,
347
- "count": 55
348
  }
349
  }
350
  },
351
  "add_S1": {
352
- "full_accuracy": 0.98,
353
- "digit_accuracy": 0.9971428571428571,
354
- "n_examples": 50,
355
  "per_subtask": {
356
  "SA": {
357
  "accuracy": 1.0,
358
- "count": 126
359
  },
360
  "SC": {
361
  "accuracy": 1.0,
362
- "count": 79
363
  },
364
  "SS": {
365
  "accuracy": 1.0,
366
- "count": 21
367
  },
368
  "UC": {
369
- "accuracy": 0.9919354838709677,
370
- "count": 124
371
  }
372
  }
373
  },
374
  "add_S2": {
375
- "full_accuracy": 0.9,
376
- "digit_accuracy": 0.9828571428571429,
377
- "n_examples": 50,
378
  "per_subtask": {
379
  "SA": {
380
- "accuracy": 0.9866666666666667,
381
- "count": 75
382
  },
383
  "SC": {
384
- "accuracy": 1.0,
385
- "count": 62
386
  },
387
  "SS": {
388
- "accuracy": 1.0,
389
- "count": 39
390
  },
391
  "UC": {
392
- "accuracy": 0.954954954954955,
393
- "count": 111
394
  },
395
  "US": {
396
  "accuracy": 1.0,
397
- "count": 63
398
  }
399
  }
400
  },
401
  "add_S3": {
402
- "full_accuracy": 0.7,
403
- "digit_accuracy": 0.9571428571428572,
404
- "n_examples": 50,
405
  "per_subtask": {
406
  "SA": {
407
  "accuracy": 1.0,
408
- "count": 60
409
  },
410
  "SC": {
411
- "accuracy": 1.0,
412
- "count": 57
413
  },
414
  "SS": {
415
  "accuracy": 1.0,
416
- "count": 19
417
  },
418
  "UC": {
419
- "accuracy": 0.8557692307692307,
420
- "count": 104
421
  },
422
  "US": {
423
  "accuracy": 1.0,
424
- "count": 110
425
  }
426
  }
427
  },
428
  "add_S4": {
429
- "full_accuracy": 0.64,
430
- "digit_accuracy": 0.9342857142857143,
431
- "n_examples": 50,
432
  "per_subtask": {
433
  "SA": {
434
  "accuracy": 1.0,
435
- "count": 48
436
  },
437
  "SC": {
438
  "accuracy": 1.0,
439
- "count": 52
440
  },
441
  "SS": {
442
  "accuracy": 1.0,
443
- "count": 7
444
  },
445
  "UC": {
446
- "accuracy": 0.8202247191011236,
447
- "count": 89
448
  },
449
  "US": {
450
- "accuracy": 0.9545454545454546,
451
- "count": 154
452
  }
453
  }
454
  },
455
  "add_S5": {
456
- "full_accuracy": 0.3,
457
- "digit_accuracy": 0.7342857142857143,
458
- "n_examples": 50,
459
  "per_subtask": {
460
  "SA": {
461
  "accuracy": 1.0,
462
- "count": 50
463
  },
464
  "SC": {
465
  "accuracy": 1.0,
466
- "count": 50
467
  },
468
  "UC": {
469
- "accuracy": 0.38,
470
- "count": 50
471
  },
472
  "US": {
473
- "accuracy": 0.69,
474
- "count": 200
475
  }
476
  }
477
  },
478
  "add_S6": {
479
- "full_accuracy": 0.36,
480
- "digit_accuracy": 0.6685714285714286,
481
- "n_examples": 50,
482
  "per_subtask": {
483
  "SC": {
484
  "accuracy": 1.0,
485
- "count": 50
486
  },
487
  "UC": {
488
- "accuracy": 0.46,
489
- "count": 50
490
  },
491
  "US": {
492
- "accuracy": 0.644,
493
- "count": 250
494
  }
495
  }
496
  },
497
  "add_random": {
498
- "full_accuracy": 0.96,
499
- "digit_accuracy": 0.9935714285714285,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.9953596287703016,
504
- "count": 431
505
  },
506
  "SC": {
507
- "accuracy": 1.0,
508
- "count": 316
509
  },
510
  "SS": {
511
  "accuracy": 1.0,
512
- "count": 39
513
  },
514
  "UC": {
515
- "accuracy": 0.9875,
516
- "count": 560
517
  },
518
  "US": {
519
  "accuracy": 1.0,
520
- "count": 54
521
  }
522
  }
523
  },
524
  "add_C1": {
525
- "full_accuracy": 1.0,
526
- "digit_accuracy": 1.0,
527
- "n_examples": 50,
528
  "per_subtask": {
529
  "SA": {
530
- "accuracy": 1.0,
531
- "count": 250
532
  },
533
  "SC": {
534
  "accuracy": 1.0,
535
- "count": 50
536
  },
537
  "UC": {
538
  "accuracy": 1.0,
539
- "count": 50
540
  }
541
  }
542
  },
543
  "add_C2": {
544
- "full_accuracy": 0.98,
545
- "digit_accuracy": 0.9971428571428571,
546
- "n_examples": 50,
547
  "per_subtask": {
548
  "SA": {
549
- "accuracy": 1.0,
550
- "count": 200
551
  },
552
  "SC": {
553
  "accuracy": 1.0,
554
- "count": 50
555
  },
556
  "UC": {
557
- "accuracy": 0.9879518072289156,
558
- "count": 83
559
  },
560
  "US": {
561
- "accuracy": 1.0,
562
- "count": 17
563
  }
564
  }
565
  },
566
  "add_C3": {
567
- "full_accuracy": 0.78,
568
- "digit_accuracy": 0.9685714285714285,
569
- "n_examples": 50,
570
  "per_subtask": {
571
  "SA": {
572
- "accuracy": 0.9933333333333333,
573
- "count": 150
574
  },
575
  "SC": {
576
  "accuracy": 1.0,
577
- "count": 50
578
  },
579
  "UC": {
580
- "accuracy": 0.9,
581
- "count": 100
582
  },
583
  "US": {
584
  "accuracy": 1.0,
585
- "count": 50
586
  }
587
  }
588
  },
589
  "add_C4": {
590
- "full_accuracy": 0.88,
591
- "digit_accuracy": 0.9828571428571429,
592
- "n_examples": 50,
593
  "per_subtask": {
594
  "SA": {
595
  "accuracy": 1.0,
596
- "count": 100
597
  },
598
  "SC": {
599
  "accuracy": 1.0,
600
- "count": 50
601
  },
602
  "UC": {
603
- "accuracy": 0.9545454545454546,
604
- "count": 132
605
  },
606
  "US": {
607
- "accuracy": 1.0,
608
- "count": 68
609
  }
610
  }
611
  },
612
  "add_C5": {
613
- "full_accuracy": 0.76,
614
- "digit_accuracy": 0.9571428571428572,
615
- "n_examples": 50,
616
  "per_subtask": {
617
  "SA": {
618
  "accuracy": 1.0,
619
- "count": 50
620
  },
621
  "SC": {
622
  "accuracy": 1.0,
623
- "count": 50
624
  },
625
  "UC": {
626
- "accuracy": 0.9178082191780822,
627
- "count": 146
628
  },
629
  "US": {
630
- "accuracy": 0.9711538461538461,
631
- "count": 104
632
  }
633
  }
634
  },
635
  "add_C6": {
636
- "full_accuracy": 0.92,
637
- "digit_accuracy": 0.9885714285714285,
638
- "n_examples": 50,
639
  "per_subtask": {
640
  "SC": {
641
  "accuracy": 1.0,
642
- "count": 50
643
  },
644
  "UC": {
645
- "accuracy": 0.9788359788359788,
646
- "count": 189
647
  },
648
  "US": {
649
- "accuracy": 1.0,
650
- "count": 111
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
- "full_accuracy": 1.0,
656
- "digit_accuracy": 1.0,
657
- "n_examples": 50,
658
  "per_subtask": {
659
  "MD": {
660
- "accuracy": 1.0,
661
- "count": 303
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
- "count": 47
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
- "full_accuracy": 0.92,
671
- "digit_accuracy": 0.9885714285714285,
672
- "n_examples": 50,
673
  "per_subtask": {
674
  "MD": {
675
- "accuracy": 0.9858156028368794,
676
- "count": 141
677
  },
678
  "MB": {
679
- "accuracy": 0.9722222222222222,
680
- "count": 72
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
- "count": 18
685
  },
686
  "UB": {
687
  "accuracy": 1.0,
688
- "count": 119
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
- "full_accuracy": 0.76,
694
- "digit_accuracy": 0.9657142857142857,
695
- "n_examples": 50,
696
  "per_subtask": {
697
  "MD": {
698
  "accuracy": 1.0,
699
- "count": 112
700
  },
701
  "MB": {
702
- "accuracy": 0.9056603773584906,
703
- "count": 53
704
  },
705
  "ME": {
706
- "accuracy": 1.0,
707
- "count": 47
708
  },
709
  "UB": {
710
- "accuracy": 0.9176470588235294,
711
- "count": 85
712
  },
713
  "UD": {
714
  "accuracy": 1.0,
715
- "count": 53
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
- "full_accuracy": 0.42,
721
- "digit_accuracy": 0.9171428571428571,
722
- "n_examples": 50,
723
  "per_subtask": {
724
  "MD": {
725
  "accuracy": 1.0,
726
- "count": 97
727
  },
728
  "MB": {
729
  "accuracy": 1.0,
730
- "count": 51
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
- "count": 27
735
  },
736
  "UB": {
737
- "accuracy": 0.6081081081081081,
738
- "count": 74
739
  },
740
  "UD": {
741
- "accuracy": 1.0,
742
- "count": 101
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
- "full_accuracy": 0.1,
748
- "digit_accuracy": 0.8228571428571428,
749
- "n_examples": 50,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 1.0,
753
- "count": 100
754
  },
755
  "MB": {
756
  "accuracy": 1.0,
757
- "count": 50
758
  },
759
  "UB": {
760
- "accuracy": 0.44,
761
- "count": 50
762
  },
763
  "UD": {
764
- "accuracy": 0.7733333333333333,
765
- "count": 150
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
- "full_accuracy": 0.06,
771
- "digit_accuracy": 0.6885714285714286,
772
- "n_examples": 50,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
- "count": 50
777
  },
778
  "MB": {
779
  "accuracy": 1.0,
780
- "count": 50
781
  },
782
  "UB": {
783
- "accuracy": 0.38,
784
- "count": 50
785
  },
786
  "UD": {
787
- "accuracy": 0.61,
788
- "count": 200
789
  }
790
  }
791
  },
792
  "sub_random": {
793
- "full_accuracy": 0.96,
794
- "digit_accuracy": 0.9942857142857143,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
- "accuracy": 0.9964912280701754,
799
- "count": 570
800
  },
801
  "MB": {
802
- "accuracy": 0.9819494584837545,
803
- "count": 277
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
- "accuracy": 0.9978768577494692,
811
- "count": 471
812
  },
813
  "UD": {
814
  "accuracy": 1.0,
815
- "count": 29
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
- "full_accuracy": 0.82,
821
- "digit_accuracy": 0.9742857142857143,
822
- "n_examples": 50,
823
  "per_subtask": {
824
  "MD": {
825
- "accuracy": 1.0,
826
- "count": 150
827
  },
828
  "MB": {
829
  "accuracy": 1.0,
830
- "count": 50
831
  },
832
  "UB": {
833
- "accuracy": 0.9108910891089109,
834
- "count": 101
835
  },
836
  "UD": {
837
- "accuracy": 1.0,
838
- "count": 49
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
- "full_accuracy": 0.62,
844
- "digit_accuracy": 0.9371428571428572,
845
- "n_examples": 50,
846
  "per_subtask": {
847
  "MD": {
848
- "accuracy": 1.0,
849
- "count": 100
850
  },
851
  "MB": {
852
  "accuracy": 1.0,
853
- "count": 50
854
  },
855
  "UB": {
856
- "accuracy": 0.859504132231405,
857
- "count": 121
858
  },
859
  "UD": {
860
- "accuracy": 0.9367088607594937,
861
- "count": 79
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
- "full_accuracy": 0.46,
867
- "digit_accuracy": 0.9142857142857143,
868
- "n_examples": 50,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
- "count": 50
873
  },
874
  "MB": {
875
  "accuracy": 1.0,
876
- "count": 50
877
  },
878
  "UB": {
879
- "accuracy": 0.8289473684210527,
880
- "count": 152
881
  },
882
  "UD": {
883
- "accuracy": 0.9591836734693877,
884
- "count": 98
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
- "overall_accuracy": 0.766,
891
- "digit_accuracy": 0.9439047619047619,
892
- "total_examples": 1500,
893
  "n_splits": 24
894
  }
895
  }
 
65
  3100
66
  ],
67
  "loss": [
68
+ 7.406970977783203,
69
+ 4.287904262542725,
70
+ 2.028409719467163,
71
+ 1.9122778177261353,
72
+ 1.7641465663909912,
73
+ 1.6944198608398438,
74
+ 1.5549181699752808,
75
+ 1.4273613691329956,
76
+ 1.1411476135253906,
77
+ 0.7707713842391968,
78
+ 0.652624249458313,
79
+ 0.5090309977531433,
80
+ 0.4863796830177307,
81
+ 0.4217025935649872,
82
+ 0.3141794204711914,
83
+ 0.26072242856025696,
84
+ 0.20925267040729523,
85
+ 0.1722976416349411,
86
+ 0.16701845824718475,
87
+ 0.14609448611736298,
88
+ 0.15161116421222687,
89
+ 0.11793585866689682,
90
+ 0.12784767150878906,
91
+ 0.17903172969818115,
92
+ 0.07992793619632721,
93
+ 0.09357694536447525,
94
+ 0.07849029451608658,
95
+ 0.08744285255670547,
96
+ 0.047625500708818436,
97
+ 0.06857192516326904,
98
+ 0.058801162987947464,
99
+ 0.08054264634847641,
100
+ 0.0829831138253212,
101
+ 0.08300244808197021,
102
+ 0.04127458110451698,
103
+ 0.045694876462221146,
104
+ 0.0545954629778862,
105
+ 0.04024777188897133,
106
+ 0.07766951620578766,
107
+ 0.024591432884335518,
108
+ 0.06888258457183838,
109
+ 0.03272942453622818,
110
+ 0.05081992596387863,
111
+ 0.07547096163034439,
112
+ 0.04898810759186745,
113
+ 0.04332876205444336,
114
+ 0.018458085134625435,
115
+ 0.03549834340810776,
116
+ 0.020787600427865982,
117
+ 0.01855045184493065,
118
+ 0.018782520666718483,
119
+ 0.013612723909318447,
120
+ 0.0217971820384264,
121
+ 0.01920527033507824,
122
+ 0.01275918073952198,
123
+ 0.023383263498544693,
124
+ 0.017817409709095955,
125
+ 0.01498924009501934,
126
+ 0.021258754655718803,
127
+ 0.022163305431604385,
128
+ 0.012838104739785194,
129
+ 0.015554169192910194
130
  ],
131
  "base_loss": [
132
+ 7.406970977783203,
133
+ 4.287904262542725,
134
+ 2.028409719467163,
135
+ 1.9122778177261353,
136
+ 1.7641465663909912,
137
+ 1.6944198608398438,
138
+ 1.5549181699752808,
139
+ 1.4273613691329956,
140
+ 1.1411476135253906,
141
+ 0.7707713842391968,
142
+ 0.652624249458313,
143
+ 0.5090309977531433,
144
+ 0.4863796830177307,
145
+ 0.4217025935649872,
146
+ 0.3141794204711914,
147
+ 0.26072242856025696,
148
+ 0.20925267040729523,
149
+ 0.1722976416349411,
150
+ 0.16701845824718475,
151
+ 0.14609448611736298,
152
+ 0.15161116421222687,
153
+ 0.11793585866689682,
154
+ 0.12784767150878906,
155
+ 0.17903172969818115,
156
+ 0.07992793619632721,
157
+ 0.09357694536447525,
158
+ 0.07849029451608658,
159
+ 0.08744285255670547,
160
+ 0.047625500708818436,
161
+ 0.06857192516326904,
162
+ 0.058801162987947464,
163
+ 0.08054264634847641,
164
+ 0.0829831138253212,
165
+ 0.08300244808197021,
166
+ 0.04127458110451698,
167
+ 0.045694876462221146,
168
+ 0.0545954629778862,
169
+ 0.04024777188897133,
170
+ 0.07766951620578766,
171
+ 0.024591432884335518,
172
+ 0.06888258457183838,
173
+ 0.03272942453622818,
174
+ 0.05081992596387863,
175
+ 0.07547096163034439,
176
+ 0.04898810759186745,
177
+ 0.04332876205444336,
178
+ 0.018458085134625435,
179
+ 0.03549834340810776,
180
+ 0.020787600427865982,
181
+ 0.01855045184493065,
182
+ 0.018782520666718483,
183
+ 0.013612723909318447,
184
+ 0.0217971820384264,
185
+ 0.01920527033507824,
186
+ 0.01275918073952198,
187
+ 0.023383263498544693,
188
+ 0.017817409709095955,
189
+ 0.01498924009501934,
190
+ 0.021258754655718803,
191
+ 0.022163305431604385,
192
+ 0.012838104739785194,
193
+ 0.015554169192910194
194
  ],
195
  "lr": [
196
  3.9200000000000004e-05,
 
301
  20
302
  ],
303
  "eval_accuracy": [
304
+ 0.005263157894736842,
305
+ 0.005263157894736842,
306
+ 0.023157894736842106,
307
+ 0.10421052631578948,
308
+ 0.4073684210526316,
309
+ 0.6189473684210526,
310
+ 0.6663157894736842,
311
+ 0.7210526315789474,
312
+ 0.7315789473684211,
313
+ 0.76,
314
+ 0.7884210526315789,
315
+ 0.783157894736842,
316
+ 0.7631578947368421,
317
+ 0.8126315789473684,
318
+ 0.8147368421052632,
319
+ 0.8126315789473684,
320
+ 0.8105263157894737,
321
+ 0.8189473684210526,
322
+ 0.82,
323
+ 0.8094736842105263
324
  ]
325
  },
326
+ "final_accuracy": 0.76,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
+ "full_accuracy": 0.97,
338
+ "digit_accuracy": 0.9957142857142857,
339
+ "n_examples": 100,
340
  "per_subtask": {
341
  "SA": {
342
+ "accuracy": 0.9950413223140496,
343
+ "count": 605
344
  },
345
  "SS": {
346
  "accuracy": 1.0,
347
+ "count": 95
348
  }
349
  }
350
  },
351
  "add_S1": {
352
+ "full_accuracy": 1.0,
353
+ "digit_accuracy": 1.0,
354
+ "n_examples": 100,
355
  "per_subtask": {
356
  "SA": {
357
  "accuracy": 1.0,
358
+ "count": 204
359
  },
360
  "SC": {
361
  "accuracy": 1.0,
362
+ "count": 169
363
  },
364
  "SS": {
365
  "accuracy": 1.0,
366
+ "count": 31
367
  },
368
  "UC": {
369
+ "accuracy": 1.0,
370
+ "count": 296
371
  }
372
  }
373
  },
374
  "add_S2": {
375
+ "full_accuracy": 0.97,
376
+ "digit_accuracy": 0.9957142857142857,
377
+ "n_examples": 100,
378
  "per_subtask": {
379
  "SA": {
380
+ "accuracy": 1.0,
381
+ "count": 163
382
  },
383
  "SC": {
384
+ "accuracy": 0.9846153846153847,
385
+ "count": 130
386
  },
387
  "SS": {
388
+ "accuracy": 0.9885057471264368,
389
+ "count": 87
390
  },
391
  "UC": {
392
+ "accuracy": 1.0,
393
+ "count": 203
394
  },
395
  "US": {
396
  "accuracy": 1.0,
397
+ "count": 117
398
  }
399
  }
400
  },
401
  "add_S3": {
402
+ "full_accuracy": 0.67,
403
+ "digit_accuracy": 0.9528571428571428,
404
+ "n_examples": 100,
405
  "per_subtask": {
406
  "SA": {
407
  "accuracy": 1.0,
408
+ "count": 121
409
  },
410
  "SC": {
411
+ "accuracy": 0.9834710743801653,
412
+ "count": 121
413
  },
414
  "SS": {
415
  "accuracy": 1.0,
416
+ "count": 49
417
  },
418
  "UC": {
419
+ "accuracy": 0.8333333333333334,
420
+ "count": 186
421
  },
422
  "US": {
423
  "accuracy": 1.0,
424
+ "count": 223
425
  }
426
  }
427
  },
428
  "add_S4": {
429
+ "full_accuracy": 0.6,
430
+ "digit_accuracy": 0.9171428571428571,
431
+ "n_examples": 100,
432
  "per_subtask": {
433
  "SA": {
434
  "accuracy": 1.0,
435
+ "count": 104
436
  },
437
  "SC": {
438
  "accuracy": 1.0,
439
+ "count": 106
440
  },
441
  "SS": {
442
  "accuracy": 1.0,
443
+ "count": 23
444
  },
445
  "UC": {
446
+ "accuracy": 0.8,
447
+ "count": 160
448
  },
449
  "US": {
450
+ "accuracy": 0.9153094462540716,
451
+ "count": 307
452
  }
453
  }
454
  },
455
  "add_S5": {
456
+ "full_accuracy": 0.5,
457
+ "digit_accuracy": 0.8371428571428572,
458
+ "n_examples": 100,
459
  "per_subtask": {
460
  "SA": {
461
  "accuracy": 1.0,
462
+ "count": 100
463
  },
464
  "SC": {
465
  "accuracy": 1.0,
466
+ "count": 100
467
  },
468
  "UC": {
469
+ "accuracy": 0.65,
470
+ "count": 100
471
  },
472
  "US": {
473
+ "accuracy": 0.8025,
474
+ "count": 400
475
  }
476
  }
477
  },
478
  "add_S6": {
479
+ "full_accuracy": 0.5,
480
+ "digit_accuracy": 0.8142857142857143,
481
+ "n_examples": 100,
482
  "per_subtask": {
483
  "SC": {
484
  "accuracy": 1.0,
485
+ "count": 100
486
  },
487
  "UC": {
488
+ "accuracy": 0.76,
489
+ "count": 100
490
  },
491
  "US": {
492
+ "accuracy": 0.788,
493
+ "count": 500
494
  }
495
  }
496
  },
497
  "add_random": {
498
+ "full_accuracy": 0.97,
499
+ "digit_accuracy": 0.9957142857142857,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.9955257270693513,
504
+ "count": 447
505
  },
506
  "SC": {
507
+ "accuracy": 0.99375,
508
+ "count": 320
509
  },
510
  "SS": {
511
  "accuracy": 1.0,
512
+ "count": 56
513
  },
514
  "UC": {
515
+ "accuracy": 0.996219281663516,
516
+ "count": 529
517
  },
518
  "US": {
519
  "accuracy": 1.0,
520
+ "count": 48
521
  }
522
  }
523
  },
524
  "add_C1": {
525
+ "full_accuracy": 0.99,
526
+ "digit_accuracy": 0.9985714285714286,
527
+ "n_examples": 100,
528
  "per_subtask": {
529
  "SA": {
530
+ "accuracy": 0.998,
531
+ "count": 500
532
  },
533
  "SC": {
534
  "accuracy": 1.0,
535
+ "count": 100
536
  },
537
  "UC": {
538
  "accuracy": 1.0,
539
+ "count": 100
540
  }
541
  }
542
  },
543
  "add_C2": {
544
+ "full_accuracy": 0.93,
545
+ "digit_accuracy": 0.99,
546
+ "n_examples": 100,
547
  "per_subtask": {
548
  "SA": {
549
+ "accuracy": 0.995,
550
+ "count": 400
551
  },
552
  "SC": {
553
  "accuracy": 1.0,
554
+ "count": 100
555
  },
556
  "UC": {
557
+ "accuracy": 0.9743589743589743,
558
+ "count": 156
559
  },
560
  "US": {
561
+ "accuracy": 0.9772727272727273,
562
+ "count": 44
563
  }
564
  }
565
  },
566
  "add_C3": {
567
+ "full_accuracy": 0.85,
568
+ "digit_accuracy": 0.9785714285714285,
569
+ "n_examples": 100,
570
  "per_subtask": {
571
  "SA": {
572
+ "accuracy": 1.0,
573
+ "count": 300
574
  },
575
  "SC": {
576
  "accuracy": 1.0,
577
+ "count": 100
578
  },
579
  "UC": {
580
+ "accuracy": 0.9246231155778895,
581
+ "count": 199
582
  },
583
  "US": {
584
  "accuracy": 1.0,
585
+ "count": 101
586
  }
587
  }
588
  },
589
  "add_C4": {
590
+ "full_accuracy": 0.78,
591
+ "digit_accuracy": 0.9657142857142857,
592
+ "n_examples": 100,
593
  "per_subtask": {
594
  "SA": {
595
  "accuracy": 1.0,
596
+ "count": 200
597
  },
598
  "SC": {
599
  "accuracy": 1.0,
600
+ "count": 100
601
  },
602
  "UC": {
603
+ "accuracy": 0.9242424242424242,
604
+ "count": 264
605
  },
606
  "US": {
607
+ "accuracy": 0.9705882352941176,
608
+ "count": 136
609
  }
610
  }
611
  },
612
  "add_C5": {
613
+ "full_accuracy": 0.79,
614
+ "digit_accuracy": 0.9614285714285714,
615
+ "n_examples": 100,
616
  "per_subtask": {
617
  "SA": {
618
  "accuracy": 1.0,
619
+ "count": 100
620
  },
621
  "SC": {
622
  "accuracy": 1.0,
623
+ "count": 100
624
  },
625
  "UC": {
626
+ "accuracy": 0.9354838709677419,
627
+ "count": 310
628
  },
629
  "US": {
630
+ "accuracy": 0.9631578947368421,
631
+ "count": 190
632
  }
633
  }
634
  },
635
  "add_C6": {
636
+ "full_accuracy": 0.74,
637
+ "digit_accuracy": 0.9542857142857143,
638
+ "n_examples": 100,
639
  "per_subtask": {
640
  "SC": {
641
  "accuracy": 1.0,
642
+ "count": 100
643
  },
644
  "UC": {
645
+ "accuracy": 0.9459459459459459,
646
+ "count": 370
647
  },
648
  "US": {
649
+ "accuracy": 0.9478260869565217,
650
+ "count": 230
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
+ "full_accuracy": 0.97,
656
+ "digit_accuracy": 0.9957142857142857,
657
+ "n_examples": 100,
658
  "per_subtask": {
659
  "MD": {
660
+ "accuracy": 0.9951219512195122,
661
+ "count": 615
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
+ "count": 85
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
+ "full_accuracy": 0.97,
671
+ "digit_accuracy": 0.9957142857142857,
672
+ "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
675
+ "accuracy": 0.9931506849315068,
676
+ "count": 292
677
  },
678
  "MB": {
679
+ "accuracy": 0.9930555555555556,
680
+ "count": 144
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
+ "count": 25
685
  },
686
  "UB": {
687
  "accuracy": 1.0,
688
+ "count": 239
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
+ "full_accuracy": 0.94,
694
+ "digit_accuracy": 0.9914285714285714,
695
+ "n_examples": 100,
696
  "per_subtask": {
697
  "MD": {
698
  "accuracy": 1.0,
699
+ "count": 211
700
  },
701
  "MB": {
702
+ "accuracy": 1.0,
703
+ "count": 115
704
  },
705
  "ME": {
706
+ "accuracy": 0.9882352941176471,
707
+ "count": 85
708
  },
709
  "UB": {
710
+ "accuracy": 0.9723756906077348,
711
+ "count": 181
712
  },
713
  "UD": {
714
  "accuracy": 1.0,
715
+ "count": 108
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
+ "full_accuracy": 0.38,
721
+ "digit_accuracy": 0.9028571428571428,
722
+ "n_examples": 100,
723
  "per_subtask": {
724
  "MD": {
725
  "accuracy": 1.0,
726
+ "count": 179
727
  },
728
  "MB": {
729
  "accuracy": 1.0,
730
+ "count": 103
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
+ "count": 56
735
  },
736
  "UB": {
737
+ "accuracy": 0.610738255033557,
738
+ "count": 149
739
  },
740
  "UD": {
741
+ "accuracy": 0.9530516431924883,
742
+ "count": 213
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
+ "full_accuracy": 0.18,
748
+ "digit_accuracy": 0.7942857142857143,
749
+ "n_examples": 100,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 1.0,
753
+ "count": 200
754
  },
755
  "MB": {
756
  "accuracy": 1.0,
757
+ "count": 100
758
  },
759
  "UB": {
760
+ "accuracy": 0.47,
761
+ "count": 100
762
  },
763
  "UD": {
764
+ "accuracy": 0.6966666666666667,
765
+ "count": 300
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
+ "full_accuracy": 0.13,
771
+ "digit_accuracy": 0.6685714285714286,
772
+ "n_examples": 100,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
+ "count": 100
777
  },
778
  "MB": {
779
  "accuracy": 1.0,
780
+ "count": 100
781
  },
782
  "UB": {
783
+ "accuracy": 0.46,
784
+ "count": 100
785
  },
786
  "UD": {
787
+ "accuracy": 0.555,
788
+ "count": 400
789
  }
790
  }
791
  },
792
  "sub_random": {
793
+ "full_accuracy": 0.95,
794
+ "digit_accuracy": 0.9928571428571429,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
+ "accuracy": 0.9883333333333333,
799
+ "count": 600
800
  },
801
  "MB": {
802
+ "accuracy": 0.9962546816479401,
803
+ "count": 267
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
+ "accuracy": 0.9954441913439636,
811
+ "count": 439
812
  },
813
  "UD": {
814
  "accuracy": 1.0,
815
+ "count": 41
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
+ "full_accuracy": 0.79,
821
+ "digit_accuracy": 0.9685714285714285,
822
+ "n_examples": 100,
823
  "per_subtask": {
824
  "MD": {
825
+ "accuracy": 0.9966666666666667,
826
+ "count": 300
827
  },
828
  "MB": {
829
  "accuracy": 1.0,
830
+ "count": 100
831
  },
832
  "UB": {
833
+ "accuracy": 0.8984771573604061,
834
+ "count": 197
835
  },
836
  "UD": {
837
+ "accuracy": 0.9902912621359223,
838
+ "count": 103
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
+ "full_accuracy": 0.61,
844
+ "digit_accuracy": 0.9342857142857143,
845
+ "n_examples": 100,
846
  "per_subtask": {
847
  "MD": {
848
+ "accuracy": 0.98,
849
+ "count": 200
850
  },
851
  "MB": {
852
  "accuracy": 1.0,
853
+ "count": 100
854
  },
855
  "UB": {
856
+ "accuracy": 0.8744939271255061,
857
+ "count": 247
858
  },
859
  "UD": {
860
+ "accuracy": 0.9281045751633987,
861
+ "count": 153
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
+ "full_accuracy": 0.66,
867
+ "digit_accuracy": 0.9342857142857143,
868
+ "n_examples": 100,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
+ "count": 100
873
  },
874
  "MB": {
875
  "accuracy": 1.0,
876
+ "count": 100
877
  },
878
  "UB": {
879
+ "accuracy": 0.8993288590604027,
880
+ "count": 298
881
  },
882
  "UD": {
883
+ "accuracy": 0.9207920792079208,
884
+ "count": 202
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
+ "overall_accuracy": 0.76,
891
+ "digit_accuracy": 0.9431318681318681,
892
+ "total_examples": 2600,
893
  "n_splits": 24
894
  }
895
  }
add_sub_baseline_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99090d5772338ac21e3d890eee50dfa2204eeafe4c53123e495571b22267bec9
3
  size 650266922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9024dda0f3d40530abfd72b50df46a8e6a8a4dde5fed1a429b758171421411ab
3
  size 650266922
add_sub_baseline_10K/train_config.json CHANGED
@@ -36,7 +36,7 @@
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
- "output_dir": "ckpt/sweep/add_sub_baseline_10K",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
- "git_commit": "8d5ee5420119746ef4e2c87570eb250c9718f643",
73
- "timestamp": "2026-04-12T22:03:24.880179+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "j0499k85",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/j0499k85",
81
- "final_accuracy": 0.7241666666666666,
82
- "sft_accuracy": 0.7241666666666666,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_baseline_10K_2L3H510d",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
 
69
  "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-16T01:10:30.945547+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "dvsplayi",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/dvsplayi",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.76,
86
+ "sft_accuracy": 0.76,
87
  "eval_method": "ArithmeticEvaluator"
88
  }