amirali1985 commited on
Commit
d2ea635
·
verified ·
1 Parent(s): bd32447

Upload add_sub_baseline_10K_1L3H510d

Browse files
add_sub_baseline_10K_1L3H510d/metrics.json CHANGED
@@ -65,132 +65,132 @@
65
  3100
66
  ],
67
  "loss": [
68
- 7.182804584503174,
69
- 4.20333194732666,
70
- 2.0677404403686523,
71
- 1.829144835472107,
72
- 1.8410581350326538,
73
- 1.820699691772461,
74
- 1.7079013586044312,
75
- 1.655383825302124,
76
- 1.6272556781768799,
77
- 1.56789231300354,
78
- 1.2935447692871094,
79
- 0.8198140263557434,
80
- 0.7896751761436462,
81
- 0.7217035293579102,
82
- 0.6449242830276489,
83
- 0.6155810356140137,
84
- 0.6273579001426697,
85
- 0.5691636800765991,
86
- 0.5195116400718689,
87
- 0.5188413262367249,
88
- 0.4571366012096405,
89
- 0.46536850929260254,
90
- 0.45766669511795044,
91
- 0.4275817275047302,
92
- 0.40487247705459595,
93
- 0.4464508593082428,
94
- 0.39323002099990845,
95
- 0.39669889211654663,
96
- 0.34110480546951294,
97
- 0.3717530369758606,
98
- 0.3865724503993988,
99
- 0.36133044958114624,
100
- 0.3357429802417755,
101
- 0.3121393620967865,
102
- 0.3670870363712311,
103
- 0.2978810667991638,
104
- 0.3120233118534088,
105
- 0.34596773982048035,
106
- 0.29005879163742065,
107
- 0.34079042077064514,
108
- 0.29599788784980774,
109
- 0.30136188864707947,
110
- 0.2756097614765167,
111
- 0.3011065125465393,
112
- 0.30379655957221985,
113
- 0.2766466736793518,
114
- 0.2690426707267761,
115
- 0.3143273890018463,
116
- 0.26247408986091614,
117
- 0.27483493089675903,
118
- 0.2900548279285431,
119
- 0.2740040123462677,
120
- 0.2515677511692047,
121
- 0.2737451195716858,
122
- 0.246206596493721,
123
- 0.2460457980632782,
124
- 0.2329309731721878,
125
- 0.25465384125709534,
126
- 0.2698599696159363,
127
- 0.2485286146402359,
128
- 0.24830453097820282,
129
- 0.2897026538848877
130
  ],
131
  "base_loss": [
132
- 7.182804584503174,
133
- 4.20333194732666,
134
- 2.0677404403686523,
135
- 1.829144835472107,
136
- 1.8410581350326538,
137
- 1.820699691772461,
138
- 1.7079013586044312,
139
- 1.655383825302124,
140
- 1.6272556781768799,
141
- 1.56789231300354,
142
- 1.2935447692871094,
143
- 0.8198140263557434,
144
- 0.7896751761436462,
145
- 0.7217035293579102,
146
- 0.6449242830276489,
147
- 0.6155810356140137,
148
- 0.6273579001426697,
149
- 0.5691636800765991,
150
- 0.5195116400718689,
151
- 0.5188413262367249,
152
- 0.4571366012096405,
153
- 0.46536850929260254,
154
- 0.45766669511795044,
155
- 0.4275817275047302,
156
- 0.40487247705459595,
157
- 0.4464508593082428,
158
- 0.39323002099990845,
159
- 0.39669889211654663,
160
- 0.34110480546951294,
161
- 0.3717530369758606,
162
- 0.3865724503993988,
163
- 0.36133044958114624,
164
- 0.3357429802417755,
165
- 0.3121393620967865,
166
- 0.3670870363712311,
167
- 0.2978810667991638,
168
- 0.3120233118534088,
169
- 0.34596773982048035,
170
- 0.29005879163742065,
171
- 0.34079042077064514,
172
- 0.29599788784980774,
173
- 0.30136188864707947,
174
- 0.2756097614765167,
175
- 0.3011065125465393,
176
- 0.30379655957221985,
177
- 0.2766466736793518,
178
- 0.2690426707267761,
179
- 0.3143273890018463,
180
- 0.26247408986091614,
181
- 0.27483493089675903,
182
- 0.2900548279285431,
183
- 0.2740040123462677,
184
- 0.2515677511692047,
185
- 0.2737451195716858,
186
- 0.246206596493721,
187
- 0.2460457980632782,
188
- 0.2329309731721878,
189
- 0.25465384125709534,
190
- 0.2698599696159363,
191
- 0.2485286146402359,
192
- 0.24830453097820282,
193
- 0.2897026538848877
194
  ],
195
  "lr": [
196
  3.9200000000000004e-05,
@@ -301,595 +301,595 @@
301
  20
302
  ],
303
  "eval_accuracy": [
304
- 0.0,
305
- 0.0044444444444444444,
306
- 0.006666666666666667,
307
- 0.01888888888888889,
308
- 0.057777777777777775,
309
- 0.09444444444444444,
310
- 0.11888888888888889,
311
- 0.20222222222222222,
312
- 0.24555555555555555,
313
- 0.2611111111111111,
314
- 0.32666666666666666,
315
- 0.3011111111111111,
316
- 0.33555555555555555,
317
- 0.39222222222222225,
318
- 0.41888888888888887,
319
- 0.3888888888888889,
320
- 0.4111111111111111,
321
- 0.39666666666666667,
322
- 0.41,
323
- 0.42444444444444446
324
  ]
325
  },
326
- "final_accuracy": 0.33625,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
- "full_accuracy": 0.56,
338
- "digit_accuracy": 0.92,
339
- "n_examples": 50,
340
  "per_subtask": {
341
  "SA": {
342
- "accuracy": 0.9220338983050848,
343
- "count": 295
344
  },
345
  "SS": {
346
- "accuracy": 0.9090909090909091,
347
- "count": 55
348
  }
349
  }
350
  },
351
  "add_S1": {
352
- "full_accuracy": 0.56,
353
- "digit_accuracy": 0.9285714285714286,
354
- "n_examples": 50,
355
  "per_subtask": {
356
  "SA": {
357
- "accuracy": 0.9365079365079365,
358
- "count": 126
359
  },
360
  "SC": {
361
- "accuracy": 0.9746835443037974,
362
- "count": 79
363
  },
364
  "SS": {
365
- "accuracy": 0.9523809523809523,
366
- "count": 21
367
  },
368
  "UC": {
369
- "accuracy": 0.8870967741935484,
370
- "count": 124
371
  }
372
  }
373
  },
374
  "add_S2": {
375
- "full_accuracy": 0.32,
376
- "digit_accuracy": 0.8714285714285714,
377
- "n_examples": 50,
378
  "per_subtask": {
379
  "SA": {
380
- "accuracy": 0.9733333333333334,
381
- "count": 75
382
  },
383
  "SC": {
384
- "accuracy": 0.8870967741935484,
385
- "count": 62
386
  },
387
  "SS": {
388
- "accuracy": 0.9743589743589743,
389
- "count": 39
390
  },
391
  "UC": {
392
- "accuracy": 0.7387387387387387,
393
- "count": 111
394
  },
395
  "US": {
396
- "accuracy": 0.9047619047619048,
397
- "count": 63
398
  }
399
  }
400
  },
401
  "add_S3": {
402
- "full_accuracy": 0.26,
403
- "digit_accuracy": 0.7771428571428571,
404
- "n_examples": 50,
405
  "per_subtask": {
406
  "SA": {
407
- "accuracy": 0.9666666666666667,
408
- "count": 60
409
  },
410
  "SC": {
411
- "accuracy": 0.9649122807017544,
412
- "count": 57
413
  },
414
  "SS": {
415
- "accuracy": 0.9473684210526315,
416
- "count": 19
417
  },
418
  "UC": {
419
- "accuracy": 0.6153846153846154,
420
- "count": 104
421
  },
422
  "US": {
423
- "accuracy": 0.7,
424
- "count": 110
425
  }
426
  }
427
  },
428
  "add_S4": {
429
- "full_accuracy": 0.28,
430
- "digit_accuracy": 0.7,
431
- "n_examples": 50,
432
  "per_subtask": {
433
  "SA": {
434
- "accuracy": 1.0,
435
- "count": 48
436
  },
437
  "SC": {
438
- "accuracy": 0.9807692307692307,
439
- "count": 52
440
  },
441
  "SS": {
442
- "accuracy": 1.0,
443
- "count": 7
444
  },
445
  "UC": {
446
- "accuracy": 0.651685393258427,
447
- "count": 89
448
  },
449
  "US": {
450
- "accuracy": 0.525974025974026,
451
- "count": 154
452
  }
453
  }
454
  },
455
  "add_S5": {
456
- "full_accuracy": 0.26,
457
- "digit_accuracy": 0.6142857142857143,
458
- "n_examples": 50,
459
  "per_subtask": {
460
  "SA": {
461
  "accuracy": 1.0,
462
- "count": 50
463
  },
464
  "SC": {
465
- "accuracy": 0.98,
466
- "count": 50
467
  },
468
  "UC": {
469
- "accuracy": 0.44,
470
- "count": 50
471
  },
472
  "US": {
473
- "accuracy": 0.47,
474
- "count": 200
475
  }
476
  }
477
  },
478
  "add_S6": {
479
- "full_accuracy": 0.3,
480
- "digit_accuracy": 0.4685714285714286,
481
- "n_examples": 50,
482
  "per_subtask": {
483
  "SC": {
484
  "accuracy": 1.0,
485
- "count": 50
486
  },
487
  "UC": {
488
- "accuracy": 0.36,
489
- "count": 50
490
  },
491
  "US": {
492
- "accuracy": 0.384,
493
- "count": 250
494
  }
495
  }
496
  },
497
  "add_random": {
498
- "full_accuracy": 0.54,
499
- "digit_accuracy": 0.9214285714285714,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.9675174013921114,
504
- "count": 431
505
  },
506
  "SC": {
507
- "accuracy": 0.9715189873417721,
508
- "count": 316
509
  },
510
  "SS": {
511
- "accuracy": 0.9487179487179487,
512
- "count": 39
513
  },
514
  "UC": {
515
- "accuracy": 0.8589285714285714,
516
- "count": 560
517
  },
518
  "US": {
519
- "accuracy": 0.8888888888888888,
520
- "count": 54
521
  }
522
  }
523
  },
524
  "add_C1": {
525
- "full_accuracy": 0.56,
526
- "digit_accuracy": 0.9342857142857143,
527
- "n_examples": 50,
528
  "per_subtask": {
529
  "SA": {
530
- "accuracy": 0.952,
531
- "count": 250
532
  },
533
  "SC": {
534
- "accuracy": 1.0,
535
- "count": 50
536
  },
537
  "UC": {
538
- "accuracy": 0.78,
539
- "count": 50
540
  }
541
  }
542
  },
543
  "add_C2": {
544
- "full_accuracy": 0.44,
545
- "digit_accuracy": 0.9028571428571428,
546
- "n_examples": 50,
547
  "per_subtask": {
548
  "SA": {
549
- "accuracy": 0.945,
550
- "count": 200
551
  },
552
  "SC": {
553
- "accuracy": 0.96,
554
- "count": 50
555
  },
556
  "UC": {
557
- "accuracy": 0.7590361445783133,
558
- "count": 83
559
  },
560
  "US": {
561
- "accuracy": 0.9411764705882353,
562
- "count": 17
563
  }
564
  }
565
  },
566
  "add_C3": {
567
- "full_accuracy": 0.32,
568
- "digit_accuracy": 0.8571428571428571,
569
- "n_examples": 50,
570
  "per_subtask": {
571
  "SA": {
572
- "accuracy": 0.9466666666666667,
573
- "count": 150
574
  },
575
  "SC": {
576
- "accuracy": 0.94,
577
- "count": 50
578
  },
579
  "UC": {
580
- "accuracy": 0.76,
581
- "count": 100
582
  },
583
  "US": {
584
- "accuracy": 0.7,
585
- "count": 50
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.34,
591
- "digit_accuracy": 0.8371428571428572,
592
- "n_examples": 50,
593
  "per_subtask": {
594
  "SA": {
595
- "accuracy": 0.99,
596
- "count": 100
597
  },
598
  "SC": {
599
  "accuracy": 0.96,
600
- "count": 50
601
  },
602
  "UC": {
603
- "accuracy": 0.7348484848484849,
604
- "count": 132
605
  },
606
  "US": {
607
- "accuracy": 0.7205882352941176,
608
- "count": 68
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.28,
614
- "digit_accuracy": 0.7828571428571428,
615
- "n_examples": 50,
616
  "per_subtask": {
617
  "SA": {
618
  "accuracy": 1.0,
619
- "count": 50
620
  },
621
  "SC": {
622
- "accuracy": 1.0,
623
- "count": 50
624
  },
625
  "UC": {
626
- "accuracy": 0.7123287671232876,
627
- "count": 146
628
  },
629
  "US": {
630
- "accuracy": 0.6730769230769231,
631
- "count": 104
632
  }
633
  }
634
  },
635
  "add_C6": {
636
- "full_accuracy": 0.12,
637
- "digit_accuracy": 0.7628571428571429,
638
- "n_examples": 50,
639
  "per_subtask": {
640
  "SC": {
641
  "accuracy": 1.0,
642
- "count": 50
643
  },
644
  "UC": {
645
- "accuracy": 0.7037037037037037,
646
- "count": 189
647
  },
648
  "US": {
649
- "accuracy": 0.7567567567567568,
650
- "count": 111
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
- "full_accuracy": 0.74,
656
- "digit_accuracy": 0.96,
657
- "n_examples": 50,
658
  "per_subtask": {
659
  "MD": {
660
- "accuracy": 0.9537953795379538,
661
- "count": 303
662
  },
663
  "ME": {
664
- "accuracy": 1.0,
665
- "count": 47
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
- "full_accuracy": 0.44,
671
- "digit_accuracy": 0.8971428571428571,
672
- "n_examples": 50,
673
  "per_subtask": {
674
  "MD": {
675
- "accuracy": 0.9716312056737588,
676
- "count": 141
677
  },
678
  "MB": {
679
- "accuracy": 0.9305555555555556,
680
- "count": 72
681
  },
682
  "ME": {
683
- "accuracy": 0.9444444444444444,
684
- "count": 18
685
  },
686
  "UB": {
687
- "accuracy": 0.7815126050420168,
688
- "count": 119
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
- "full_accuracy": 0.24,
694
- "digit_accuracy": 0.8228571428571428,
695
- "n_examples": 50,
696
  "per_subtask": {
697
  "MD": {
698
- "accuracy": 0.9196428571428571,
699
- "count": 112
700
  },
701
  "MB": {
702
- "accuracy": 0.9056603773584906,
703
- "count": 53
704
  },
705
  "ME": {
706
- "accuracy": 0.9574468085106383,
707
- "count": 47
708
  },
709
  "UB": {
710
- "accuracy": 0.5764705882352941,
711
- "count": 85
712
  },
713
  "UD": {
714
- "accuracy": 0.8113207547169812,
715
- "count": 53
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
- "full_accuracy": 0.1,
721
- "digit_accuracy": 0.7628571428571429,
722
- "n_examples": 50,
723
  "per_subtask": {
724
  "MD": {
725
- "accuracy": 0.979381443298969,
726
- "count": 97
727
  },
728
  "MB": {
729
- "accuracy": 0.9411764705882353,
730
- "count": 51
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
- "count": 27
735
  },
736
  "UB": {
737
- "accuracy": 0.581081081081081,
738
- "count": 74
739
  },
740
  "UD": {
741
- "accuracy": 0.5346534653465347,
742
- "count": 101
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
- "full_accuracy": 0.04,
748
- "digit_accuracy": 0.6,
749
- "n_examples": 50,
750
  "per_subtask": {
751
  "MD": {
752
- "accuracy": 0.97,
753
- "count": 100
754
  },
755
  "MB": {
756
- "accuracy": 1.0,
757
- "count": 50
758
  },
759
  "UB": {
760
- "accuracy": 0.46,
761
- "count": 50
762
  },
763
  "UD": {
764
- "accuracy": 0.26666666666666666,
765
- "count": 150
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
- "full_accuracy": 0.08,
771
- "digit_accuracy": 0.5314285714285715,
772
- "n_examples": 50,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
- "count": 50
777
  },
778
  "MB": {
779
- "accuracy": 0.98,
780
- "count": 50
781
  },
782
  "UB": {
783
- "accuracy": 0.46,
784
- "count": 50
785
  },
786
  "UD": {
787
- "accuracy": 0.32,
788
- "count": 200
789
  }
790
  }
791
  },
792
  "sub_random": {
793
- "full_accuracy": 0.515,
794
- "digit_accuracy": 0.9107142857142857,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
- "accuracy": 0.9719298245614035,
799
- "count": 570
800
  },
801
  "MB": {
802
- "accuracy": 0.9422382671480144,
803
- "count": 277
804
  },
805
  "ME": {
806
  "accuracy": 0.9433962264150944,
807
  "count": 53
808
  },
809
  "UB": {
810
- "accuracy": 0.8131634819532909,
811
- "count": 471
812
  },
813
  "UD": {
814
- "accuracy": 0.9310344827586207,
815
- "count": 29
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
- "full_accuracy": 0.26,
821
- "digit_accuracy": 0.8142857142857143,
822
- "n_examples": 50,
823
  "per_subtask": {
824
  "MD": {
825
- "accuracy": 0.96,
826
- "count": 150
827
  },
828
  "MB": {
829
- "accuracy": 1.0,
830
- "count": 50
831
  },
832
  "UB": {
833
- "accuracy": 0.6435643564356436,
834
- "count": 101
835
  },
836
  "UD": {
837
- "accuracy": 0.5306122448979592,
838
- "count": 49
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
- "full_accuracy": 0.22,
844
- "digit_accuracy": 0.7514285714285714,
845
- "n_examples": 50,
846
  "per_subtask": {
847
  "MD": {
848
- "accuracy": 0.98,
849
- "count": 100
850
  },
851
  "MB": {
852
- "accuracy": 0.98,
853
- "count": 50
854
  },
855
  "UB": {
856
- "accuracy": 0.6611570247933884,
857
- "count": 121
858
  },
859
  "UD": {
860
- "accuracy": 0.45569620253164556,
861
- "count": 79
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
- "full_accuracy": 0.18,
867
- "digit_accuracy": 0.6971428571428572,
868
- "n_examples": 50,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
- "count": 50
873
  },
874
  "MB": {
875
  "accuracy": 1.0,
876
- "count": 50
877
  },
878
  "UB": {
879
- "accuracy": 0.6052631578947368,
880
- "count": 152
881
  },
882
  "UD": {
883
- "accuracy": 0.5306122448979592,
884
- "count": 98
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
- "overall_accuracy": 0.37066666666666664,
891
- "digit_accuracy": 0.8174285714285714,
892
- "total_examples": 1500,
893
  "n_splits": 24
894
  }
895
  }
 
65
  3100
66
  ],
67
  "loss": [
68
+ 7.394570350646973,
69
+ 4.051821708679199,
70
+ 2.021190881729126,
71
+ 1.8420897722244263,
72
+ 1.9143022298812866,
73
+ 1.7933646440505981,
74
+ 1.7442551851272583,
75
+ 1.6997934579849243,
76
+ 1.6858255863189697,
77
+ 1.603048324584961,
78
+ 1.1169459819793701,
79
+ 0.9371060729026794,
80
+ 0.7447897791862488,
81
+ 0.6269596815109253,
82
+ 0.6037188768386841,
83
+ 0.5636410713195801,
84
+ 0.5894449949264526,
85
+ 0.5685290694236755,
86
+ 0.5134449601173401,
87
+ 0.49189919233322144,
88
+ 0.4935498833656311,
89
+ 0.45771291851997375,
90
+ 0.4145750105381012,
91
+ 0.409036785364151,
92
+ 0.445596843957901,
93
+ 0.39751124382019043,
94
+ 0.38312414288520813,
95
+ 0.3672215938568115,
96
+ 0.3891475200653076,
97
+ 0.364010751247406,
98
+ 0.34130558371543884,
99
+ 0.3457995057106018,
100
+ 0.3009479343891144,
101
+ 0.33354875445365906,
102
+ 0.31102973222732544,
103
+ 0.2982536852359772,
104
+ 0.3054274022579193,
105
+ 0.28427326679229736,
106
+ 0.266096830368042,
107
+ 0.29840412735939026,
108
+ 0.29664090275764465,
109
+ 0.2837351858615875,
110
+ 0.2988941967487335,
111
+ 0.3007279932498932,
112
+ 0.26320937275886536,
113
+ 0.2581765055656433,
114
+ 0.28365617990493774,
115
+ 0.2829630970954895,
116
+ 0.2507253885269165,
117
+ 0.2723925709724426,
118
+ 0.24912698566913605,
119
+ 0.24788996577262878,
120
+ 0.26047465205192566,
121
+ 0.2179279625415802,
122
+ 0.2571420967578888,
123
+ 0.26797670125961304,
124
+ 0.24822230637073517,
125
+ 0.23186878859996796,
126
+ 0.27942460775375366,
127
+ 0.21844741702079773,
128
+ 0.24789755046367645,
129
+ 0.2550739645957947
130
  ],
131
  "base_loss": [
132
+ 7.394570350646973,
133
+ 4.051821708679199,
134
+ 2.021190881729126,
135
+ 1.8420897722244263,
136
+ 1.9143022298812866,
137
+ 1.7933646440505981,
138
+ 1.7442551851272583,
139
+ 1.6997934579849243,
140
+ 1.6858255863189697,
141
+ 1.603048324584961,
142
+ 1.1169459819793701,
143
+ 0.9371060729026794,
144
+ 0.7447897791862488,
145
+ 0.6269596815109253,
146
+ 0.6037188768386841,
147
+ 0.5636410713195801,
148
+ 0.5894449949264526,
149
+ 0.5685290694236755,
150
+ 0.5134449601173401,
151
+ 0.49189919233322144,
152
+ 0.4935498833656311,
153
+ 0.45771291851997375,
154
+ 0.4145750105381012,
155
+ 0.409036785364151,
156
+ 0.445596843957901,
157
+ 0.39751124382019043,
158
+ 0.38312414288520813,
159
+ 0.3672215938568115,
160
+ 0.3891475200653076,
161
+ 0.364010751247406,
162
+ 0.34130558371543884,
163
+ 0.3457995057106018,
164
+ 0.3009479343891144,
165
+ 0.33354875445365906,
166
+ 0.31102973222732544,
167
+ 0.2982536852359772,
168
+ 0.3054274022579193,
169
+ 0.28427326679229736,
170
+ 0.266096830368042,
171
+ 0.29840412735939026,
172
+ 0.29664090275764465,
173
+ 0.2837351858615875,
174
+ 0.2988941967487335,
175
+ 0.3007279932498932,
176
+ 0.26320937275886536,
177
+ 0.2581765055656433,
178
+ 0.28365617990493774,
179
+ 0.2829630970954895,
180
+ 0.2507253885269165,
181
+ 0.2723925709724426,
182
+ 0.24912698566913605,
183
+ 0.24788996577262878,
184
+ 0.26047465205192566,
185
+ 0.2179279625415802,
186
+ 0.2571420967578888,
187
+ 0.26797670125961304,
188
+ 0.24822230637073517,
189
+ 0.23186878859996796,
190
+ 0.27942460775375366,
191
+ 0.21844741702079773,
192
+ 0.24789755046367645,
193
+ 0.2550739645957947
194
  ],
195
  "lr": [
196
  3.9200000000000004e-05,
 
301
  20
302
  ],
303
  "eval_accuracy": [
304
+ 0.01368421052631579,
305
+ 0.002105263157894737,
306
+ 0.005263157894736842,
307
+ 0.0431578947368421,
308
+ 0.06315789473684211,
309
+ 0.09473684210526316,
310
+ 0.17578947368421052,
311
+ 0.1968421052631579,
312
+ 0.2831578947368421,
313
+ 0.27052631578947367,
314
+ 0.3105263157894737,
315
+ 0.33789473684210525,
316
+ 0.39473684210526316,
317
+ 0.3368421052631579,
318
+ 0.3831578947368421,
319
+ 0.4073684210526316,
320
+ 0.41789473684210526,
321
+ 0.42736842105263156,
322
+ 0.41789473684210526,
323
+ 0.41578947368421054
324
  ]
325
  },
326
+ "final_accuracy": 0.35846153846153844,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
+ "full_accuracy": 0.66,
338
+ "digit_accuracy": 0.94,
339
+ "n_examples": 100,
340
  "per_subtask": {
341
  "SA": {
342
+ "accuracy": 0.9388429752066115,
343
+ "count": 605
344
  },
345
  "SS": {
346
+ "accuracy": 0.9473684210526315,
347
+ "count": 95
348
  }
349
  }
350
  },
351
  "add_S1": {
352
+ "full_accuracy": 0.53,
353
+ "digit_accuracy": 0.9185714285714286,
354
+ "n_examples": 100,
355
  "per_subtask": {
356
  "SA": {
357
+ "accuracy": 0.9509803921568627,
358
+ "count": 204
359
  },
360
  "SC": {
361
+ "accuracy": 0.9585798816568047,
362
+ "count": 169
363
  },
364
  "SS": {
365
+ "accuracy": 0.9354838709677419,
366
+ "count": 31
367
  },
368
  "UC": {
369
+ "accuracy": 0.8716216216216216,
370
+ "count": 296
371
  }
372
  }
373
  },
374
  "add_S2": {
375
+ "full_accuracy": 0.28,
376
+ "digit_accuracy": 0.8585714285714285,
377
+ "n_examples": 100,
378
  "per_subtask": {
379
  "SA": {
380
+ "accuracy": 0.9447852760736196,
381
+ "count": 163
382
  },
383
  "SC": {
384
+ "accuracy": 0.8615384615384616,
385
+ "count": 130
386
  },
387
  "SS": {
388
+ "accuracy": 0.9080459770114943,
389
+ "count": 87
390
  },
391
  "UC": {
392
+ "accuracy": 0.7241379310344828,
393
+ "count": 203
394
  },
395
  "US": {
396
+ "accuracy": 0.9316239316239316,
397
+ "count": 117
398
  }
399
  }
400
  },
401
  "add_S3": {
402
+ "full_accuracy": 0.23,
403
+ "digit_accuracy": 0.78,
404
+ "n_examples": 100,
405
  "per_subtask": {
406
  "SA": {
407
+ "accuracy": 0.9834710743801653,
408
+ "count": 121
409
  },
410
  "SC": {
411
+ "accuracy": 0.9256198347107438,
412
+ "count": 121
413
  },
414
  "SS": {
415
+ "accuracy": 0.8979591836734694,
416
+ "count": 49
417
  },
418
  "UC": {
419
+ "accuracy": 0.6129032258064516,
420
+ "count": 186
421
  },
422
  "US": {
423
+ "accuracy": 0.7040358744394619,
424
+ "count": 223
425
  }
426
  }
427
  },
428
  "add_S4": {
429
+ "full_accuracy": 0.23,
430
+ "digit_accuracy": 0.7242857142857143,
431
+ "n_examples": 100,
432
  "per_subtask": {
433
  "SA": {
434
+ "accuracy": 0.9903846153846154,
435
+ "count": 104
436
  },
437
  "SC": {
438
+ "accuracy": 0.8867924528301887,
439
+ "count": 106
440
  },
441
  "SS": {
442
+ "accuracy": 0.9565217391304348,
443
+ "count": 23
444
  },
445
  "UC": {
446
+ "accuracy": 0.69375,
447
+ "count": 160
448
  },
449
  "US": {
450
+ "accuracy": 0.5765472312703583,
451
+ "count": 307
452
  }
453
  }
454
  },
455
  "add_S5": {
456
+ "full_accuracy": 0.17,
457
+ "digit_accuracy": 0.5471428571428572,
458
+ "n_examples": 100,
459
  "per_subtask": {
460
  "SA": {
461
  "accuracy": 1.0,
462
+ "count": 100
463
  },
464
  "SC": {
465
+ "accuracy": 0.96,
466
+ "count": 100
467
  },
468
  "UC": {
469
+ "accuracy": 0.39,
470
+ "count": 100
471
  },
472
  "US": {
473
+ "accuracy": 0.37,
474
+ "count": 400
475
  }
476
  }
477
  },
478
  "add_S6": {
479
+ "full_accuracy": 0.39,
480
+ "digit_accuracy": 0.5628571428571428,
481
+ "n_examples": 100,
482
  "per_subtask": {
483
  "SC": {
484
  "accuracy": 1.0,
485
+ "count": 100
486
  },
487
  "UC": {
488
+ "accuracy": 0.5,
489
+ "count": 100
490
  },
491
  "US": {
492
+ "accuracy": 0.488,
493
+ "count": 500
494
  }
495
  }
496
  },
497
  "add_random": {
498
+ "full_accuracy": 0.595,
499
+ "digit_accuracy": 0.925,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.9574944071588367,
504
+ "count": 447
505
  },
506
  "SC": {
507
+ "accuracy": 0.946875,
508
+ "count": 320
509
  },
510
  "SS": {
511
+ "accuracy": 0.9464285714285714,
512
+ "count": 56
513
  },
514
  "UC": {
515
+ "accuracy": 0.8865784499054821,
516
+ "count": 529
517
  },
518
  "US": {
519
+ "accuracy": 0.875,
520
+ "count": 48
521
  }
522
  }
523
  },
524
  "add_C1": {
525
+ "full_accuracy": 0.73,
526
+ "digit_accuracy": 0.9514285714285714,
527
+ "n_examples": 100,
528
  "per_subtask": {
529
  "SA": {
530
+ "accuracy": 0.968,
531
+ "count": 500
532
  },
533
  "SC": {
534
+ "accuracy": 0.97,
535
+ "count": 100
536
  },
537
  "UC": {
538
+ "accuracy": 0.85,
539
+ "count": 100
540
  }
541
  }
542
  },
543
  "add_C2": {
544
+ "full_accuracy": 0.56,
545
+ "digit_accuracy": 0.9257142857142857,
546
+ "n_examples": 100,
547
  "per_subtask": {
548
  "SA": {
549
+ "accuracy": 0.9675,
550
+ "count": 400
551
  },
552
  "SC": {
553
+ "accuracy": 0.97,
554
+ "count": 100
555
  },
556
  "UC": {
557
+ "accuracy": 0.8012820512820513,
558
+ "count": 156
559
  },
560
  "US": {
561
+ "accuracy": 0.8863636363636364,
562
+ "count": 44
563
  }
564
  }
565
  },
566
  "add_C3": {
567
+ "full_accuracy": 0.28,
568
+ "digit_accuracy": 0.8442857142857143,
569
+ "n_examples": 100,
570
  "per_subtask": {
571
  "SA": {
572
+ "accuracy": 0.9666666666666667,
573
+ "count": 300
574
  },
575
  "SC": {
576
+ "accuracy": 0.96,
577
+ "count": 100
578
  },
579
  "UC": {
580
+ "accuracy": 0.6582914572864321,
581
+ "count": 199
582
  },
583
  "US": {
584
+ "accuracy": 0.7326732673267327,
585
+ "count": 101
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.34,
591
+ "digit_accuracy": 0.8485714285714285,
592
+ "n_examples": 100,
593
  "per_subtask": {
594
  "SA": {
595
+ "accuracy": 0.995,
596
+ "count": 200
597
  },
598
  "SC": {
599
  "accuracy": 0.96,
600
+ "count": 100
601
  },
602
  "UC": {
603
+ "accuracy": 0.7462121212121212,
604
+ "count": 264
605
  },
606
  "US": {
607
+ "accuracy": 0.75,
608
+ "count": 136
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.28,
614
+ "digit_accuracy": 0.8271428571428572,
615
+ "n_examples": 100,
616
  "per_subtask": {
617
  "SA": {
618
  "accuracy": 1.0,
619
+ "count": 100
620
  },
621
  "SC": {
622
+ "accuracy": 0.99,
623
+ "count": 100
624
  },
625
  "UC": {
626
+ "accuracy": 0.7709677419354839,
627
+ "count": 310
628
  },
629
  "US": {
630
+ "accuracy": 0.7421052631578947,
631
+ "count": 190
632
  }
633
  }
634
  },
635
  "add_C6": {
636
+ "full_accuracy": 0.26,
637
+ "digit_accuracy": 0.7914285714285715,
638
+ "n_examples": 100,
639
  "per_subtask": {
640
  "SC": {
641
  "accuracy": 1.0,
642
+ "count": 100
643
  },
644
  "UC": {
645
+ "accuracy": 0.7702702702702703,
646
+ "count": 370
647
  },
648
  "US": {
649
+ "accuracy": 0.7347826086956522,
650
+ "count": 230
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
+ "full_accuracy": 0.71,
656
+ "digit_accuracy": 0.9514285714285714,
657
+ "n_examples": 100,
658
  "per_subtask": {
659
  "MD": {
660
+ "accuracy": 0.9495934959349593,
661
+ "count": 615
662
  },
663
  "ME": {
664
+ "accuracy": 0.9647058823529412,
665
+ "count": 85
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
+ "full_accuracy": 0.5,
671
+ "digit_accuracy": 0.9171428571428571,
672
+ "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
675
+ "accuracy": 0.952054794520548,
676
+ "count": 292
677
  },
678
  "MB": {
679
+ "accuracy": 0.9722222222222222,
680
+ "count": 144
681
  },
682
  "ME": {
683
+ "accuracy": 0.96,
684
+ "count": 25
685
  },
686
  "UB": {
687
+ "accuracy": 0.8368200836820083,
688
+ "count": 239
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
+ "full_accuracy": 0.26,
694
+ "digit_accuracy": 0.85,
695
+ "n_examples": 100,
696
  "per_subtask": {
697
  "MD": {
698
+ "accuracy": 0.966824644549763,
699
+ "count": 211
700
  },
701
  "MB": {
702
+ "accuracy": 0.9652173913043478,
703
+ "count": 115
704
  },
705
  "ME": {
706
+ "accuracy": 0.9529411764705882,
707
+ "count": 85
708
  },
709
  "UB": {
710
+ "accuracy": 0.6187845303867403,
711
+ "count": 181
712
  },
713
  "UD": {
714
+ "accuracy": 0.8055555555555556,
715
+ "count": 108
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
+ "full_accuracy": 0.14,
721
+ "digit_accuracy": 0.7714285714285715,
722
+ "n_examples": 100,
723
  "per_subtask": {
724
  "MD": {
725
+ "accuracy": 0.9888268156424581,
726
+ "count": 179
727
  },
728
  "MB": {
729
+ "accuracy": 0.9320388349514563,
730
+ "count": 103
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
+ "count": 56
735
  },
736
  "UB": {
737
+ "accuracy": 0.5436241610738255,
738
+ "count": 149
739
  },
740
  "UD": {
741
+ "accuracy": 0.6103286384976526,
742
+ "count": 213
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
+ "full_accuracy": 0.09,
748
+ "digit_accuracy": 0.6271428571428571,
749
+ "n_examples": 100,
750
  "per_subtask": {
751
  "MD": {
752
+ "accuracy": 0.995,
753
+ "count": 200
754
  },
755
  "MB": {
756
+ "accuracy": 0.98,
757
+ "count": 100
758
  },
759
  "UB": {
760
+ "accuracy": 0.44,
761
+ "count": 100
762
  },
763
  "UD": {
764
+ "accuracy": 0.32666666666666666,
765
+ "count": 300
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
+ "full_accuracy": 0.04,
771
+ "digit_accuracy": 0.48714285714285716,
772
+ "n_examples": 100,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
+ "count": 100
777
  },
778
  "MB": {
779
+ "accuracy": 1.0,
780
+ "count": 100
781
  },
782
  "UB": {
783
+ "accuracy": 0.48,
784
+ "count": 100
785
  },
786
  "UD": {
787
+ "accuracy": 0.2325,
788
+ "count": 400
789
  }
790
  }
791
  },
792
  "sub_random": {
793
+ "full_accuracy": 0.505,
794
+ "digit_accuracy": 0.9121428571428571,
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
+ "accuracy": 0.9466666666666667,
799
+ "count": 600
800
  },
801
  "MB": {
802
+ "accuracy": 0.9438202247191011,
803
+ "count": 267
804
  },
805
  "ME": {
806
  "accuracy": 0.9433962264150944,
807
  "count": 53
808
  },
809
  "UB": {
810
+ "accuracy": 0.8473804100227791,
811
+ "count": 439
812
  },
813
  "UD": {
814
+ "accuracy": 0.8536585365853658,
815
+ "count": 41
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
+ "full_accuracy": 0.23,
821
+ "digit_accuracy": 0.8114285714285714,
822
+ "n_examples": 100,
823
  "per_subtask": {
824
  "MD": {
825
+ "accuracy": 0.9733333333333334,
826
+ "count": 300
827
  },
828
  "MB": {
829
+ "accuracy": 0.95,
830
+ "count": 100
831
  },
832
  "UB": {
833
+ "accuracy": 0.6192893401015228,
834
+ "count": 197
835
  },
836
  "UD": {
837
+ "accuracy": 0.5728155339805825,
838
+ "count": 103
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
+ "full_accuracy": 0.08,
844
+ "digit_accuracy": 0.7328571428571429,
845
+ "n_examples": 100,
846
  "per_subtask": {
847
  "MD": {
848
+ "accuracy": 0.975,
849
+ "count": 200
850
  },
851
  "MB": {
852
+ "accuracy": 0.99,
853
+ "count": 100
854
  },
855
  "UB": {
856
+ "accuracy": 0.5506072874493927,
857
+ "count": 247
858
  },
859
  "UD": {
860
+ "accuracy": 0.5424836601307189,
861
+ "count": 153
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
+ "full_accuracy": 0.13,
867
+ "digit_accuracy": 0.6985714285714286,
868
+ "n_examples": 100,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
+ "count": 100
873
  },
874
  "MB": {
875
  "accuracy": 1.0,
876
+ "count": 100
877
  },
878
  "UB": {
879
+ "accuracy": 0.6308724832214765,
880
+ "count": 298
881
  },
882
  "UD": {
883
+ "accuracy": 0.5,
884
+ "count": 202
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
+ "overall_accuracy": 0.35846153846153844,
891
+ "digit_accuracy": 0.8091758241758241,
892
+ "total_examples": 2600,
893
  "n_splits": 24
894
  }
895
  }
add_sub_baseline_10K_1L3H510d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb08c950f588dd85bf054c11d3921a45bb77e8a8be21e8944ddd67270255396
3
  size 634642298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5bbfcb52eff2217ebf7270baf0f88acdff3defcd30ae5cc7aa22315d0713c18
3
  size 634642298
add_sub_baseline_10K_1L3H510d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_10K_1L3H510d",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T05:05:40.288836+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "nneomu0d",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/nneomu0d",
81
- "final_accuracy": 0.33625,
82
- "sft_accuracy": 0.33625,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_10K_1L3H510d",
72
+ "git_commit": "1d5a160e16a5070d61b881494e832aa88149b15c",
73
+ "timestamp": "2026-04-15T06:51:51.171053+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "ncfggziw",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/ncfggziw",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.35846153846153844,
86
+ "sft_accuracy": 0.35846153846153844,
87
  "eval_method": "ArithmeticEvaluator"
88
  }