amirali1985 commited on
Commit
370ad5a
·
verified ·
1 Parent(s): 30603c6

Upload add_sub_baseline_10K_1L2H256d

Browse files
add_sub_baseline_10K_1L2H256d/metrics.json CHANGED
@@ -65,132 +65,132 @@
65
  3100
66
  ],
67
  "loss": [
68
- 11.231005668640137,
69
- 9.655884742736816,
70
- 9.104948997497559,
71
- 8.300870895385742,
72
- 7.815948486328125,
73
- 7.295914649963379,
74
- 6.637112140655518,
75
- 6.088160037994385,
76
- 5.606553554534912,
77
- 5.160516262054443,
78
- 4.693508625030518,
79
- 4.079601287841797,
80
- 3.7621495723724365,
81
- 3.3034651279449463,
82
- 2.915548324584961,
83
- 2.6438701152801514,
84
- 2.4529876708984375,
85
- 2.2501683235168457,
86
- 2.161332368850708,
87
- 2.0686194896698,
88
- 1.9981437921524048,
89
- 2.0475308895111084,
90
- 1.9610540866851807,
91
- 1.9816741943359375,
92
- 1.940521001815796,
93
- 1.947417974472046,
94
- 1.9467179775238037,
95
- 1.921958327293396,
96
- 1.8819931745529175,
97
- 1.8775389194488525,
98
- 1.8941129446029663,
99
- 1.8416680097579956,
100
- 1.8777066469192505,
101
- 1.8092600107192993,
102
- 1.8147611618041992,
103
- 1.8798166513442993,
104
- 1.7604089975357056,
105
- 1.8312472105026245,
106
- 1.8071128129959106,
107
- 1.80684494972229,
108
- 1.8678429126739502,
109
- 1.82756507396698,
110
- 1.7287473678588867,
111
- 1.8508025407791138,
112
- 1.8142287731170654,
113
- 1.7641321420669556,
114
- 1.8310877084732056,
115
- 1.7848055362701416,
116
- 1.7922300100326538,
117
- 1.8264408111572266,
118
- 1.8544548749923706,
119
- 1.8431966304779053,
120
- 1.841490626335144,
121
- 1.77731192111969,
122
- 1.8109773397445679,
123
- 1.8000199794769287,
124
- 1.801360845565796,
125
- 1.8505899906158447,
126
- 1.8166601657867432,
127
- 1.7919466495513916,
128
- 1.7338820695877075,
129
- 1.8010389804840088
130
  ],
131
  "base_loss": [
132
- 11.231005668640137,
133
- 9.655884742736816,
134
- 9.104948997497559,
135
- 8.300870895385742,
136
- 7.815948486328125,
137
- 7.295914649963379,
138
- 6.637112140655518,
139
- 6.088160037994385,
140
- 5.606553554534912,
141
- 5.160516262054443,
142
- 4.693508625030518,
143
- 4.079601287841797,
144
- 3.7621495723724365,
145
- 3.3034651279449463,
146
- 2.915548324584961,
147
- 2.6438701152801514,
148
- 2.4529876708984375,
149
- 2.2501683235168457,
150
- 2.161332368850708,
151
- 2.0686194896698,
152
- 1.9981437921524048,
153
- 2.0475308895111084,
154
- 1.9610540866851807,
155
- 1.9816741943359375,
156
- 1.940521001815796,
157
- 1.947417974472046,
158
- 1.9467179775238037,
159
- 1.921958327293396,
160
- 1.8819931745529175,
161
- 1.8775389194488525,
162
- 1.8941129446029663,
163
- 1.8416680097579956,
164
- 1.8777066469192505,
165
- 1.8092600107192993,
166
- 1.8147611618041992,
167
- 1.8798166513442993,
168
- 1.7604089975357056,
169
- 1.8312472105026245,
170
- 1.8071128129959106,
171
- 1.80684494972229,
172
- 1.8678429126739502,
173
- 1.82756507396698,
174
- 1.7287473678588867,
175
- 1.8508025407791138,
176
- 1.8142287731170654,
177
- 1.7641321420669556,
178
- 1.8310877084732056,
179
- 1.7848055362701416,
180
- 1.7922300100326538,
181
- 1.8264408111572266,
182
- 1.8544548749923706,
183
- 1.8431966304779053,
184
- 1.841490626335144,
185
- 1.77731192111969,
186
- 1.8109773397445679,
187
- 1.8000199794769287,
188
- 1.801360845565796,
189
- 1.8505899906158447,
190
- 1.8166601657867432,
191
- 1.7919466495513916,
192
- 1.7338820695877075,
193
- 1.8010389804840088
194
  ],
195
  "lr": [
196
  9.800000000000001e-06,
@@ -302,490 +302,490 @@
302
  ],
303
  "eval_accuracy": [
304
  0.0,
305
- 0.0011111111111111111,
306
- 0.0011111111111111111,
307
  0.0,
308
- 0.0011111111111111111,
309
- 0.005555555555555556,
310
- 0.01,
311
- 0.0033333333333333335,
312
- 0.0044444444444444444,
313
- 0.0011111111111111111,
314
- 0.0044444444444444444,
315
- 0.008888888888888889,
316
- 0.011111111111111112,
317
- 0.0011111111111111111,
318
- 0.005555555555555556,
319
- 0.0022222222222222222,
320
- 0.006666666666666667,
321
- 0.0022222222222222222,
322
- 0.0022222222222222222,
323
- 0.0033333333333333335
 
 
324
  ]
325
  },
326
- "final_accuracy": 0.0016666666666666668,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.0,
338
- "digit_accuracy": 0.3457142857142857,
339
- "n_examples": 50,
340
  "per_subtask": {
341
  "SA": {
342
- "accuracy": 0.22372881355932203,
343
- "count": 295
344
  },
345
  "SS": {
346
  "accuracy": 1.0,
347
- "count": 55
348
  }
349
  }
350
  },
351
  "add_S1": {
352
  "full_accuracy": 0.0,
353
- "digit_accuracy": 0.24571428571428572,
354
- "n_examples": 50,
355
  "per_subtask": {
356
  "SA": {
357
- "accuracy": 0.29365079365079366,
358
- "count": 126
359
  },
360
  "SC": {
361
- "accuracy": 0.08860759493670886,
362
- "count": 79
363
  },
364
  "SS": {
365
- "accuracy": 0.9047619047619048,
366
- "count": 21
367
  },
368
  "UC": {
369
- "accuracy": 0.18548387096774194,
370
- "count": 124
371
  }
372
  }
373
  },
374
  "add_S2": {
375
  "full_accuracy": 0.0,
376
- "digit_accuracy": 0.37142857142857144,
377
- "n_examples": 50,
378
  "per_subtask": {
379
  "SA": {
380
- "accuracy": 0.4,
381
- "count": 75
382
  },
383
  "SC": {
384
- "accuracy": 0.12903225806451613,
385
- "count": 62
386
  },
387
  "SS": {
388
- "accuracy": 0.5128205128205128,
389
- "count": 39
390
  },
391
  "UC": {
392
- "accuracy": 0.36936936936936937,
393
- "count": 111
394
  },
395
  "US": {
396
- "accuracy": 0.49206349206349204,
397
- "count": 63
398
  }
399
  }
400
  },
401
  "add_S3": {
402
  "full_accuracy": 0.0,
403
- "digit_accuracy": 0.37142857142857144,
404
- "n_examples": 50,
405
  "per_subtask": {
406
  "SA": {
407
- "accuracy": 0.5,
408
- "count": 60
409
  },
410
  "SC": {
411
- "accuracy": 0.10526315789473684,
412
- "count": 57
413
  },
414
  "SS": {
415
- "accuracy": 0.6842105263157895,
416
- "count": 19
417
  },
418
  "UC": {
419
- "accuracy": 0.3557692307692308,
420
- "count": 104
421
  },
422
  "US": {
423
- "accuracy": 0.4,
424
- "count": 110
425
  }
426
  }
427
  },
428
  "add_S4": {
429
  "full_accuracy": 0.0,
430
- "digit_accuracy": 0.32,
431
- "n_examples": 50,
432
  "per_subtask": {
433
  "SA": {
434
- "accuracy": 0.4791666666666667,
435
- "count": 48
436
  },
437
  "SC": {
438
- "accuracy": 0.057692307692307696,
439
- "count": 52
440
  },
441
  "SS": {
442
- "accuracy": 0.8571428571428571,
443
- "count": 7
444
  },
445
  "UC": {
446
- "accuracy": 0.3146067415730337,
447
- "count": 89
448
  },
449
  "US": {
450
- "accuracy": 0.33766233766233766,
451
- "count": 154
452
  }
453
  }
454
  },
455
  "add_S5": {
456
  "full_accuracy": 0.0,
457
- "digit_accuracy": 0.07714285714285714,
458
- "n_examples": 50,
459
  "per_subtask": {
460
  "SA": {
461
- "accuracy": 0.5,
462
- "count": 50
463
  },
464
  "SC": {
465
- "accuracy": 0.0,
466
- "count": 50
467
  },
468
  "UC": {
469
- "accuracy": 0.04,
470
- "count": 50
471
  },
472
  "US": {
473
- "accuracy": 0.0,
474
- "count": 200
475
  }
476
  }
477
  },
478
  "add_S6": {
479
- "full_accuracy": 0.12,
480
- "digit_accuracy": 0.3485714285714286,
481
- "n_examples": 50,
482
  "per_subtask": {
483
  "SC": {
484
- "accuracy": 0.12,
485
- "count": 50
486
  },
487
  "UC": {
488
- "accuracy": 0.52,
489
- "count": 50
490
  },
491
  "US": {
492
- "accuracy": 0.36,
493
- "count": 250
494
  }
495
  }
496
  },
497
  "add_random": {
498
  "full_accuracy": 0.0,
499
- "digit_accuracy": 0.24357142857142858,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.2505800464037123,
504
- "count": 431
505
  },
506
  "SC": {
507
- "accuracy": 0.10126582278481013,
508
- "count": 316
509
  },
510
  "SS": {
511
- "accuracy": 0.8974358974358975,
512
- "count": 39
513
  },
514
  "UC": {
515
- "accuracy": 0.2571428571428571,
516
- "count": 560
517
  },
518
  "US": {
519
- "accuracy": 0.4074074074074074,
520
- "count": 54
521
  }
522
  }
523
  },
524
  "add_C1": {
525
  "full_accuracy": 0.0,
526
- "digit_accuracy": 0.15714285714285714,
527
- "n_examples": 50,
528
  "per_subtask": {
529
  "SA": {
530
- "accuracy": 0.184,
531
- "count": 250
532
  },
533
  "SC": {
534
- "accuracy": 0.06,
535
- "count": 50
536
  },
537
  "UC": {
538
- "accuracy": 0.12,
539
- "count": 50
540
  }
541
  }
542
  },
543
  "add_C2": {
544
  "full_accuracy": 0.0,
545
- "digit_accuracy": 0.16285714285714287,
546
- "n_examples": 50,
547
  "per_subtask": {
548
  "SA": {
549
- "accuracy": 0.21,
550
- "count": 200
551
  },
552
  "SC": {
553
  "accuracy": 0.02,
554
- "count": 50
555
  },
556
  "UC": {
557
- "accuracy": 0.13253012048192772,
558
- "count": 83
559
  },
560
  "US": {
561
- "accuracy": 0.17647058823529413,
562
- "count": 17
563
  }
564
  }
565
  },
566
  "add_C3": {
567
  "full_accuracy": 0.0,
568
- "digit_accuracy": 0.16857142857142857,
569
- "n_examples": 50,
570
  "per_subtask": {
571
  "SA": {
572
- "accuracy": 0.26,
573
- "count": 150
574
  },
575
  "SC": {
576
- "accuracy": 0.04,
577
- "count": 50
578
  },
579
  "UC": {
580
- "accuracy": 0.13,
581
- "count": 100
582
  },
583
  "US": {
584
- "accuracy": 0.1,
585
- "count": 50
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.0,
591
- "digit_accuracy": 0.18857142857142858,
592
- "n_examples": 50,
593
  "per_subtask": {
594
  "SA": {
595
- "accuracy": 0.39,
596
- "count": 100
597
  },
598
  "SC": {
599
- "accuracy": 0.0,
600
- "count": 50
601
  },
602
  "UC": {
603
- "accuracy": 0.09090909090909091,
604
- "count": 132
605
  },
606
  "US": {
607
- "accuracy": 0.22058823529411764,
608
- "count": 68
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.0,
614
- "digit_accuracy": 0.18857142857142858,
615
- "n_examples": 50,
616
  "per_subtask": {
617
  "SA": {
618
- "accuracy": 0.44,
619
- "count": 50
620
  },
621
  "SC": {
622
- "accuracy": 0.04,
623
- "count": 50
624
  },
625
  "UC": {
626
- "accuracy": 0.1506849315068493,
627
- "count": 146
628
  },
629
  "US": {
630
- "accuracy": 0.19230769230769232,
631
- "count": 104
632
  }
633
  }
634
  },
635
  "add_C6": {
636
  "full_accuracy": 0.0,
637
- "digit_accuracy": 0.33714285714285713,
638
- "n_examples": 50,
639
  "per_subtask": {
640
  "SC": {
641
- "accuracy": 0.06,
642
- "count": 50
643
  },
644
  "UC": {
645
- "accuracy": 0.2962962962962963,
646
- "count": 189
647
  },
648
  "US": {
649
- "accuracy": 0.5315315315315315,
650
- "count": 111
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
  "full_accuracy": 0.0,
656
- "digit_accuracy": 0.3057142857142857,
657
- "n_examples": 50,
658
  "per_subtask": {
659
  "MD": {
660
- "accuracy": 0.19801980198019803,
661
- "count": 303
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
- "count": 47
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
  "full_accuracy": 0.0,
671
- "digit_accuracy": 0.26285714285714284,
672
- "n_examples": 50,
673
  "per_subtask": {
674
  "MD": {
675
- "accuracy": 0.3900709219858156,
676
- "count": 141
677
  },
678
  "MB": {
679
  "accuracy": 0.0,
680
- "count": 72
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
- "count": 18
685
  },
686
  "UB": {
687
- "accuracy": 0.15966386554621848,
688
- "count": 119
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
  "full_accuracy": 0.0,
694
- "digit_accuracy": 0.38285714285714284,
695
- "n_examples": 50,
696
  "per_subtask": {
697
  "MD": {
698
- "accuracy": 0.6428571428571429,
699
- "count": 112
700
  },
701
  "MB": {
702
  "accuracy": 0.0,
703
- "count": 53
704
  },
705
  "ME": {
706
  "accuracy": 1.0,
707
- "count": 47
708
  },
709
  "UB": {
710
- "accuracy": 0.17647058823529413,
711
- "count": 85
712
  },
713
  "UD": {
714
  "accuracy": 0.0,
715
- "count": 53
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
  "full_accuracy": 0.0,
721
- "digit_accuracy": 0.28,
722
- "n_examples": 50,
723
  "per_subtask": {
724
  "MD": {
725
- "accuracy": 0.6494845360824743,
726
- "count": 97
727
  },
728
  "MB": {
729
  "accuracy": 0.0,
730
- "count": 51
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
- "count": 27
735
  },
736
  "UB": {
737
- "accuracy": 0.10810810810810811,
738
- "count": 74
739
  },
740
  "UD": {
741
  "accuracy": 0.0,
742
- "count": 101
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
  "full_accuracy": 0.0,
748
- "digit_accuracy": 0.21142857142857144,
749
- "n_examples": 50,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 0.5,
753
- "count": 100
754
  },
755
  "MB": {
756
  "accuracy": 0.0,
757
- "count": 50
758
  },
759
  "UB": {
760
- "accuracy": 0.48,
761
- "count": 50
762
  },
763
  "UD": {
764
  "accuracy": 0.0,
765
- "count": 150
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
  "full_accuracy": 0.0,
771
- "digit_accuracy": 0.18285714285714286,
772
- "n_examples": 50,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
- "count": 50
777
  },
778
  "MB": {
779
  "accuracy": 0.0,
780
- "count": 50
781
  },
782
  "UB": {
783
- "accuracy": 0.28,
784
- "count": 50
785
  },
786
  "UD": {
787
  "accuracy": 0.0,
788
- "count": 200
789
  }
790
  }
791
  },
@@ -795,101 +795,101 @@
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
- "accuracy": 0.37719298245614036,
799
- "count": 570
800
  },
801
  "MB": {
802
  "accuracy": 0.0,
803
- "count": 277
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
- "accuracy": 0.11677282377919321,
811
- "count": 471
812
  },
813
  "UD": {
814
  "accuracy": 0.0,
815
- "count": 29
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
  "full_accuracy": 0.0,
821
- "digit_accuracy": 0.18857142857142858,
822
- "n_examples": 50,
823
  "per_subtask": {
824
  "MD": {
825
  "accuracy": 0.3333333333333333,
826
- "count": 150
827
  },
828
  "MB": {
829
  "accuracy": 0.0,
830
- "count": 50
831
  },
832
  "UB": {
833
- "accuracy": 0.15841584158415842,
834
- "count": 101
835
  },
836
  "UD": {
837
  "accuracy": 0.0,
838
- "count": 49
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
  "full_accuracy": 0.0,
844
- "digit_accuracy": 0.17714285714285713,
845
- "n_examples": 50,
846
  "per_subtask": {
847
  "MD": {
848
  "accuracy": 0.5,
849
- "count": 100
850
  },
851
  "MB": {
852
  "accuracy": 0.0,
853
- "count": 50
854
  },
855
  "UB": {
856
- "accuracy": 0.09917355371900827,
857
- "count": 121
858
  },
859
  "UD": {
860
  "accuracy": 0.0,
861
- "count": 79
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
  "full_accuracy": 0.0,
867
- "digit_accuracy": 0.18,
868
- "n_examples": 50,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
- "count": 50
873
  },
874
  "MB": {
875
  "accuracy": 0.0,
876
- "count": 50
877
  },
878
  "UB": {
879
- "accuracy": 0.08552631578947369,
880
- "count": 152
881
  },
882
  "UD": {
883
  "accuracy": 0.0,
884
- "count": 98
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
- "overall_accuracy": 0.004,
891
- "digit_accuracy": 0.24485714285714286,
892
- "total_examples": 1500,
893
  "n_splits": 24
894
  }
895
  }
 
65
  3100
66
  ],
67
  "loss": [
68
+ 11.318906784057617,
69
+ 9.830679893493652,
70
+ 8.917726516723633,
71
+ 8.428927421569824,
72
+ 7.825829029083252,
73
+ 7.2453155517578125,
74
+ 6.758902549743652,
75
+ 6.228304386138916,
76
+ 5.600834369659424,
77
+ 5.050490379333496,
78
+ 4.656107425689697,
79
+ 4.113125801086426,
80
+ 3.6875858306884766,
81
+ 3.315659999847412,
82
+ 2.9331486225128174,
83
+ 2.6042838096618652,
84
+ 2.5160679817199707,
85
+ 2.234703779220581,
86
+ 2.1570537090301514,
87
+ 2.05960750579834,
88
+ 2.041534662246704,
89
+ 2.013371706008911,
90
+ 1.9541598558425903,
91
+ 1.84817636013031,
92
+ 1.91841721534729,
93
+ 1.9015415906906128,
94
+ 1.9575022459030151,
95
+ 1.8885481357574463,
96
+ 1.8128883838653564,
97
+ 1.9127439260482788,
98
+ 1.9104502201080322,
99
+ 1.8516839742660522,
100
+ 1.7954167127609253,
101
+ 1.8567707538604736,
102
+ 1.8164469003677368,
103
+ 1.8527641296386719,
104
+ 1.84950852394104,
105
+ 1.8511903285980225,
106
+ 1.813899040222168,
107
+ 1.8822053670883179,
108
+ 1.790955662727356,
109
+ 1.810065507888794,
110
+ 1.860735535621643,
111
+ 1.8452800512313843,
112
+ 1.7577123641967773,
113
+ 1.8937323093414307,
114
+ 1.8409382104873657,
115
+ 1.7796363830566406,
116
+ 1.8138635158538818,
117
+ 1.7971341609954834,
118
+ 1.7884553670883179,
119
+ 1.8205574750900269,
120
+ 1.8187378644943237,
121
+ 1.8169699907302856,
122
+ 1.8283138275146484,
123
+ 1.8101422786712646,
124
+ 1.8362064361572266,
125
+ 1.8084076642990112,
126
+ 1.8131023645401,
127
+ 1.784583330154419,
128
+ 1.8322092294692993,
129
+ 1.7847820520401
130
  ],
131
  "base_loss": [
132
+ 11.318906784057617,
133
+ 9.830679893493652,
134
+ 8.917726516723633,
135
+ 8.428927421569824,
136
+ 7.825829029083252,
137
+ 7.2453155517578125,
138
+ 6.758902549743652,
139
+ 6.228304386138916,
140
+ 5.600834369659424,
141
+ 5.050490379333496,
142
+ 4.656107425689697,
143
+ 4.113125801086426,
144
+ 3.6875858306884766,
145
+ 3.315659999847412,
146
+ 2.9331486225128174,
147
+ 2.6042838096618652,
148
+ 2.5160679817199707,
149
+ 2.234703779220581,
150
+ 2.1570537090301514,
151
+ 2.05960750579834,
152
+ 2.041534662246704,
153
+ 2.013371706008911,
154
+ 1.9541598558425903,
155
+ 1.84817636013031,
156
+ 1.91841721534729,
157
+ 1.9015415906906128,
158
+ 1.9575022459030151,
159
+ 1.8885481357574463,
160
+ 1.8128883838653564,
161
+ 1.9127439260482788,
162
+ 1.9104502201080322,
163
+ 1.8516839742660522,
164
+ 1.7954167127609253,
165
+ 1.8567707538604736,
166
+ 1.8164469003677368,
167
+ 1.8527641296386719,
168
+ 1.84950852394104,
169
+ 1.8511903285980225,
170
+ 1.813899040222168,
171
+ 1.8822053670883179,
172
+ 1.790955662727356,
173
+ 1.810065507888794,
174
+ 1.860735535621643,
175
+ 1.8452800512313843,
176
+ 1.7577123641967773,
177
+ 1.8937323093414307,
178
+ 1.8409382104873657,
179
+ 1.7796363830566406,
180
+ 1.8138635158538818,
181
+ 1.7971341609954834,
182
+ 1.7884553670883179,
183
+ 1.8205574750900269,
184
+ 1.8187378644943237,
185
+ 1.8169699907302856,
186
+ 1.8283138275146484,
187
+ 1.8101422786712646,
188
+ 1.8362064361572266,
189
+ 1.8084076642990112,
190
+ 1.8131023645401,
191
+ 1.784583330154419,
192
+ 1.8322092294692993,
193
+ 1.7847820520401
194
  ],
195
  "lr": [
196
  9.800000000000001e-06,
 
302
  ],
303
  "eval_accuracy": [
304
  0.0,
 
 
305
  0.0,
306
+ 0.002105263157894737,
307
+ 0.0,
308
+ 0.0010526315789473684,
309
+ 0.003157894736842105,
310
+ 0.005263157894736842,
311
+ 0.0010526315789473684,
312
+ 0.003157894736842105,
313
+ 0.00631578947368421,
314
+ 0.005263157894736842,
315
+ 0.00631578947368421,
316
+ 0.003157894736842105,
317
+ 0.007368421052631579,
318
+ 0.0010526315789473684,
319
+ 0.00631578947368421,
320
+ 0.00631578947368421,
321
+ 0.00631578947368421,
322
+ 0.007368421052631579,
323
+ 0.007368421052631579
324
  ]
325
  },
326
+ "final_accuracy": 0.0019230769230769232,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
  "full_accuracy": 0.0,
338
+ "digit_accuracy": 0.3142857142857143,
339
+ "n_examples": 100,
340
  "per_subtask": {
341
  "SA": {
342
+ "accuracy": 0.2066115702479339,
343
+ "count": 605
344
  },
345
  "SS": {
346
  "accuracy": 1.0,
347
+ "count": 95
348
  }
349
  }
350
  },
351
  "add_S1": {
352
  "full_accuracy": 0.0,
353
+ "digit_accuracy": 0.2571428571428571,
354
+ "n_examples": 100,
355
  "per_subtask": {
356
  "SA": {
357
+ "accuracy": 0.2647058823529412,
358
+ "count": 204
359
  },
360
  "SC": {
361
+ "accuracy": 0.15976331360946747,
362
+ "count": 169
363
  },
364
  "SS": {
365
+ "accuracy": 0.7419354838709677,
366
+ "count": 31
367
  },
368
  "UC": {
369
+ "accuracy": 0.25675675675675674,
370
+ "count": 296
371
  }
372
  }
373
  },
374
  "add_S2": {
375
  "full_accuracy": 0.0,
376
+ "digit_accuracy": 0.36428571428571427,
377
+ "n_examples": 100,
378
  "per_subtask": {
379
  "SA": {
380
+ "accuracy": 0.38650306748466257,
381
+ "count": 163
382
  },
383
  "SC": {
384
+ "accuracy": 0.13076923076923078,
385
+ "count": 130
386
  },
387
  "SS": {
388
+ "accuracy": 0.4827586206896552,
389
+ "count": 87
390
  },
391
  "UC": {
392
+ "accuracy": 0.3694581280788177,
393
+ "count": 203
394
  },
395
  "US": {
396
+ "accuracy": 0.49572649572649574,
397
+ "count": 117
398
  }
399
  }
400
  },
401
  "add_S3": {
402
  "full_accuracy": 0.0,
403
+ "digit_accuracy": 0.3485714285714286,
404
+ "n_examples": 100,
405
  "per_subtask": {
406
  "SA": {
407
+ "accuracy": 0.4793388429752066,
408
+ "count": 121
409
  },
410
  "SC": {
411
+ "accuracy": 0.049586776859504134,
412
+ "count": 121
413
  },
414
  "SS": {
415
+ "accuracy": 0.7142857142857143,
416
+ "count": 49
417
  },
418
  "UC": {
419
+ "accuracy": 0.3387096774193548,
420
+ "count": 186
421
  },
422
  "US": {
423
+ "accuracy": 0.36771300448430494,
424
+ "count": 223
425
  }
426
  }
427
  },
428
  "add_S4": {
429
  "full_accuracy": 0.0,
430
+ "digit_accuracy": 0.35714285714285715,
431
+ "n_examples": 100,
432
  "per_subtask": {
433
  "SA": {
434
+ "accuracy": 0.4519230769230769,
435
+ "count": 104
436
  },
437
  "SC": {
438
+ "accuracy": 0.07547169811320754,
439
+ "count": 106
440
  },
441
  "SS": {
442
+ "accuracy": 0.6521739130434783,
443
+ "count": 23
444
  },
445
  "UC": {
446
+ "accuracy": 0.39375,
447
+ "count": 160
448
  },
449
  "US": {
450
+ "accuracy": 0.3811074918566775,
451
+ "count": 307
452
  }
453
  }
454
  },
455
  "add_S5": {
456
  "full_accuracy": 0.0,
457
+ "digit_accuracy": 0.11285714285714285,
458
+ "n_examples": 100,
459
  "per_subtask": {
460
  "SA": {
461
+ "accuracy": 0.46,
462
+ "count": 100
463
  },
464
  "SC": {
465
+ "accuracy": 0.02,
466
+ "count": 100
467
  },
468
  "UC": {
469
+ "accuracy": 0.14,
470
+ "count": 100
471
  },
472
  "US": {
473
+ "accuracy": 0.0425,
474
+ "count": 400
475
  }
476
  }
477
  },
478
  "add_S6": {
479
+ "full_accuracy": 0.05,
480
+ "digit_accuracy": 0.56,
481
+ "n_examples": 100,
482
  "per_subtask": {
483
  "SC": {
484
+ "accuracy": 0.07,
485
+ "count": 100
486
  },
487
  "UC": {
488
+ "accuracy": 0.68,
489
+ "count": 100
490
  },
491
  "US": {
492
+ "accuracy": 0.634,
493
+ "count": 500
494
  }
495
  }
496
  },
497
  "add_random": {
498
  "full_accuracy": 0.0,
499
+ "digit_accuracy": 0.2507142857142857,
500
  "n_examples": 200,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.2684563758389262,
504
+ "count": 447
505
  },
506
  "SC": {
507
+ "accuracy": 0.096875,
508
+ "count": 320
509
  },
510
  "SS": {
511
+ "accuracy": 0.7142857142857143,
512
+ "count": 56
513
  },
514
  "UC": {
515
+ "accuracy": 0.2646502835538752,
516
+ "count": 529
517
  },
518
  "US": {
519
+ "accuracy": 0.4166666666666667,
520
+ "count": 48
521
  }
522
  }
523
  },
524
  "add_C1": {
525
  "full_accuracy": 0.0,
526
+ "digit_accuracy": 0.15142857142857144,
527
+ "n_examples": 100,
528
  "per_subtask": {
529
  "SA": {
530
+ "accuracy": 0.168,
531
+ "count": 500
532
  },
533
  "SC": {
534
+ "accuracy": 0.04,
535
+ "count": 100
536
  },
537
  "UC": {
538
+ "accuracy": 0.18,
539
+ "count": 100
540
  }
541
  }
542
  },
543
  "add_C2": {
544
  "full_accuracy": 0.0,
545
+ "digit_accuracy": 0.18142857142857144,
546
+ "n_examples": 100,
547
  "per_subtask": {
548
  "SA": {
549
+ "accuracy": 0.23,
550
+ "count": 400
551
  },
552
  "SC": {
553
  "accuracy": 0.02,
554
+ "count": 100
555
  },
556
  "UC": {
557
+ "accuracy": 0.14743589743589744,
558
+ "count": 156
559
  },
560
  "US": {
561
+ "accuracy": 0.22727272727272727,
562
+ "count": 44
563
  }
564
  }
565
  },
566
  "add_C3": {
567
  "full_accuracy": 0.0,
568
+ "digit_accuracy": 0.15571428571428572,
569
+ "n_examples": 100,
570
  "per_subtask": {
571
  "SA": {
572
+ "accuracy": 0.25333333333333335,
573
+ "count": 300
574
  },
575
  "SC": {
576
+ "accuracy": 0.02,
577
+ "count": 100
578
  },
579
  "UC": {
580
+ "accuracy": 0.12060301507537688,
581
+ "count": 199
582
  },
583
  "US": {
584
+ "accuracy": 0.06930693069306931,
585
+ "count": 101
586
  }
587
  }
588
  },
589
  "add_C4": {
590
  "full_accuracy": 0.0,
591
+ "digit_accuracy": 0.19,
592
+ "n_examples": 100,
593
  "per_subtask": {
594
  "SA": {
595
+ "accuracy": 0.36,
596
+ "count": 200
597
  },
598
  "SC": {
599
+ "accuracy": 0.05,
600
+ "count": 100
601
  },
602
  "UC": {
603
+ "accuracy": 0.12121212121212122,
604
+ "count": 264
605
  },
606
  "US": {
607
+ "accuracy": 0.17647058823529413,
608
+ "count": 136
609
  }
610
  }
611
  },
612
  "add_C5": {
613
  "full_accuracy": 0.0,
614
+ "digit_accuracy": 0.17714285714285713,
615
+ "n_examples": 100,
616
  "per_subtask": {
617
  "SA": {
618
+ "accuracy": 0.53,
619
+ "count": 100
620
  },
621
  "SC": {
622
+ "accuracy": 0.01,
623
+ "count": 100
624
  },
625
  "UC": {
626
+ "accuracy": 0.12580645161290321,
627
+ "count": 310
628
  },
629
  "US": {
630
+ "accuracy": 0.1631578947368421,
631
+ "count": 190
632
  }
633
  }
634
  },
635
  "add_C6": {
636
  "full_accuracy": 0.0,
637
+ "digit_accuracy": 0.37,
638
+ "n_examples": 100,
639
  "per_subtask": {
640
  "SC": {
641
+ "accuracy": 0.05,
642
+ "count": 100
643
  },
644
  "UC": {
645
+ "accuracy": 0.2972972972972973,
646
+ "count": 370
647
  },
648
  "US": {
649
+ "accuracy": 0.6260869565217392,
650
+ "count": 230
651
  }
652
  }
653
  },
654
  "sub_M0": {
655
  "full_accuracy": 0.0,
656
+ "digit_accuracy": 0.29285714285714287,
657
+ "n_examples": 100,
658
  "per_subtask": {
659
  "MD": {
660
+ "accuracy": 0.1951219512195122,
661
+ "count": 615
662
  },
663
  "ME": {
664
  "accuracy": 1.0,
665
+ "count": 85
666
  }
667
  }
668
  },
669
  "sub_M1": {
670
  "full_accuracy": 0.0,
671
+ "digit_accuracy": 0.22428571428571428,
672
+ "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
675
+ "accuracy": 0.3698630136986301,
676
+ "count": 292
677
  },
678
  "MB": {
679
  "accuracy": 0.0,
680
+ "count": 144
681
  },
682
  "ME": {
683
  "accuracy": 1.0,
684
+ "count": 25
685
  },
686
  "UB": {
687
+ "accuracy": 0.100418410041841,
688
+ "count": 239
689
  }
690
  }
691
  },
692
  "sub_M2": {
693
  "full_accuracy": 0.0,
694
+ "digit_accuracy": 0.35428571428571426,
695
+ "n_examples": 100,
696
  "per_subtask": {
697
  "MD": {
698
+ "accuracy": 0.6208530805687204,
699
+ "count": 211
700
  },
701
  "MB": {
702
  "accuracy": 0.0,
703
+ "count": 115
704
  },
705
  "ME": {
706
  "accuracy": 1.0,
707
+ "count": 85
708
  },
709
  "UB": {
710
+ "accuracy": 0.17679558011049723,
711
+ "count": 181
712
  },
713
  "UD": {
714
  "accuracy": 0.0,
715
+ "count": 108
716
  }
717
  }
718
  },
719
  "sub_M3": {
720
  "full_accuracy": 0.0,
721
+ "digit_accuracy": 0.3,
722
+ "n_examples": 100,
723
  "per_subtask": {
724
  "MD": {
725
+ "accuracy": 0.7597765363128491,
726
+ "count": 179
727
  },
728
  "MB": {
729
  "accuracy": 0.0,
730
+ "count": 103
731
  },
732
  "ME": {
733
  "accuracy": 1.0,
734
+ "count": 56
735
  },
736
  "UB": {
737
+ "accuracy": 0.12080536912751678,
738
+ "count": 149
739
  },
740
  "UD": {
741
  "accuracy": 0.0,
742
+ "count": 213
743
  }
744
  }
745
  },
746
  "sub_M4": {
747
  "full_accuracy": 0.0,
748
+ "digit_accuracy": 0.18571428571428572,
749
+ "n_examples": 100,
750
  "per_subtask": {
751
  "MD": {
752
  "accuracy": 0.5,
753
+ "count": 200
754
  },
755
  "MB": {
756
  "accuracy": 0.0,
757
+ "count": 100
758
  },
759
  "UB": {
760
+ "accuracy": 0.3,
761
+ "count": 100
762
  },
763
  "UD": {
764
  "accuracy": 0.0,
765
+ "count": 300
766
  }
767
  }
768
  },
769
  "sub_M5": {
770
  "full_accuracy": 0.0,
771
+ "digit_accuracy": 0.18714285714285714,
772
+ "n_examples": 100,
773
  "per_subtask": {
774
  "MD": {
775
  "accuracy": 1.0,
776
+ "count": 100
777
  },
778
  "MB": {
779
  "accuracy": 0.0,
780
+ "count": 100
781
  },
782
  "UB": {
783
+ "accuracy": 0.31,
784
+ "count": 100
785
  },
786
  "UD": {
787
  "accuracy": 0.0,
788
+ "count": 400
789
  }
790
  }
791
  },
 
795
  "n_examples": 200,
796
  "per_subtask": {
797
  "MD": {
798
+ "accuracy": 0.3616666666666667,
799
+ "count": 600
800
  },
801
  "MB": {
802
  "accuracy": 0.0,
803
+ "count": 267
804
  },
805
  "ME": {
806
  "accuracy": 1.0,
807
  "count": 53
808
  },
809
  "UB": {
810
+ "accuracy": 0.12072892938496584,
811
+ "count": 439
812
  },
813
  "UD": {
814
  "accuracy": 0.0,
815
+ "count": 41
816
  }
817
  }
818
  },
819
  "sub_B3": {
820
  "full_accuracy": 0.0,
821
+ "digit_accuracy": 0.19285714285714287,
822
+ "n_examples": 100,
823
  "per_subtask": {
824
  "MD": {
825
  "accuracy": 0.3333333333333333,
826
+ "count": 300
827
  },
828
  "MB": {
829
  "accuracy": 0.0,
830
+ "count": 100
831
  },
832
  "UB": {
833
+ "accuracy": 0.17766497461928935,
834
+ "count": 197
835
  },
836
  "UD": {
837
  "accuracy": 0.0,
838
+ "count": 103
839
  }
840
  }
841
  },
842
  "sub_B4": {
843
  "full_accuracy": 0.0,
844
+ "digit_accuracy": 0.19428571428571428,
845
+ "n_examples": 100,
846
  "per_subtask": {
847
  "MD": {
848
  "accuracy": 0.5,
849
+ "count": 200
850
  },
851
  "MB": {
852
  "accuracy": 0.0,
853
+ "count": 100
854
  },
855
  "UB": {
856
+ "accuracy": 0.145748987854251,
857
+ "count": 247
858
  },
859
  "UD": {
860
  "accuracy": 0.0,
861
+ "count": 153
862
  }
863
  }
864
  },
865
  "sub_B5": {
866
  "full_accuracy": 0.0,
867
+ "digit_accuracy": 0.19,
868
+ "n_examples": 100,
869
  "per_subtask": {
870
  "MD": {
871
  "accuracy": 1.0,
872
+ "count": 100
873
  },
874
  "MB": {
875
  "accuracy": 0.0,
876
+ "count": 100
877
  },
878
  "UB": {
879
+ "accuracy": 0.11073825503355705,
880
+ "count": 298
881
  },
882
  "UD": {
883
  "accuracy": 0.0,
884
+ "count": 202
885
  }
886
  }
887
  }
888
  },
889
  "summary": {
890
+ "overall_accuracy": 0.0019230769230769232,
891
+ "digit_accuracy": 0.25456043956043956,
892
+ "total_examples": 2600,
893
  "n_splits": 24
894
  }
895
  }
add_sub_baseline_10K_1L2H256d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53a01202265b8cb5dcaaf9bceb2f9af7c042a0935ffc28b3f5d14570b87d9f0d
3
  size 315072674
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa05a1cd94e5d85c81af8afec9355ccfeab6c774365e3be321eeb593ab3c4c12
3
  size 315072674
add_sub_baseline_10K_1L2H256d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 78691840,
71
  "run_name": "add_sub_baseline_10K_1L2H256d",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T06:22:24.415740+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "2b4iz8fe",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/2b4iz8fe",
81
- "final_accuracy": 0.0016666666666666668,
82
- "sft_accuracy": 0.0016666666666666668,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 78691840,
71
  "run_name": "add_sub_baseline_10K_1L2H256d",
72
+ "git_commit": "1d5a160e16a5070d61b881494e832aa88149b15c",
73
+ "timestamp": "2026-04-15T05:21:44.700980+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "s9ep067n",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/s9ep067n",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.0019230769230769232,
86
+ "sft_accuracy": 0.0019230769230769232,
87
  "eval_method": "ArithmeticEvaluator"
88
  }