amirali1985 commited on
Commit
238c67b
·
verified ·
1 Parent(s): 0305978

Upload add_sub_baseline_50K_1L3H510d

Browse files
add_sub_baseline_50K_1L3H510d/metrics.json CHANGED
@@ -315,632 +315,632 @@
315
  15600
316
  ],
317
  "loss": [
318
- 10.19568157196045,
319
- 7.192863464355469,
320
- 6.217032432556152,
321
- 4.67427921295166,
322
- 2.955170154571533,
323
- 2.0684783458709717,
324
- 1.879281759262085,
325
- 1.9268118143081665,
326
- 1.838002324104309,
327
- 1.6958969831466675,
328
- 1.6997877359390259,
329
- 1.5733033418655396,
330
- 1.5437490940093994,
331
- 1.147268295288086,
332
- 0.8210930228233337,
333
- 0.7633241415023804,
334
- 0.7406877279281616,
335
- 0.7202191352844238,
336
- 0.5617724061012268,
337
- 0.5543747544288635,
338
- 0.5196675658226013,
339
- 0.515282154083252,
340
- 0.541344404220581,
341
- 0.4584448039531708,
342
- 0.43351513147354126,
343
- 0.43640443682670593,
344
- 0.40899521112442017,
345
- 0.3735989034175873,
346
- 0.36980319023132324,
347
- 0.38705509901046753,
348
- 0.34830760955810547,
349
- 0.3112049400806427,
350
- 0.38858872652053833,
351
- 0.3346550166606903,
352
- 0.3554438054561615,
353
- 0.31374162435531616,
354
- 0.3362943232059479,
355
- 0.3253510296344757,
356
- 0.3735771179199219,
357
- 0.31208163499832153,
358
- 0.2946203649044037,
359
- 0.28192415833473206,
360
- 0.37054675817489624,
361
- 0.2701527178287506,
362
- 0.35655704140663147,
363
- 0.29881080985069275,
364
- 0.2859213054180145,
365
- 0.28949055075645447,
366
- 0.28196626901626587,
367
- 0.23745205998420715,
368
- 0.3055541217327118,
369
- 0.32069405913352966,
370
- 0.2612220048904419,
371
- 0.2794129252433777,
372
- 0.35670334100723267,
373
- 0.2686660587787628,
374
- 0.2621057331562042,
375
- 0.2519742548465729,
376
- 0.2593633234500885,
377
- 0.23556630313396454,
378
- 0.2755930423736572,
379
- 0.23948059976100922,
380
- 0.28940704464912415,
381
- 0.23039786517620087,
382
- 0.21981647610664368,
383
- 0.2917427718639374,
384
- 0.2732771635055542,
385
- 0.24072220921516418,
386
- 0.2149968147277832,
387
- 0.22879359126091003,
388
- 0.23625613749027252,
389
- 0.2409660965204239,
390
- 0.2549420893192291,
391
- 0.19799569249153137,
392
- 0.23658190667629242,
393
- 0.25520455837249756,
394
- 0.2438141405582428,
395
- 0.21446263790130615,
396
- 0.18694305419921875,
397
- 0.21734251081943512,
398
- 0.23308448493480682,
399
- 0.25597816705703735,
400
- 0.2303375005722046,
401
- 0.22810396552085876,
402
- 0.2421826273202896,
403
- 0.24269063770771027,
404
- 0.19383986294269562,
405
- 0.19238220155239105,
406
- 0.23637458682060242,
407
- 0.2113189697265625,
408
- 0.18594272434711456,
409
- 0.2047307789325714,
410
- 0.22051747143268585,
411
- 0.2338298112154007,
412
- 0.18495337665081024,
413
- 0.31769758462905884,
414
- 0.17948535084724426,
415
- 0.16956846415996552,
416
- 0.18370643258094788,
417
- 0.21728311479091644,
418
- 0.16529808938503265,
419
- 0.1880127638578415,
420
- 0.1927148550748825,
421
- 0.17977656424045563,
422
- 0.19032812118530273,
423
- 0.225495383143425,
424
- 0.18763747811317444,
425
- 0.15616856515407562,
426
- 0.2110285460948944,
427
- 0.1917344182729721,
428
- 0.19217932224273682,
429
- 0.18634091317653656,
430
- 0.17289797961711884,
431
- 0.18017618358135223,
432
- 0.16975148022174835,
433
- 0.20279523730278015,
434
- 0.18793119490146637,
435
- 0.18208105862140656,
436
- 0.2210826873779297,
437
- 0.1848507970571518,
438
- 0.175362229347229,
439
- 0.2338799387216568,
440
- 0.15665939450263977,
441
- 0.1710033118724823,
442
- 0.22897052764892578,
443
- 0.19976939260959625,
444
- 0.17123167216777802,
445
- 0.17207647860050201,
446
- 0.16104988753795624,
447
- 0.17151372134685516,
448
- 0.20540104806423187,
449
- 0.16327939927577972,
450
- 0.18736600875854492,
451
- 0.15778745710849762,
452
- 0.1724669635295868,
453
- 0.22603313624858856,
454
- 0.15962539613246918,
455
- 0.17842444777488708,
456
- 0.2059607207775116,
457
- 0.16246894001960754,
458
- 0.1688106805086136,
459
- 0.15051047503948212,
460
- 0.17147870361804962,
461
- 0.1596122831106186,
462
- 0.16684506833553314,
463
- 0.17243006825447083,
464
- 0.18031249940395355,
465
- 0.15410535037517548,
466
- 0.14030024409294128,
467
- 0.17812776565551758,
468
- 0.15372362732887268,
469
- 0.16936704516410828,
470
- 0.16118179261684418,
471
- 0.15867330133914948,
472
- 0.17409607768058777,
473
- 0.16555775701999664,
474
- 0.14979203045368195,
475
- 0.11939563602209091,
476
- 0.17306402325630188,
477
- 0.19467274844646454,
478
- 0.1610839068889618,
479
- 0.14897356927394867,
480
- 0.13533374667167664,
481
- 0.1751706600189209,
482
- 0.14803266525268555,
483
- 0.14225070178508759,
484
- 0.1285431832075119,
485
- 0.15326274931430817,
486
- 0.12212397158145905,
487
- 0.15423189103603363,
488
- 0.1676071733236313,
489
- 0.17005637288093567,
490
- 0.13811911642551422,
491
- 0.1578800231218338,
492
- 0.1245812326669693,
493
- 0.14160023629665375,
494
- 0.1362280547618866,
495
- 0.13950759172439575,
496
- 0.13266520202159882,
497
- 0.12251361459493637,
498
- 0.1468406617641449,
499
- 0.13180361688137054,
500
- 0.14409062266349792,
501
- 0.10714907199144363,
502
- 0.15004582703113556,
503
- 0.1443106085062027,
504
- 0.16568748652935028,
505
- 0.13185156881809235,
506
- 0.1419181525707245,
507
- 0.12822316586971283,
508
- 0.13271304965019226,
509
- 0.14806745946407318,
510
- 0.12749268114566803,
511
- 0.14777174592018127,
512
- 0.12936027348041534,
513
- 0.16480930149555206,
514
- 0.1283726692199707,
515
- 0.17376258969306946,
516
- 0.14537104964256287,
517
- 0.17024172842502594,
518
- 0.10525000840425491,
519
- 0.13460473716259003,
520
- 0.1136106625199318,
521
- 0.14334174990653992,
522
- 0.12381266057491302,
523
- 0.11635736376047134,
524
- 0.1619941145181656,
525
- 0.11771261692047119,
526
- 0.1627335399389267,
527
- 0.13971097767353058,
528
- 0.13618090748786926,
529
- 0.13798284530639648,
530
- 0.1138051375746727,
531
- 0.10376495867967606,
532
- 0.1188473179936409,
533
- 0.14848637580871582,
534
- 0.12351793795824051,
535
- 0.1291089653968811,
536
- 0.14152243733406067,
537
- 0.120426245033741,
538
- 0.12951800227165222,
539
- 0.1462240368127823,
540
- 0.12321526557207108,
541
- 0.11646594107151031,
542
- 0.16523265838623047,
543
- 0.09980458766222,
544
- 0.11321472376585007,
545
- 0.11211931705474854,
546
- 0.11767058819532394,
547
- 0.12701120972633362,
548
- 0.13879325985908508,
549
- 0.15463045239448547,
550
- 0.0981360599398613,
551
- 0.0952308177947998,
552
- 0.11886616051197052,
553
- 0.11998490244150162,
554
- 0.1317901909351349,
555
- 0.12052290141582489,
556
- 0.11561640352010727,
557
- 0.1148061528801918,
558
- 0.11529232561588287,
559
- 0.1109485775232315,
560
- 0.08685992658138275,
561
- 0.10863905400037766,
562
- 0.10214114189147949,
563
- 0.11502943933010101,
564
- 0.10768144577741623,
565
- 0.12711675465106964,
566
- 0.08342073112726212,
567
- 0.11724580824375153,
568
- 0.11047422140836716,
569
- 0.1387486755847931,
570
- 0.11161993443965912,
571
- 0.10834724456071854,
572
- 0.10992390662431717,
573
- 0.09308646619319916,
574
- 0.09626653045415878,
575
- 0.0924919992685318,
576
- 0.11180823296308517,
577
- 0.10675805807113647,
578
- 0.1210842952132225,
579
- 0.11283500492572784,
580
- 0.10502934455871582,
581
- 0.09572649002075195,
582
- 0.10792434215545654,
583
- 0.10839086771011353,
584
- 0.13045024871826172,
585
- 0.1423415094614029,
586
- 0.1113070398569107,
587
- 0.11313152313232422,
588
- 0.12788374722003937,
589
- 0.10165081918239594,
590
- 0.08276218920946121,
591
- 0.10745533555746078,
592
- 0.10156644880771637,
593
- 0.09847947210073471,
594
- 0.13116250932216644,
595
- 0.11027399450540543,
596
- 0.10681574791669846,
597
- 0.11552350223064423,
598
- 0.08603107184171677,
599
- 0.10476719588041306,
600
- 0.09377549588680267,
601
- 0.11473890393972397,
602
- 0.12605629861354828,
603
- 0.1027151346206665,
604
- 0.10792066156864166,
605
- 0.08990789949893951,
606
- 0.09799478948116302,
607
- 0.07296454906463623,
608
- 0.12055801600217819,
609
- 0.12072388827800751,
610
- 0.09598750621080399,
611
- 0.10575366765260696,
612
- 0.08751322329044342,
613
- 0.09683708101511002,
614
- 0.10606545954942703,
615
- 0.09735989570617676,
616
- 0.08438508957624435,
617
- 0.12220897525548935,
618
- 0.10032238811254501,
619
- 0.08653612434864044,
620
- 0.09255076199769974,
621
- 0.12790820002555847,
622
- 0.0972944125533104,
623
- 0.09457828849554062,
624
- 0.08459220826625824,
625
- 0.12037122994661331,
626
- 0.10575146228075027,
627
- 0.09452326595783234,
628
- 0.09958111494779587,
629
- 0.11665801703929901
630
  ],
631
  "base_loss": [
632
- 10.19568157196045,
633
- 7.192863464355469,
634
- 6.217032432556152,
635
- 4.67427921295166,
636
- 2.955170154571533,
637
- 2.0684783458709717,
638
- 1.879281759262085,
639
- 1.9268118143081665,
640
- 1.838002324104309,
641
- 1.6958969831466675,
642
- 1.6997877359390259,
643
- 1.5733033418655396,
644
- 1.5437490940093994,
645
- 1.147268295288086,
646
- 0.8210930228233337,
647
- 0.7633241415023804,
648
- 0.7406877279281616,
649
- 0.7202191352844238,
650
- 0.5617724061012268,
651
- 0.5543747544288635,
652
- 0.5196675658226013,
653
- 0.515282154083252,
654
- 0.541344404220581,
655
- 0.4584448039531708,
656
- 0.43351513147354126,
657
- 0.43640443682670593,
658
- 0.40899521112442017,
659
- 0.3735989034175873,
660
- 0.36980319023132324,
661
- 0.38705509901046753,
662
- 0.34830760955810547,
663
- 0.3112049400806427,
664
- 0.38858872652053833,
665
- 0.3346550166606903,
666
- 0.3554438054561615,
667
- 0.31374162435531616,
668
- 0.3362943232059479,
669
- 0.3253510296344757,
670
- 0.3735771179199219,
671
- 0.31208163499832153,
672
- 0.2946203649044037,
673
- 0.28192415833473206,
674
- 0.37054675817489624,
675
- 0.2701527178287506,
676
- 0.35655704140663147,
677
- 0.29881080985069275,
678
- 0.2859213054180145,
679
- 0.28949055075645447,
680
- 0.28196626901626587,
681
- 0.23745205998420715,
682
- 0.3055541217327118,
683
- 0.32069405913352966,
684
- 0.2612220048904419,
685
- 0.2794129252433777,
686
- 0.35670334100723267,
687
- 0.2686660587787628,
688
- 0.2621057331562042,
689
- 0.2519742548465729,
690
- 0.2593633234500885,
691
- 0.23556630313396454,
692
- 0.2755930423736572,
693
- 0.23948059976100922,
694
- 0.28940704464912415,
695
- 0.23039786517620087,
696
- 0.21981647610664368,
697
- 0.2917427718639374,
698
- 0.2732771635055542,
699
- 0.24072220921516418,
700
- 0.2149968147277832,
701
- 0.22879359126091003,
702
- 0.23625613749027252,
703
- 0.2409660965204239,
704
- 0.2549420893192291,
705
- 0.19799569249153137,
706
- 0.23658190667629242,
707
- 0.25520455837249756,
708
- 0.2438141405582428,
709
- 0.21446263790130615,
710
- 0.18694305419921875,
711
- 0.21734251081943512,
712
- 0.23308448493480682,
713
- 0.25597816705703735,
714
- 0.2303375005722046,
715
- 0.22810396552085876,
716
- 0.2421826273202896,
717
- 0.24269063770771027,
718
- 0.19383986294269562,
719
- 0.19238220155239105,
720
- 0.23637458682060242,
721
- 0.2113189697265625,
722
- 0.18594272434711456,
723
- 0.2047307789325714,
724
- 0.22051747143268585,
725
- 0.2338298112154007,
726
- 0.18495337665081024,
727
- 0.31769758462905884,
728
- 0.17948535084724426,
729
- 0.16956846415996552,
730
- 0.18370643258094788,
731
- 0.21728311479091644,
732
- 0.16529808938503265,
733
- 0.1880127638578415,
734
- 0.1927148550748825,
735
- 0.17977656424045563,
736
- 0.19032812118530273,
737
- 0.225495383143425,
738
- 0.18763747811317444,
739
- 0.15616856515407562,
740
- 0.2110285460948944,
741
- 0.1917344182729721,
742
- 0.19217932224273682,
743
- 0.18634091317653656,
744
- 0.17289797961711884,
745
- 0.18017618358135223,
746
- 0.16975148022174835,
747
- 0.20279523730278015,
748
- 0.18793119490146637,
749
- 0.18208105862140656,
750
- 0.2210826873779297,
751
- 0.1848507970571518,
752
- 0.175362229347229,
753
- 0.2338799387216568,
754
- 0.15665939450263977,
755
- 0.1710033118724823,
756
- 0.22897052764892578,
757
- 0.19976939260959625,
758
- 0.17123167216777802,
759
- 0.17207647860050201,
760
- 0.16104988753795624,
761
- 0.17151372134685516,
762
- 0.20540104806423187,
763
- 0.16327939927577972,
764
- 0.18736600875854492,
765
- 0.15778745710849762,
766
- 0.1724669635295868,
767
- 0.22603313624858856,
768
- 0.15962539613246918,
769
- 0.17842444777488708,
770
- 0.2059607207775116,
771
- 0.16246894001960754,
772
- 0.1688106805086136,
773
- 0.15051047503948212,
774
- 0.17147870361804962,
775
- 0.1596122831106186,
776
- 0.16684506833553314,
777
- 0.17243006825447083,
778
- 0.18031249940395355,
779
- 0.15410535037517548,
780
- 0.14030024409294128,
781
- 0.17812776565551758,
782
- 0.15372362732887268,
783
- 0.16936704516410828,
784
- 0.16118179261684418,
785
- 0.15867330133914948,
786
- 0.17409607768058777,
787
- 0.16555775701999664,
788
- 0.14979203045368195,
789
- 0.11939563602209091,
790
- 0.17306402325630188,
791
- 0.19467274844646454,
792
- 0.1610839068889618,
793
- 0.14897356927394867,
794
- 0.13533374667167664,
795
- 0.1751706600189209,
796
- 0.14803266525268555,
797
- 0.14225070178508759,
798
- 0.1285431832075119,
799
- 0.15326274931430817,
800
- 0.12212397158145905,
801
- 0.15423189103603363,
802
- 0.1676071733236313,
803
- 0.17005637288093567,
804
- 0.13811911642551422,
805
- 0.1578800231218338,
806
- 0.1245812326669693,
807
- 0.14160023629665375,
808
- 0.1362280547618866,
809
- 0.13950759172439575,
810
- 0.13266520202159882,
811
- 0.12251361459493637,
812
- 0.1468406617641449,
813
- 0.13180361688137054,
814
- 0.14409062266349792,
815
- 0.10714907199144363,
816
- 0.15004582703113556,
817
- 0.1443106085062027,
818
- 0.16568748652935028,
819
- 0.13185156881809235,
820
- 0.1419181525707245,
821
- 0.12822316586971283,
822
- 0.13271304965019226,
823
- 0.14806745946407318,
824
- 0.12749268114566803,
825
- 0.14777174592018127,
826
- 0.12936027348041534,
827
- 0.16480930149555206,
828
- 0.1283726692199707,
829
- 0.17376258969306946,
830
- 0.14537104964256287,
831
- 0.17024172842502594,
832
- 0.10525000840425491,
833
- 0.13460473716259003,
834
- 0.1136106625199318,
835
- 0.14334174990653992,
836
- 0.12381266057491302,
837
- 0.11635736376047134,
838
- 0.1619941145181656,
839
- 0.11771261692047119,
840
- 0.1627335399389267,
841
- 0.13971097767353058,
842
- 0.13618090748786926,
843
- 0.13798284530639648,
844
- 0.1138051375746727,
845
- 0.10376495867967606,
846
- 0.1188473179936409,
847
- 0.14848637580871582,
848
- 0.12351793795824051,
849
- 0.1291089653968811,
850
- 0.14152243733406067,
851
- 0.120426245033741,
852
- 0.12951800227165222,
853
- 0.1462240368127823,
854
- 0.12321526557207108,
855
- 0.11646594107151031,
856
- 0.16523265838623047,
857
- 0.09980458766222,
858
- 0.11321472376585007,
859
- 0.11211931705474854,
860
- 0.11767058819532394,
861
- 0.12701120972633362,
862
- 0.13879325985908508,
863
- 0.15463045239448547,
864
- 0.0981360599398613,
865
- 0.0952308177947998,
866
- 0.11886616051197052,
867
- 0.11998490244150162,
868
- 0.1317901909351349,
869
- 0.12052290141582489,
870
- 0.11561640352010727,
871
- 0.1148061528801918,
872
- 0.11529232561588287,
873
- 0.1109485775232315,
874
- 0.08685992658138275,
875
- 0.10863905400037766,
876
- 0.10214114189147949,
877
- 0.11502943933010101,
878
- 0.10768144577741623,
879
- 0.12711675465106964,
880
- 0.08342073112726212,
881
- 0.11724580824375153,
882
- 0.11047422140836716,
883
- 0.1387486755847931,
884
- 0.11161993443965912,
885
- 0.10834724456071854,
886
- 0.10992390662431717,
887
- 0.09308646619319916,
888
- 0.09626653045415878,
889
- 0.0924919992685318,
890
- 0.11180823296308517,
891
- 0.10675805807113647,
892
- 0.1210842952132225,
893
- 0.11283500492572784,
894
- 0.10502934455871582,
895
- 0.09572649002075195,
896
- 0.10792434215545654,
897
- 0.10839086771011353,
898
- 0.13045024871826172,
899
- 0.1423415094614029,
900
- 0.1113070398569107,
901
- 0.11313152313232422,
902
- 0.12788374722003937,
903
- 0.10165081918239594,
904
- 0.08276218920946121,
905
- 0.10745533555746078,
906
- 0.10156644880771637,
907
- 0.09847947210073471,
908
- 0.13116250932216644,
909
- 0.11027399450540543,
910
- 0.10681574791669846,
911
- 0.11552350223064423,
912
- 0.08603107184171677,
913
- 0.10476719588041306,
914
- 0.09377549588680267,
915
- 0.11473890393972397,
916
- 0.12605629861354828,
917
- 0.1027151346206665,
918
- 0.10792066156864166,
919
- 0.08990789949893951,
920
- 0.09799478948116302,
921
- 0.07296454906463623,
922
- 0.12055801600217819,
923
- 0.12072388827800751,
924
- 0.09598750621080399,
925
- 0.10575366765260696,
926
- 0.08751322329044342,
927
- 0.09683708101511002,
928
- 0.10606545954942703,
929
- 0.09735989570617676,
930
- 0.08438508957624435,
931
- 0.12220897525548935,
932
- 0.10032238811254501,
933
- 0.08653612434864044,
934
- 0.09255076199769974,
935
- 0.12790820002555847,
936
- 0.0972944125533104,
937
- 0.09457828849554062,
938
- 0.08459220826625824,
939
- 0.12037122994661331,
940
- 0.10575146228075027,
941
- 0.09452326595783234,
942
- 0.09958111494779587,
943
- 0.11665801703929901
944
  ],
945
  "lr": [
946
  8.376068376068378e-06,
@@ -1301,595 +1301,595 @@
1301
  20
1302
  ],
1303
  "eval_accuracy": [
1304
- 0.021111111111111112,
1305
- 0.24,
1306
- 0.3611111111111111,
1307
- 0.44666666666666666,
1308
- 0.4588888888888889,
1309
- 0.5144444444444445,
1310
- 0.5033333333333333,
1311
- 0.5022222222222222,
1312
- 0.5611111111111111,
1313
- 0.57,
1314
- 0.5722222222222222,
1315
- 0.6244444444444445,
1316
- 0.6166666666666667,
1317
- 0.6211111111111111,
1318
- 0.6455555555555555,
1319
- 0.6711111111111111,
1320
- 0.6311111111111111,
1321
- 0.6322222222222222,
1322
- 0.6533333333333333,
1323
- 0.6666666666666666
1324
  ]
1325
  },
1326
- "final_accuracy": 0.54125,
1327
  "sft_eval": {
1328
  "config": {
1329
  "ops": "add_sub",
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
- "n_per_split": 50
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
- "full_accuracy": 0.92,
1338
- "digit_accuracy": 0.9885714285714285,
1339
- "n_examples": 50,
1340
  "per_subtask": {
1341
  "SA": {
1342
- "accuracy": 0.9864406779661017,
1343
- "count": 295
1344
  },
1345
  "SS": {
1346
- "accuracy": 1.0,
1347
- "count": 55
1348
  }
1349
  }
1350
  },
1351
  "add_S1": {
1352
- "full_accuracy": 0.88,
1353
- "digit_accuracy": 0.9828571428571429,
1354
- "n_examples": 50,
1355
  "per_subtask": {
1356
  "SA": {
1357
- "accuracy": 1.0,
1358
- "count": 126
1359
  },
1360
  "SC": {
1361
- "accuracy": 0.9746835443037974,
1362
- "count": 79
1363
  },
1364
  "SS": {
1365
- "accuracy": 0.9523809523809523,
1366
- "count": 21
1367
  },
1368
  "UC": {
1369
- "accuracy": 0.9758064516129032,
1370
- "count": 124
1371
  }
1372
  }
1373
  },
1374
  "add_S2": {
1375
- "full_accuracy": 0.64,
1376
- "digit_accuracy": 0.9428571428571428,
1377
- "n_examples": 50,
1378
  "per_subtask": {
1379
  "SA": {
1380
- "accuracy": 0.9866666666666667,
1381
- "count": 75
1382
  },
1383
  "SC": {
1384
- "accuracy": 0.9516129032258065,
1385
- "count": 62
1386
  },
1387
  "SS": {
1388
- "accuracy": 0.9743589743589743,
1389
- "count": 39
1390
  },
1391
  "UC": {
1392
- "accuracy": 0.8648648648648649,
1393
- "count": 111
1394
  },
1395
  "US": {
1396
  "accuracy": 1.0,
1397
- "count": 63
1398
  }
1399
  }
1400
  },
1401
  "add_S3": {
1402
- "full_accuracy": 0.44,
1403
- "digit_accuracy": 0.8771428571428571,
1404
- "n_examples": 50,
1405
  "per_subtask": {
1406
  "SA": {
1407
- "accuracy": 1.0,
1408
- "count": 60
1409
  },
1410
  "SC": {
1411
- "accuracy": 1.0,
1412
- "count": 57
1413
  },
1414
  "SS": {
1415
- "accuracy": 1.0,
1416
- "count": 19
1417
  },
1418
  "UC": {
1419
- "accuracy": 0.7692307692307693,
1420
- "count": 104
1421
  },
1422
  "US": {
1423
- "accuracy": 0.8272727272727273,
1424
- "count": 110
1425
  }
1426
  }
1427
  },
1428
  "add_S4": {
1429
- "full_accuracy": 0.22,
1430
- "digit_accuracy": 0.74,
1431
- "n_examples": 50,
1432
  "per_subtask": {
1433
  "SA": {
1434
  "accuracy": 1.0,
1435
- "count": 48
1436
  },
1437
  "SC": {
1438
- "accuracy": 0.9807692307692307,
1439
- "count": 52
1440
  },
1441
  "SS": {
1442
  "accuracy": 1.0,
1443
- "count": 7
1444
  },
1445
  "UC": {
1446
- "accuracy": 0.6404494382022472,
1447
- "count": 89
1448
  },
1449
  "US": {
1450
- "accuracy": 0.6233766233766234,
1451
- "count": 154
1452
  }
1453
  }
1454
  },
1455
  "add_S5": {
1456
- "full_accuracy": 0.18,
1457
  "digit_accuracy": 0.5971428571428572,
1458
- "n_examples": 50,
1459
  "per_subtask": {
1460
  "SA": {
1461
  "accuracy": 1.0,
1462
- "count": 50
1463
  },
1464
  "SC": {
1465
  "accuracy": 1.0,
1466
- "count": 50
1467
  },
1468
  "UC": {
1469
- "accuracy": 0.42,
1470
- "count": 50
1471
  },
1472
  "US": {
1473
- "accuracy": 0.44,
1474
- "count": 200
1475
  }
1476
  }
1477
  },
1478
  "add_S6": {
1479
- "full_accuracy": 0.5,
1480
- "digit_accuracy": 0.6485714285714286,
1481
- "n_examples": 50,
1482
  "per_subtask": {
1483
  "SC": {
1484
  "accuracy": 1.0,
1485
- "count": 50
1486
  },
1487
  "UC": {
1488
- "accuracy": 0.54,
1489
- "count": 50
1490
  },
1491
  "US": {
1492
- "accuracy": 0.6,
1493
- "count": 250
1494
  }
1495
  }
1496
  },
1497
  "add_random": {
1498
- "full_accuracy": 0.89,
1499
- "digit_accuracy": 0.9814285714285714,
1500
  "n_examples": 200,
1501
  "per_subtask": {
1502
  "SA": {
1503
- "accuracy": 1.0,
1504
- "count": 431
1505
  },
1506
  "SC": {
1507
- "accuracy": 1.0,
1508
- "count": 316
1509
  },
1510
  "SS": {
1511
- "accuracy": 1.0,
1512
- "count": 39
1513
  },
1514
  "UC": {
1515
- "accuracy": 0.9607142857142857,
1516
- "count": 560
1517
  },
1518
  "US": {
1519
- "accuracy": 0.9259259259259259,
1520
- "count": 54
1521
  }
1522
  }
1523
  },
1524
  "add_C1": {
1525
- "full_accuracy": 0.94,
1526
- "digit_accuracy": 0.9914285714285714,
1527
- "n_examples": 50,
1528
  "per_subtask": {
1529
  "SA": {
1530
- "accuracy": 0.996,
1531
- "count": 250
1532
  },
1533
  "SC": {
1534
  "accuracy": 1.0,
1535
- "count": 50
1536
  },
1537
  "UC": {
1538
- "accuracy": 0.96,
1539
- "count": 50
1540
  }
1541
  }
1542
  },
1543
  "add_C2": {
1544
- "full_accuracy": 0.84,
1545
- "digit_accuracy": 0.9742857142857143,
1546
- "n_examples": 50,
1547
  "per_subtask": {
1548
  "SA": {
1549
- "accuracy": 1.0,
1550
- "count": 200
1551
  },
1552
  "SC": {
1553
  "accuracy": 1.0,
1554
- "count": 50
1555
  },
1556
  "UC": {
1557
- "accuracy": 0.9036144578313253,
1558
- "count": 83
1559
  },
1560
  "US": {
1561
- "accuracy": 0.9411764705882353,
1562
- "count": 17
1563
  }
1564
  }
1565
  },
1566
  "add_C3": {
1567
- "full_accuracy": 0.62,
1568
- "digit_accuracy": 0.9228571428571428,
1569
- "n_examples": 50,
1570
  "per_subtask": {
1571
  "SA": {
1572
- "accuracy": 0.9933333333333333,
1573
- "count": 150
1574
  },
1575
  "SC": {
1576
  "accuracy": 1.0,
1577
- "count": 50
1578
  },
1579
  "UC": {
1580
- "accuracy": 0.85,
1581
- "count": 100
1582
  },
1583
  "US": {
1584
- "accuracy": 0.78,
1585
- "count": 50
1586
  }
1587
  }
1588
  },
1589
  "add_C4": {
1590
- "full_accuracy": 0.66,
1591
- "digit_accuracy": 0.9257142857142857,
1592
- "n_examples": 50,
1593
  "per_subtask": {
1594
  "SA": {
1595
- "accuracy": 0.99,
1596
- "count": 100
1597
  },
1598
  "SC": {
1599
  "accuracy": 1.0,
1600
- "count": 50
1601
  },
1602
  "UC": {
1603
- "accuracy": 0.8787878787878788,
1604
- "count": 132
1605
  },
1606
  "US": {
1607
- "accuracy": 0.8676470588235294,
1608
- "count": 68
1609
  }
1610
  }
1611
  },
1612
  "add_C5": {
1613
- "full_accuracy": 0.54,
1614
- "digit_accuracy": 0.8742857142857143,
1615
- "n_examples": 50,
1616
  "per_subtask": {
1617
  "SA": {
1618
  "accuracy": 1.0,
1619
- "count": 50
1620
  },
1621
  "SC": {
1622
  "accuracy": 1.0,
1623
- "count": 50
1624
  },
1625
  "UC": {
1626
- "accuracy": 0.8493150684931506,
1627
- "count": 146
1628
  },
1629
  "US": {
1630
- "accuracy": 0.7884615384615384,
1631
- "count": 104
1632
  }
1633
  }
1634
  },
1635
  "add_C6": {
1636
- "full_accuracy": 0.5,
1637
- "digit_accuracy": 0.8771428571428571,
1638
- "n_examples": 50,
1639
  "per_subtask": {
1640
  "SC": {
1641
  "accuracy": 1.0,
1642
- "count": 50
1643
  },
1644
  "UC": {
1645
- "accuracy": 0.8571428571428571,
1646
- "count": 189
1647
  },
1648
  "US": {
1649
- "accuracy": 0.8558558558558559,
1650
- "count": 111
1651
  }
1652
  }
1653
  },
1654
  "sub_M0": {
1655
  "full_accuracy": 0.9,
1656
  "digit_accuracy": 0.9857142857142858,
1657
- "n_examples": 50,
1658
  "per_subtask": {
1659
  "MD": {
1660
- "accuracy": 0.9834983498349835,
1661
- "count": 303
1662
  },
1663
  "ME": {
1664
  "accuracy": 1.0,
1665
- "count": 47
1666
  }
1667
  }
1668
  },
1669
  "sub_M1": {
1670
- "full_accuracy": 0.86,
1671
- "digit_accuracy": 0.98,
1672
- "n_examples": 50,
1673
  "per_subtask": {
1674
  "MD": {
1675
- "accuracy": 0.9858156028368794,
1676
- "count": 141
1677
  },
1678
  "MB": {
1679
  "accuracy": 0.9861111111111112,
1680
- "count": 72
1681
  },
1682
  "ME": {
1683
- "accuracy": 1.0,
1684
- "count": 18
1685
  },
1686
  "UB": {
1687
- "accuracy": 0.9663865546218487,
1688
- "count": 119
1689
  }
1690
  }
1691
  },
1692
  "sub_M2": {
1693
- "full_accuracy": 0.56,
1694
- "digit_accuracy": 0.9257142857142857,
1695
- "n_examples": 50,
1696
  "per_subtask": {
1697
  "MD": {
1698
- "accuracy": 0.9732142857142857,
1699
- "count": 112
1700
  },
1701
  "MB": {
1702
- "accuracy": 0.9622641509433962,
1703
- "count": 53
1704
  },
1705
  "ME": {
1706
- "accuracy": 1.0,
1707
- "count": 47
1708
  },
1709
  "UB": {
1710
- "accuracy": 0.7647058823529411,
1711
- "count": 85
1712
  },
1713
  "UD": {
1714
- "accuracy": 0.9811320754716981,
1715
- "count": 53
1716
  }
1717
  }
1718
  },
1719
  "sub_M3": {
1720
- "full_accuracy": 0.22,
1721
- "digit_accuracy": 0.8457142857142858,
1722
- "n_examples": 50,
1723
  "per_subtask": {
1724
  "MD": {
1725
- "accuracy": 0.979381443298969,
1726
- "count": 97
1727
  },
1728
  "MB": {
1729
- "accuracy": 0.9803921568627451,
1730
- "count": 51
1731
  },
1732
  "ME": {
1733
  "accuracy": 1.0,
1734
- "count": 27
1735
  },
1736
  "UB": {
1737
- "accuracy": 0.5945945945945946,
1738
- "count": 74
1739
  },
1740
  "UD": {
1741
- "accuracy": 0.7920792079207921,
1742
- "count": 101
1743
  }
1744
  }
1745
  },
1746
  "sub_M4": {
1747
- "full_accuracy": 0.04,
1748
- "digit_accuracy": 0.6771428571428572,
1749
- "n_examples": 50,
1750
  "per_subtask": {
1751
  "MD": {
1752
  "accuracy": 1.0,
1753
- "count": 100
1754
  },
1755
  "MB": {
1756
  "accuracy": 1.0,
1757
- "count": 50
1758
  },
1759
  "UB": {
1760
- "accuracy": 0.4,
1761
- "count": 50
1762
  },
1763
  "UD": {
1764
- "accuracy": 0.44666666666666666,
1765
- "count": 150
1766
  }
1767
  }
1768
  },
1769
  "sub_M5": {
1770
- "full_accuracy": 0.1,
1771
- "digit_accuracy": 0.5771428571428572,
1772
- "n_examples": 50,
1773
  "per_subtask": {
1774
  "MD": {
1775
  "accuracy": 1.0,
1776
- "count": 50
1777
  },
1778
  "MB": {
1779
  "accuracy": 1.0,
1780
- "count": 50
1781
  },
1782
  "UB": {
1783
- "accuracy": 0.58,
1784
- "count": 50
1785
  },
1786
  "UD": {
1787
- "accuracy": 0.365,
1788
- "count": 200
1789
  }
1790
  }
1791
  },
1792
  "sub_random": {
1793
- "full_accuracy": 0.87,
1794
- "digit_accuracy": 0.98,
1795
  "n_examples": 200,
1796
  "per_subtask": {
1797
  "MD": {
1798
- "accuracy": 0.9982456140350877,
1799
- "count": 570
1800
  },
1801
  "MB": {
1802
- "accuracy": 0.9819494584837545,
1803
- "count": 277
1804
  },
1805
  "ME": {
1806
  "accuracy": 1.0,
1807
  "count": 53
1808
  },
1809
  "UB": {
1810
- "accuracy": 0.9554140127388535,
1811
- "count": 471
1812
  },
1813
  "UD": {
1814
- "accuracy": 0.9655172413793104,
1815
- "count": 29
1816
  }
1817
  }
1818
  },
1819
  "sub_B3": {
1820
- "full_accuracy": 0.52,
1821
- "digit_accuracy": 0.9142857142857143,
1822
- "n_examples": 50,
1823
  "per_subtask": {
1824
  "MD": {
1825
- "accuracy": 1.0,
1826
- "count": 150
1827
  },
1828
  "MB": {
1829
  "accuracy": 1.0,
1830
- "count": 50
1831
  },
1832
  "UB": {
1833
- "accuracy": 0.801980198019802,
1834
- "count": 101
1835
  },
1836
  "UD": {
1837
- "accuracy": 0.7959183673469388,
1838
- "count": 49
1839
  }
1840
  }
1841
  },
1842
  "sub_B4": {
1843
- "full_accuracy": 0.34,
1844
- "digit_accuracy": 0.8485714285714285,
1845
- "n_examples": 50,
1846
  "per_subtask": {
1847
  "MD": {
1848
  "accuracy": 1.0,
1849
- "count": 100
1850
  },
1851
  "MB": {
1852
  "accuracy": 1.0,
1853
- "count": 50
1854
  },
1855
  "UB": {
1856
- "accuracy": 0.7603305785123967,
1857
- "count": 121
1858
  },
1859
  "UD": {
1860
- "accuracy": 0.6962025316455697,
1861
- "count": 79
1862
  }
1863
  }
1864
  },
1865
  "sub_B5": {
1866
  "full_accuracy": 0.22,
1867
- "digit_accuracy": 0.8114285714285714,
1868
- "n_examples": 50,
1869
  "per_subtask": {
1870
  "MD": {
1871
  "accuracy": 1.0,
1872
- "count": 50
1873
  },
1874
  "MB": {
1875
  "accuracy": 1.0,
1876
- "count": 50
1877
  },
1878
  "UB": {
1879
- "accuracy": 0.75,
1880
- "count": 152
1881
  },
1882
  "UD": {
1883
- "accuracy": 0.7142857142857143,
1884
- "count": 98
1885
  }
1886
  }
1887
  }
1888
  },
1889
  "summary": {
1890
- "overall_accuracy": 0.6226666666666667,
1891
- "digit_accuracy": 0.8917142857142857,
1892
- "total_examples": 1500,
1893
  "n_splits": 24
1894
  }
1895
  }
 
315
  15600
316
  ],
317
  "loss": [
318
+ 10.278651237487793,
319
+ 7.389064311981201,
320
+ 6.051224231719971,
321
+ 4.550988674163818,
322
+ 2.852480173110962,
323
+ 2.1066410541534424,
324
+ 1.9148751497268677,
325
+ 1.843735933303833,
326
+ 1.8798506259918213,
327
+ 1.8511418104171753,
328
+ 1.7925702333450317,
329
+ 1.6850757598876953,
330
+ 1.6945250034332275,
331
+ 1.5783421993255615,
332
+ 1.089109182357788,
333
+ 0.7692533135414124,
334
+ 0.7096071243286133,
335
+ 0.6046804785728455,
336
+ 0.6018385291099548,
337
+ 0.5790620446205139,
338
+ 0.5549424290657043,
339
+ 0.5272607803344727,
340
+ 0.46060559153556824,
341
+ 0.4667455852031708,
342
+ 0.47789812088012695,
343
+ 0.4029836058616638,
344
+ 0.47773483395576477,
345
+ 0.440927654504776,
346
+ 0.39726462960243225,
347
+ 0.383687287569046,
348
+ 0.4488619863986969,
349
+ 0.3516424298286438,
350
+ 0.32863950729370117,
351
+ 0.327939510345459,
352
+ 0.31499478220939636,
353
+ 0.3476874530315399,
354
+ 0.35759884119033813,
355
+ 0.29587382078170776,
356
+ 0.3376016914844513,
357
+ 0.2916772663593292,
358
+ 0.2944425642490387,
359
+ 0.2793613076210022,
360
+ 0.30821874737739563,
361
+ 0.30350440740585327,
362
+ 0.30654028058052063,
363
+ 0.25445234775543213,
364
+ 0.293817937374115,
365
+ 0.2751149535179138,
366
+ 0.2549825608730316,
367
+ 0.27168112993240356,
368
+ 0.29966264963150024,
369
+ 0.2999582886695862,
370
+ 0.2856738567352295,
371
+ 0.2571064531803131,
372
+ 0.2718660533428192,
373
+ 0.26083821058273315,
374
+ 0.2872292399406433,
375
+ 0.23113496601581573,
376
+ 0.24555249512195587,
377
+ 0.2553543150424957,
378
+ 0.22126071155071259,
379
+ 0.24801431596279144,
380
+ 0.22513781487941742,
381
+ 0.21824850142002106,
382
+ 0.32301944494247437,
383
+ 0.2092411071062088,
384
+ 0.23654519021511078,
385
+ 0.22970826923847198,
386
+ 0.196001797914505,
387
+ 0.19658486545085907,
388
+ 0.19009140133857727,
389
+ 0.254011332988739,
390
+ 0.20502305030822754,
391
+ 0.23104675114154816,
392
+ 0.21880166232585907,
393
+ 0.24527417123317719,
394
+ 0.2686038315296173,
395
+ 0.2016792744398117,
396
+ 0.17031656205654144,
397
+ 0.20333679020404816,
398
+ 0.21816182136535645,
399
+ 0.21064119040966034,
400
+ 0.19884170591831207,
401
+ 0.19956159591674805,
402
+ 0.18471239507198334,
403
+ 0.15811093151569366,
404
+ 0.21883070468902588,
405
+ 0.1914089471101761,
406
+ 0.1655077189207077,
407
+ 0.22221115231513977,
408
+ 0.14605960249900818,
409
+ 0.21837235987186432,
410
+ 0.1704948991537094,
411
+ 0.1736709624528885,
412
+ 0.2253033071756363,
413
+ 0.18853184580802917,
414
+ 0.20015767216682434,
415
+ 0.19018162786960602,
416
+ 0.1817663013935089,
417
+ 0.21601822972297668,
418
+ 0.1782209873199463,
419
+ 0.1798601895570755,
420
+ 0.17680145800113678,
421
+ 0.16178666055202484,
422
+ 0.15822872519493103,
423
+ 0.21383854746818542,
424
+ 0.17432792484760284,
425
+ 0.15497128665447235,
426
+ 0.14949870109558105,
427
+ 0.1623925417661667,
428
+ 0.178536057472229,
429
+ 0.1496189385652542,
430
+ 0.1535705029964447,
431
+ 0.15259061753749847,
432
+ 0.20757174491882324,
433
+ 0.189642533659935,
434
+ 0.16320088505744934,
435
+ 0.19857345521450043,
436
+ 0.17532989382743835,
437
+ 0.18625964224338531,
438
+ 0.1518552303314209,
439
+ 0.15830935537815094,
440
+ 0.16554827988147736,
441
+ 0.1885346621274948,
442
+ 0.19510552287101746,
443
+ 0.1609521359205246,
444
+ 0.22554394602775574,
445
+ 0.17842698097229004,
446
+ 0.16276274621486664,
447
+ 0.1613040417432785,
448
+ 0.13027097284793854,
449
+ 0.16238419711589813,
450
+ 0.14015449583530426,
451
+ 0.17456690967082977,
452
+ 0.1481691151857376,
453
+ 0.1879999339580536,
454
+ 0.17365124821662903,
455
+ 0.2072172909975052,
456
+ 0.13534089922904968,
457
+ 0.12938502430915833,
458
+ 0.16615700721740723,
459
+ 0.1711844503879547,
460
+ 0.1538037359714508,
461
+ 0.17003300786018372,
462
+ 0.1528840959072113,
463
+ 0.14285284280776978,
464
+ 0.1634855419397354,
465
+ 0.13300959765911102,
466
+ 0.16810348629951477,
467
+ 0.13577891886234283,
468
+ 0.1253504455089569,
469
+ 0.13998855650424957,
470
+ 0.1808578222990036,
471
+ 0.14750854671001434,
472
+ 0.15061943233013153,
473
+ 0.15485039353370667,
474
+ 0.15015456080436707,
475
+ 0.12394087016582489,
476
+ 0.13308438658714294,
477
+ 0.12737306952476501,
478
+ 0.10817860811948776,
479
+ 0.1835630238056183,
480
+ 0.1271446943283081,
481
+ 0.17026567459106445,
482
+ 0.14795377850532532,
483
+ 0.15456995368003845,
484
+ 0.12015783041715622,
485
+ 0.14983054995536804,
486
+ 0.15613238513469696,
487
+ 0.1362493485212326,
488
+ 0.11746997386217117,
489
+ 0.12015391886234283,
490
+ 0.14974607527256012,
491
+ 0.12486754357814789,
492
+ 0.1347617208957672,
493
+ 0.13983528316020966,
494
+ 0.14421994984149933,
495
+ 0.15558600425720215,
496
+ 0.1395520716905594,
497
+ 0.12529298663139343,
498
+ 0.14551745355129242,
499
+ 0.16064704954624176,
500
+ 0.123051218688488,
501
+ 0.14231589436531067,
502
+ 0.15310995280742645,
503
+ 0.12945358455181122,
504
+ 0.15963591635227203,
505
+ 0.17953287065029144,
506
+ 0.09377244859933853,
507
+ 0.1109309270977974,
508
+ 0.10077396780252457,
509
+ 0.11636873334646225,
510
+ 0.14231428503990173,
511
+ 0.13868117332458496,
512
+ 0.1199658066034317,
513
+ 0.10137726366519928,
514
+ 0.12408987432718277,
515
+ 0.12158029526472092,
516
+ 0.15156243741512299,
517
+ 0.1056785136461258,
518
+ 0.11238741874694824,
519
+ 0.13356435298919678,
520
+ 0.1199871078133583,
521
+ 0.11451991647481918,
522
+ 0.11946208029985428,
523
+ 0.136271134018898,
524
+ 0.11361639946699142,
525
+ 0.10386927425861359,
526
+ 0.10679834336042404,
527
+ 0.09268485009670258,
528
+ 0.13429896533489227,
529
+ 0.12343748658895493,
530
+ 0.1442834585905075,
531
+ 0.12418247759342194,
532
+ 0.13955631852149963,
533
+ 0.09388206154108047,
534
+ 0.11501727253198624,
535
+ 0.1351633220911026,
536
+ 0.13560284674167633,
537
+ 0.12844859063625336,
538
+ 0.09430808573961258,
539
+ 0.11472094804048538,
540
+ 0.11257084459066391,
541
+ 0.11793804168701172,
542
+ 0.10530801862478256,
543
+ 0.13184157013893127,
544
+ 0.08044975250959396,
545
+ 0.10456537455320358,
546
+ 0.15832646191120148,
547
+ 0.10107123851776123,
548
+ 0.13520397245883942,
549
+ 0.11445610970258713,
550
+ 0.11855047196149826,
551
+ 0.13093633949756622,
552
+ 0.10899967700242996,
553
+ 0.10174088180065155,
554
+ 0.12345916777849197,
555
+ 0.1107829362154007,
556
+ 0.1003185361623764,
557
+ 0.1176886186003685,
558
+ 0.09605075418949127,
559
+ 0.12465865164995193,
560
+ 0.09835167974233627,
561
+ 0.1037013903260231,
562
+ 0.11466304212808609,
563
+ 0.11380521208047867,
564
+ 0.13064700365066528,
565
+ 0.13941724598407745,
566
+ 0.08339516073465347,
567
+ 0.11133090406656265,
568
+ 0.10470746457576752,
569
+ 0.11295027285814285,
570
+ 0.11331796646118164,
571
+ 0.09551217406988144,
572
+ 0.11493296921253204,
573
+ 0.09893112629652023,
574
+ 0.11825107038021088,
575
+ 0.12228565663099289,
576
+ 0.10353125631809235,
577
+ 0.10824307054281235,
578
+ 0.10025204718112946,
579
+ 0.07499944418668747,
580
+ 0.09401531517505646,
581
+ 0.0934530571103096,
582
+ 0.11390485614538193,
583
+ 0.10727037489414215,
584
+ 0.09588678926229477,
585
+ 0.0825532078742981,
586
+ 0.10723954439163208,
587
+ 0.1088128611445427,
588
+ 0.11156749725341797,
589
+ 0.09598823636770248,
590
+ 0.08453374356031418,
591
+ 0.10130847245454788,
592
+ 0.09620478004217148,
593
+ 0.11045535653829575,
594
+ 0.10323013365268707,
595
+ 0.13056471943855286,
596
+ 0.08738001435995102,
597
+ 0.11891673505306244,
598
+ 0.09096094220876694,
599
+ 0.08709857612848282,
600
+ 0.09677024930715561,
601
+ 0.08751770108938217,
602
+ 0.10058047622442245,
603
+ 0.09135355800390244,
604
+ 0.1054212898015976,
605
+ 0.09947032481431961,
606
+ 0.10693687945604324,
607
+ 0.0917934849858284,
608
+ 0.10931998491287231,
609
+ 0.09929491579532623,
610
+ 0.11224542558193207,
611
+ 0.11092674732208252,
612
+ 0.09260962903499603,
613
+ 0.12740318477153778,
614
+ 0.11285065114498138,
615
+ 0.10502947121858597,
616
+ 0.07829917967319489,
617
+ 0.1043354719877243,
618
+ 0.10123976320028305,
619
+ 0.08939259499311447,
620
+ 0.11151234060525894,
621
+ 0.13105222582817078,
622
+ 0.10120860487222672,
623
+ 0.06924744695425034,
624
+ 0.10668231546878815,
625
+ 0.09256572276353836,
626
+ 0.08223498612642288,
627
+ 0.09994925558567047,
628
+ 0.08328042924404144,
629
+ 0.0906481072306633
630
  ],
631
  "base_loss": [
632
+ 10.278651237487793,
633
+ 7.389064311981201,
634
+ 6.051224231719971,
635
+ 4.550988674163818,
636
+ 2.852480173110962,
637
+ 2.1066410541534424,
638
+ 1.9148751497268677,
639
+ 1.843735933303833,
640
+ 1.8798506259918213,
641
+ 1.8511418104171753,
642
+ 1.7925702333450317,
643
+ 1.6850757598876953,
644
+ 1.6945250034332275,
645
+ 1.5783421993255615,
646
+ 1.089109182357788,
647
+ 0.7692533135414124,
648
+ 0.7096071243286133,
649
+ 0.6046804785728455,
650
+ 0.6018385291099548,
651
+ 0.5790620446205139,
652
+ 0.5549424290657043,
653
+ 0.5272607803344727,
654
+ 0.46060559153556824,
655
+ 0.4667455852031708,
656
+ 0.47789812088012695,
657
+ 0.4029836058616638,
658
+ 0.47773483395576477,
659
+ 0.440927654504776,
660
+ 0.39726462960243225,
661
+ 0.383687287569046,
662
+ 0.4488619863986969,
663
+ 0.3516424298286438,
664
+ 0.32863950729370117,
665
+ 0.327939510345459,
666
+ 0.31499478220939636,
667
+ 0.3476874530315399,
668
+ 0.35759884119033813,
669
+ 0.29587382078170776,
670
+ 0.3376016914844513,
671
+ 0.2916772663593292,
672
+ 0.2944425642490387,
673
+ 0.2793613076210022,
674
+ 0.30821874737739563,
675
+ 0.30350440740585327,
676
+ 0.30654028058052063,
677
+ 0.25445234775543213,
678
+ 0.293817937374115,
679
+ 0.2751149535179138,
680
+ 0.2549825608730316,
681
+ 0.27168112993240356,
682
+ 0.29966264963150024,
683
+ 0.2999582886695862,
684
+ 0.2856738567352295,
685
+ 0.2571064531803131,
686
+ 0.2718660533428192,
687
+ 0.26083821058273315,
688
+ 0.2872292399406433,
689
+ 0.23113496601581573,
690
+ 0.24555249512195587,
691
+ 0.2553543150424957,
692
+ 0.22126071155071259,
693
+ 0.24801431596279144,
694
+ 0.22513781487941742,
695
+ 0.21824850142002106,
696
+ 0.32301944494247437,
697
+ 0.2092411071062088,
698
+ 0.23654519021511078,
699
+ 0.22970826923847198,
700
+ 0.196001797914505,
701
+ 0.19658486545085907,
702
+ 0.19009140133857727,
703
+ 0.254011332988739,
704
+ 0.20502305030822754,
705
+ 0.23104675114154816,
706
+ 0.21880166232585907,
707
+ 0.24527417123317719,
708
+ 0.2686038315296173,
709
+ 0.2016792744398117,
710
+ 0.17031656205654144,
711
+ 0.20333679020404816,
712
+ 0.21816182136535645,
713
+ 0.21064119040966034,
714
+ 0.19884170591831207,
715
+ 0.19956159591674805,
716
+ 0.18471239507198334,
717
+ 0.15811093151569366,
718
+ 0.21883070468902588,
719
+ 0.1914089471101761,
720
+ 0.1655077189207077,
721
+ 0.22221115231513977,
722
+ 0.14605960249900818,
723
+ 0.21837235987186432,
724
+ 0.1704948991537094,
725
+ 0.1736709624528885,
726
+ 0.2253033071756363,
727
+ 0.18853184580802917,
728
+ 0.20015767216682434,
729
+ 0.19018162786960602,
730
+ 0.1817663013935089,
731
+ 0.21601822972297668,
732
+ 0.1782209873199463,
733
+ 0.1798601895570755,
734
+ 0.17680145800113678,
735
+ 0.16178666055202484,
736
+ 0.15822872519493103,
737
+ 0.21383854746818542,
738
+ 0.17432792484760284,
739
+ 0.15497128665447235,
740
+ 0.14949870109558105,
741
+ 0.1623925417661667,
742
+ 0.178536057472229,
743
+ 0.1496189385652542,
744
+ 0.1535705029964447,
745
+ 0.15259061753749847,
746
+ 0.20757174491882324,
747
+ 0.189642533659935,
748
+ 0.16320088505744934,
749
+ 0.19857345521450043,
750
+ 0.17532989382743835,
751
+ 0.18625964224338531,
752
+ 0.1518552303314209,
753
+ 0.15830935537815094,
754
+ 0.16554827988147736,
755
+ 0.1885346621274948,
756
+ 0.19510552287101746,
757
+ 0.1609521359205246,
758
+ 0.22554394602775574,
759
+ 0.17842698097229004,
760
+ 0.16276274621486664,
761
+ 0.1613040417432785,
762
+ 0.13027097284793854,
763
+ 0.16238419711589813,
764
+ 0.14015449583530426,
765
+ 0.17456690967082977,
766
+ 0.1481691151857376,
767
+ 0.1879999339580536,
768
+ 0.17365124821662903,
769
+ 0.2072172909975052,
770
+ 0.13534089922904968,
771
+ 0.12938502430915833,
772
+ 0.16615700721740723,
773
+ 0.1711844503879547,
774
+ 0.1538037359714508,
775
+ 0.17003300786018372,
776
+ 0.1528840959072113,
777
+ 0.14285284280776978,
778
+ 0.1634855419397354,
779
+ 0.13300959765911102,
780
+ 0.16810348629951477,
781
+ 0.13577891886234283,
782
+ 0.1253504455089569,
783
+ 0.13998855650424957,
784
+ 0.1808578222990036,
785
+ 0.14750854671001434,
786
+ 0.15061943233013153,
787
+ 0.15485039353370667,
788
+ 0.15015456080436707,
789
+ 0.12394087016582489,
790
+ 0.13308438658714294,
791
+ 0.12737306952476501,
792
+ 0.10817860811948776,
793
+ 0.1835630238056183,
794
+ 0.1271446943283081,
795
+ 0.17026567459106445,
796
+ 0.14795377850532532,
797
+ 0.15456995368003845,
798
+ 0.12015783041715622,
799
+ 0.14983054995536804,
800
+ 0.15613238513469696,
801
+ 0.1362493485212326,
802
+ 0.11746997386217117,
803
+ 0.12015391886234283,
804
+ 0.14974607527256012,
805
+ 0.12486754357814789,
806
+ 0.1347617208957672,
807
+ 0.13983528316020966,
808
+ 0.14421994984149933,
809
+ 0.15558600425720215,
810
+ 0.1395520716905594,
811
+ 0.12529298663139343,
812
+ 0.14551745355129242,
813
+ 0.16064704954624176,
814
+ 0.123051218688488,
815
+ 0.14231589436531067,
816
+ 0.15310995280742645,
817
+ 0.12945358455181122,
818
+ 0.15963591635227203,
819
+ 0.17953287065029144,
820
+ 0.09377244859933853,
821
+ 0.1109309270977974,
822
+ 0.10077396780252457,
823
+ 0.11636873334646225,
824
+ 0.14231428503990173,
825
+ 0.13868117332458496,
826
+ 0.1199658066034317,
827
+ 0.10137726366519928,
828
+ 0.12408987432718277,
829
+ 0.12158029526472092,
830
+ 0.15156243741512299,
831
+ 0.1056785136461258,
832
+ 0.11238741874694824,
833
+ 0.13356435298919678,
834
+ 0.1199871078133583,
835
+ 0.11451991647481918,
836
+ 0.11946208029985428,
837
+ 0.136271134018898,
838
+ 0.11361639946699142,
839
+ 0.10386927425861359,
840
+ 0.10679834336042404,
841
+ 0.09268485009670258,
842
+ 0.13429896533489227,
843
+ 0.12343748658895493,
844
+ 0.1442834585905075,
845
+ 0.12418247759342194,
846
+ 0.13955631852149963,
847
+ 0.09388206154108047,
848
+ 0.11501727253198624,
849
+ 0.1351633220911026,
850
+ 0.13560284674167633,
851
+ 0.12844859063625336,
852
+ 0.09430808573961258,
853
+ 0.11472094804048538,
854
+ 0.11257084459066391,
855
+ 0.11793804168701172,
856
+ 0.10530801862478256,
857
+ 0.13184157013893127,
858
+ 0.08044975250959396,
859
+ 0.10456537455320358,
860
+ 0.15832646191120148,
861
+ 0.10107123851776123,
862
+ 0.13520397245883942,
863
+ 0.11445610970258713,
864
+ 0.11855047196149826,
865
+ 0.13093633949756622,
866
+ 0.10899967700242996,
867
+ 0.10174088180065155,
868
+ 0.12345916777849197,
869
+ 0.1107829362154007,
870
+ 0.1003185361623764,
871
+ 0.1176886186003685,
872
+ 0.09605075418949127,
873
+ 0.12465865164995193,
874
+ 0.09835167974233627,
875
+ 0.1037013903260231,
876
+ 0.11466304212808609,
877
+ 0.11380521208047867,
878
+ 0.13064700365066528,
879
+ 0.13941724598407745,
880
+ 0.08339516073465347,
881
+ 0.11133090406656265,
882
+ 0.10470746457576752,
883
+ 0.11295027285814285,
884
+ 0.11331796646118164,
885
+ 0.09551217406988144,
886
+ 0.11493296921253204,
887
+ 0.09893112629652023,
888
+ 0.11825107038021088,
889
+ 0.12228565663099289,
890
+ 0.10353125631809235,
891
+ 0.10824307054281235,
892
+ 0.10025204718112946,
893
+ 0.07499944418668747,
894
+ 0.09401531517505646,
895
+ 0.0934530571103096,
896
+ 0.11390485614538193,
897
+ 0.10727037489414215,
898
+ 0.09588678926229477,
899
+ 0.0825532078742981,
900
+ 0.10723954439163208,
901
+ 0.1088128611445427,
902
+ 0.11156749725341797,
903
+ 0.09598823636770248,
904
+ 0.08453374356031418,
905
+ 0.10130847245454788,
906
+ 0.09620478004217148,
907
+ 0.11045535653829575,
908
+ 0.10323013365268707,
909
+ 0.13056471943855286,
910
+ 0.08738001435995102,
911
+ 0.11891673505306244,
912
+ 0.09096094220876694,
913
+ 0.08709857612848282,
914
+ 0.09677024930715561,
915
+ 0.08751770108938217,
916
+ 0.10058047622442245,
917
+ 0.09135355800390244,
918
+ 0.1054212898015976,
919
+ 0.09947032481431961,
920
+ 0.10693687945604324,
921
+ 0.0917934849858284,
922
+ 0.10931998491287231,
923
+ 0.09929491579532623,
924
+ 0.11224542558193207,
925
+ 0.11092674732208252,
926
+ 0.09260962903499603,
927
+ 0.12740318477153778,
928
+ 0.11285065114498138,
929
+ 0.10502947121858597,
930
+ 0.07829917967319489,
931
+ 0.1043354719877243,
932
+ 0.10123976320028305,
933
+ 0.08939259499311447,
934
+ 0.11151234060525894,
935
+ 0.13105222582817078,
936
+ 0.10120860487222672,
937
+ 0.06924744695425034,
938
+ 0.10668231546878815,
939
+ 0.09256572276353836,
940
+ 0.08223498612642288,
941
+ 0.09994925558567047,
942
+ 0.08328042924404144,
943
+ 0.0906481072306633
944
  ],
945
  "lr": [
946
  8.376068376068378e-06,
 
1301
  20
1302
  ],
1303
  "eval_accuracy": [
1304
+ 0.01263157894736842,
1305
+ 0.2168421052631579,
1306
+ 0.3736842105263158,
1307
+ 0.4473684210526316,
1308
+ 0.4936842105263158,
1309
+ 0.5494736842105263,
1310
+ 0.5178947368421053,
1311
+ 0.52,
1312
+ 0.5873684210526315,
1313
+ 0.5136842105263157,
1314
+ 0.6073684210526316,
1315
+ 0.6021052631578947,
1316
+ 0.6431578947368422,
1317
+ 0.6621052631578948,
1318
+ 0.6168421052631579,
1319
+ 0.6536842105263158,
1320
+ 0.631578947368421,
1321
+ 0.6505263157894737,
1322
+ 0.6705263157894736,
1323
+ 0.6631578947368421
1324
  ]
1325
  },
1326
+ "final_accuracy": 0.5342307692307692,
1327
  "sft_eval": {
1328
  "config": {
1329
  "ops": "add_sub",
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
+ "n_per_split": 100
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
+ "full_accuracy": 0.86,
1338
+ "digit_accuracy": 0.9771428571428571,
1339
+ "n_examples": 100,
1340
  "per_subtask": {
1341
  "SA": {
1342
+ "accuracy": 0.9768595041322314,
1343
+ "count": 605
1344
  },
1345
  "SS": {
1346
+ "accuracy": 0.9789473684210527,
1347
+ "count": 95
1348
  }
1349
  }
1350
  },
1351
  "add_S1": {
1352
+ "full_accuracy": 0.93,
1353
+ "digit_accuracy": 0.99,
1354
+ "n_examples": 100,
1355
  "per_subtask": {
1356
  "SA": {
1357
+ "accuracy": 0.9803921568627451,
1358
+ "count": 204
1359
  },
1360
  "SC": {
1361
+ "accuracy": 0.9822485207100592,
1362
+ "count": 169
1363
  },
1364
  "SS": {
1365
+ "accuracy": 1.0,
1366
+ "count": 31
1367
  },
1368
  "UC": {
1369
+ "accuracy": 1.0,
1370
+ "count": 296
1371
  }
1372
  }
1373
  },
1374
  "add_S2": {
1375
+ "full_accuracy": 0.57,
1376
+ "digit_accuracy": 0.9185714285714286,
1377
+ "n_examples": 100,
1378
  "per_subtask": {
1379
  "SA": {
1380
+ "accuracy": 0.9754601226993865,
1381
+ "count": 163
1382
  },
1383
  "SC": {
1384
+ "accuracy": 0.9461538461538461,
1385
+ "count": 130
1386
  },
1387
  "SS": {
1388
+ "accuracy": 0.9080459770114943,
1389
+ "count": 87
1390
  },
1391
  "UC": {
1392
+ "accuracy": 0.812807881773399,
1393
+ "count": 203
1394
  },
1395
  "US": {
1396
  "accuracy": 1.0,
1397
+ "count": 117
1398
  }
1399
  }
1400
  },
1401
  "add_S3": {
1402
+ "full_accuracy": 0.23,
1403
+ "digit_accuracy": 0.8114285714285714,
1404
+ "n_examples": 100,
1405
  "per_subtask": {
1406
  "SA": {
1407
+ "accuracy": 0.9669421487603306,
1408
+ "count": 121
1409
  },
1410
  "SC": {
1411
+ "accuracy": 0.9504132231404959,
1412
+ "count": 121
1413
  },
1414
  "SS": {
1415
+ "accuracy": 0.9591836734693877,
1416
+ "count": 49
1417
  },
1418
  "UC": {
1419
+ "accuracy": 0.6505376344086021,
1420
+ "count": 186
1421
  },
1422
  "US": {
1423
+ "accuracy": 0.7533632286995515,
1424
+ "count": 223
1425
  }
1426
  }
1427
  },
1428
  "add_S4": {
1429
+ "full_accuracy": 0.33,
1430
+ "digit_accuracy": 0.75,
1431
+ "n_examples": 100,
1432
  "per_subtask": {
1433
  "SA": {
1434
  "accuracy": 1.0,
1435
+ "count": 104
1436
  },
1437
  "SC": {
1438
+ "accuracy": 0.9905660377358491,
1439
+ "count": 106
1440
  },
1441
  "SS": {
1442
  "accuracy": 1.0,
1443
+ "count": 23
1444
  },
1445
  "UC": {
1446
+ "accuracy": 0.6625,
1447
+ "count": 160
1448
  },
1449
  "US": {
1450
+ "accuracy": 0.6091205211726385,
1451
+ "count": 307
1452
  }
1453
  }
1454
  },
1455
  "add_S5": {
1456
+ "full_accuracy": 0.19,
1457
  "digit_accuracy": 0.5971428571428572,
1458
+ "n_examples": 100,
1459
  "per_subtask": {
1460
  "SA": {
1461
  "accuracy": 1.0,
1462
+ "count": 100
1463
  },
1464
  "SC": {
1465
  "accuracy": 1.0,
1466
+ "count": 100
1467
  },
1468
  "UC": {
1469
+ "accuracy": 0.34,
1470
+ "count": 100
1471
  },
1472
  "US": {
1473
+ "accuracy": 0.46,
1474
+ "count": 400
1475
  }
1476
  }
1477
  },
1478
  "add_S6": {
1479
+ "full_accuracy": 0.52,
1480
+ "digit_accuracy": 0.6642857142857143,
1481
+ "n_examples": 100,
1482
  "per_subtask": {
1483
  "SC": {
1484
  "accuracy": 1.0,
1485
+ "count": 100
1486
  },
1487
  "UC": {
1488
+ "accuracy": 0.56,
1489
+ "count": 100
1490
  },
1491
  "US": {
1492
+ "accuracy": 0.618,
1493
+ "count": 500
1494
  }
1495
  }
1496
  },
1497
  "add_random": {
1498
+ "full_accuracy": 0.845,
1499
+ "digit_accuracy": 0.9742857142857143,
1500
  "n_examples": 200,
1501
  "per_subtask": {
1502
  "SA": {
1503
+ "accuracy": 0.9843400447427293,
1504
+ "count": 447
1505
  },
1506
  "SC": {
1507
+ "accuracy": 0.990625,
1508
+ "count": 320
1509
  },
1510
  "SS": {
1511
+ "accuracy": 0.9642857142857143,
1512
+ "count": 56
1513
  },
1514
  "UC": {
1515
+ "accuracy": 0.9584120982986768,
1516
+ "count": 529
1517
  },
1518
  "US": {
1519
+ "accuracy": 0.9583333333333334,
1520
+ "count": 48
1521
  }
1522
  }
1523
  },
1524
  "add_C1": {
1525
+ "full_accuracy": 0.97,
1526
+ "digit_accuracy": 0.9957142857142857,
1527
+ "n_examples": 100,
1528
  "per_subtask": {
1529
  "SA": {
1530
+ "accuracy": 0.994,
1531
+ "count": 500
1532
  },
1533
  "SC": {
1534
  "accuracy": 1.0,
1535
+ "count": 100
1536
  },
1537
  "UC": {
1538
+ "accuracy": 1.0,
1539
+ "count": 100
1540
  }
1541
  }
1542
  },
1543
  "add_C2": {
1544
+ "full_accuracy": 0.8,
1545
+ "digit_accuracy": 0.9714285714285714,
1546
+ "n_examples": 100,
1547
  "per_subtask": {
1548
  "SA": {
1549
+ "accuracy": 0.985,
1550
+ "count": 400
1551
  },
1552
  "SC": {
1553
  "accuracy": 1.0,
1554
+ "count": 100
1555
  },
1556
  "UC": {
1557
+ "accuracy": 0.9102564102564102,
1558
+ "count": 156
1559
  },
1560
  "US": {
1561
+ "accuracy": 1.0,
1562
+ "count": 44
1563
  }
1564
  }
1565
  },
1566
  "add_C3": {
1567
+ "full_accuracy": 0.57,
1568
+ "digit_accuracy": 0.9242857142857143,
1569
+ "n_examples": 100,
1570
  "per_subtask": {
1571
  "SA": {
1572
+ "accuracy": 0.9966666666666667,
1573
+ "count": 300
1574
  },
1575
  "SC": {
1576
  "accuracy": 1.0,
1577
+ "count": 100
1578
  },
1579
  "UC": {
1580
+ "accuracy": 0.8040201005025126,
1581
+ "count": 199
1582
  },
1583
  "US": {
1584
+ "accuracy": 0.8712871287128713,
1585
+ "count": 101
1586
  }
1587
  }
1588
  },
1589
  "add_C4": {
1590
+ "full_accuracy": 0.49,
1591
+ "digit_accuracy": 0.8914285714285715,
1592
+ "n_examples": 100,
1593
  "per_subtask": {
1594
  "SA": {
1595
+ "accuracy": 0.995,
1596
+ "count": 200
1597
  },
1598
  "SC": {
1599
  "accuracy": 1.0,
1600
+ "count": 100
1601
  },
1602
  "UC": {
1603
+ "accuracy": 0.8143939393939394,
1604
+ "count": 264
1605
  },
1606
  "US": {
1607
+ "accuracy": 0.8088235294117647,
1608
+ "count": 136
1609
  }
1610
  }
1611
  },
1612
  "add_C5": {
1613
+ "full_accuracy": 0.37,
1614
+ "digit_accuracy": 0.8571428571428571,
1615
+ "n_examples": 100,
1616
  "per_subtask": {
1617
  "SA": {
1618
  "accuracy": 1.0,
1619
+ "count": 100
1620
  },
1621
  "SC": {
1622
  "accuracy": 1.0,
1623
+ "count": 100
1624
  },
1625
  "UC": {
1626
+ "accuracy": 0.8096774193548387,
1627
+ "count": 310
1628
  },
1629
  "US": {
1630
+ "accuracy": 0.7842105263157895,
1631
+ "count": 190
1632
  }
1633
  }
1634
  },
1635
  "add_C6": {
1636
+ "full_accuracy": 0.42,
1637
+ "digit_accuracy": 0.8442857142857143,
1638
+ "n_examples": 100,
1639
  "per_subtask": {
1640
  "SC": {
1641
  "accuracy": 1.0,
1642
+ "count": 100
1643
  },
1644
  "UC": {
1645
+ "accuracy": 0.8405405405405405,
1646
+ "count": 370
1647
  },
1648
  "US": {
1649
+ "accuracy": 0.782608695652174,
1650
+ "count": 230
1651
  }
1652
  }
1653
  },
1654
  "sub_M0": {
1655
  "full_accuracy": 0.9,
1656
  "digit_accuracy": 0.9857142857142858,
1657
+ "n_examples": 100,
1658
  "per_subtask": {
1659
  "MD": {
1660
+ "accuracy": 0.983739837398374,
1661
+ "count": 615
1662
  },
1663
  "ME": {
1664
  "accuracy": 1.0,
1665
+ "count": 85
1666
  }
1667
  }
1668
  },
1669
  "sub_M1": {
1670
+ "full_accuracy": 0.94,
1671
+ "digit_accuracy": 0.9914285714285714,
1672
+ "n_examples": 100,
1673
  "per_subtask": {
1674
  "MD": {
1675
+ "accuracy": 0.9965753424657534,
1676
+ "count": 292
1677
  },
1678
  "MB": {
1679
  "accuracy": 0.9861111111111112,
1680
+ "count": 144
1681
  },
1682
  "ME": {
1683
+ "accuracy": 0.96,
1684
+ "count": 25
1685
  },
1686
  "UB": {
1687
+ "accuracy": 0.9916317991631799,
1688
+ "count": 239
1689
  }
1690
  }
1691
  },
1692
  "sub_M2": {
1693
+ "full_accuracy": 0.38,
1694
+ "digit_accuracy": 0.8985714285714286,
1695
+ "n_examples": 100,
1696
  "per_subtask": {
1697
  "MD": {
1698
+ "accuracy": 0.985781990521327,
1699
+ "count": 211
1700
  },
1701
  "MB": {
1702
+ "accuracy": 0.9739130434782609,
1703
+ "count": 115
1704
  },
1705
  "ME": {
1706
+ "accuracy": 0.9764705882352941,
1707
+ "count": 85
1708
  },
1709
  "UB": {
1710
+ "accuracy": 0.6519337016574586,
1711
+ "count": 181
1712
  },
1713
  "UD": {
1714
+ "accuracy": 1.0,
1715
+ "count": 108
1716
  }
1717
  }
1718
  },
1719
  "sub_M3": {
1720
+ "full_accuracy": 0.1,
1721
+ "digit_accuracy": 0.8071428571428572,
1722
+ "n_examples": 100,
1723
  "per_subtask": {
1724
  "MD": {
1725
+ "accuracy": 1.0,
1726
+ "count": 179
1727
  },
1728
  "MB": {
1729
+ "accuracy": 0.9805825242718447,
1730
+ "count": 103
1731
  },
1732
  "ME": {
1733
  "accuracy": 1.0,
1734
+ "count": 56
1735
  },
1736
  "UB": {
1737
+ "accuracy": 0.4899328859060403,
1738
+ "count": 149
1739
  },
1740
  "UD": {
1741
+ "accuracy": 0.7323943661971831,
1742
+ "count": 213
1743
  }
1744
  }
1745
  },
1746
  "sub_M4": {
1747
+ "full_accuracy": 0.06,
1748
+ "digit_accuracy": 0.6885714285714286,
1749
+ "n_examples": 100,
1750
  "per_subtask": {
1751
  "MD": {
1752
  "accuracy": 1.0,
1753
+ "count": 200
1754
  },
1755
  "MB": {
1756
  "accuracy": 1.0,
1757
+ "count": 100
1758
  },
1759
  "UB": {
1760
+ "accuracy": 0.36,
1761
+ "count": 100
1762
  },
1763
  "UD": {
1764
+ "accuracy": 0.4866666666666667,
1765
+ "count": 300
1766
  }
1767
  }
1768
  },
1769
  "sub_M5": {
1770
+ "full_accuracy": 0.06,
1771
+ "digit_accuracy": 0.5571428571428572,
1772
+ "n_examples": 100,
1773
  "per_subtask": {
1774
  "MD": {
1775
  "accuracy": 1.0,
1776
+ "count": 100
1777
  },
1778
  "MB": {
1779
  "accuracy": 1.0,
1780
+ "count": 100
1781
  },
1782
  "UB": {
1783
+ "accuracy": 0.49,
1784
+ "count": 100
1785
  },
1786
  "UD": {
1787
+ "accuracy": 0.3525,
1788
+ "count": 400
1789
  }
1790
  }
1791
  },
1792
  "sub_random": {
1793
+ "full_accuracy": 0.845,
1794
+ "digit_accuracy": 0.9771428571428571,
1795
  "n_examples": 200,
1796
  "per_subtask": {
1797
  "MD": {
1798
+ "accuracy": 0.9866666666666667,
1799
+ "count": 600
1800
  },
1801
  "MB": {
1802
+ "accuracy": 0.9887640449438202,
1803
+ "count": 267
1804
  },
1805
  "ME": {
1806
  "accuracy": 1.0,
1807
  "count": 53
1808
  },
1809
  "UB": {
1810
+ "accuracy": 0.9521640091116174,
1811
+ "count": 439
1812
  },
1813
  "UD": {
1814
+ "accuracy": 1.0,
1815
+ "count": 41
1816
  }
1817
  }
1818
  },
1819
  "sub_B3": {
1820
+ "full_accuracy": 0.36,
1821
+ "digit_accuracy": 0.8857142857142857,
1822
+ "n_examples": 100,
1823
  "per_subtask": {
1824
  "MD": {
1825
+ "accuracy": 0.9966666666666667,
1826
+ "count": 300
1827
  },
1828
  "MB": {
1829
  "accuracy": 1.0,
1830
+ "count": 100
1831
  },
1832
  "UB": {
1833
+ "accuracy": 0.7411167512690355,
1834
+ "count": 197
1835
  },
1836
  "UD": {
1837
+ "accuracy": 0.7281553398058253,
1838
+ "count": 103
1839
  }
1840
  }
1841
  },
1842
  "sub_B4": {
1843
+ "full_accuracy": 0.26,
1844
+ "digit_accuracy": 0.84,
1845
+ "n_examples": 100,
1846
  "per_subtask": {
1847
  "MD": {
1848
  "accuracy": 1.0,
1849
+ "count": 200
1850
  },
1851
  "MB": {
1852
  "accuracy": 1.0,
1853
+ "count": 100
1854
  },
1855
  "UB": {
1856
+ "accuracy": 0.7125506072874493,
1857
+ "count": 247
1858
  },
1859
  "UD": {
1860
+ "accuracy": 0.7320261437908496,
1861
+ "count": 153
1862
  }
1863
  }
1864
  },
1865
  "sub_B5": {
1866
  "full_accuracy": 0.22,
1867
+ "digit_accuracy": 0.7957142857142857,
1868
+ "n_examples": 100,
1869
  "per_subtask": {
1870
  "MD": {
1871
  "accuracy": 1.0,
1872
+ "count": 100
1873
  },
1874
  "MB": {
1875
  "accuracy": 1.0,
1876
+ "count": 100
1877
  },
1878
  "UB": {
1879
+ "accuracy": 0.7651006711409396,
1880
+ "count": 298
1881
  },
1882
  "UD": {
1883
+ "accuracy": 0.6386138613861386,
1884
+ "count": 202
1885
  }
1886
  }
1887
  }
1888
  },
1889
  "summary": {
1890
+ "overall_accuracy": 0.5342307692307692,
1891
+ "digit_accuracy": 0.8671428571428571,
1892
+ "total_examples": 2600,
1893
  "n_splits": 24
1894
  }
1895
  }
add_sub_baseline_50K_1L3H510d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bfc5309917d4aa9bff45c52de66d373b6b27f50b2a5e8b3d16a116f52ff3678
3
  size 634642298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52d5fc0719081cdcc41cebb47bfb62d7ccd33018f75f5738e461989b9f5e9219
3
  size 634642298
add_sub_baseline_50K_1L3H510d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_50K_1L3H510d",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T06:42:54.584652+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "ql0jdky0",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/ql0jdky0",
81
- "final_accuracy": 0.54125,
82
- "sft_accuracy": 0.54125,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_50K_1L3H510d",
72
+ "git_commit": "1d5a160e16a5070d61b881494e832aa88149b15c",
73
+ "timestamp": "2026-04-15T03:54:44.956075+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_50K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "ougociky",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/ougociky",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.5342307692307692,
86
+ "sft_accuracy": 0.5342307692307692,
87
  "eval_method": "ArithmeticEvaluator"
88
  }