amirali1985 commited on
Commit
bc45de3
·
verified ·
1 Parent(s): af5f066

Upload add_sub_baseline_50K_2L1H128d

Browse files
add_sub_baseline_50K_2L1H128d/metrics.json CHANGED
@@ -315,632 +315,632 @@
315
  15600
316
  ],
317
  "loss": [
318
- 11.93499755859375,
319
- 11.65730094909668,
320
- 11.293171882629395,
321
- 10.867568969726562,
322
- 10.623129844665527,
323
- 10.355392456054688,
324
- 10.099054336547852,
325
- 9.930326461791992,
326
- 9.612565994262695,
327
- 9.24365234375,
328
- 9.00301456451416,
329
- 8.693142890930176,
330
- 8.47545051574707,
331
- 8.173990249633789,
332
- 7.881795406341553,
333
- 7.611324310302734,
334
- 7.3448920249938965,
335
- 7.075301170349121,
336
- 6.706723213195801,
337
- 6.513725280761719,
338
- 6.21718168258667,
339
- 5.842379570007324,
340
- 5.690333366394043,
341
- 5.309576511383057,
342
- 5.099353313446045,
343
- 4.7732625007629395,
344
- 4.531905651092529,
345
- 4.2372260093688965,
346
- 4.069974899291992,
347
- 3.8053746223449707,
348
- 3.5196328163146973,
349
- 3.335747241973877,
350
- 3.109636068344116,
351
- 3.0120599269866943,
352
- 2.8363168239593506,
353
- 2.676023483276367,
354
- 2.6105294227600098,
355
- 2.5148777961730957,
356
- 2.3828721046447754,
357
- 2.345553159713745,
358
- 2.2795569896698,
359
- 2.2441999912261963,
360
- 2.212557792663574,
361
- 2.105844497680664,
362
- 2.151437520980835,
363
- 2.183159351348877,
364
- 2.157996416091919,
365
- 2.0764122009277344,
366
- 2.0949976444244385,
367
- 2.0614359378814697,
368
- 2.0152029991149902,
369
- 2.042832374572754,
370
- 1.985764741897583,
371
- 2.0169003009796143,
372
- 1.9234191179275513,
373
- 1.98187255859375,
374
- 1.9020922183990479,
375
- 1.947366714477539,
376
- 1.912832498550415,
377
- 1.9788919687271118,
378
- 1.8701711893081665,
379
- 1.9873000383377075,
380
- 1.8384500741958618,
381
- 1.864878535270691,
382
- 1.8773438930511475,
383
- 1.814258337020874,
384
- 1.8422654867172241,
385
- 1.7905521392822266,
386
- 1.9188811779022217,
387
- 1.9195749759674072,
388
- 1.840183973312378,
389
- 1.9478784799575806,
390
- 1.7651938199996948,
391
- 1.8026297092437744,
392
- 1.8211473226547241,
393
- 1.6967977285385132,
394
- 1.774262547492981,
395
- 1.8092931509017944,
396
- 1.7626827955245972,
397
- 1.7678852081298828,
398
- 1.7257096767425537,
399
- 1.6912577152252197,
400
- 1.705618143081665,
401
- 1.7209644317626953,
402
- 1.6141128540039062,
403
- 1.5833446979522705,
404
- 1.6016104221343994,
405
- 1.6115186214447021,
406
- 1.4938329458236694,
407
- 1.5911413431167603,
408
- 1.4935094118118286,
409
- 1.4679266214370728,
410
- 1.4957019090652466,
411
- 1.446588158607483,
412
- 1.3904730081558228,
413
- 1.4148086309432983,
414
- 1.4284785985946655,
415
- 1.2719846963882446,
416
- 1.2865592241287231,
417
- 1.2901942729949951,
418
- 1.2603648900985718,
419
- 1.305459976196289,
420
- 1.1517776250839233,
421
- 1.1661006212234497,
422
- 1.1481062173843384,
423
- 1.1654253005981445,
424
- 1.143684983253479,
425
- 1.1574296951293945,
426
- 1.104051113128662,
427
- 1.1066854000091553,
428
- 1.0712209939956665,
429
- 1.0240689516067505,
430
- 1.0762532949447632,
431
- 0.9777925610542297,
432
- 1.0075340270996094,
433
- 1.0130956172943115,
434
- 0.9441969990730286,
435
- 0.9728404879570007,
436
- 0.9550210237503052,
437
- 0.9782089591026306,
438
- 0.9868655204772949,
439
- 0.9737717509269714,
440
- 0.9871805906295776,
441
- 0.8989226222038269,
442
- 0.9575096368789673,
443
- 0.9029489159584045,
444
- 0.8649100065231323,
445
- 0.8886522650718689,
446
- 0.8820281028747559,
447
- 0.8742324113845825,
448
- 0.88325035572052,
449
- 0.8407949805259705,
450
- 0.8328596353530884,
451
- 0.8495661616325378,
452
- 0.7535262107849121,
453
- 0.7798505425453186,
454
- 0.7734178900718689,
455
- 0.8000691533088684,
456
- 0.7600011825561523,
457
- 0.7487917542457581,
458
- 0.7534374594688416,
459
- 0.7358126044273376,
460
- 0.7776135206222534,
461
- 0.7900133728981018,
462
- 0.726458728313446,
463
- 0.7606948018074036,
464
- 0.7421143651008606,
465
- 0.7261181473731995,
466
- 0.7051188349723816,
467
- 0.7383636236190796,
468
- 0.6680508852005005,
469
- 0.680087685585022,
470
- 0.7639477849006653,
471
- 0.6837146878242493,
472
- 0.6702830195426941,
473
- 0.667018473148346,
474
- 0.6458178758621216,
475
- 0.626135528087616,
476
- 0.6087480187416077,
477
- 0.6776902079582214,
478
- 0.6297626495361328,
479
- 0.6746311187744141,
480
- 0.6392488479614258,
481
- 0.6054442524909973,
482
- 0.6305686235427856,
483
- 0.65091472864151,
484
- 0.6684236526489258,
485
- 0.630896270275116,
486
- 0.5898188352584839,
487
- 0.5816236138343811,
488
- 0.6031948924064636,
489
- 0.5798562169075012,
490
- 0.5800939202308655,
491
- 0.6128515601158142,
492
- 0.5971966981887817,
493
- 0.5636686086654663,
494
- 0.6101219058036804,
495
- 0.5626725554466248,
496
- 0.5759359002113342,
497
- 0.5579383969306946,
498
- 0.5766159892082214,
499
- 0.563798725605011,
500
- 0.5380284190177917,
501
- 0.5644034147262573,
502
- 0.5314499735832214,
503
- 0.549543559551239,
504
- 0.5258645415306091,
505
- 0.5414046049118042,
506
- 0.5449110269546509,
507
- 0.5177078247070312,
508
- 0.5357903242111206,
509
- 0.5383039116859436,
510
- 0.5342908501625061,
511
- 0.5052288770675659,
512
- 0.4803391993045807,
513
- 0.5331531167030334,
514
- 0.5584177374839783,
515
- 0.5099272131919861,
516
- 0.5030084252357483,
517
- 0.5107883810997009,
518
- 0.5074604749679565,
519
- 0.474060595035553,
520
- 0.5217799544334412,
521
- 0.5024420022964478,
522
- 0.5018254518508911,
523
- 0.48899394273757935,
524
- 0.5179036855697632,
525
- 0.4651453495025635,
526
- 0.4600445628166199,
527
- 0.46727508306503296,
528
- 0.4327971637248993,
529
- 0.4529948830604553,
530
- 0.46315622329711914,
531
- 0.48899713158607483,
532
- 0.4627113342285156,
533
- 0.45527729392051697,
534
- 0.47023242712020874,
535
- 0.4694068729877472,
536
- 0.4536097049713135,
537
- 0.46996352076530457,
538
- 0.4478769302368164,
539
- 0.5023384690284729,
540
- 0.4848630130290985,
541
- 0.44978800415992737,
542
- 0.4636957347393036,
543
- 0.46273866295814514,
544
- 0.4170573651790619,
545
- 0.45214805006980896,
546
- 0.4561144709587097,
547
- 0.4628432095050812,
548
- 0.46510520577430725,
549
- 0.5094900131225586,
550
- 0.4306984841823578,
551
- 0.4236433207988739,
552
- 0.41763633489608765,
553
- 0.4181102216243744,
554
- 0.41568782925605774,
555
- 0.43321552872657776,
556
- 0.42102813720703125,
557
- 0.44086867570877075,
558
- 0.4437388777732849,
559
- 0.42119714617729187,
560
- 0.4386158585548401,
561
- 0.4230058491230011,
562
- 0.42731285095214844,
563
- 0.43081793189048767,
564
- 0.40430232882499695,
565
- 0.39149078726768494,
566
- 0.4208660125732422,
567
- 0.41456151008605957,
568
- 0.4628043472766876,
569
- 0.45343923568725586,
570
- 0.4034596383571625,
571
- 0.43333905935287476,
572
- 0.4089009463787079,
573
- 0.428451269865036,
574
- 0.44539371132850647,
575
- 0.3964701294898987,
576
- 0.363548219203949,
577
- 0.41381707787513733,
578
- 0.41168859601020813,
579
- 0.3732242286205292,
580
- 0.4289732575416565,
581
- 0.4374411404132843,
582
- 0.4020676016807556,
583
- 0.39040374755859375,
584
- 0.435672789812088,
585
- 0.4191468358039856,
586
- 0.43827033042907715,
587
- 0.42149630188941956,
588
- 0.4028854966163635,
589
- 0.42512935400009155,
590
- 0.3935700058937073,
591
- 0.4023810029029846,
592
- 0.4407925307750702,
593
- 0.4217481017112732,
594
- 0.4327377676963806,
595
- 0.3848411738872528,
596
- 0.38072606921195984,
597
- 0.42019957304000854,
598
- 0.4077237546443939,
599
- 0.4318618178367615,
600
- 0.41669175028800964,
601
- 0.42001357674598694,
602
- 0.41785627603530884,
603
- 0.43324536085128784,
604
- 0.44770950078964233,
605
- 0.40982261300086975,
606
- 0.41195228695869446,
607
- 0.4122852385044098,
608
- 0.4267185628414154,
609
- 0.3932749927043915,
610
- 0.42152076959609985,
611
- 0.40566104650497437,
612
- 0.3950228691101074,
613
- 0.41320866346359253,
614
- 0.3923717141151428,
615
- 0.4263770282268524,
616
- 0.399299293756485,
617
- 0.37513676285743713,
618
- 0.3975963592529297,
619
- 0.3869399428367615,
620
- 0.3949268162250519,
621
- 0.43800851702690125,
622
- 0.38830724358558655,
623
- 0.4024786055088043,
624
- 0.3731174170970917,
625
- 0.3714914321899414,
626
- 0.39834064245224,
627
- 0.39818379282951355,
628
- 0.39613842964172363,
629
- 0.3969874978065491
630
  ],
631
  "base_loss": [
632
- 11.93499755859375,
633
- 11.65730094909668,
634
- 11.293171882629395,
635
- 10.867568969726562,
636
- 10.623129844665527,
637
- 10.355392456054688,
638
- 10.099054336547852,
639
- 9.930326461791992,
640
- 9.612565994262695,
641
- 9.24365234375,
642
- 9.00301456451416,
643
- 8.693142890930176,
644
- 8.47545051574707,
645
- 8.173990249633789,
646
- 7.881795406341553,
647
- 7.611324310302734,
648
- 7.3448920249938965,
649
- 7.075301170349121,
650
- 6.706723213195801,
651
- 6.513725280761719,
652
- 6.21718168258667,
653
- 5.842379570007324,
654
- 5.690333366394043,
655
- 5.309576511383057,
656
- 5.099353313446045,
657
- 4.7732625007629395,
658
- 4.531905651092529,
659
- 4.2372260093688965,
660
- 4.069974899291992,
661
- 3.8053746223449707,
662
- 3.5196328163146973,
663
- 3.335747241973877,
664
- 3.109636068344116,
665
- 3.0120599269866943,
666
- 2.8363168239593506,
667
- 2.676023483276367,
668
- 2.6105294227600098,
669
- 2.5148777961730957,
670
- 2.3828721046447754,
671
- 2.345553159713745,
672
- 2.2795569896698,
673
- 2.2441999912261963,
674
- 2.212557792663574,
675
- 2.105844497680664,
676
- 2.151437520980835,
677
- 2.183159351348877,
678
- 2.157996416091919,
679
- 2.0764122009277344,
680
- 2.0949976444244385,
681
- 2.0614359378814697,
682
- 2.0152029991149902,
683
- 2.042832374572754,
684
- 1.985764741897583,
685
- 2.0169003009796143,
686
- 1.9234191179275513,
687
- 1.98187255859375,
688
- 1.9020922183990479,
689
- 1.947366714477539,
690
- 1.912832498550415,
691
- 1.9788919687271118,
692
- 1.8701711893081665,
693
- 1.9873000383377075,
694
- 1.8384500741958618,
695
- 1.864878535270691,
696
- 1.8773438930511475,
697
- 1.814258337020874,
698
- 1.8422654867172241,
699
- 1.7905521392822266,
700
- 1.9188811779022217,
701
- 1.9195749759674072,
702
- 1.840183973312378,
703
- 1.9478784799575806,
704
- 1.7651938199996948,
705
- 1.8026297092437744,
706
- 1.8211473226547241,
707
- 1.6967977285385132,
708
- 1.774262547492981,
709
- 1.8092931509017944,
710
- 1.7626827955245972,
711
- 1.7678852081298828,
712
- 1.7257096767425537,
713
- 1.6912577152252197,
714
- 1.705618143081665,
715
- 1.7209644317626953,
716
- 1.6141128540039062,
717
- 1.5833446979522705,
718
- 1.6016104221343994,
719
- 1.6115186214447021,
720
- 1.4938329458236694,
721
- 1.5911413431167603,
722
- 1.4935094118118286,
723
- 1.4679266214370728,
724
- 1.4957019090652466,
725
- 1.446588158607483,
726
- 1.3904730081558228,
727
- 1.4148086309432983,
728
- 1.4284785985946655,
729
- 1.2719846963882446,
730
- 1.2865592241287231,
731
- 1.2901942729949951,
732
- 1.2603648900985718,
733
- 1.305459976196289,
734
- 1.1517776250839233,
735
- 1.1661006212234497,
736
- 1.1481062173843384,
737
- 1.1654253005981445,
738
- 1.143684983253479,
739
- 1.1574296951293945,
740
- 1.104051113128662,
741
- 1.1066854000091553,
742
- 1.0712209939956665,
743
- 1.0240689516067505,
744
- 1.0762532949447632,
745
- 0.9777925610542297,
746
- 1.0075340270996094,
747
- 1.0130956172943115,
748
- 0.9441969990730286,
749
- 0.9728404879570007,
750
- 0.9550210237503052,
751
- 0.9782089591026306,
752
- 0.9868655204772949,
753
- 0.9737717509269714,
754
- 0.9871805906295776,
755
- 0.8989226222038269,
756
- 0.9575096368789673,
757
- 0.9029489159584045,
758
- 0.8649100065231323,
759
- 0.8886522650718689,
760
- 0.8820281028747559,
761
- 0.8742324113845825,
762
- 0.88325035572052,
763
- 0.8407949805259705,
764
- 0.8328596353530884,
765
- 0.8495661616325378,
766
- 0.7535262107849121,
767
- 0.7798505425453186,
768
- 0.7734178900718689,
769
- 0.8000691533088684,
770
- 0.7600011825561523,
771
- 0.7487917542457581,
772
- 0.7534374594688416,
773
- 0.7358126044273376,
774
- 0.7776135206222534,
775
- 0.7900133728981018,
776
- 0.726458728313446,
777
- 0.7606948018074036,
778
- 0.7421143651008606,
779
- 0.7261181473731995,
780
- 0.7051188349723816,
781
- 0.7383636236190796,
782
- 0.6680508852005005,
783
- 0.680087685585022,
784
- 0.7639477849006653,
785
- 0.6837146878242493,
786
- 0.6702830195426941,
787
- 0.667018473148346,
788
- 0.6458178758621216,
789
- 0.626135528087616,
790
- 0.6087480187416077,
791
- 0.6776902079582214,
792
- 0.6297626495361328,
793
- 0.6746311187744141,
794
- 0.6392488479614258,
795
- 0.6054442524909973,
796
- 0.6305686235427856,
797
- 0.65091472864151,
798
- 0.6684236526489258,
799
- 0.630896270275116,
800
- 0.5898188352584839,
801
- 0.5816236138343811,
802
- 0.6031948924064636,
803
- 0.5798562169075012,
804
- 0.5800939202308655,
805
- 0.6128515601158142,
806
- 0.5971966981887817,
807
- 0.5636686086654663,
808
- 0.6101219058036804,
809
- 0.5626725554466248,
810
- 0.5759359002113342,
811
- 0.5579383969306946,
812
- 0.5766159892082214,
813
- 0.563798725605011,
814
- 0.5380284190177917,
815
- 0.5644034147262573,
816
- 0.5314499735832214,
817
- 0.549543559551239,
818
- 0.5258645415306091,
819
- 0.5414046049118042,
820
- 0.5449110269546509,
821
- 0.5177078247070312,
822
- 0.5357903242111206,
823
- 0.5383039116859436,
824
- 0.5342908501625061,
825
- 0.5052288770675659,
826
- 0.4803391993045807,
827
- 0.5331531167030334,
828
- 0.5584177374839783,
829
- 0.5099272131919861,
830
- 0.5030084252357483,
831
- 0.5107883810997009,
832
- 0.5074604749679565,
833
- 0.474060595035553,
834
- 0.5217799544334412,
835
- 0.5024420022964478,
836
- 0.5018254518508911,
837
- 0.48899394273757935,
838
- 0.5179036855697632,
839
- 0.4651453495025635,
840
- 0.4600445628166199,
841
- 0.46727508306503296,
842
- 0.4327971637248993,
843
- 0.4529948830604553,
844
- 0.46315622329711914,
845
- 0.48899713158607483,
846
- 0.4627113342285156,
847
- 0.45527729392051697,
848
- 0.47023242712020874,
849
- 0.4694068729877472,
850
- 0.4536097049713135,
851
- 0.46996352076530457,
852
- 0.4478769302368164,
853
- 0.5023384690284729,
854
- 0.4848630130290985,
855
- 0.44978800415992737,
856
- 0.4636957347393036,
857
- 0.46273866295814514,
858
- 0.4170573651790619,
859
- 0.45214805006980896,
860
- 0.4561144709587097,
861
- 0.4628432095050812,
862
- 0.46510520577430725,
863
- 0.5094900131225586,
864
- 0.4306984841823578,
865
- 0.4236433207988739,
866
- 0.41763633489608765,
867
- 0.4181102216243744,
868
- 0.41568782925605774,
869
- 0.43321552872657776,
870
- 0.42102813720703125,
871
- 0.44086867570877075,
872
- 0.4437388777732849,
873
- 0.42119714617729187,
874
- 0.4386158585548401,
875
- 0.4230058491230011,
876
- 0.42731285095214844,
877
- 0.43081793189048767,
878
- 0.40430232882499695,
879
- 0.39149078726768494,
880
- 0.4208660125732422,
881
- 0.41456151008605957,
882
- 0.4628043472766876,
883
- 0.45343923568725586,
884
- 0.4034596383571625,
885
- 0.43333905935287476,
886
- 0.4089009463787079,
887
- 0.428451269865036,
888
- 0.44539371132850647,
889
- 0.3964701294898987,
890
- 0.363548219203949,
891
- 0.41381707787513733,
892
- 0.41168859601020813,
893
- 0.3732242286205292,
894
- 0.4289732575416565,
895
- 0.4374411404132843,
896
- 0.4020676016807556,
897
- 0.39040374755859375,
898
- 0.435672789812088,
899
- 0.4191468358039856,
900
- 0.43827033042907715,
901
- 0.42149630188941956,
902
- 0.4028854966163635,
903
- 0.42512935400009155,
904
- 0.3935700058937073,
905
- 0.4023810029029846,
906
- 0.4407925307750702,
907
- 0.4217481017112732,
908
- 0.4327377676963806,
909
- 0.3848411738872528,
910
- 0.38072606921195984,
911
- 0.42019957304000854,
912
- 0.4077237546443939,
913
- 0.4318618178367615,
914
- 0.41669175028800964,
915
- 0.42001357674598694,
916
- 0.41785627603530884,
917
- 0.43324536085128784,
918
- 0.44770950078964233,
919
- 0.40982261300086975,
920
- 0.41195228695869446,
921
- 0.4122852385044098,
922
- 0.4267185628414154,
923
- 0.3932749927043915,
924
- 0.42152076959609985,
925
- 0.40566104650497437,
926
- 0.3950228691101074,
927
- 0.41320866346359253,
928
- 0.3923717141151428,
929
- 0.4263770282268524,
930
- 0.399299293756485,
931
- 0.37513676285743713,
932
- 0.3975963592529297,
933
- 0.3869399428367615,
934
- 0.3949268162250519,
935
- 0.43800851702690125,
936
- 0.38830724358558655,
937
- 0.4024786055088043,
938
- 0.3731174170970917,
939
- 0.3714914321899414,
940
- 0.39834064245224,
941
- 0.39818379282951355,
942
- 0.39613842964172363,
943
- 0.3969874978065491
944
  ],
945
  "lr": [
946
  2.0940170940170946e-06,
@@ -1304,592 +1304,592 @@
1304
  0.0,
1305
  0.0,
1306
  0.0,
1307
- 0.0011111111111111111,
1308
- 0.0022222222222222222,
1309
- 0.014444444444444444,
1310
- 0.052222222222222225,
1311
- 0.051111111111111114,
1312
- 0.08333333333333333,
1313
- 0.10888888888888888,
1314
- 0.11222222222222222,
1315
- 0.1288888888888889,
1316
- 0.13777777777777778,
1317
- 0.17,
1318
- 0.15555555555555556,
1319
- 0.18444444444444444,
1320
- 0.2088888888888889,
1321
- 0.20222222222222222,
1322
- 0.21777777777777776,
1323
- 0.21444444444444444
1324
  ]
1325
  },
1326
- "final_accuracy": 0.1925,
1327
  "sft_eval": {
1328
  "config": {
1329
  "ops": "add_sub",
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
- "n_per_split": 50
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
- "full_accuracy": 0.48,
1338
- "digit_accuracy": 0.9114285714285715,
1339
- "n_examples": 50,
1340
  "per_subtask": {
1341
  "SA": {
1342
- "accuracy": 0.8983050847457628,
1343
- "count": 295
1344
  },
1345
  "SS": {
1346
- "accuracy": 0.9818181818181818,
1347
- "count": 55
1348
  }
1349
  }
1350
  },
1351
  "add_S1": {
1352
- "full_accuracy": 0.3,
1353
- "digit_accuracy": 0.8371428571428572,
1354
- "n_examples": 50,
1355
  "per_subtask": {
1356
  "SA": {
1357
- "accuracy": 0.9047619047619048,
1358
- "count": 126
1359
  },
1360
  "SC": {
1361
- "accuracy": 0.8987341772151899,
1362
- "count": 79
1363
  },
1364
  "SS": {
1365
- "accuracy": 0.9523809523809523,
1366
- "count": 21
1367
  },
1368
  "UC": {
1369
- "accuracy": 0.7096774193548387,
1370
- "count": 124
1371
  }
1372
  }
1373
  },
1374
  "add_S2": {
1375
- "full_accuracy": 0.2,
1376
- "digit_accuracy": 0.8114285714285714,
1377
- "n_examples": 50,
1378
  "per_subtask": {
1379
  "SA": {
1380
- "accuracy": 0.8266666666666667,
1381
- "count": 75
1382
  },
1383
  "SC": {
1384
- "accuracy": 0.8870967741935484,
1385
- "count": 62
1386
  },
1387
  "SS": {
1388
- "accuracy": 0.8461538461538461,
1389
- "count": 39
1390
  },
1391
  "UC": {
1392
- "accuracy": 0.6396396396396397,
1393
- "count": 111
1394
  },
1395
  "US": {
1396
- "accuracy": 1.0,
1397
- "count": 63
1398
  }
1399
  }
1400
  },
1401
  "add_S3": {
1402
- "full_accuracy": 0.18,
1403
- "digit_accuracy": 0.8228571428571428,
1404
- "n_examples": 50,
1405
  "per_subtask": {
1406
  "SA": {
1407
- "accuracy": 0.9666666666666667,
1408
- "count": 60
1409
  },
1410
  "SC": {
1411
- "accuracy": 0.8947368421052632,
1412
- "count": 57
1413
  },
1414
  "SS": {
1415
- "accuracy": 0.8421052631578947,
1416
- "count": 19
1417
  },
1418
  "UC": {
1419
- "accuracy": 0.7115384615384616,
1420
- "count": 104
1421
  },
1422
  "US": {
1423
- "accuracy": 0.8090909090909091,
1424
- "count": 110
1425
  }
1426
  }
1427
  },
1428
  "add_S4": {
1429
  "full_accuracy": 0.24,
1430
- "digit_accuracy": 0.6914285714285714,
1431
- "n_examples": 50,
1432
  "per_subtask": {
1433
  "SA": {
1434
- "accuracy": 0.9375,
1435
- "count": 48
1436
  },
1437
  "SC": {
1438
- "accuracy": 0.9423076923076923,
1439
- "count": 52
1440
  },
1441
  "SS": {
1442
- "accuracy": 1.0,
1443
- "count": 7
1444
  },
1445
  "UC": {
1446
- "accuracy": 0.6067415730337079,
1447
- "count": 89
1448
  },
1449
  "US": {
1450
- "accuracy": 0.564935064935065,
1451
- "count": 154
1452
  }
1453
  }
1454
  },
1455
  "add_S5": {
1456
- "full_accuracy": 0.1,
1457
- "digit_accuracy": 0.5257142857142857,
1458
- "n_examples": 50,
1459
  "per_subtask": {
1460
  "SA": {
1461
- "accuracy": 1.0,
1462
- "count": 50
1463
  },
1464
  "SC": {
1465
- "accuracy": 1.0,
1466
- "count": 50
1467
  },
1468
  "UC": {
1469
- "accuracy": 0.42,
1470
- "count": 50
1471
  },
1472
  "US": {
1473
- "accuracy": 0.315,
1474
- "count": 200
1475
  }
1476
  }
1477
  },
1478
  "add_S6": {
1479
- "full_accuracy": 0.34,
1480
- "digit_accuracy": 0.5742857142857143,
1481
- "n_examples": 50,
1482
  "per_subtask": {
1483
  "SC": {
1484
- "accuracy": 1.0,
1485
- "count": 50
1486
  },
1487
  "UC": {
1488
- "accuracy": 0.46,
1489
- "count": 50
1490
  },
1491
  "US": {
1492
- "accuracy": 0.512,
1493
- "count": 250
1494
  }
1495
  }
1496
  },
1497
  "add_random": {
1498
- "full_accuracy": 0.25,
1499
- "digit_accuracy": 0.8514285714285714,
1500
  "n_examples": 200,
1501
  "per_subtask": {
1502
  "SA": {
1503
- "accuracy": 0.91415313225058,
1504
- "count": 431
1505
  },
1506
  "SC": {
1507
- "accuracy": 0.9493670886075949,
1508
- "count": 316
1509
  },
1510
  "SS": {
1511
- "accuracy": 1.0,
1512
- "count": 39
1513
  },
1514
  "UC": {
1515
- "accuracy": 0.7303571428571428,
1516
- "count": 560
1517
  },
1518
  "US": {
1519
- "accuracy": 0.9259259259259259,
1520
- "count": 54
1521
  }
1522
  }
1523
  },
1524
  "add_C1": {
1525
- "full_accuracy": 0.26,
1526
- "digit_accuracy": 0.8657142857142858,
1527
- "n_examples": 50,
1528
  "per_subtask": {
1529
  "SA": {
1530
- "accuracy": 0.928,
1531
- "count": 250
1532
  },
1533
  "SC": {
1534
- "accuracy": 0.96,
1535
- "count": 50
1536
  },
1537
  "UC": {
1538
- "accuracy": 0.46,
1539
- "count": 50
1540
  }
1541
  }
1542
  },
1543
  "add_C2": {
1544
- "full_accuracy": 0.24,
1545
- "digit_accuracy": 0.8371428571428572,
1546
- "n_examples": 50,
1547
  "per_subtask": {
1548
  "SA": {
1549
- "accuracy": 0.945,
1550
- "count": 200
1551
  },
1552
  "SC": {
1553
- "accuracy": 0.94,
1554
- "count": 50
1555
  },
1556
  "UC": {
1557
- "accuracy": 0.5180722891566265,
1558
- "count": 83
1559
  },
1560
  "US": {
1561
- "accuracy": 0.8235294117647058,
1562
- "count": 17
1563
  }
1564
  }
1565
  },
1566
  "add_C3": {
1567
- "full_accuracy": 0.14,
1568
  "digit_accuracy": 0.8057142857142857,
1569
- "n_examples": 50,
1570
  "per_subtask": {
1571
  "SA": {
1572
- "accuracy": 0.9533333333333334,
1573
- "count": 150
1574
  },
1575
  "SC": {
1576
- "accuracy": 1.0,
1577
- "count": 50
1578
  },
1579
  "UC": {
1580
- "accuracy": 0.5,
1581
- "count": 100
1582
  },
1583
  "US": {
1584
- "accuracy": 0.78,
1585
- "count": 50
1586
  }
1587
  }
1588
  },
1589
  "add_C4": {
1590
- "full_accuracy": 0.18,
1591
- "digit_accuracy": 0.82,
1592
- "n_examples": 50,
1593
  "per_subtask": {
1594
  "SA": {
1595
- "accuracy": 0.98,
1596
- "count": 100
1597
  },
1598
  "SC": {
1599
- "accuracy": 0.98,
1600
- "count": 50
1601
  },
1602
  "UC": {
1603
- "accuracy": 0.6060606060606061,
1604
- "count": 132
1605
  },
1606
  "US": {
1607
- "accuracy": 0.8823529411764706,
1608
- "count": 68
1609
  }
1610
  }
1611
  },
1612
  "add_C5": {
1613
- "full_accuracy": 0.16,
1614
  "digit_accuracy": 0.7942857142857143,
1615
- "n_examples": 50,
1616
  "per_subtask": {
1617
  "SA": {
1618
- "accuracy": 0.98,
1619
- "count": 50
1620
  },
1621
  "SC": {
1622
- "accuracy": 1.0,
1623
- "count": 50
1624
  },
1625
  "UC": {
1626
- "accuracy": 0.6438356164383562,
1627
- "count": 146
1628
  },
1629
  "US": {
1630
- "accuracy": 0.8173076923076923,
1631
- "count": 104
1632
  }
1633
  }
1634
  },
1635
  "add_C6": {
1636
- "full_accuracy": 0.14,
1637
- "digit_accuracy": 0.7628571428571429,
1638
- "n_examples": 50,
1639
  "per_subtask": {
1640
  "SC": {
1641
- "accuracy": 0.98,
1642
- "count": 50
1643
  },
1644
  "UC": {
1645
- "accuracy": 0.6931216931216931,
1646
- "count": 189
1647
  },
1648
  "US": {
1649
- "accuracy": 0.7837837837837838,
1650
- "count": 111
1651
  }
1652
  }
1653
  },
1654
  "sub_M0": {
1655
- "full_accuracy": 0.48,
1656
- "digit_accuracy": 0.8971428571428571,
1657
- "n_examples": 50,
1658
  "per_subtask": {
1659
  "MD": {
1660
- "accuracy": 0.8811881188118812,
1661
- "count": 303
1662
  },
1663
  "ME": {
1664
  "accuracy": 1.0,
1665
- "count": 47
1666
  }
1667
  }
1668
  },
1669
  "sub_M1": {
1670
- "full_accuracy": 0.22,
1671
- "digit_accuracy": 0.8314285714285714,
1672
- "n_examples": 50,
1673
  "per_subtask": {
1674
  "MD": {
1675
- "accuracy": 0.9361702127659575,
1676
- "count": 141
1677
  },
1678
  "MB": {
1679
- "accuracy": 0.9583333333333334,
1680
- "count": 72
1681
  },
1682
  "ME": {
1683
- "accuracy": 0.8888888888888888,
1684
- "count": 18
1685
  },
1686
  "UB": {
1687
- "accuracy": 0.6218487394957983,
1688
- "count": 119
1689
  }
1690
  }
1691
  },
1692
  "sub_M2": {
1693
- "full_accuracy": 0.14,
1694
- "digit_accuracy": 0.8171428571428572,
1695
- "n_examples": 50,
1696
  "per_subtask": {
1697
  "MD": {
1698
- "accuracy": 0.9464285714285714,
1699
- "count": 112
1700
  },
1701
  "MB": {
1702
- "accuracy": 0.9245283018867925,
1703
- "count": 53
1704
  },
1705
  "ME": {
1706
- "accuracy": 0.9148936170212766,
1707
- "count": 47
1708
  },
1709
  "UB": {
1710
- "accuracy": 0.4588235294117647,
1711
- "count": 85
1712
  },
1713
  "UD": {
1714
- "accuracy": 0.9245283018867925,
1715
- "count": 53
1716
  }
1717
  }
1718
  },
1719
  "sub_M3": {
1720
- "full_accuracy": 0.02,
1721
- "digit_accuracy": 0.7142857142857143,
1722
- "n_examples": 50,
1723
  "per_subtask": {
1724
  "MD": {
1725
- "accuracy": 0.9278350515463918,
1726
- "count": 97
1727
  },
1728
  "MB": {
1729
- "accuracy": 0.9607843137254902,
1730
- "count": 51
1731
  },
1732
  "ME": {
1733
- "accuracy": 0.9629629629629629,
1734
- "count": 27
1735
  },
1736
  "UB": {
1737
- "accuracy": 0.4594594594594595,
1738
- "count": 74
1739
  },
1740
  "UD": {
1741
- "accuracy": 0.504950495049505,
1742
- "count": 101
1743
  }
1744
  }
1745
  },
1746
  "sub_M4": {
1747
- "full_accuracy": 0.08,
1748
- "digit_accuracy": 0.6542857142857142,
1749
- "n_examples": 50,
1750
  "per_subtask": {
1751
  "MD": {
1752
- "accuracy": 0.96,
1753
- "count": 100
1754
  },
1755
  "MB": {
1756
- "accuracy": 0.94,
1757
- "count": 50
1758
  },
1759
  "UB": {
1760
- "accuracy": 0.56,
1761
- "count": 50
1762
  },
1763
  "UD": {
1764
- "accuracy": 0.38666666666666666,
1765
- "count": 150
1766
  }
1767
  }
1768
  },
1769
  "sub_M5": {
1770
  "full_accuracy": 0.04,
1771
- "digit_accuracy": 0.5314285714285715,
1772
- "n_examples": 50,
1773
  "per_subtask": {
1774
  "MD": {
1775
  "accuracy": 1.0,
1776
- "count": 50
1777
  },
1778
  "MB": {
1779
- "accuracy": 1.0,
1780
- "count": 50
1781
  },
1782
  "UB": {
1783
  "accuracy": 0.56,
1784
- "count": 50
1785
  },
1786
  "UD": {
1787
- "accuracy": 0.29,
1788
- "count": 200
1789
  }
1790
  }
1791
  },
1792
  "sub_random": {
1793
- "full_accuracy": 0.31,
1794
- "digit_accuracy": 0.8514285714285714,
1795
  "n_examples": 200,
1796
  "per_subtask": {
1797
  "MD": {
1798
- "accuracy": 0.9526315789473684,
1799
- "count": 570
1800
  },
1801
  "MB": {
1802
- "accuracy": 0.9494584837545126,
1803
- "count": 277
1804
  },
1805
  "ME": {
1806
  "accuracy": 0.9811320754716981,
1807
  "count": 53
1808
  },
1809
  "UB": {
1810
- "accuracy": 0.6475583864118896,
1811
- "count": 471
1812
  },
1813
  "UD": {
1814
- "accuracy": 1.0,
1815
- "count": 29
1816
  }
1817
  }
1818
  },
1819
  "sub_B3": {
1820
- "full_accuracy": 0.06,
1821
- "digit_accuracy": 0.7942857142857143,
1822
- "n_examples": 50,
1823
  "per_subtask": {
1824
  "MD": {
1825
- "accuracy": 0.9666666666666667,
1826
- "count": 150
1827
  },
1828
  "MB": {
1829
- "accuracy": 0.98,
1830
- "count": 50
1831
  },
1832
  "UB": {
1833
- "accuracy": 0.48514851485148514,
1834
- "count": 101
1835
  },
1836
  "UD": {
1837
- "accuracy": 0.7142857142857143,
1838
- "count": 49
1839
  }
1840
  }
1841
  },
1842
  "sub_B4": {
1843
- "full_accuracy": 0.08,
1844
- "digit_accuracy": 0.7171428571428572,
1845
- "n_examples": 50,
1846
  "per_subtask": {
1847
  "MD": {
1848
- "accuracy": 1.0,
1849
- "count": 100
1850
  },
1851
  "MB": {
1852
- "accuracy": 0.98,
1853
- "count": 50
1854
  },
1855
  "UB": {
1856
- "accuracy": 0.45454545454545453,
1857
- "count": 121
1858
  },
1859
  "UD": {
1860
- "accuracy": 0.5949367088607594,
1861
- "count": 79
1862
  }
1863
  }
1864
  },
1865
  "sub_B5": {
1866
- "full_accuracy": 0.06,
1867
- "digit_accuracy": 0.7057142857142857,
1868
- "n_examples": 50,
1869
  "per_subtask": {
1870
  "MD": {
1871
  "accuracy": 1.0,
1872
- "count": 50
1873
  },
1874
  "MB": {
1875
  "accuracy": 1.0,
1876
- "count": 50
1877
  },
1878
  "UB": {
1879
- "accuracy": 0.4934210526315789,
1880
- "count": 152
1881
  },
1882
  "UD": {
1883
- "accuracy": 0.7346938775510204,
1884
- "count": 98
1885
  }
1886
  }
1887
  }
1888
  },
1889
  "summary": {
1890
- "overall_accuracy": 0.21266666666666667,
1891
- "digit_accuracy": 0.7844761904761904,
1892
- "total_examples": 1500,
1893
  "n_splits": 24
1894
  }
1895
  }
 
315
  15600
316
  ],
317
  "loss": [
318
+ 11.902360916137695,
319
+ 11.666987419128418,
320
+ 11.278733253479004,
321
+ 10.870060920715332,
322
+ 10.583491325378418,
323
+ 10.357315063476562,
324
+ 10.140096664428711,
325
+ 9.8663911819458,
326
+ 9.547391891479492,
327
+ 9.399455070495605,
328
+ 9.050603866577148,
329
+ 8.765486717224121,
330
+ 8.510666847229004,
331
+ 8.226161003112793,
332
+ 7.946071147918701,
333
+ 7.7140021324157715,
334
+ 7.339910984039307,
335
+ 7.013484477996826,
336
+ 6.772124767303467,
337
+ 6.514701843261719,
338
+ 6.206382751464844,
339
+ 5.888983726501465,
340
+ 5.631106853485107,
341
+ 5.346935272216797,
342
+ 5.030400276184082,
343
+ 4.801684379577637,
344
+ 4.544530868530273,
345
+ 4.25732946395874,
346
+ 4.024832725524902,
347
+ 3.7788538932800293,
348
+ 3.5896661281585693,
349
+ 3.4343185424804688,
350
+ 3.1499485969543457,
351
+ 2.975759983062744,
352
+ 2.84167218208313,
353
+ 2.7719228267669678,
354
+ 2.685962677001953,
355
+ 2.4699745178222656,
356
+ 2.415933609008789,
357
+ 2.3947372436523438,
358
+ 2.274650812149048,
359
+ 2.2390007972717285,
360
+ 2.2176783084869385,
361
+ 2.1771786212921143,
362
+ 2.1928083896636963,
363
+ 2.131913661956787,
364
+ 2.166004180908203,
365
+ 2.091320037841797,
366
+ 2.138065814971924,
367
+ 2.059041738510132,
368
+ 2.0520567893981934,
369
+ 2.069549322128296,
370
+ 2.1000163555145264,
371
+ 2.0322933197021484,
372
+ 2.002192497253418,
373
+ 1.993998646736145,
374
+ 1.9147483110427856,
375
+ 2.025142192840576,
376
+ 1.9972620010375977,
377
+ 1.9602936506271362,
378
+ 1.9629830121994019,
379
+ 1.9658845663070679,
380
+ 1.908106803894043,
381
+ 1.8817909955978394,
382
+ 1.9339027404785156,
383
+ 1.9608190059661865,
384
+ 1.8899195194244385,
385
+ 1.8446751832962036,
386
+ 1.8116694688796997,
387
+ 1.8471367359161377,
388
+ 1.8834497928619385,
389
+ 1.8279780149459839,
390
+ 1.7376059293746948,
391
+ 1.8068692684173584,
392
+ 1.8083994388580322,
393
+ 1.7907131910324097,
394
+ 1.7131086587905884,
395
+ 1.6682668924331665,
396
+ 1.7329654693603516,
397
+ 1.612639307975769,
398
+ 1.6369705200195312,
399
+ 1.5567694902420044,
400
+ 1.6371721029281616,
401
+ 1.5882548093795776,
402
+ 1.5648012161254883,
403
+ 1.4990679025650024,
404
+ 1.486085057258606,
405
+ 1.4619702100753784,
406
+ 1.492026686668396,
407
+ 1.431452989578247,
408
+ 1.4061998128890991,
409
+ 1.4255949258804321,
410
+ 1.369724988937378,
411
+ 1.3176831007003784,
412
+ 1.353609323501587,
413
+ 1.287545919418335,
414
+ 1.2904835939407349,
415
+ 1.237708330154419,
416
+ 1.2732247114181519,
417
+ 1.2794243097305298,
418
+ 1.232486605644226,
419
+ 1.2047475576400757,
420
+ 1.1985729932785034,
421
+ 1.175077199935913,
422
+ 1.152284026145935,
423
+ 1.1384241580963135,
424
+ 1.134267807006836,
425
+ 1.157617211341858,
426
+ 1.0927313566207886,
427
+ 1.1036320924758911,
428
+ 1.036346197128296,
429
+ 1.1012026071548462,
430
+ 1.0513644218444824,
431
+ 1.082848310470581,
432
+ 1.0133121013641357,
433
+ 1.0115025043487549,
434
+ 0.9968132376670837,
435
+ 1.0016825199127197,
436
+ 0.9806528687477112,
437
+ 1.0336967706680298,
438
+ 0.9553194642066956,
439
+ 0.9541217684745789,
440
+ 0.9729284644126892,
441
+ 0.9127997159957886,
442
+ 0.8899979591369629,
443
+ 0.9080758094787598,
444
+ 0.9263311624526978,
445
+ 0.8968016505241394,
446
+ 0.9046673774719238,
447
+ 0.9198164343833923,
448
+ 0.880632758140564,
449
+ 0.858092188835144,
450
+ 0.8669767379760742,
451
+ 0.8270903825759888,
452
+ 0.8441628813743591,
453
+ 0.8286030888557434,
454
+ 0.8017106056213379,
455
+ 0.8334652185440063,
456
+ 0.7973408102989197,
457
+ 0.7861750721931458,
458
+ 0.7986087799072266,
459
+ 0.7349762320518494,
460
+ 0.7474499940872192,
461
+ 0.7652159333229065,
462
+ 0.7694664001464844,
463
+ 0.7698801159858704,
464
+ 0.7689107060432434,
465
+ 0.7688385844230652,
466
+ 0.7331604361534119,
467
+ 0.739378035068512,
468
+ 0.7268163561820984,
469
+ 0.6888265013694763,
470
+ 0.7279484868049622,
471
+ 0.6972444653511047,
472
+ 0.6783798933029175,
473
+ 0.7007697224617004,
474
+ 0.6984170079231262,
475
+ 0.7051367163658142,
476
+ 0.6599271893501282,
477
+ 0.6801080703735352,
478
+ 0.6596269607543945,
479
+ 0.6970394849777222,
480
+ 0.6735067367553711,
481
+ 0.6091650724411011,
482
+ 0.5997785329818726,
483
+ 0.6572319865226746,
484
+ 0.6416034698486328,
485
+ 0.6416297554969788,
486
+ 0.6632289290428162,
487
+ 0.6143150329589844,
488
+ 0.6194843053817749,
489
+ 0.5911574959754944,
490
+ 0.5913539528846741,
491
+ 0.6205930709838867,
492
+ 0.6280899047851562,
493
+ 0.5930410027503967,
494
+ 0.5673736929893494,
495
+ 0.5849844813346863,
496
+ 0.573549747467041,
497
+ 0.5737026333808899,
498
+ 0.5761013031005859,
499
+ 0.5887518525123596,
500
+ 0.5787922739982605,
501
+ 0.5531727075576782,
502
+ 0.5395235419273376,
503
+ 0.5347995162010193,
504
+ 0.5664406418800354,
505
+ 0.535719633102417,
506
+ 0.5654853582382202,
507
+ 0.5577755570411682,
508
+ 0.5406196713447571,
509
+ 0.5438421368598938,
510
+ 0.5191155076026917,
511
+ 0.5238958597183228,
512
+ 0.5294751524925232,
513
+ 0.5278587341308594,
514
+ 0.5264574289321899,
515
+ 0.526719868183136,
516
+ 0.5218141674995422,
517
+ 0.4832751452922821,
518
+ 0.5143148303031921,
519
+ 0.5116984248161316,
520
+ 0.5009939670562744,
521
+ 0.5164638161659241,
522
+ 0.4878538250923157,
523
+ 0.49269673228263855,
524
+ 0.5037528276443481,
525
+ 0.4777663052082062,
526
+ 0.487152099609375,
527
+ 0.46394041180610657,
528
+ 0.4958875775337219,
529
+ 0.5004867911338806,
530
+ 0.4617388844490051,
531
+ 0.4889770448207855,
532
+ 0.46261903643608093,
533
+ 0.44640398025512695,
534
+ 0.4764914810657501,
535
+ 0.49255529046058655,
536
+ 0.4767415225505829,
537
+ 0.4745542109012604,
538
+ 0.44122710824012756,
539
+ 0.44721779227256775,
540
+ 0.4414359927177429,
541
+ 0.4407655894756317,
542
+ 0.4724572002887726,
543
+ 0.46291184425354004,
544
+ 0.4877897799015045,
545
+ 0.46290403604507446,
546
+ 0.4350273907184601,
547
+ 0.4549728035926819,
548
+ 0.4433819353580475,
549
+ 0.459841787815094,
550
+ 0.4417283535003662,
551
+ 0.4167630970478058,
552
+ 0.4117043912410736,
553
+ 0.4316425919532776,
554
+ 0.43041709065437317,
555
+ 0.45717042684555054,
556
+ 0.4303068518638611,
557
+ 0.40953192114830017,
558
+ 0.425926148891449,
559
+ 0.3955731987953186,
560
+ 0.42986229062080383,
561
+ 0.4956095814704895,
562
+ 0.4266194701194763,
563
+ 0.4557635188102722,
564
+ 0.44420650601387024,
565
+ 0.41897672414779663,
566
+ 0.43166255950927734,
567
+ 0.4073420464992523,
568
+ 0.4340267777442932,
569
+ 0.43941253423690796,
570
+ 0.41659075021743774,
571
+ 0.4046151340007782,
572
+ 0.4280521869659424,
573
+ 0.42428717017173767,
574
+ 0.4402787685394287,
575
+ 0.4167115390300751,
576
+ 0.4264659285545349,
577
+ 0.4104572832584381,
578
+ 0.41204768419265747,
579
+ 0.44351884722709656,
580
+ 0.4115920066833496,
581
+ 0.41794463992118835,
582
+ 0.39936190843582153,
583
+ 0.4187738001346588,
584
+ 0.40758633613586426,
585
+ 0.40320804715156555,
586
+ 0.4426390826702118,
587
+ 0.42466142773628235,
588
+ 0.4063023626804352,
589
+ 0.4364185929298401,
590
+ 0.3955465853214264,
591
+ 0.39736407995224,
592
+ 0.4157145917415619,
593
+ 0.3955707252025604,
594
+ 0.40614697337150574,
595
+ 0.4296145737171173,
596
+ 0.4289790689945221,
597
+ 0.4248588979244232,
598
+ 0.40194225311279297,
599
+ 0.3777453601360321,
600
+ 0.41459909081459045,
601
+ 0.4123244881629944,
602
+ 0.39063140749931335,
603
+ 0.3955395221710205,
604
+ 0.4094971716403961,
605
+ 0.43402963876724243,
606
+ 0.39580103754997253,
607
+ 0.41646233201026917,
608
+ 0.397787481546402,
609
+ 0.39125150442123413,
610
+ 0.4029674530029297,
611
+ 0.4123210310935974,
612
+ 0.3969133794307709,
613
+ 0.41159409284591675,
614
+ 0.36812132596969604,
615
+ 0.40473809838294983,
616
+ 0.4111975133419037,
617
+ 0.420559823513031,
618
+ 0.39803218841552734,
619
+ 0.38535213470458984,
620
+ 0.4098115563392639,
621
+ 0.40594691038131714,
622
+ 0.41488656401634216,
623
+ 0.3916519582271576,
624
+ 0.4265446066856384,
625
+ 0.39961308240890503,
626
+ 0.3947022259235382,
627
+ 0.4145981967449188,
628
+ 0.3922405540943146,
629
+ 0.3995726406574249
630
  ],
631
  "base_loss": [
632
+ 11.902360916137695,
633
+ 11.666987419128418,
634
+ 11.278733253479004,
635
+ 10.870060920715332,
636
+ 10.583491325378418,
637
+ 10.357315063476562,
638
+ 10.140096664428711,
639
+ 9.8663911819458,
640
+ 9.547391891479492,
641
+ 9.399455070495605,
642
+ 9.050603866577148,
643
+ 8.765486717224121,
644
+ 8.510666847229004,
645
+ 8.226161003112793,
646
+ 7.946071147918701,
647
+ 7.7140021324157715,
648
+ 7.339910984039307,
649
+ 7.013484477996826,
650
+ 6.772124767303467,
651
+ 6.514701843261719,
652
+ 6.206382751464844,
653
+ 5.888983726501465,
654
+ 5.631106853485107,
655
+ 5.346935272216797,
656
+ 5.030400276184082,
657
+ 4.801684379577637,
658
+ 4.544530868530273,
659
+ 4.25732946395874,
660
+ 4.024832725524902,
661
+ 3.7788538932800293,
662
+ 3.5896661281585693,
663
+ 3.4343185424804688,
664
+ 3.1499485969543457,
665
+ 2.975759983062744,
666
+ 2.84167218208313,
667
+ 2.7719228267669678,
668
+ 2.685962677001953,
669
+ 2.4699745178222656,
670
+ 2.415933609008789,
671
+ 2.3947372436523438,
672
+ 2.274650812149048,
673
+ 2.2390007972717285,
674
+ 2.2176783084869385,
675
+ 2.1771786212921143,
676
+ 2.1928083896636963,
677
+ 2.131913661956787,
678
+ 2.166004180908203,
679
+ 2.091320037841797,
680
+ 2.138065814971924,
681
+ 2.059041738510132,
682
+ 2.0520567893981934,
683
+ 2.069549322128296,
684
+ 2.1000163555145264,
685
+ 2.0322933197021484,
686
+ 2.002192497253418,
687
+ 1.993998646736145,
688
+ 1.9147483110427856,
689
+ 2.025142192840576,
690
+ 1.9972620010375977,
691
+ 1.9602936506271362,
692
+ 1.9629830121994019,
693
+ 1.9658845663070679,
694
+ 1.908106803894043,
695
+ 1.8817909955978394,
696
+ 1.9339027404785156,
697
+ 1.9608190059661865,
698
+ 1.8899195194244385,
699
+ 1.8446751832962036,
700
+ 1.8116694688796997,
701
+ 1.8471367359161377,
702
+ 1.8834497928619385,
703
+ 1.8279780149459839,
704
+ 1.7376059293746948,
705
+ 1.8068692684173584,
706
+ 1.8083994388580322,
707
+ 1.7907131910324097,
708
+ 1.7131086587905884,
709
+ 1.6682668924331665,
710
+ 1.7329654693603516,
711
+ 1.612639307975769,
712
+ 1.6369705200195312,
713
+ 1.5567694902420044,
714
+ 1.6371721029281616,
715
+ 1.5882548093795776,
716
+ 1.5648012161254883,
717
+ 1.4990679025650024,
718
+ 1.486085057258606,
719
+ 1.4619702100753784,
720
+ 1.492026686668396,
721
+ 1.431452989578247,
722
+ 1.4061998128890991,
723
+ 1.4255949258804321,
724
+ 1.369724988937378,
725
+ 1.3176831007003784,
726
+ 1.353609323501587,
727
+ 1.287545919418335,
728
+ 1.2904835939407349,
729
+ 1.237708330154419,
730
+ 1.2732247114181519,
731
+ 1.2794243097305298,
732
+ 1.232486605644226,
733
+ 1.2047475576400757,
734
+ 1.1985729932785034,
735
+ 1.175077199935913,
736
+ 1.152284026145935,
737
+ 1.1384241580963135,
738
+ 1.134267807006836,
739
+ 1.157617211341858,
740
+ 1.0927313566207886,
741
+ 1.1036320924758911,
742
+ 1.036346197128296,
743
+ 1.1012026071548462,
744
+ 1.0513644218444824,
745
+ 1.082848310470581,
746
+ 1.0133121013641357,
747
+ 1.0115025043487549,
748
+ 0.9968132376670837,
749
+ 1.0016825199127197,
750
+ 0.9806528687477112,
751
+ 1.0336967706680298,
752
+ 0.9553194642066956,
753
+ 0.9541217684745789,
754
+ 0.9729284644126892,
755
+ 0.9127997159957886,
756
+ 0.8899979591369629,
757
+ 0.9080758094787598,
758
+ 0.9263311624526978,
759
+ 0.8968016505241394,
760
+ 0.9046673774719238,
761
+ 0.9198164343833923,
762
+ 0.880632758140564,
763
+ 0.858092188835144,
764
+ 0.8669767379760742,
765
+ 0.8270903825759888,
766
+ 0.8441628813743591,
767
+ 0.8286030888557434,
768
+ 0.8017106056213379,
769
+ 0.8334652185440063,
770
+ 0.7973408102989197,
771
+ 0.7861750721931458,
772
+ 0.7986087799072266,
773
+ 0.7349762320518494,
774
+ 0.7474499940872192,
775
+ 0.7652159333229065,
776
+ 0.7694664001464844,
777
+ 0.7698801159858704,
778
+ 0.7689107060432434,
779
+ 0.7688385844230652,
780
+ 0.7331604361534119,
781
+ 0.739378035068512,
782
+ 0.7268163561820984,
783
+ 0.6888265013694763,
784
+ 0.7279484868049622,
785
+ 0.6972444653511047,
786
+ 0.6783798933029175,
787
+ 0.7007697224617004,
788
+ 0.6984170079231262,
789
+ 0.7051367163658142,
790
+ 0.6599271893501282,
791
+ 0.6801080703735352,
792
+ 0.6596269607543945,
793
+ 0.6970394849777222,
794
+ 0.6735067367553711,
795
+ 0.6091650724411011,
796
+ 0.5997785329818726,
797
+ 0.6572319865226746,
798
+ 0.6416034698486328,
799
+ 0.6416297554969788,
800
+ 0.6632289290428162,
801
+ 0.6143150329589844,
802
+ 0.6194843053817749,
803
+ 0.5911574959754944,
804
+ 0.5913539528846741,
805
+ 0.6205930709838867,
806
+ 0.6280899047851562,
807
+ 0.5930410027503967,
808
+ 0.5673736929893494,
809
+ 0.5849844813346863,
810
+ 0.573549747467041,
811
+ 0.5737026333808899,
812
+ 0.5761013031005859,
813
+ 0.5887518525123596,
814
+ 0.5787922739982605,
815
+ 0.5531727075576782,
816
+ 0.5395235419273376,
817
+ 0.5347995162010193,
818
+ 0.5664406418800354,
819
+ 0.535719633102417,
820
+ 0.5654853582382202,
821
+ 0.5577755570411682,
822
+ 0.5406196713447571,
823
+ 0.5438421368598938,
824
+ 0.5191155076026917,
825
+ 0.5238958597183228,
826
+ 0.5294751524925232,
827
+ 0.5278587341308594,
828
+ 0.5264574289321899,
829
+ 0.526719868183136,
830
+ 0.5218141674995422,
831
+ 0.4832751452922821,
832
+ 0.5143148303031921,
833
+ 0.5116984248161316,
834
+ 0.5009939670562744,
835
+ 0.5164638161659241,
836
+ 0.4878538250923157,
837
+ 0.49269673228263855,
838
+ 0.5037528276443481,
839
+ 0.4777663052082062,
840
+ 0.487152099609375,
841
+ 0.46394041180610657,
842
+ 0.4958875775337219,
843
+ 0.5004867911338806,
844
+ 0.4617388844490051,
845
+ 0.4889770448207855,
846
+ 0.46261903643608093,
847
+ 0.44640398025512695,
848
+ 0.4764914810657501,
849
+ 0.49255529046058655,
850
+ 0.4767415225505829,
851
+ 0.4745542109012604,
852
+ 0.44122710824012756,
853
+ 0.44721779227256775,
854
+ 0.4414359927177429,
855
+ 0.4407655894756317,
856
+ 0.4724572002887726,
857
+ 0.46291184425354004,
858
+ 0.4877897799015045,
859
+ 0.46290403604507446,
860
+ 0.4350273907184601,
861
+ 0.4549728035926819,
862
+ 0.4433819353580475,
863
+ 0.459841787815094,
864
+ 0.4417283535003662,
865
+ 0.4167630970478058,
866
+ 0.4117043912410736,
867
+ 0.4316425919532776,
868
+ 0.43041709065437317,
869
+ 0.45717042684555054,
870
+ 0.4303068518638611,
871
+ 0.40953192114830017,
872
+ 0.425926148891449,
873
+ 0.3955731987953186,
874
+ 0.42986229062080383,
875
+ 0.4956095814704895,
876
+ 0.4266194701194763,
877
+ 0.4557635188102722,
878
+ 0.44420650601387024,
879
+ 0.41897672414779663,
880
+ 0.43166255950927734,
881
+ 0.4073420464992523,
882
+ 0.4340267777442932,
883
+ 0.43941253423690796,
884
+ 0.41659075021743774,
885
+ 0.4046151340007782,
886
+ 0.4280521869659424,
887
+ 0.42428717017173767,
888
+ 0.4402787685394287,
889
+ 0.4167115390300751,
890
+ 0.4264659285545349,
891
+ 0.4104572832584381,
892
+ 0.41204768419265747,
893
+ 0.44351884722709656,
894
+ 0.4115920066833496,
895
+ 0.41794463992118835,
896
+ 0.39936190843582153,
897
+ 0.4187738001346588,
898
+ 0.40758633613586426,
899
+ 0.40320804715156555,
900
+ 0.4426390826702118,
901
+ 0.42466142773628235,
902
+ 0.4063023626804352,
903
+ 0.4364185929298401,
904
+ 0.3955465853214264,
905
+ 0.39736407995224,
906
+ 0.4157145917415619,
907
+ 0.3955707252025604,
908
+ 0.40614697337150574,
909
+ 0.4296145737171173,
910
+ 0.4289790689945221,
911
+ 0.4248588979244232,
912
+ 0.40194225311279297,
913
+ 0.3777453601360321,
914
+ 0.41459909081459045,
915
+ 0.4123244881629944,
916
+ 0.39063140749931335,
917
+ 0.3955395221710205,
918
+ 0.4094971716403961,
919
+ 0.43402963876724243,
920
+ 0.39580103754997253,
921
+ 0.41646233201026917,
922
+ 0.397787481546402,
923
+ 0.39125150442123413,
924
+ 0.4029674530029297,
925
+ 0.4123210310935974,
926
+ 0.3969133794307709,
927
+ 0.41159409284591675,
928
+ 0.36812132596969604,
929
+ 0.40473809838294983,
930
+ 0.4111975133419037,
931
+ 0.420559823513031,
932
+ 0.39803218841552734,
933
+ 0.38535213470458984,
934
+ 0.4098115563392639,
935
+ 0.40594691038131714,
936
+ 0.41488656401634216,
937
+ 0.3916519582271576,
938
+ 0.4265446066856384,
939
+ 0.39961308240890503,
940
+ 0.3947022259235382,
941
+ 0.4145981967449188,
942
+ 0.3922405540943146,
943
+ 0.3995726406574249
944
  ],
945
  "lr": [
946
  2.0940170940170946e-06,
 
1304
  0.0,
1305
  0.0,
1306
  0.0,
1307
+ 0.0,
1308
+ 0.002105263157894737,
1309
+ 0.04,
1310
+ 0.06631578947368422,
1311
+ 0.08421052631578947,
1312
+ 0.06736842105263158,
1313
+ 0.1,
1314
+ 0.12842105263157894,
1315
+ 0.16210526315789472,
1316
+ 0.2168421052631579,
1317
+ 0.19157894736842104,
1318
+ 0.2305263157894737,
1319
+ 0.2326315789473684,
1320
+ 0.24421052631578946,
1321
+ 0.24947368421052632,
1322
+ 0.26210526315789473,
1323
+ 0.26
1324
  ]
1325
  },
1326
+ "final_accuracy": 0.22615384615384615,
1327
  "sft_eval": {
1328
  "config": {
1329
  "ops": "add_sub",
1330
  "K": null,
1331
  "mode": "sft",
1332
  "n_digits": 6,
1333
+ "n_per_split": 100
1334
  },
1335
  "splits": {
1336
  "add_S0": {
1337
+ "full_accuracy": 0.3,
1338
+ "digit_accuracy": 0.8757142857142857,
1339
+ "n_examples": 100,
1340
  "per_subtask": {
1341
  "SA": {
1342
+ "accuracy": 0.8611570247933884,
1343
+ "count": 605
1344
  },
1345
  "SS": {
1346
+ "accuracy": 0.968421052631579,
1347
+ "count": 95
1348
  }
1349
  }
1350
  },
1351
  "add_S1": {
1352
+ "full_accuracy": 0.41,
1353
+ "digit_accuracy": 0.8757142857142857,
1354
+ "n_examples": 100,
1355
  "per_subtask": {
1356
  "SA": {
1357
+ "accuracy": 0.9264705882352942,
1358
+ "count": 204
1359
  },
1360
  "SC": {
1361
+ "accuracy": 0.9230769230769231,
1362
+ "count": 169
1363
  },
1364
  "SS": {
1365
+ "accuracy": 1.0,
1366
+ "count": 31
1367
  },
1368
  "UC": {
1369
+ "accuracy": 0.8006756756756757,
1370
+ "count": 296
1371
  }
1372
  }
1373
  },
1374
  "add_S2": {
1375
+ "full_accuracy": 0.23,
1376
+ "digit_accuracy": 0.8414285714285714,
1377
+ "n_examples": 100,
1378
  "per_subtask": {
1379
  "SA": {
1380
+ "accuracy": 0.9079754601226994,
1381
+ "count": 163
1382
  },
1383
  "SC": {
1384
+ "accuracy": 0.8846153846153846,
1385
+ "count": 130
1386
  },
1387
  "SS": {
1388
+ "accuracy": 0.8390804597701149,
1389
+ "count": 87
1390
  },
1391
  "UC": {
1392
+ "accuracy": 0.6896551724137931,
1393
+ "count": 203
1394
  },
1395
  "US": {
1396
+ "accuracy": 0.9658119658119658,
1397
+ "count": 117
1398
  }
1399
  }
1400
  },
1401
  "add_S3": {
1402
+ "full_accuracy": 0.21,
1403
+ "digit_accuracy": 0.8185714285714286,
1404
+ "n_examples": 100,
1405
  "per_subtask": {
1406
  "SA": {
1407
+ "accuracy": 0.9586776859504132,
1408
+ "count": 121
1409
  },
1410
  "SC": {
1411
+ "accuracy": 0.8677685950413223,
1412
+ "count": 121
1413
  },
1414
  "SS": {
1415
+ "accuracy": 0.8979591836734694,
1416
+ "count": 49
1417
  },
1418
  "UC": {
1419
+ "accuracy": 0.7204301075268817,
1420
+ "count": 186
1421
  },
1422
  "US": {
1423
+ "accuracy": 0.7802690582959642,
1424
+ "count": 223
1425
  }
1426
  }
1427
  },
1428
  "add_S4": {
1429
  "full_accuracy": 0.24,
1430
+ "digit_accuracy": 0.7185714285714285,
1431
+ "n_examples": 100,
1432
  "per_subtask": {
1433
  "SA": {
1434
+ "accuracy": 0.9615384615384616,
1435
+ "count": 104
1436
  },
1437
  "SC": {
1438
+ "accuracy": 0.9056603773584906,
1439
+ "count": 106
1440
  },
1441
  "SS": {
1442
+ "accuracy": 0.8695652173913043,
1443
+ "count": 23
1444
  },
1445
  "UC": {
1446
+ "accuracy": 0.6625,
1447
+ "count": 160
1448
  },
1449
  "US": {
1450
+ "accuracy": 0.5895765472312704,
1451
+ "count": 307
1452
  }
1453
  }
1454
  },
1455
  "add_S5": {
1456
+ "full_accuracy": 0.13,
1457
+ "digit_accuracy": 0.5528571428571428,
1458
+ "n_examples": 100,
1459
  "per_subtask": {
1460
  "SA": {
1461
+ "accuracy": 0.96,
1462
+ "count": 100
1463
  },
1464
  "SC": {
1465
+ "accuracy": 0.95,
1466
+ "count": 100
1467
  },
1468
  "UC": {
1469
+ "accuracy": 0.41,
1470
+ "count": 100
1471
  },
1472
  "US": {
1473
+ "accuracy": 0.3875,
1474
+ "count": 400
1475
  }
1476
  }
1477
  },
1478
  "add_S6": {
1479
+ "full_accuracy": 0.49,
1480
+ "digit_accuracy": 0.6614285714285715,
1481
+ "n_examples": 100,
1482
  "per_subtask": {
1483
  "SC": {
1484
+ "accuracy": 0.99,
1485
+ "count": 100
1486
  },
1487
  "UC": {
1488
+ "accuracy": 0.58,
1489
+ "count": 100
1490
  },
1491
  "US": {
1492
+ "accuracy": 0.612,
1493
+ "count": 500
1494
  }
1495
  }
1496
  },
1497
  "add_random": {
1498
+ "full_accuracy": 0.33,
1499
+ "digit_accuracy": 0.8578571428571429,
1500
  "n_examples": 200,
1501
  "per_subtask": {
1502
  "SA": {
1503
+ "accuracy": 0.8859060402684564,
1504
+ "count": 447
1505
  },
1506
  "SC": {
1507
+ "accuracy": 0.928125,
1508
+ "count": 320
1509
  },
1510
  "SS": {
1511
+ "accuracy": 0.9464285714285714,
1512
+ "count": 56
1513
  },
1514
  "UC": {
1515
+ "accuracy": 0.7863894139886578,
1516
+ "count": 529
1517
  },
1518
  "US": {
1519
+ "accuracy": 0.8125,
1520
+ "count": 48
1521
  }
1522
  }
1523
  },
1524
  "add_C1": {
1525
+ "full_accuracy": 0.33,
1526
+ "digit_accuracy": 0.8542857142857143,
1527
+ "n_examples": 100,
1528
  "per_subtask": {
1529
  "SA": {
1530
+ "accuracy": 0.886,
1531
+ "count": 500
1532
  },
1533
  "SC": {
1534
+ "accuracy": 0.95,
1535
+ "count": 100
1536
  },
1537
  "UC": {
1538
+ "accuracy": 0.6,
1539
+ "count": 100
1540
  }
1541
  }
1542
  },
1543
  "add_C2": {
1544
+ "full_accuracy": 0.3,
1545
+ "digit_accuracy": 0.8442857142857143,
1546
+ "n_examples": 100,
1547
  "per_subtask": {
1548
  "SA": {
1549
+ "accuracy": 0.905,
1550
+ "count": 400
1551
  },
1552
  "SC": {
1553
+ "accuracy": 0.97,
1554
+ "count": 100
1555
  },
1556
  "UC": {
1557
+ "accuracy": 0.6089743589743589,
1558
+ "count": 156
1559
  },
1560
  "US": {
1561
+ "accuracy": 0.8409090909090909,
1562
+ "count": 44
1563
  }
1564
  }
1565
  },
1566
  "add_C3": {
1567
+ "full_accuracy": 0.23,
1568
  "digit_accuracy": 0.8057142857142857,
1569
+ "n_examples": 100,
1570
  "per_subtask": {
1571
  "SA": {
1572
+ "accuracy": 0.8866666666666667,
1573
+ "count": 300
1574
  },
1575
  "SC": {
1576
+ "accuracy": 0.92,
1577
+ "count": 100
1578
  },
1579
  "UC": {
1580
+ "accuracy": 0.6180904522613065,
1581
+ "count": 199
1582
  },
1583
  "US": {
1584
+ "accuracy": 0.8217821782178217,
1585
+ "count": 101
1586
  }
1587
  }
1588
  },
1589
  "add_C4": {
1590
+ "full_accuracy": 0.19,
1591
+ "digit_accuracy": 0.79,
1592
+ "n_examples": 100,
1593
  "per_subtask": {
1594
  "SA": {
1595
+ "accuracy": 0.91,
1596
+ "count": 200
1597
  },
1598
  "SC": {
1599
+ "accuracy": 0.94,
1600
+ "count": 100
1601
  },
1602
  "UC": {
1603
+ "accuracy": 0.6477272727272727,
1604
+ "count": 264
1605
  },
1606
  "US": {
1607
+ "accuracy": 0.7794117647058824,
1608
+ "count": 136
1609
  }
1610
  }
1611
  },
1612
  "add_C5": {
1613
+ "full_accuracy": 0.17,
1614
  "digit_accuracy": 0.7942857142857143,
1615
+ "n_examples": 100,
1616
  "per_subtask": {
1617
  "SA": {
1618
+ "accuracy": 0.97,
1619
+ "count": 100
1620
  },
1621
  "SC": {
1622
+ "accuracy": 0.96,
1623
+ "count": 100
1624
  },
1625
  "UC": {
1626
+ "accuracy": 0.667741935483871,
1627
+ "count": 310
1628
  },
1629
  "US": {
1630
+ "accuracy": 0.8210526315789474,
1631
+ "count": 190
1632
  }
1633
  }
1634
  },
1635
  "add_C6": {
1636
+ "full_accuracy": 0.23,
1637
+ "digit_accuracy": 0.8228571428571428,
1638
+ "n_examples": 100,
1639
  "per_subtask": {
1640
  "SC": {
1641
+ "accuracy": 1.0,
1642
+ "count": 100
1643
  },
1644
  "UC": {
1645
+ "accuracy": 0.7594594594594595,
1646
+ "count": 370
1647
  },
1648
  "US": {
1649
+ "accuracy": 0.8478260869565217,
1650
+ "count": 230
1651
  }
1652
  }
1653
  },
1654
  "sub_M0": {
1655
+ "full_accuracy": 0.45,
1656
+ "digit_accuracy": 0.9028571428571428,
1657
+ "n_examples": 100,
1658
  "per_subtask": {
1659
  "MD": {
1660
+ "accuracy": 0.8894308943089431,
1661
+ "count": 615
1662
  },
1663
  "ME": {
1664
  "accuracy": 1.0,
1665
+ "count": 85
1666
  }
1667
  }
1668
  },
1669
  "sub_M1": {
1670
+ "full_accuracy": 0.26,
1671
+ "digit_accuracy": 0.8457142857142858,
1672
+ "n_examples": 100,
1673
  "per_subtask": {
1674
  "MD": {
1675
+ "accuracy": 0.9143835616438356,
1676
+ "count": 292
1677
  },
1678
  "MB": {
1679
+ "accuracy": 0.9166666666666666,
1680
+ "count": 144
1681
  },
1682
  "ME": {
1683
+ "accuracy": 1.0,
1684
+ "count": 25
1685
  },
1686
  "UB": {
1687
+ "accuracy": 0.702928870292887,
1688
+ "count": 239
1689
  }
1690
  }
1691
  },
1692
  "sub_M2": {
1693
+ "full_accuracy": 0.2,
1694
+ "digit_accuracy": 0.8342857142857143,
1695
+ "n_examples": 100,
1696
  "per_subtask": {
1697
  "MD": {
1698
+ "accuracy": 0.9004739336492891,
1699
+ "count": 211
1700
  },
1701
  "MB": {
1702
+ "accuracy": 0.9304347826086956,
1703
+ "count": 115
1704
  },
1705
  "ME": {
1706
+ "accuracy": 0.9882352941176471,
1707
+ "count": 85
1708
  },
1709
  "UB": {
1710
+ "accuracy": 0.5469613259668509,
1711
+ "count": 181
1712
  },
1713
  "UD": {
1714
+ "accuracy": 0.9629629629629629,
1715
+ "count": 108
1716
  }
1717
  }
1718
  },
1719
  "sub_M3": {
1720
+ "full_accuracy": 0.05,
1721
+ "digit_accuracy": 0.74,
1722
+ "n_examples": 100,
1723
  "per_subtask": {
1724
  "MD": {
1725
+ "accuracy": 0.9553072625698324,
1726
+ "count": 179
1727
  },
1728
  "MB": {
1729
+ "accuracy": 0.970873786407767,
1730
+ "count": 103
1731
  },
1732
  "ME": {
1733
+ "accuracy": 0.9464285714285714,
1734
+ "count": 56
1735
  },
1736
  "UB": {
1737
+ "accuracy": 0.4966442953020134,
1738
+ "count": 149
1739
  },
1740
  "UD": {
1741
+ "accuracy": 0.5633802816901409,
1742
+ "count": 213
1743
  }
1744
  }
1745
  },
1746
  "sub_M4": {
1747
+ "full_accuracy": 0.05,
1748
+ "digit_accuracy": 0.6385714285714286,
1749
+ "n_examples": 100,
1750
  "per_subtask": {
1751
  "MD": {
1752
+ "accuracy": 0.965,
1753
+ "count": 200
1754
  },
1755
  "MB": {
1756
+ "accuracy": 0.96,
1757
+ "count": 100
1758
  },
1759
  "UB": {
1760
+ "accuracy": 0.57,
1761
+ "count": 100
1762
  },
1763
  "UD": {
1764
+ "accuracy": 0.33666666666666667,
1765
+ "count": 300
1766
  }
1767
  }
1768
  },
1769
  "sub_M5": {
1770
  "full_accuracy": 0.04,
1771
+ "digit_accuracy": 0.5,
1772
+ "n_examples": 100,
1773
  "per_subtask": {
1774
  "MD": {
1775
  "accuracy": 1.0,
1776
+ "count": 100
1777
  },
1778
  "MB": {
1779
+ "accuracy": 0.96,
1780
+ "count": 100
1781
  },
1782
  "UB": {
1783
  "accuracy": 0.56,
1784
+ "count": 100
1785
  },
1786
  "UD": {
1787
+ "accuracy": 0.245,
1788
+ "count": 400
1789
  }
1790
  }
1791
  },
1792
  "sub_random": {
1793
+ "full_accuracy": 0.24,
1794
+ "digit_accuracy": 0.8371428571428572,
1795
  "n_examples": 200,
1796
  "per_subtask": {
1797
  "MD": {
1798
+ "accuracy": 0.9083333333333333,
1799
+ "count": 600
1800
  },
1801
  "MB": {
1802
+ "accuracy": 0.9363295880149812,
1803
+ "count": 267
1804
  },
1805
  "ME": {
1806
  "accuracy": 0.9811320754716981,
1807
  "count": 53
1808
  },
1809
  "UB": {
1810
+ "accuracy": 0.6583143507972665,
1811
+ "count": 439
1812
  },
1813
  "UD": {
1814
+ "accuracy": 0.8780487804878049,
1815
+ "count": 41
1816
  }
1817
  }
1818
  },
1819
  "sub_B3": {
1820
+ "full_accuracy": 0.13,
1821
+ "digit_accuracy": 0.8028571428571428,
1822
+ "n_examples": 100,
1823
  "per_subtask": {
1824
  "MD": {
1825
+ "accuracy": 0.94,
1826
+ "count": 300
1827
  },
1828
  "MB": {
1829
+ "accuracy": 0.95,
1830
+ "count": 100
1831
  },
1832
  "UB": {
1833
+ "accuracy": 0.5583756345177665,
1834
+ "count": 197
1835
  },
1836
  "UD": {
1837
+ "accuracy": 0.7281553398058253,
1838
+ "count": 103
1839
  }
1840
  }
1841
  },
1842
  "sub_B4": {
1843
+ "full_accuracy": 0.05,
1844
+ "digit_accuracy": 0.72,
1845
+ "n_examples": 100,
1846
  "per_subtask": {
1847
  "MD": {
1848
+ "accuracy": 0.955,
1849
+ "count": 200
1850
  },
1851
  "MB": {
1852
+ "accuracy": 0.96,
1853
+ "count": 100
1854
  },
1855
  "UB": {
1856
+ "accuracy": 0.48582995951417,
1857
+ "count": 247
1858
  },
1859
  "UD": {
1860
+ "accuracy": 0.6339869281045751,
1861
+ "count": 153
1862
  }
1863
  }
1864
  },
1865
  "sub_B5": {
1866
+ "full_accuracy": 0.05,
1867
+ "digit_accuracy": 0.6914285714285714,
1868
+ "n_examples": 100,
1869
  "per_subtask": {
1870
  "MD": {
1871
  "accuracy": 1.0,
1872
+ "count": 100
1873
  },
1874
  "MB": {
1875
  "accuracy": 1.0,
1876
+ "count": 100
1877
  },
1878
  "UB": {
1879
+ "accuracy": 0.49328859060402686,
1880
+ "count": 298
1881
  },
1882
  "UD": {
1883
+ "accuracy": 0.6782178217821783,
1884
+ "count": 202
1885
  }
1886
  }
1887
  }
1888
  },
1889
  "summary": {
1890
+ "overall_accuracy": 0.22615384615384615,
1891
+ "digit_accuracy": 0.7815384615384615,
1892
+ "total_examples": 2600,
1893
  "n_splits": 24
1894
  }
1895
  }
add_sub_baseline_50K_2L1H128d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f042fb5952a5a3d094f33db76e3792f60cc38e235d390fab794cf6dc4983dad2
3
  size 157692826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6aa50140e3dc999e70ccc8a0ebc11a0caa554693342acc49ad8f60da90d1dd5
3
  size 157692826
add_sub_baseline_50K_2L1H128d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 39346560,
71
  "run_name": "add_sub_baseline_50K_2L1H128d",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T04:37:53.146117+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "gvx3cabo",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/gvx3cabo",
81
- "final_accuracy": 0.1925,
82
- "sft_accuracy": 0.1925,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 39346560,
71
  "run_name": "add_sub_baseline_50K_2L1H128d",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-15T09:22:35.671141+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_50K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "wnsxa3xn",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/wnsxa3xn",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.22615384615384615,
86
+ "sft_accuracy": 0.22615384615384615,
87
  "eval_method": "ArithmeticEvaluator"
88
  }