amirali1985 commited on
Commit
0fedc0a
·
verified ·
1 Parent(s): 01261e1

Upload add_sub_baseline_25K_2L1H128d

Browse files
add_sub_baseline_25K_2L1H128d/metrics.json CHANGED
@@ -159,320 +159,320 @@
159
  7800
160
  ],
161
  "loss": [
162
- 11.851262092590332,
163
- 11.322190284729004,
164
- 10.84399700164795,
165
- 10.476851463317871,
166
- 10.189689636230469,
167
- 9.838882446289062,
168
- 9.510722160339355,
169
- 9.290719032287598,
170
- 8.962188720703125,
171
- 8.611163139343262,
172
- 8.361300468444824,
173
- 8.08813190460205,
174
- 7.842682838439941,
175
- 7.567698001861572,
176
- 7.269339561462402,
177
- 7.015201091766357,
178
- 6.760293006896973,
179
- 6.50231409072876,
180
- 6.1420512199401855,
181
- 5.9626617431640625,
182
- 5.677587032318115,
183
- 5.316855430603027,
184
- 5.1820387840271,
185
- 4.774725914001465,
186
- 4.56818962097168,
187
- 4.361102104187012,
188
- 4.119393825531006,
189
- 3.8857994079589844,
190
- 3.673340320587158,
191
- 3.468486785888672,
192
- 3.2346725463867188,
193
- 3.0731041431427,
194
- 2.8872439861297607,
195
- 2.8238704204559326,
196
- 2.6868231296539307,
197
- 2.5601580142974854,
198
- 2.5200085639953613,
199
- 2.4465887546539307,
200
- 2.3342435359954834,
201
- 2.310267686843872,
202
- 2.2022716999053955,
203
- 2.2503445148468018,
204
- 2.2016828060150146,
205
- 2.1673545837402344,
206
- 2.134101152420044,
207
- 2.205885887145996,
208
- 2.1625630855560303,
209
- 2.090986967086792,
210
- 2.111237049102783,
211
- 2.0769805908203125,
212
- 2.029428243637085,
213
- 2.0649588108062744,
214
- 2.00223970413208,
215
- 2.041292667388916,
216
- 1.9394997358322144,
217
- 1.9607492685317993,
218
- 1.9083858728408813,
219
- 1.9098879098892212,
220
- 1.9169260263442993,
221
- 1.9707139730453491,
222
- 1.8977491855621338,
223
- 1.9636948108673096,
224
- 1.8278611898422241,
225
- 1.8901598453521729,
226
- 1.8808103799819946,
227
- 1.8042938709259033,
228
- 1.8307939767837524,
229
- 1.7672587633132935,
230
- 1.8975050449371338,
231
- 1.8720804452896118,
232
- 1.8167418241500854,
233
- 1.870581865310669,
234
- 1.6975057125091553,
235
- 1.7767753601074219,
236
- 1.7748700380325317,
237
- 1.65091073513031,
238
- 1.70918869972229,
239
- 1.8009940385818481,
240
- 1.6945215463638306,
241
- 1.7233848571777344,
242
- 1.6666440963745117,
243
- 1.6989432573318481,
244
- 1.657471776008606,
245
- 1.6441662311553955,
246
- 1.597665548324585,
247
- 1.5535640716552734,
248
- 1.5122039318084717,
249
- 1.5779234170913696,
250
- 1.5203800201416016,
251
- 1.5966535806655884,
252
- 1.5483477115631104,
253
- 1.4608439207077026,
254
- 1.493105173110962,
255
- 1.486636996269226,
256
- 1.4677892923355103,
257
- 1.4958831071853638,
258
- 1.5283870697021484,
259
- 1.3748594522476196,
260
- 1.3993589878082275,
261
- 1.444647192955017,
262
- 1.4109257459640503,
263
- 1.4192183017730713,
264
- 1.3186607360839844,
265
- 1.314361572265625,
266
- 1.3938757181167603,
267
- 1.3351393938064575,
268
- 1.3570326566696167,
269
- 1.3236685991287231,
270
- 1.376649022102356,
271
- 1.3282345533370972,
272
- 1.3094770908355713,
273
- 1.299924612045288,
274
- 1.3394296169281006,
275
- 1.2237088680267334,
276
- 1.267302393913269,
277
- 1.2889286279678345,
278
- 1.2387006282806396,
279
- 1.2579516172409058,
280
- 1.2616617679595947,
281
- 1.2706013917922974,
282
- 1.273461103439331,
283
- 1.2665029764175415,
284
- 1.3155572414398193,
285
- 1.2146965265274048,
286
- 1.2717746496200562,
287
- 1.2832603454589844,
288
- 1.2183363437652588,
289
- 1.2612063884735107,
290
- 1.2248671054840088,
291
- 1.2440804243087769,
292
- 1.26742684841156,
293
- 1.2262872457504272,
294
- 1.2662094831466675,
295
- 1.2455024719238281,
296
- 1.2313419580459595,
297
- 1.251754641532898,
298
- 1.2320165634155273,
299
- 1.2252769470214844,
300
- 1.2007458209991455,
301
- 1.2372609376907349,
302
- 1.20767080783844,
303
- 1.2022647857666016,
304
- 1.2439204454421997,
305
- 1.2493950128555298,
306
- 1.1735715866088867,
307
- 1.222267508506775,
308
- 1.2459700107574463,
309
- 1.2111799716949463,
310
- 1.1730115413665771,
311
- 1.1799018383026123,
312
- 1.217897653579712,
313
- 1.2276479005813599,
314
- 1.2698315382003784,
315
- 1.2276161909103394,
316
- 1.2415111064910889,
317
- 1.2021055221557617
318
  ],
319
  "base_loss": [
320
- 11.851262092590332,
321
- 11.322190284729004,
322
- 10.84399700164795,
323
- 10.476851463317871,
324
- 10.189689636230469,
325
- 9.838882446289062,
326
- 9.510722160339355,
327
- 9.290719032287598,
328
- 8.962188720703125,
329
- 8.611163139343262,
330
- 8.361300468444824,
331
- 8.08813190460205,
332
- 7.842682838439941,
333
- 7.567698001861572,
334
- 7.269339561462402,
335
- 7.015201091766357,
336
- 6.760293006896973,
337
- 6.50231409072876,
338
- 6.1420512199401855,
339
- 5.9626617431640625,
340
- 5.677587032318115,
341
- 5.316855430603027,
342
- 5.1820387840271,
343
- 4.774725914001465,
344
- 4.56818962097168,
345
- 4.361102104187012,
346
- 4.119393825531006,
347
- 3.8857994079589844,
348
- 3.673340320587158,
349
- 3.468486785888672,
350
- 3.2346725463867188,
351
- 3.0731041431427,
352
- 2.8872439861297607,
353
- 2.8238704204559326,
354
- 2.6868231296539307,
355
- 2.5601580142974854,
356
- 2.5200085639953613,
357
- 2.4465887546539307,
358
- 2.3342435359954834,
359
- 2.310267686843872,
360
- 2.2022716999053955,
361
- 2.2503445148468018,
362
- 2.2016828060150146,
363
- 2.1673545837402344,
364
- 2.134101152420044,
365
- 2.205885887145996,
366
- 2.1625630855560303,
367
- 2.090986967086792,
368
- 2.111237049102783,
369
- 2.0769805908203125,
370
- 2.029428243637085,
371
- 2.0649588108062744,
372
- 2.00223970413208,
373
- 2.041292667388916,
374
- 1.9394997358322144,
375
- 1.9607492685317993,
376
- 1.9083858728408813,
377
- 1.9098879098892212,
378
- 1.9169260263442993,
379
- 1.9707139730453491,
380
- 1.8977491855621338,
381
- 1.9636948108673096,
382
- 1.8278611898422241,
383
- 1.8901598453521729,
384
- 1.8808103799819946,
385
- 1.8042938709259033,
386
- 1.8307939767837524,
387
- 1.7672587633132935,
388
- 1.8975050449371338,
389
- 1.8720804452896118,
390
- 1.8167418241500854,
391
- 1.870581865310669,
392
- 1.6975057125091553,
393
- 1.7767753601074219,
394
- 1.7748700380325317,
395
- 1.65091073513031,
396
- 1.70918869972229,
397
- 1.8009940385818481,
398
- 1.6945215463638306,
399
- 1.7233848571777344,
400
- 1.6666440963745117,
401
- 1.6989432573318481,
402
- 1.657471776008606,
403
- 1.6441662311553955,
404
- 1.597665548324585,
405
- 1.5535640716552734,
406
- 1.5122039318084717,
407
- 1.5779234170913696,
408
- 1.5203800201416016,
409
- 1.5966535806655884,
410
- 1.5483477115631104,
411
- 1.4608439207077026,
412
- 1.493105173110962,
413
- 1.486636996269226,
414
- 1.4677892923355103,
415
- 1.4958831071853638,
416
- 1.5283870697021484,
417
- 1.3748594522476196,
418
- 1.3993589878082275,
419
- 1.444647192955017,
420
- 1.4109257459640503,
421
- 1.4192183017730713,
422
- 1.3186607360839844,
423
- 1.314361572265625,
424
- 1.3938757181167603,
425
- 1.3351393938064575,
426
- 1.3570326566696167,
427
- 1.3236685991287231,
428
- 1.376649022102356,
429
- 1.3282345533370972,
430
- 1.3094770908355713,
431
- 1.299924612045288,
432
- 1.3394296169281006,
433
- 1.2237088680267334,
434
- 1.267302393913269,
435
- 1.2889286279678345,
436
- 1.2387006282806396,
437
- 1.2579516172409058,
438
- 1.2616617679595947,
439
- 1.2706013917922974,
440
- 1.273461103439331,
441
- 1.2665029764175415,
442
- 1.3155572414398193,
443
- 1.2146965265274048,
444
- 1.2717746496200562,
445
- 1.2832603454589844,
446
- 1.2183363437652588,
447
- 1.2612063884735107,
448
- 1.2248671054840088,
449
- 1.2440804243087769,
450
- 1.26742684841156,
451
- 1.2262872457504272,
452
- 1.2662094831466675,
453
- 1.2455024719238281,
454
- 1.2313419580459595,
455
- 1.251754641532898,
456
- 1.2320165634155273,
457
- 1.2252769470214844,
458
- 1.2007458209991455,
459
- 1.2372609376907349,
460
- 1.20767080783844,
461
- 1.2022647857666016,
462
- 1.2439204454421997,
463
- 1.2493950128555298,
464
- 1.1735715866088867,
465
- 1.222267508506775,
466
- 1.2459700107574463,
467
- 1.2111799716949463,
468
- 1.1730115413665771,
469
- 1.1799018383026123,
470
- 1.217897653579712,
471
- 1.2276479005813599,
472
- 1.2698315382003784,
473
- 1.2276161909103394,
474
- 1.2415111064910889,
475
- 1.2021055221557617
476
  ],
477
  "lr": [
478
  4.188034188034189e-06,
@@ -682,590 +682,590 @@
682
  0.0,
683
  0.0,
684
  0.0,
 
 
685
  0.0,
686
  0.0,
687
- 0.0,
688
- 0.0033333333333333335,
689
- 0.006666666666666667,
690
- 0.008888888888888889,
691
- 0.01,
692
- 0.015555555555555555,
693
- 0.02666666666666667,
694
- 0.027777777777777776,
695
- 0.03,
696
- 0.04,
697
- 0.03333333333333333,
698
- 0.04111111111111111,
699
- 0.03777777777777778
700
  ]
701
  },
702
- "final_accuracy": 0.03791666666666667,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
- "n_per_split": 50
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 0.04,
714
- "digit_accuracy": 0.6485714285714286,
715
- "n_examples": 50,
716
  "per_subtask": {
717
  "SA": {
718
- "accuracy": 0.6372881355932203,
719
- "count": 295
720
  },
721
  "SS": {
722
- "accuracy": 0.7090909090909091,
723
- "count": 55
724
  }
725
  }
726
  },
727
  "add_S1": {
728
- "full_accuracy": 0.0,
729
- "digit_accuracy": 0.5885714285714285,
730
- "n_examples": 50,
731
  "per_subtask": {
732
  "SA": {
733
- "accuracy": 0.6349206349206349,
734
- "count": 126
735
  },
736
  "SC": {
737
- "accuracy": 0.620253164556962,
738
- "count": 79
739
  },
740
  "SS": {
741
- "accuracy": 0.6666666666666666,
742
- "count": 21
743
  },
744
  "UC": {
745
- "accuracy": 0.5080645161290323,
746
- "count": 124
747
  }
748
  }
749
  },
750
  "add_S2": {
751
- "full_accuracy": 0.02,
752
- "digit_accuracy": 0.56,
753
- "n_examples": 50,
754
  "per_subtask": {
755
  "SA": {
756
- "accuracy": 0.6,
757
- "count": 75
758
  },
759
  "SC": {
760
- "accuracy": 0.46774193548387094,
761
- "count": 62
762
  },
763
  "SS": {
764
- "accuracy": 0.46153846153846156,
765
- "count": 39
766
  },
767
  "UC": {
768
- "accuracy": 0.5405405405405406,
769
- "count": 111
770
  },
771
  "US": {
772
- "accuracy": 0.6984126984126984,
773
- "count": 63
774
  }
775
  }
776
  },
777
  "add_S3": {
778
- "full_accuracy": 0.02,
779
- "digit_accuracy": 0.6085714285714285,
780
- "n_examples": 50,
781
  "per_subtask": {
782
  "SA": {
783
- "accuracy": 0.6166666666666667,
784
- "count": 60
785
  },
786
  "SC": {
787
- "accuracy": 0.5964912280701754,
788
- "count": 57
789
  },
790
  "SS": {
791
- "accuracy": 0.42105263157894735,
792
- "count": 19
793
  },
794
  "UC": {
795
- "accuracy": 0.47115384615384615,
796
- "count": 104
797
  },
798
  "US": {
799
- "accuracy": 0.7727272727272727,
800
- "count": 110
801
  }
802
  }
803
  },
804
  "add_S4": {
805
- "full_accuracy": 0.08,
806
- "digit_accuracy": 0.5942857142857143,
807
- "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
- "accuracy": 0.6458333333333334,
811
- "count": 48
812
  },
813
  "SC": {
814
- "accuracy": 0.6730769230769231,
815
- "count": 52
816
  },
817
  "SS": {
818
- "accuracy": 0.8571428571428571,
819
- "count": 7
820
  },
821
  "UC": {
822
- "accuracy": 0.5056179775280899,
823
- "count": 89
824
  },
825
  "US": {
826
- "accuracy": 0.5909090909090909,
827
- "count": 154
828
  }
829
  }
830
  },
831
  "add_S5": {
832
- "full_accuracy": 0.16,
833
- "digit_accuracy": 0.4685714285714286,
834
- "n_examples": 50,
835
  "per_subtask": {
836
  "SA": {
837
- "accuracy": 0.7,
838
- "count": 50
839
  },
840
  "SC": {
841
- "accuracy": 0.7,
842
- "count": 50
843
  },
844
  "UC": {
845
- "accuracy": 0.38,
846
- "count": 50
847
  },
848
  "US": {
849
- "accuracy": 0.375,
850
- "count": 200
851
  }
852
  }
853
  },
854
  "add_S6": {
855
- "full_accuracy": 0.28,
856
- "digit_accuracy": 0.3942857142857143,
857
- "n_examples": 50,
858
  "per_subtask": {
859
  "SC": {
860
- "accuracy": 0.5,
861
- "count": 50
862
  },
863
  "UC": {
864
- "accuracy": 0.34,
865
- "count": 50
866
  },
867
  "US": {
868
- "accuracy": 0.384,
869
- "count": 250
870
  }
871
  }
872
  },
873
  "add_random": {
874
- "full_accuracy": 0.02,
875
- "digit_accuracy": 0.5921428571428572,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
- "accuracy": 0.5986078886310905,
880
- "count": 431
881
  },
882
  "SC": {
883
- "accuracy": 0.6360759493670886,
884
- "count": 316
885
  },
886
  "SS": {
887
- "accuracy": 0.7948717948717948,
888
- "count": 39
889
  },
890
  "UC": {
891
- "accuracy": 0.5232142857142857,
892
- "count": 560
893
  },
894
  "US": {
895
- "accuracy": 0.8518518518518519,
896
- "count": 54
897
  }
898
  }
899
  },
900
  "add_C1": {
901
  "full_accuracy": 0.0,
902
- "digit_accuracy": 0.5714285714285714,
903
- "n_examples": 50,
904
  "per_subtask": {
905
  "SA": {
906
- "accuracy": 0.596,
907
- "count": 250
908
  },
909
  "SC": {
910
- "accuracy": 0.62,
911
- "count": 50
912
  },
913
  "UC": {
914
- "accuracy": 0.4,
915
- "count": 50
916
  }
917
  }
918
  },
919
  "add_C2": {
920
- "full_accuracy": 0.02,
921
- "digit_accuracy": 0.5628571428571428,
922
- "n_examples": 50,
923
  "per_subtask": {
924
  "SA": {
925
- "accuracy": 0.64,
926
- "count": 200
927
  },
928
  "SC": {
929
- "accuracy": 0.54,
930
- "count": 50
931
  },
932
  "UC": {
933
- "accuracy": 0.3855421686746988,
934
- "count": 83
935
  },
936
  "US": {
937
- "accuracy": 0.5882352941176471,
938
- "count": 17
939
  }
940
  }
941
  },
942
  "add_C3": {
943
  "full_accuracy": 0.0,
944
- "digit_accuracy": 0.5428571428571428,
945
- "n_examples": 50,
946
  "per_subtask": {
947
  "SA": {
948
- "accuracy": 0.64,
949
- "count": 150
950
  },
951
  "SC": {
952
- "accuracy": 0.56,
953
- "count": 50
954
  },
955
  "UC": {
956
- "accuracy": 0.35,
957
- "count": 100
958
  },
959
  "US": {
960
- "accuracy": 0.62,
961
- "count": 50
962
  }
963
  }
964
  },
965
  "add_C4": {
966
- "full_accuracy": 0.06,
967
- "digit_accuracy": 0.5971428571428572,
968
- "n_examples": 50,
969
  "per_subtask": {
970
  "SA": {
971
- "accuracy": 0.71,
972
- "count": 100
973
  },
974
  "SC": {
975
- "accuracy": 0.66,
976
- "count": 50
977
  },
978
  "UC": {
979
- "accuracy": 0.4090909090909091,
980
- "count": 132
981
  },
982
  "US": {
983
- "accuracy": 0.75,
984
- "count": 68
985
  }
986
  }
987
  },
988
  "add_C5": {
989
- "full_accuracy": 0.02,
990
- "digit_accuracy": 0.6428571428571429,
991
- "n_examples": 50,
992
  "per_subtask": {
993
  "SA": {
994
- "accuracy": 0.68,
995
- "count": 50
996
  },
997
  "SC": {
998
- "accuracy": 0.68,
999
- "count": 50
1000
  },
1001
  "UC": {
1002
- "accuracy": 0.4520547945205479,
1003
- "count": 146
1004
  },
1005
  "US": {
1006
- "accuracy": 0.875,
1007
- "count": 104
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
  "full_accuracy": 0.0,
1013
- "digit_accuracy": 0.5657142857142857,
1014
- "n_examples": 50,
1015
  "per_subtask": {
1016
  "SC": {
1017
- "accuracy": 0.66,
1018
- "count": 50
1019
  },
1020
  "UC": {
1021
- "accuracy": 0.5026455026455027,
1022
- "count": 189
1023
  },
1024
  "US": {
1025
- "accuracy": 0.6306306306306306,
1026
- "count": 111
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
- "full_accuracy": 0.04,
1032
- "digit_accuracy": 0.6571428571428571,
1033
- "n_examples": 50,
1034
  "per_subtask": {
1035
  "MD": {
1036
- "accuracy": 0.6039603960396039,
1037
- "count": 303
1038
  },
1039
  "ME": {
1040
- "accuracy": 1.0,
1041
- "count": 47
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
- "full_accuracy": 0.0,
1047
- "digit_accuracy": 0.6542857142857142,
1048
- "n_examples": 50,
1049
  "per_subtask": {
1050
  "MD": {
1051
- "accuracy": 0.7092198581560284,
1052
- "count": 141
1053
  },
1054
  "MB": {
1055
- "accuracy": 0.7083333333333334,
1056
- "count": 72
1057
  },
1058
  "ME": {
1059
- "accuracy": 0.9444444444444444,
1060
- "count": 18
1061
  },
1062
  "UB": {
1063
- "accuracy": 0.5126050420168067,
1064
- "count": 119
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
- "full_accuracy": 0.04,
1070
- "digit_accuracy": 0.6828571428571428,
1071
- "n_examples": 50,
1072
  "per_subtask": {
1073
  "MD": {
1074
- "accuracy": 0.8214285714285714,
1075
- "count": 112
1076
  },
1077
  "MB": {
1078
- "accuracy": 0.7169811320754716,
1079
- "count": 53
1080
  },
1081
  "ME": {
1082
- "accuracy": 0.9148936170212766,
1083
- "count": 47
1084
  },
1085
  "UB": {
1086
- "accuracy": 0.4470588235294118,
1087
- "count": 85
1088
  },
1089
  "UD": {
1090
- "accuracy": 0.5283018867924528,
1091
- "count": 53
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
  "full_accuracy": 0.0,
1097
- "digit_accuracy": 0.5085714285714286,
1098
- "n_examples": 50,
1099
  "per_subtask": {
1100
  "MD": {
1101
- "accuracy": 0.8144329896907216,
1102
- "count": 97
1103
  },
1104
  "MB": {
1105
- "accuracy": 0.6274509803921569,
1106
- "count": 51
1107
  },
1108
  "ME": {
1109
- "accuracy": 1.0,
1110
- "count": 27
1111
  },
1112
  "UB": {
1113
- "accuracy": 0.5135135135135135,
1114
- "count": 74
1115
  },
1116
  "UD": {
1117
- "accuracy": 0.019801980198019802,
1118
- "count": 101
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
  "full_accuracy": 0.0,
1124
- "digit_accuracy": 0.3485714285714286,
1125
- "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
- "accuracy": 0.64,
1129
- "count": 100
1130
  },
1131
  "MB": {
1132
- "accuracy": 0.58,
1133
- "count": 50
1134
  },
1135
  "UB": {
1136
  "accuracy": 0.56,
1137
- "count": 50
1138
  },
1139
  "UD": {
1140
- "accuracy": 0.006666666666666667,
1141
- "count": 150
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
  "full_accuracy": 0.0,
1147
- "digit_accuracy": 0.34,
1148
- "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
- "count": 50
1153
  },
1154
  "MB": {
1155
- "accuracy": 0.74,
1156
- "count": 50
1157
  },
1158
  "UB": {
1159
- "accuracy": 0.64,
1160
- "count": 50
1161
  },
1162
  "UD": {
1163
  "accuracy": 0.0,
1164
- "count": 200
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
- "full_accuracy": 0.035,
1170
- "digit_accuracy": 0.6064285714285714,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
- "accuracy": 0.6543859649122807,
1175
- "count": 570
1176
  },
1177
  "MB": {
1178
- "accuracy": 0.7075812274368231,
1179
- "count": 277
1180
  },
1181
  "ME": {
1182
- "accuracy": 0.9245283018867925,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
- "accuracy": 0.46709129511677283,
1187
- "count": 471
1188
  },
1189
  "UD": {
1190
- "accuracy": 0.3793103448275862,
1191
- "count": 29
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
- "full_accuracy": 0.02,
1197
- "digit_accuracy": 0.5485714285714286,
1198
- "n_examples": 50,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.6333333333333333,
1202
- "count": 150
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.82,
1206
- "count": 50
1207
  },
1208
  "UB": {
1209
- "accuracy": 0.45544554455445546,
1210
- "count": 101
1211
  },
1212
  "UD": {
1213
- "accuracy": 0.20408163265306123,
1214
- "count": 49
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
  "full_accuracy": 0.0,
1220
- "digit_accuracy": 0.52,
1221
- "n_examples": 50,
1222
  "per_subtask": {
1223
  "MD": {
1224
- "accuracy": 0.77,
1225
- "count": 100
1226
  },
1227
  "MB": {
1228
- "accuracy": 0.78,
1229
- "count": 50
1230
  },
1231
  "UB": {
1232
- "accuracy": 0.4214876033057851,
1233
- "count": 121
1234
  },
1235
  "UD": {
1236
- "accuracy": 0.189873417721519,
1237
- "count": 79
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
  "full_accuracy": 0.0,
1243
- "digit_accuracy": 0.5057142857142857,
1244
- "n_examples": 50,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
- "count": 50
1249
  },
1250
  "MB": {
1251
- "accuracy": 0.76,
1252
- "count": 50
1253
  },
1254
  "UB": {
1255
- "accuracy": 0.46710526315789475,
1256
- "count": 152
1257
  },
1258
  "UD": {
1259
- "accuracy": 0.1836734693877551,
1260
- "count": 98
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
- "overall_accuracy": 0.034,
1267
- "digit_accuracy": 0.5633333333333334,
1268
- "total_examples": 1500,
1269
  "n_splits": 24
1270
  }
1271
  }
 
159
  7800
160
  ],
161
  "loss": [
162
+ 11.832879066467285,
163
+ 11.36864185333252,
164
+ 10.761643409729004,
165
+ 10.459257125854492,
166
+ 10.175871849060059,
167
+ 9.816800117492676,
168
+ 9.529030799865723,
169
+ 9.260773658752441,
170
+ 9.017507553100586,
171
+ 8.663485527038574,
172
+ 8.371795654296875,
173
+ 8.118546485900879,
174
+ 7.833754539489746,
175
+ 7.534058570861816,
176
+ 7.297992706298828,
177
+ 6.95162296295166,
178
+ 6.682138919830322,
179
+ 6.448472023010254,
180
+ 6.186150550842285,
181
+ 5.851781368255615,
182
+ 5.642083168029785,
183
+ 5.3920135498046875,
184
+ 5.093167781829834,
185
+ 4.796841621398926,
186
+ 4.534151554107666,
187
+ 4.4100446701049805,
188
+ 4.1128153800964355,
189
+ 3.8507931232452393,
190
+ 3.6341888904571533,
191
+ 3.4420320987701416,
192
+ 3.2039170265197754,
193
+ 3.1796298027038574,
194
+ 2.970296621322632,
195
+ 2.8707118034362793,
196
+ 2.7452502250671387,
197
+ 2.64337158203125,
198
+ 2.503178119659424,
199
+ 2.4706099033355713,
200
+ 2.370277166366577,
201
+ 2.268285036087036,
202
+ 2.318305492401123,
203
+ 2.2794229984283447,
204
+ 2.149564504623413,
205
+ 2.1994822025299072,
206
+ 2.2138619422912598,
207
+ 2.154664993286133,
208
+ 2.1253409385681152,
209
+ 2.104158401489258,
210
+ 2.0960659980773926,
211
+ 2.066467523574829,
212
+ 2.087989568710327,
213
+ 2.050243616104126,
214
+ 1.993398666381836,
215
+ 2.037946939468384,
216
+ 1.9801470041275024,
217
+ 2.0205399990081787,
218
+ 2.0452141761779785,
219
+ 1.938254475593567,
220
+ 1.964411973953247,
221
+ 1.9696029424667358,
222
+ 1.9631099700927734,
223
+ 1.9499931335449219,
224
+ 1.9534265995025635,
225
+ 1.930572271347046,
226
+ 1.9394501447677612,
227
+ 1.9909099340438843,
228
+ 1.8681749105453491,
229
+ 1.8672945499420166,
230
+ 1.9460028409957886,
231
+ 1.8758708238601685,
232
+ 1.8508617877960205,
233
+ 1.8957277536392212,
234
+ 1.89703369140625,
235
+ 1.8883601427078247,
236
+ 1.9217368364334106,
237
+ 1.8644115924835205,
238
+ 1.8522149324417114,
239
+ 1.8996679782867432,
240
+ 1.7724077701568604,
241
+ 1.8279880285263062,
242
+ 1.8713520765304565,
243
+ 1.7754944562911987,
244
+ 1.8203290700912476,
245
+ 1.8123414516448975,
246
+ 1.718186616897583,
247
+ 1.7727686166763306,
248
+ 1.7215014696121216,
249
+ 1.6721407175064087,
250
+ 1.7613623142242432,
251
+ 1.7179336547851562,
252
+ 1.773058533668518,
253
+ 1.7030664682388306,
254
+ 1.6822694540023804,
255
+ 1.708242654800415,
256
+ 1.6574541330337524,
257
+ 1.7228068113327026,
258
+ 1.6863057613372803,
259
+ 1.7814959287643433,
260
+ 1.6393948793411255,
261
+ 1.602840781211853,
262
+ 1.6753469705581665,
263
+ 1.6772491931915283,
264
+ 1.6624025106430054,
265
+ 1.6662027835845947,
266
+ 1.6071633100509644,
267
+ 1.6065205335617065,
268
+ 1.6259610652923584,
269
+ 1.6235558986663818,
270
+ 1.545785665512085,
271
+ 1.5571601390838623,
272
+ 1.6024692058563232,
273
+ 1.5312445163726807,
274
+ 1.5064446926116943,
275
+ 1.5606070756912231,
276
+ 1.5643519163131714,
277
+ 1.505636215209961,
278
+ 1.542928695678711,
279
+ 1.5629281997680664,
280
+ 1.5619080066680908,
281
+ 1.5639537572860718,
282
+ 1.4565213918685913,
283
+ 1.4847272634506226,
284
+ 1.502737045288086,
285
+ 1.4859784841537476,
286
+ 1.4928518533706665,
287
+ 1.4829928874969482,
288
+ 1.4601913690567017,
289
+ 1.531474232673645,
290
+ 1.4434651136398315,
291
+ 1.5020925998687744,
292
+ 1.47160804271698,
293
+ 1.4648197889328003,
294
+ 1.5810593366622925,
295
+ 1.4178014993667603,
296
+ 1.4882278442382812,
297
+ 1.4169243574142456,
298
+ 1.434967279434204,
299
+ 1.4791187047958374,
300
+ 1.4924046993255615,
301
+ 1.464010238647461,
302
+ 1.355211853981018,
303
+ 1.4422768354415894,
304
+ 1.4772409200668335,
305
+ 1.4855642318725586,
306
+ 1.3914214372634888,
307
+ 1.5317624807357788,
308
+ 1.4767916202545166,
309
+ 1.5525057315826416,
310
+ 1.4267528057098389,
311
+ 1.3921186923980713,
312
+ 1.4735260009765625,
313
+ 1.4076300859451294,
314
+ 1.4896430969238281,
315
+ 1.440387487411499,
316
+ 1.436928391456604,
317
+ 1.4201313257217407
318
  ],
319
  "base_loss": [
320
+ 11.832879066467285,
321
+ 11.36864185333252,
322
+ 10.761643409729004,
323
+ 10.459257125854492,
324
+ 10.175871849060059,
325
+ 9.816800117492676,
326
+ 9.529030799865723,
327
+ 9.260773658752441,
328
+ 9.017507553100586,
329
+ 8.663485527038574,
330
+ 8.371795654296875,
331
+ 8.118546485900879,
332
+ 7.833754539489746,
333
+ 7.534058570861816,
334
+ 7.297992706298828,
335
+ 6.95162296295166,
336
+ 6.682138919830322,
337
+ 6.448472023010254,
338
+ 6.186150550842285,
339
+ 5.851781368255615,
340
+ 5.642083168029785,
341
+ 5.3920135498046875,
342
+ 5.093167781829834,
343
+ 4.796841621398926,
344
+ 4.534151554107666,
345
+ 4.4100446701049805,
346
+ 4.1128153800964355,
347
+ 3.8507931232452393,
348
+ 3.6341888904571533,
349
+ 3.4420320987701416,
350
+ 3.2039170265197754,
351
+ 3.1796298027038574,
352
+ 2.970296621322632,
353
+ 2.8707118034362793,
354
+ 2.7452502250671387,
355
+ 2.64337158203125,
356
+ 2.503178119659424,
357
+ 2.4706099033355713,
358
+ 2.370277166366577,
359
+ 2.268285036087036,
360
+ 2.318305492401123,
361
+ 2.2794229984283447,
362
+ 2.149564504623413,
363
+ 2.1994822025299072,
364
+ 2.2138619422912598,
365
+ 2.154664993286133,
366
+ 2.1253409385681152,
367
+ 2.104158401489258,
368
+ 2.0960659980773926,
369
+ 2.066467523574829,
370
+ 2.087989568710327,
371
+ 2.050243616104126,
372
+ 1.993398666381836,
373
+ 2.037946939468384,
374
+ 1.9801470041275024,
375
+ 2.0205399990081787,
376
+ 2.0452141761779785,
377
+ 1.938254475593567,
378
+ 1.964411973953247,
379
+ 1.9696029424667358,
380
+ 1.9631099700927734,
381
+ 1.9499931335449219,
382
+ 1.9534265995025635,
383
+ 1.930572271347046,
384
+ 1.9394501447677612,
385
+ 1.9909099340438843,
386
+ 1.8681749105453491,
387
+ 1.8672945499420166,
388
+ 1.9460028409957886,
389
+ 1.8758708238601685,
390
+ 1.8508617877960205,
391
+ 1.8957277536392212,
392
+ 1.89703369140625,
393
+ 1.8883601427078247,
394
+ 1.9217368364334106,
395
+ 1.8644115924835205,
396
+ 1.8522149324417114,
397
+ 1.8996679782867432,
398
+ 1.7724077701568604,
399
+ 1.8279880285263062,
400
+ 1.8713520765304565,
401
+ 1.7754944562911987,
402
+ 1.8203290700912476,
403
+ 1.8123414516448975,
404
+ 1.718186616897583,
405
+ 1.7727686166763306,
406
+ 1.7215014696121216,
407
+ 1.6721407175064087,
408
+ 1.7613623142242432,
409
+ 1.7179336547851562,
410
+ 1.773058533668518,
411
+ 1.7030664682388306,
412
+ 1.6822694540023804,
413
+ 1.708242654800415,
414
+ 1.6574541330337524,
415
+ 1.7228068113327026,
416
+ 1.6863057613372803,
417
+ 1.7814959287643433,
418
+ 1.6393948793411255,
419
+ 1.602840781211853,
420
+ 1.6753469705581665,
421
+ 1.6772491931915283,
422
+ 1.6624025106430054,
423
+ 1.6662027835845947,
424
+ 1.6071633100509644,
425
+ 1.6065205335617065,
426
+ 1.6259610652923584,
427
+ 1.6235558986663818,
428
+ 1.545785665512085,
429
+ 1.5571601390838623,
430
+ 1.6024692058563232,
431
+ 1.5312445163726807,
432
+ 1.5064446926116943,
433
+ 1.5606070756912231,
434
+ 1.5643519163131714,
435
+ 1.505636215209961,
436
+ 1.542928695678711,
437
+ 1.5629281997680664,
438
+ 1.5619080066680908,
439
+ 1.5639537572860718,
440
+ 1.4565213918685913,
441
+ 1.4847272634506226,
442
+ 1.502737045288086,
443
+ 1.4859784841537476,
444
+ 1.4928518533706665,
445
+ 1.4829928874969482,
446
+ 1.4601913690567017,
447
+ 1.531474232673645,
448
+ 1.4434651136398315,
449
+ 1.5020925998687744,
450
+ 1.47160804271698,
451
+ 1.4648197889328003,
452
+ 1.5810593366622925,
453
+ 1.4178014993667603,
454
+ 1.4882278442382812,
455
+ 1.4169243574142456,
456
+ 1.434967279434204,
457
+ 1.4791187047958374,
458
+ 1.4924046993255615,
459
+ 1.464010238647461,
460
+ 1.355211853981018,
461
+ 1.4422768354415894,
462
+ 1.4772409200668335,
463
+ 1.4855642318725586,
464
+ 1.3914214372634888,
465
+ 1.5317624807357788,
466
+ 1.4767916202545166,
467
+ 1.5525057315826416,
468
+ 1.4267528057098389,
469
+ 1.3921186923980713,
470
+ 1.4735260009765625,
471
+ 1.4076300859451294,
472
+ 1.4896430969238281,
473
+ 1.440387487411499,
474
+ 1.436928391456604,
475
+ 1.4201313257217407
476
  ],
477
  "lr": [
478
  4.188034188034189e-06,
 
682
  0.0,
683
  0.0,
684
  0.0,
685
+ 0.003157894736842105,
686
+ 0.0010526315789473684,
687
  0.0,
688
  0.0,
689
+ 0.0010526315789473684,
690
+ 0.002105263157894737,
691
+ 0.010526315789473684,
692
+ 0.00631578947368421,
693
+ 0.002105263157894737,
694
+ 0.004210526315789474,
695
+ 0.010526315789473684,
696
+ 0.008421052631578947,
697
+ 0.011578947368421053,
698
+ 0.009473684210526316,
699
+ 0.009473684210526316
 
 
700
  ]
701
  },
702
+ "final_accuracy": 0.011153846153846153,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
+ "n_per_split": 100
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.01,
714
+ "digit_accuracy": 0.49142857142857144,
715
+ "n_examples": 100,
716
  "per_subtask": {
717
  "SA": {
718
+ "accuracy": 0.4396694214876033,
719
+ "count": 605
720
  },
721
  "SS": {
722
+ "accuracy": 0.8210526315789474,
723
+ "count": 95
724
  }
725
  }
726
  },
727
  "add_S1": {
728
+ "full_accuracy": 0.01,
729
+ "digit_accuracy": 0.4471428571428571,
730
+ "n_examples": 100,
731
  "per_subtask": {
732
  "SA": {
733
+ "accuracy": 0.5245098039215687,
734
+ "count": 204
735
  },
736
  "SC": {
737
+ "accuracy": 0.33136094674556216,
738
+ "count": 169
739
  },
740
  "SS": {
741
+ "accuracy": 0.6129032258064516,
742
+ "count": 31
743
  },
744
  "UC": {
745
+ "accuracy": 0.44256756756756754,
746
+ "count": 296
747
  }
748
  }
749
  },
750
  "add_S2": {
751
+ "full_accuracy": 0.0,
752
+ "digit_accuracy": 0.4957142857142857,
753
+ "n_examples": 100,
754
  "per_subtask": {
755
  "SA": {
756
+ "accuracy": 0.5460122699386503,
757
+ "count": 163
758
  },
759
  "SC": {
760
+ "accuracy": 0.38461538461538464,
761
+ "count": 130
762
  },
763
  "SS": {
764
+ "accuracy": 0.6091954022988506,
765
+ "count": 87
766
  },
767
  "UC": {
768
+ "accuracy": 0.43349753694581283,
769
+ "count": 203
770
  },
771
  "US": {
772
+ "accuracy": 0.5726495726495726,
773
+ "count": 117
774
  }
775
  }
776
  },
777
  "add_S3": {
778
+ "full_accuracy": 0.0,
779
+ "digit_accuracy": 0.4685714285714286,
780
+ "n_examples": 100,
781
  "per_subtask": {
782
  "SA": {
783
+ "accuracy": 0.5950413223140496,
784
+ "count": 121
785
  },
786
  "SC": {
787
+ "accuracy": 0.3305785123966942,
788
+ "count": 121
789
  },
790
  "SS": {
791
+ "accuracy": 0.673469387755102,
792
+ "count": 49
793
  },
794
  "UC": {
795
+ "accuracy": 0.3870967741935484,
796
+ "count": 186
797
  },
798
  "US": {
799
+ "accuracy": 0.4977578475336323,
800
+ "count": 223
801
  }
802
  }
803
  },
804
  "add_S4": {
805
+ "full_accuracy": 0.03,
806
+ "digit_accuracy": 0.44142857142857145,
807
+ "n_examples": 100,
808
  "per_subtask": {
809
  "SA": {
810
+ "accuracy": 0.6057692307692307,
811
+ "count": 104
812
  },
813
  "SC": {
814
+ "accuracy": 0.3584905660377358,
815
+ "count": 106
816
  },
817
  "SS": {
818
+ "accuracy": 0.7391304347826086,
819
+ "count": 23
820
  },
821
  "UC": {
822
+ "accuracy": 0.3875,
823
+ "count": 160
824
  },
825
  "US": {
826
+ "accuracy": 0.4201954397394137,
827
+ "count": 307
828
  }
829
  }
830
  },
831
  "add_S5": {
832
+ "full_accuracy": 0.07,
833
+ "digit_accuracy": 0.39571428571428574,
834
+ "n_examples": 100,
835
  "per_subtask": {
836
  "SA": {
837
+ "accuracy": 0.59,
838
+ "count": 100
839
  },
840
  "SC": {
841
+ "accuracy": 0.29,
842
+ "count": 100
843
  },
844
  "UC": {
845
+ "accuracy": 0.33,
846
+ "count": 100
847
  },
848
  "US": {
849
+ "accuracy": 0.39,
850
+ "count": 400
851
  }
852
  }
853
  },
854
  "add_S6": {
855
+ "full_accuracy": 0.13,
856
+ "digit_accuracy": 0.26,
857
+ "n_examples": 100,
858
  "per_subtask": {
859
  "SC": {
860
+ "accuracy": 0.29,
861
+ "count": 100
862
  },
863
  "UC": {
864
+ "accuracy": 0.25,
865
+ "count": 100
866
  },
867
  "US": {
868
+ "accuracy": 0.256,
869
+ "count": 500
870
  }
871
  }
872
  },
873
  "add_random": {
874
+ "full_accuracy": 0.0,
875
+ "digit_accuracy": 0.43857142857142856,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
+ "accuracy": 0.4720357941834452,
880
+ "count": 447
881
  },
882
  "SC": {
883
+ "accuracy": 0.35625,
884
+ "count": 320
885
  },
886
  "SS": {
887
+ "accuracy": 0.625,
888
+ "count": 56
889
  },
890
  "UC": {
891
+ "accuracy": 0.42344045368620037,
892
+ "count": 529
893
  },
894
  "US": {
895
+ "accuracy": 0.625,
896
+ "count": 48
897
  }
898
  }
899
  },
900
  "add_C1": {
901
  "full_accuracy": 0.0,
902
+ "digit_accuracy": 0.4642857142857143,
903
+ "n_examples": 100,
904
  "per_subtask": {
905
  "SA": {
906
+ "accuracy": 0.526,
907
+ "count": 500
908
  },
909
  "SC": {
910
+ "accuracy": 0.28,
911
+ "count": 100
912
  },
913
  "UC": {
914
+ "accuracy": 0.34,
915
+ "count": 100
916
  }
917
  }
918
  },
919
  "add_C2": {
920
+ "full_accuracy": 0.0,
921
+ "digit_accuracy": 0.4714285714285714,
922
+ "n_examples": 100,
923
  "per_subtask": {
924
  "SA": {
925
+ "accuracy": 0.5275,
926
+ "count": 400
927
  },
928
  "SC": {
929
+ "accuracy": 0.42,
930
+ "count": 100
931
  },
932
  "UC": {
933
+ "accuracy": 0.3141025641025641,
934
+ "count": 156
935
  },
936
  "US": {
937
+ "accuracy": 0.6363636363636364,
938
+ "count": 44
939
  }
940
  }
941
  },
942
  "add_C3": {
943
  "full_accuracy": 0.0,
944
+ "digit_accuracy": 0.45571428571428574,
945
+ "n_examples": 100,
946
  "per_subtask": {
947
  "SA": {
948
+ "accuracy": 0.5733333333333334,
949
+ "count": 300
950
  },
951
  "SC": {
952
+ "accuracy": 0.38,
953
+ "count": 100
954
  },
955
  "UC": {
956
+ "accuracy": 0.2914572864321608,
957
+ "count": 199
958
  },
959
  "US": {
960
+ "accuracy": 0.504950495049505,
961
+ "count": 101
962
  }
963
  }
964
  },
965
  "add_C4": {
966
+ "full_accuracy": 0.0,
967
+ "digit_accuracy": 0.48142857142857143,
968
+ "n_examples": 100,
969
  "per_subtask": {
970
  "SA": {
971
+ "accuracy": 0.65,
972
+ "count": 200
973
  },
974
  "SC": {
975
+ "accuracy": 0.38,
976
+ "count": 100
977
  },
978
  "UC": {
979
+ "accuracy": 0.3333333333333333,
980
+ "count": 264
981
  },
982
  "US": {
983
+ "accuracy": 0.5955882352941176,
984
+ "count": 136
985
  }
986
  }
987
  },
988
  "add_C5": {
989
+ "full_accuracy": 0.0,
990
+ "digit_accuracy": 0.4928571428571429,
991
+ "n_examples": 100,
992
  "per_subtask": {
993
  "SA": {
994
+ "accuracy": 0.7,
995
+ "count": 100
996
  },
997
  "SC": {
998
+ "accuracy": 0.43,
999
+ "count": 100
1000
  },
1001
  "UC": {
1002
+ "accuracy": 0.3,
1003
+ "count": 310
1004
  },
1005
  "US": {
1006
+ "accuracy": 0.7315789473684211,
1007
+ "count": 190
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
  "full_accuracy": 0.0,
1013
+ "digit_accuracy": 0.4957142857142857,
1014
+ "n_examples": 100,
1015
  "per_subtask": {
1016
  "SC": {
1017
+ "accuracy": 0.43,
1018
+ "count": 100
1019
  },
1020
  "UC": {
1021
+ "accuracy": 0.3621621621621622,
1022
+ "count": 370
1023
  },
1024
  "US": {
1025
+ "accuracy": 0.7391304347826086,
1026
+ "count": 230
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
+ "full_accuracy": 0.0,
1032
+ "digit_accuracy": 0.5628571428571428,
1033
+ "n_examples": 100,
1034
  "per_subtask": {
1035
  "MD": {
1036
+ "accuracy": 0.5203252032520326,
1037
+ "count": 615
1038
  },
1039
  "ME": {
1040
+ "accuracy": 0.8705882352941177,
1041
+ "count": 85
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
+ "full_accuracy": 0.01,
1047
+ "digit_accuracy": 0.45,
1048
+ "n_examples": 100,
1049
  "per_subtask": {
1050
  "MD": {
1051
+ "accuracy": 0.5821917808219178,
1052
+ "count": 292
1053
  },
1054
  "MB": {
1055
+ "accuracy": 0.3541666666666667,
1056
+ "count": 144
1057
  },
1058
  "ME": {
1059
+ "accuracy": 0.92,
1060
+ "count": 25
1061
  },
1062
  "UB": {
1063
+ "accuracy": 0.29707112970711297,
1064
+ "count": 239
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
+ "full_accuracy": 0.03,
1070
+ "digit_accuracy": 0.5485714285714286,
1071
+ "n_examples": 100,
1072
  "per_subtask": {
1073
  "MD": {
1074
+ "accuracy": 0.7488151658767772,
1075
+ "count": 211
1076
  },
1077
  "MB": {
1078
+ "accuracy": 0.26956521739130435,
1079
+ "count": 115
1080
  },
1081
  "ME": {
1082
+ "accuracy": 0.9647058823529412,
1083
+ "count": 85
1084
  },
1085
  "UB": {
1086
+ "accuracy": 0.4530386740331492,
1087
+ "count": 181
1088
  },
1089
  "UD": {
1090
+ "accuracy": 0.28703703703703703,
1091
+ "count": 108
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
  "full_accuracy": 0.0,
1097
+ "digit_accuracy": 0.43,
1098
+ "n_examples": 100,
1099
  "per_subtask": {
1100
  "MD": {
1101
+ "accuracy": 0.8435754189944135,
1102
+ "count": 179
1103
  },
1104
  "MB": {
1105
+ "accuracy": 0.14563106796116504,
1106
+ "count": 103
1107
  },
1108
  "ME": {
1109
+ "accuracy": 0.9642857142857143,
1110
+ "count": 56
1111
  },
1112
  "UB": {
1113
+ "accuracy": 0.48322147651006714,
1114
+ "count": 149
1115
  },
1116
  "UD": {
1117
+ "accuracy": 0.04225352112676056,
1118
+ "count": 213
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
  "full_accuracy": 0.0,
1124
+ "digit_accuracy": 0.31857142857142856,
1125
+ "n_examples": 100,
1126
  "per_subtask": {
1127
  "MD": {
1128
+ "accuracy": 0.705,
1129
+ "count": 200
1130
  },
1131
  "MB": {
1132
+ "accuracy": 0.08,
1133
+ "count": 100
1134
  },
1135
  "UB": {
1136
  "accuracy": 0.56,
1137
+ "count": 100
1138
  },
1139
  "UD": {
1140
+ "accuracy": 0.06,
1141
+ "count": 300
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
  "full_accuracy": 0.0,
1147
+ "digit_accuracy": 0.24428571428571427,
1148
+ "n_examples": 100,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
+ "count": 100
1153
  },
1154
  "MB": {
1155
+ "accuracy": 0.0,
1156
+ "count": 100
1157
  },
1158
  "UB": {
1159
+ "accuracy": 0.71,
1160
+ "count": 100
1161
  },
1162
  "UD": {
1163
  "accuracy": 0.0,
1164
+ "count": 400
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
+ "full_accuracy": 0.0,
1170
+ "digit_accuracy": 0.4635714285714286,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
+ "accuracy": 0.5716666666666667,
1175
+ "count": 600
1176
  },
1177
  "MB": {
1178
+ "accuracy": 0.3258426966292135,
1179
+ "count": 267
1180
  },
1181
  "ME": {
1182
+ "accuracy": 0.8113207547169812,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
+ "accuracy": 0.35990888382687924,
1187
+ "count": 439
1188
  },
1189
  "UD": {
1190
+ "accuracy": 0.43902439024390244,
1191
+ "count": 41
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
+ "full_accuracy": 0.0,
1197
+ "digit_accuracy": 0.38857142857142857,
1198
+ "n_examples": 100,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.57,
1202
+ "count": 300
1203
  },
1204
  "MB": {
1205
+ "accuracy": 0.22,
1206
+ "count": 100
1207
  },
1208
  "UB": {
1209
+ "accuracy": 0.3350253807106599,
1210
+ "count": 197
1211
  },
1212
  "UD": {
1213
+ "accuracy": 0.1262135922330097,
1214
+ "count": 103
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
  "full_accuracy": 0.0,
1220
+ "digit_accuracy": 0.3742857142857143,
1221
+ "n_examples": 100,
1222
  "per_subtask": {
1223
  "MD": {
1224
+ "accuracy": 0.69,
1225
+ "count": 200
1226
  },
1227
  "MB": {
1228
+ "accuracy": 0.18,
1229
+ "count": 100
1230
  },
1231
  "UB": {
1232
+ "accuracy": 0.3441295546558704,
1233
+ "count": 247
1234
  },
1235
  "UD": {
1236
+ "accuracy": 0.13725490196078433,
1237
+ "count": 153
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
  "full_accuracy": 0.0,
1243
+ "digit_accuracy": 0.35714285714285715,
1244
+ "n_examples": 100,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
+ "count": 100
1249
  },
1250
  "MB": {
1251
+ "accuracy": 0.17,
1252
+ "count": 100
1253
  },
1254
  "UB": {
1255
+ "accuracy": 0.38926174496644295,
1256
+ "count": 298
1257
  },
1258
  "UD": {
1259
+ "accuracy": 0.08415841584158416,
1260
+ "count": 202
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
+ "overall_accuracy": 0.011153846153846153,
1267
+ "digit_accuracy": 0.43615384615384617,
1268
+ "total_examples": 2600,
1269
  "n_splits": 24
1270
  }
1271
  }
add_sub_baseline_25K_2L1H128d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dfc7feee43bb81e3190bd7c2981cb84962bbd16d1ec47f74806d132e2a2441d
3
  size 157692826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2da20f6f58edc03c2d50da20dd2e8f1236b5943d7843ea3a20205d1d47e945ab
3
  size 157692826
add_sub_baseline_25K_2L1H128d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 39346560,
71
  "run_name": "add_sub_baseline_25K_2L1H128d",
72
- "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
- "timestamp": "2026-04-12T18:09:34.520594+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "5l0ckmy9",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/5l0ckmy9",
81
- "final_accuracy": 0.03791666666666667,
82
- "sft_accuracy": 0.03791666666666667,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 39346560,
71
  "run_name": "add_sub_baseline_25K_2L1H128d",
72
+ "git_commit": "1d5a160e16a5070d61b881494e832aa88149b15c",
73
+ "timestamp": "2026-04-15T04:56:24.356457+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_25K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "rw1bb80x",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/rw1bb80x",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.011153846153846153,
86
+ "sft_accuracy": 0.011153846153846153,
87
  "eval_method": "ArithmeticEvaluator"
88
  }