amirali1985 commited on
Commit
79875e8
·
verified ·
1 Parent(s): 804cf44

Upload add_sub_baseline_25K_1L2H256d

Browse files
add_sub_baseline_25K_1L2H256d/metrics.json CHANGED
@@ -159,320 +159,320 @@
159
  7800
160
  ],
161
  "loss": [
162
- 11.671636581420898,
163
- 10.659524917602539,
164
- 9.829411506652832,
165
- 9.199652671813965,
166
- 8.631672859191895,
167
- 7.977789878845215,
168
- 7.31129789352417,
169
- 6.896409034729004,
170
- 6.216288089752197,
171
- 5.601308345794678,
172
- 5.060961723327637,
173
- 4.576255798339844,
174
- 4.051487922668457,
175
- 3.5682461261749268,
176
- 3.152874231338501,
177
- 2.812526226043701,
178
- 2.522318124771118,
179
- 2.3712620735168457,
180
- 2.1769790649414062,
181
- 2.142094135284424,
182
- 2.0678162574768066,
183
- 1.915800929069519,
184
- 2.0273938179016113,
185
- 1.892726182937622,
186
- 1.8822896480560303,
187
- 1.9177898168563843,
188
- 1.9286824464797974,
189
- 1.8752065896987915,
190
- 1.8638038635253906,
191
- 1.8708428144454956,
192
- 1.8120969533920288,
193
- 1.827845811843872,
194
- 1.7674380540847778,
195
- 1.86089026927948,
196
- 1.7976375818252563,
197
- 1.7858340740203857,
198
- 1.8120166063308716,
199
- 1.7935630083084106,
200
- 1.8080732822418213,
201
- 1.772081971168518,
202
- 1.7113384008407593,
203
- 1.8078746795654297,
204
- 1.7796181440353394,
205
- 1.7605429887771606,
206
- 1.749908685684204,
207
- 1.8526047468185425,
208
- 1.809884786605835,
209
- 1.744711995124817,
210
- 1.7835290431976318,
211
- 1.7533817291259766,
212
- 1.7311333417892456,
213
- 1.7385872602462769,
214
- 1.7065470218658447,
215
- 1.7380945682525635,
216
- 1.6556047201156616,
217
- 1.6597269773483276,
218
- 1.5854241847991943,
219
- 1.6379982233047485,
220
- 1.636677861213684,
221
- 1.708864450454712,
222
- 1.61447274684906,
223
- 1.7286317348480225,
224
- 1.5406564474105835,
225
- 1.5738743543624878,
226
- 1.599366307258606,
227
- 1.551311731338501,
228
- 1.5821548700332642,
229
- 1.4954397678375244,
230
- 1.6357351541519165,
231
- 1.610660433769226,
232
- 1.5579359531402588,
233
- 1.585968017578125,
234
- 1.4483888149261475,
235
- 1.5140514373779297,
236
- 1.4171068668365479,
237
- 1.3210866451263428,
238
- 1.3247480392456055,
239
- 1.3959987163543701,
240
- 1.2662498950958252,
241
- 1.314728856086731,
242
- 1.2179921865463257,
243
- 1.162067174911499,
244
- 1.156597375869751,
245
- 1.180772066116333,
246
- 1.093071460723877,
247
- 1.0152738094329834,
248
- 0.9924407601356506,
249
- 1.028732180595398,
250
- 1.0052201747894287,
251
- 1.0063453912734985,
252
- 0.9981479644775391,
253
- 0.8756313920021057,
254
- 0.9030128717422485,
255
- 0.9033327102661133,
256
- 0.9253665804862976,
257
- 0.9430330991744995,
258
- 0.9484753608703613,
259
- 0.8658924698829651,
260
- 0.8693656921386719,
261
- 0.8428267240524292,
262
- 0.8409420847892761,
263
- 0.8913314938545227,
264
- 0.7889167070388794,
265
- 0.786769449710846,
266
- 0.8332729339599609,
267
- 0.8155103325843811,
268
- 0.8029834628105164,
269
- 0.7777830362319946,
270
- 0.8465943932533264,
271
- 0.7783411741256714,
272
- 0.7732486724853516,
273
- 0.7279804944992065,
274
- 0.7854341864585876,
275
- 0.6857983469963074,
276
- 0.7444823384284973,
277
- 0.7447658181190491,
278
- 0.7315057516098022,
279
- 0.7095687985420227,
280
- 0.7285814881324768,
281
- 0.7516853213310242,
282
- 0.7119408845901489,
283
- 0.7268796563148499,
284
- 0.7778827548027039,
285
- 0.7403841018676758,
286
- 0.7479632496833801,
287
- 0.7538272738456726,
288
- 0.6861394643783569,
289
- 0.7120760679244995,
290
- 0.701282799243927,
291
- 0.6962236762046814,
292
- 0.7168822884559631,
293
- 0.7258835434913635,
294
- 0.7194567918777466,
295
- 0.702591061592102,
296
- 0.7357982993125916,
297
- 0.7088969349861145,
298
- 0.6972709894180298,
299
- 0.7118088603019714,
300
- 0.6672061085700989,
301
- 0.7101794481277466,
302
- 0.704630434513092,
303
- 0.678624153137207,
304
- 0.7467736005783081,
305
- 0.7612290978431702,
306
- 0.6677433848381042,
307
- 0.7306165099143982,
308
- 0.6715617179870605,
309
- 0.6923214197158813,
310
- 0.6501680016517639,
311
- 0.6605051755905151,
312
- 0.6703984141349792,
313
- 0.7571574449539185,
314
- 0.7139632105827332,
315
- 0.74234539270401,
316
- 0.776197612285614,
317
- 0.6756425499916077
318
  ],
319
  "base_loss": [
320
- 11.671636581420898,
321
- 10.659524917602539,
322
- 9.829411506652832,
323
- 9.199652671813965,
324
- 8.631672859191895,
325
- 7.977789878845215,
326
- 7.31129789352417,
327
- 6.896409034729004,
328
- 6.216288089752197,
329
- 5.601308345794678,
330
- 5.060961723327637,
331
- 4.576255798339844,
332
- 4.051487922668457,
333
- 3.5682461261749268,
334
- 3.152874231338501,
335
- 2.812526226043701,
336
- 2.522318124771118,
337
- 2.3712620735168457,
338
- 2.1769790649414062,
339
- 2.142094135284424,
340
- 2.0678162574768066,
341
- 1.915800929069519,
342
- 2.0273938179016113,
343
- 1.892726182937622,
344
- 1.8822896480560303,
345
- 1.9177898168563843,
346
- 1.9286824464797974,
347
- 1.8752065896987915,
348
- 1.8638038635253906,
349
- 1.8708428144454956,
350
- 1.8120969533920288,
351
- 1.827845811843872,
352
- 1.7674380540847778,
353
- 1.86089026927948,
354
- 1.7976375818252563,
355
- 1.7858340740203857,
356
- 1.8120166063308716,
357
- 1.7935630083084106,
358
- 1.8080732822418213,
359
- 1.772081971168518,
360
- 1.7113384008407593,
361
- 1.8078746795654297,
362
- 1.7796181440353394,
363
- 1.7605429887771606,
364
- 1.749908685684204,
365
- 1.8526047468185425,
366
- 1.809884786605835,
367
- 1.744711995124817,
368
- 1.7835290431976318,
369
- 1.7533817291259766,
370
- 1.7311333417892456,
371
- 1.7385872602462769,
372
- 1.7065470218658447,
373
- 1.7380945682525635,
374
- 1.6556047201156616,
375
- 1.6597269773483276,
376
- 1.5854241847991943,
377
- 1.6379982233047485,
378
- 1.636677861213684,
379
- 1.708864450454712,
380
- 1.61447274684906,
381
- 1.7286317348480225,
382
- 1.5406564474105835,
383
- 1.5738743543624878,
384
- 1.599366307258606,
385
- 1.551311731338501,
386
- 1.5821548700332642,
387
- 1.4954397678375244,
388
- 1.6357351541519165,
389
- 1.610660433769226,
390
- 1.5579359531402588,
391
- 1.585968017578125,
392
- 1.4483888149261475,
393
- 1.5140514373779297,
394
- 1.4171068668365479,
395
- 1.3210866451263428,
396
- 1.3247480392456055,
397
- 1.3959987163543701,
398
- 1.2662498950958252,
399
- 1.314728856086731,
400
- 1.2179921865463257,
401
- 1.162067174911499,
402
- 1.156597375869751,
403
- 1.180772066116333,
404
- 1.093071460723877,
405
- 1.0152738094329834,
406
- 0.9924407601356506,
407
- 1.028732180595398,
408
- 1.0052201747894287,
409
- 1.0063453912734985,
410
- 0.9981479644775391,
411
- 0.8756313920021057,
412
- 0.9030128717422485,
413
- 0.9033327102661133,
414
- 0.9253665804862976,
415
- 0.9430330991744995,
416
- 0.9484753608703613,
417
- 0.8658924698829651,
418
- 0.8693656921386719,
419
- 0.8428267240524292,
420
- 0.8409420847892761,
421
- 0.8913314938545227,
422
- 0.7889167070388794,
423
- 0.786769449710846,
424
- 0.8332729339599609,
425
- 0.8155103325843811,
426
- 0.8029834628105164,
427
- 0.7777830362319946,
428
- 0.8465943932533264,
429
- 0.7783411741256714,
430
- 0.7732486724853516,
431
- 0.7279804944992065,
432
- 0.7854341864585876,
433
- 0.6857983469963074,
434
- 0.7444823384284973,
435
- 0.7447658181190491,
436
- 0.7315057516098022,
437
- 0.7095687985420227,
438
- 0.7285814881324768,
439
- 0.7516853213310242,
440
- 0.7119408845901489,
441
- 0.7268796563148499,
442
- 0.7778827548027039,
443
- 0.7403841018676758,
444
- 0.7479632496833801,
445
- 0.7538272738456726,
446
- 0.6861394643783569,
447
- 0.7120760679244995,
448
- 0.701282799243927,
449
- 0.6962236762046814,
450
- 0.7168822884559631,
451
- 0.7258835434913635,
452
- 0.7194567918777466,
453
- 0.702591061592102,
454
- 0.7357982993125916,
455
- 0.7088969349861145,
456
- 0.6972709894180298,
457
- 0.7118088603019714,
458
- 0.6672061085700989,
459
- 0.7101794481277466,
460
- 0.704630434513092,
461
- 0.678624153137207,
462
- 0.7467736005783081,
463
- 0.7612290978431702,
464
- 0.6677433848381042,
465
- 0.7306165099143982,
466
- 0.6715617179870605,
467
- 0.6923214197158813,
468
- 0.6501680016517639,
469
- 0.6605051755905151,
470
- 0.6703984141349792,
471
- 0.7571574449539185,
472
- 0.7139632105827332,
473
- 0.74234539270401,
474
- 0.776197612285614,
475
- 0.6756425499916077
476
  ],
477
  "lr": [
478
  4.188034188034189e-06,
@@ -677,595 +677,595 @@
677
  20
678
  ],
679
  "eval_accuracy": [
 
680
  0.0,
681
- 0.0,
682
- 0.0077777777777777776,
683
- 0.011111111111111112,
684
- 0.0033333333333333335,
685
- 0.006666666666666667,
686
- 0.005555555555555556,
687
- 0.0033333333333333335,
688
- 0.01,
689
- 0.015555555555555555,
690
- 0.024444444444444446,
691
- 0.03333333333333333,
692
- 0.056666666666666664,
693
- 0.06,
694
- 0.05555555555555555,
695
- 0.05555555555555555,
696
- 0.057777777777777775,
697
- 0.056666666666666664,
698
- 0.058888888888888886,
699
- 0.06444444444444444
700
  ]
701
  },
702
- "final_accuracy": 0.06916666666666667,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
- "n_per_split": 50
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 0.48,
714
- "digit_accuracy": 0.8971428571428571,
715
- "n_examples": 50,
716
  "per_subtask": {
717
  "SA": {
718
- "accuracy": 0.8813559322033898,
719
- "count": 295
720
  },
721
  "SS": {
722
- "accuracy": 0.9818181818181818,
723
- "count": 55
724
  }
725
  }
726
  },
727
  "add_S1": {
728
- "full_accuracy": 0.06,
729
- "digit_accuracy": 0.6514285714285715,
730
- "n_examples": 50,
731
  "per_subtask": {
732
  "SA": {
733
- "accuracy": 0.8968253968253969,
734
- "count": 126
735
  },
736
  "SC": {
737
- "accuracy": 0.7848101265822784,
738
- "count": 79
739
  },
740
  "SS": {
741
- "accuracy": 0.7619047619047619,
742
- "count": 21
743
  },
744
  "UC": {
745
- "accuracy": 0.29838709677419356,
746
- "count": 124
747
  }
748
  }
749
  },
750
  "add_S2": {
751
- "full_accuracy": 0.08,
752
- "digit_accuracy": 0.64,
753
- "n_examples": 50,
754
  "per_subtask": {
755
  "SA": {
756
- "accuracy": 0.8666666666666667,
757
- "count": 75
758
  },
759
  "SC": {
760
- "accuracy": 0.7096774193548387,
761
- "count": 62
762
  },
763
  "SS": {
764
- "accuracy": 0.8461538461538461,
765
- "count": 39
766
  },
767
  "UC": {
768
- "accuracy": 0.44144144144144143,
769
- "count": 111
770
  },
771
  "US": {
772
- "accuracy": 0.5238095238095238,
773
- "count": 63
774
  }
775
  }
776
  },
777
  "add_S3": {
778
- "full_accuracy": 0.08,
779
- "digit_accuracy": 0.5142857142857142,
780
- "n_examples": 50,
781
  "per_subtask": {
782
  "SA": {
783
- "accuracy": 0.9333333333333333,
784
- "count": 60
785
  },
786
  "SC": {
787
- "accuracy": 0.7543859649122807,
788
- "count": 57
789
  },
790
  "SS": {
791
- "accuracy": 0.8421052631578947,
792
- "count": 19
793
  },
794
  "UC": {
795
- "accuracy": 0.375,
796
- "count": 104
797
  },
798
  "US": {
799
- "accuracy": 0.23636363636363636,
800
- "count": 110
801
  }
802
  }
803
  },
804
  "add_S4": {
805
- "full_accuracy": 0.06,
806
- "digit_accuracy": 0.44857142857142857,
807
- "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
- "accuracy": 0.9583333333333334,
811
- "count": 48
812
  },
813
  "SC": {
814
- "accuracy": 0.8076923076923077,
815
- "count": 52
816
  },
817
  "SS": {
818
- "accuracy": 1.0,
819
- "count": 7
820
  },
821
  "UC": {
822
- "accuracy": 0.3595505617977528,
823
- "count": 89
824
  },
825
  "US": {
826
- "accuracy": 0.19480519480519481,
827
- "count": 154
828
  }
829
  }
830
  },
831
  "add_S5": {
832
- "full_accuracy": 0.1,
833
- "digit_accuracy": 0.37714285714285717,
834
- "n_examples": 50,
835
  "per_subtask": {
836
  "SA": {
837
  "accuracy": 0.98,
838
- "count": 50
839
  },
840
  "SC": {
841
- "accuracy": 0.56,
842
- "count": 50
843
  },
844
  "UC": {
845
- "accuracy": 0.34,
846
- "count": 50
847
  },
848
  "US": {
849
- "accuracy": 0.19,
850
- "count": 200
851
  }
852
  }
853
  },
854
  "add_S6": {
855
- "full_accuracy": 0.28,
856
- "digit_accuracy": 0.4342857142857143,
857
- "n_examples": 50,
858
  "per_subtask": {
859
  "SC": {
860
- "accuracy": 0.6,
861
- "count": 50
862
  },
863
  "UC": {
864
- "accuracy": 0.46,
865
- "count": 50
866
  },
867
  "US": {
868
- "accuracy": 0.396,
869
- "count": 250
870
  }
871
  }
872
  },
873
  "add_random": {
874
- "full_accuracy": 0.05,
875
- "digit_accuracy": 0.635,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
- "accuracy": 0.8723897911832946,
880
- "count": 431
881
  },
882
  "SC": {
883
- "accuracy": 0.7974683544303798,
884
- "count": 316
885
  },
886
  "SS": {
887
- "accuracy": 0.9230769230769231,
888
- "count": 39
889
  },
890
  "UC": {
891
- "accuracy": 0.36607142857142855,
892
- "count": 560
893
  },
894
  "US": {
895
- "accuracy": 0.37037037037037035,
896
- "count": 54
897
  }
898
  }
899
  },
900
  "add_C1": {
901
  "full_accuracy": 0.16,
902
- "digit_accuracy": 0.8114285714285714,
903
- "n_examples": 50,
904
  "per_subtask": {
905
  "SA": {
906
- "accuracy": 0.928,
907
- "count": 250
908
  },
909
  "SC": {
910
- "accuracy": 0.78,
911
- "count": 50
912
  },
913
  "UC": {
914
- "accuracy": 0.26,
915
- "count": 50
916
  }
917
  }
918
  },
919
  "add_C2": {
920
- "full_accuracy": 0.08,
921
- "digit_accuracy": 0.74,
922
- "n_examples": 50,
923
  "per_subtask": {
924
  "SA": {
925
- "accuracy": 0.94,
926
- "count": 200
927
  },
928
  "SC": {
929
- "accuracy": 0.74,
930
- "count": 50
931
  },
932
  "UC": {
933
- "accuracy": 0.3132530120481928,
934
- "count": 83
935
  },
936
  "US": {
937
- "accuracy": 0.47058823529411764,
938
- "count": 17
939
  }
940
  }
941
  },
942
  "add_C3": {
943
  "full_accuracy": 0.02,
944
- "digit_accuracy": 0.6457142857142857,
945
- "n_examples": 50,
946
  "per_subtask": {
947
  "SA": {
948
- "accuracy": 0.9133333333333333,
949
- "count": 150
950
  },
951
  "SC": {
952
- "accuracy": 0.8,
953
- "count": 50
954
  },
955
  "UC": {
956
- "accuracy": 0.33,
957
- "count": 100
958
  },
959
  "US": {
960
- "accuracy": 0.32,
961
- "count": 50
962
  }
963
  }
964
  },
965
  "add_C4": {
966
- "full_accuracy": 0.02,
967
- "digit_accuracy": 0.5885714285714285,
968
- "n_examples": 50,
969
  "per_subtask": {
970
  "SA": {
971
- "accuracy": 0.99,
972
- "count": 100
973
  },
974
  "SC": {
975
- "accuracy": 0.76,
976
- "count": 50
977
  },
978
  "UC": {
979
- "accuracy": 0.3484848484848485,
980
- "count": 132
981
  },
982
  "US": {
983
- "accuracy": 0.3382352941176471,
984
- "count": 68
985
  }
986
  }
987
  },
988
  "add_C5": {
989
- "full_accuracy": 0.0,
990
- "digit_accuracy": 0.4657142857142857,
991
- "n_examples": 50,
992
  "per_subtask": {
993
  "SA": {
994
- "accuracy": 0.94,
995
- "count": 50
996
  },
997
  "SC": {
998
- "accuracy": 0.72,
999
- "count": 50
1000
  },
1001
  "UC": {
1002
- "accuracy": 0.3835616438356164,
1003
- "count": 146
1004
  },
1005
  "US": {
1006
- "accuracy": 0.23076923076923078,
1007
- "count": 104
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
- "full_accuracy": 0.0,
1013
- "digit_accuracy": 0.44571428571428573,
1014
- "n_examples": 50,
1015
  "per_subtask": {
1016
  "SC": {
1017
- "accuracy": 0.64,
1018
- "count": 50
1019
  },
1020
  "UC": {
1021
- "accuracy": 0.4444444444444444,
1022
- "count": 189
1023
  },
1024
  "US": {
1025
- "accuracy": 0.36036036036036034,
1026
- "count": 111
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
- "full_accuracy": 0.26,
1032
- "digit_accuracy": 0.8657142857142858,
1033
- "n_examples": 50,
1034
  "per_subtask": {
1035
  "MD": {
1036
- "accuracy": 0.8481848184818482,
1037
- "count": 303
1038
  },
1039
  "ME": {
1040
- "accuracy": 0.9787234042553191,
1041
- "count": 47
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
- "full_accuracy": 0.0,
1047
- "digit_accuracy": 0.6371428571428571,
1048
- "n_examples": 50,
1049
  "per_subtask": {
1050
  "MD": {
1051
- "accuracy": 0.8652482269503546,
1052
- "count": 141
1053
  },
1054
  "MB": {
1055
- "accuracy": 0.6527777777777778,
1056
- "count": 72
1057
  },
1058
  "ME": {
1059
- "accuracy": 0.9444444444444444,
1060
- "count": 18
1061
  },
1062
  "UB": {
1063
- "accuracy": 0.31092436974789917,
1064
- "count": 119
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
  "full_accuracy": 0.0,
1070
- "digit_accuracy": 0.62,
1071
- "n_examples": 50,
1072
  "per_subtask": {
1073
  "MD": {
1074
- "accuracy": 0.9285714285714286,
1075
- "count": 112
1076
  },
1077
  "MB": {
1078
- "accuracy": 0.6981132075471698,
1079
- "count": 53
1080
  },
1081
  "ME": {
1082
- "accuracy": 0.9361702127659575,
1083
- "count": 47
1084
  },
1085
  "UB": {
1086
- "accuracy": 0.2823529411764706,
1087
- "count": 85
1088
  },
1089
  "UD": {
1090
- "accuracy": 0.1509433962264151,
1091
- "count": 53
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
- "full_accuracy": 0.0,
1097
- "digit_accuracy": 0.48857142857142855,
1098
- "n_examples": 50,
1099
  "per_subtask": {
1100
  "MD": {
1101
- "accuracy": 0.9381443298969072,
1102
- "count": 97
1103
  },
1104
  "MB": {
1105
- "accuracy": 0.5686274509803921,
1106
- "count": 51
1107
  },
1108
  "ME": {
1109
- "accuracy": 1.0,
1110
- "count": 27
1111
  },
1112
  "UB": {
1113
- "accuracy": 0.1891891891891892,
1114
- "count": 74
1115
  },
1116
  "UD": {
1117
- "accuracy": 0.09900990099009901,
1118
- "count": 101
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
- "full_accuracy": 0.0,
1124
- "digit_accuracy": 0.4342857142857143,
1125
- "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
- "accuracy": 0.88,
1129
- "count": 100
1130
  },
1131
  "MB": {
1132
- "accuracy": 0.68,
1133
- "count": 50
1134
  },
1135
  "UB": {
1136
- "accuracy": 0.4,
1137
- "count": 50
1138
  },
1139
  "UD": {
1140
- "accuracy": 0.06666666666666667,
1141
- "count": 150
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
  "full_accuracy": 0.0,
1147
- "digit_accuracy": 0.30857142857142855,
1148
- "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
- "count": 50
1153
  },
1154
  "MB": {
1155
- "accuracy": 0.78,
1156
- "count": 50
1157
  },
1158
  "UB": {
1159
- "accuracy": 0.38,
1160
- "count": 50
1161
  },
1162
  "UD": {
1163
- "accuracy": 0.0,
1164
- "count": 200
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
- "full_accuracy": 0.02,
1170
- "digit_accuracy": 0.62,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
- "accuracy": 0.8807017543859649,
1175
- "count": 570
1176
  },
1177
  "MB": {
1178
- "accuracy": 0.6534296028880866,
1179
- "count": 277
1180
  },
1181
  "ME": {
1182
- "accuracy": 0.9245283018867925,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
- "accuracy": 0.2823779193205945,
1187
- "count": 471
1188
  },
1189
  "UD": {
1190
- "accuracy": 0.10344827586206896,
1191
- "count": 29
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
- "full_accuracy": 0.0,
1197
- "digit_accuracy": 0.5657142857142857,
1198
- "n_examples": 50,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.8733333333333333,
1202
- "count": 150
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.68,
1206
- "count": 50
1207
  },
1208
  "UB": {
1209
- "accuracy": 0.21782178217821782,
1210
- "count": 101
1211
  },
1212
  "UD": {
1213
- "accuracy": 0.22448979591836735,
1214
- "count": 49
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
  "full_accuracy": 0.0,
1220
- "digit_accuracy": 0.52,
1221
- "n_examples": 50,
1222
  "per_subtask": {
1223
  "MD": {
1224
- "accuracy": 0.96,
1225
- "count": 100
1226
  },
1227
  "MB": {
1228
  "accuracy": 0.74,
1229
- "count": 50
1230
  },
1231
  "UB": {
1232
- "accuracy": 0.30578512396694213,
1233
- "count": 121
1234
  },
1235
  "UD": {
1236
- "accuracy": 0.1518987341772152,
1237
- "count": 79
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
  "full_accuracy": 0.0,
1243
- "digit_accuracy": 0.4142857142857143,
1244
- "n_examples": 50,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
- "count": 50
1249
  },
1250
  "MB": {
1251
- "accuracy": 0.76,
1252
- "count": 50
1253
  },
1254
  "UB": {
1255
- "accuracy": 0.2565789473684211,
1256
- "count": 152
1257
  },
1258
  "UD": {
1259
- "accuracy": 0.1836734693877551,
1260
- "count": 98
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
- "overall_accuracy": 0.06533333333333333,
1267
- "digit_accuracy": 0.5842857142857143,
1268
- "total_examples": 1500,
1269
  "n_splits": 24
1270
  }
1271
  }
 
159
  7800
160
  ],
161
  "loss": [
162
+ 11.67835807800293,
163
+ 10.719292640686035,
164
+ 9.762801170349121,
165
+ 9.028444290161133,
166
+ 8.50659465789795,
167
+ 7.853487491607666,
168
+ 7.278311729431152,
169
+ 6.883544921875,
170
+ 6.277066230773926,
171
+ 5.69489049911499,
172
+ 5.099170207977295,
173
+ 4.710115909576416,
174
+ 4.105507850646973,
175
+ 3.5853869915008545,
176
+ 3.1561012268066406,
177
+ 2.732867956161499,
178
+ 2.5105342864990234,
179
+ 2.380223035812378,
180
+ 2.1963541507720947,
181
+ 2.0700221061706543,
182
+ 2.055406332015991,
183
+ 1.9924981594085693,
184
+ 1.9652398824691772,
185
+ 1.9360522031784058,
186
+ 1.9294606447219849,
187
+ 1.9191540479660034,
188
+ 1.884749174118042,
189
+ 1.8851999044418335,
190
+ 1.875937819480896,
191
+ 1.8560281991958618,
192
+ 1.820598840713501,
193
+ 1.7985869646072388,
194
+ 1.8134841918945312,
195
+ 1.8033429384231567,
196
+ 1.843862771987915,
197
+ 1.8056700229644775,
198
+ 1.7657887935638428,
199
+ 1.771441102027893,
200
+ 1.7327417135238647,
201
+ 1.7670358419418335,
202
+ 1.7790080308914185,
203
+ 1.7342703342437744,
204
+ 1.8182449340820312,
205
+ 1.758160948753357,
206
+ 1.8148043155670166,
207
+ 1.7672239542007446,
208
+ 1.759755253791809,
209
+ 1.6406434774398804,
210
+ 1.6834887266159058,
211
+ 1.6384862661361694,
212
+ 1.669707179069519,
213
+ 1.6000508069992065,
214
+ 1.7436786890029907,
215
+ 1.712514877319336,
216
+ 1.666881799697876,
217
+ 1.7062987089157104,
218
+ 1.604015588760376,
219
+ 1.6764469146728516,
220
+ 1.584958791732788,
221
+ 1.647741675376892,
222
+ 1.6784383058547974,
223
+ 1.5880528688430786,
224
+ 1.6094725131988525,
225
+ 1.5886987447738647,
226
+ 1.5662020444869995,
227
+ 1.6310920715332031,
228
+ 1.5743287801742554,
229
+ 1.5807675123214722,
230
+ 1.537028431892395,
231
+ 1.5652409791946411,
232
+ 1.475896954536438,
233
+ 1.5633798837661743,
234
+ 1.318359136581421,
235
+ 1.489592432975769,
236
+ 1.4318926334381104,
237
+ 1.295494794845581,
238
+ 1.4096946716308594,
239
+ 1.255828857421875,
240
+ 1.237991452217102,
241
+ 1.1837003231048584,
242
+ 1.2067559957504272,
243
+ 1.183036208152771,
244
+ 1.0562609434127808,
245
+ 1.0683873891830444,
246
+ 1.002286672592163,
247
+ 1.0516374111175537,
248
+ 1.0503524541854858,
249
+ 1.0116995573043823,
250
+ 0.9874834418296814,
251
+ 0.9922329783439636,
252
+ 1.0121840238571167,
253
+ 0.9221497774124146,
254
+ 0.9718131422996521,
255
+ 0.9580082893371582,
256
+ 0.9458693265914917,
257
+ 0.8837714791297913,
258
+ 0.852050244808197,
259
+ 0.8929813504219055,
260
+ 0.863052248954773,
261
+ 0.8303223848342896,
262
+ 0.912526547908783,
263
+ 0.8493172526359558,
264
+ 0.836484432220459,
265
+ 0.8957289457321167,
266
+ 0.8345073461532593,
267
+ 0.8347916007041931,
268
+ 0.8315057754516602,
269
+ 0.8784408569335938,
270
+ 0.774987518787384,
271
+ 0.7976661324501038,
272
+ 0.7938314080238342,
273
+ 0.7878810167312622,
274
+ 0.7915247082710266,
275
+ 0.7997151613235474,
276
+ 0.7489715218544006,
277
+ 0.7730088829994202,
278
+ 0.8028727769851685,
279
+ 0.8076448440551758,
280
+ 0.7823832631111145,
281
+ 0.762302815914154,
282
+ 0.714794933795929,
283
+ 0.7424928545951843,
284
+ 0.7266560196876526,
285
+ 0.734574019908905,
286
+ 0.7426309585571289,
287
+ 0.7433751821517944,
288
+ 0.6983920335769653,
289
+ 0.76390141248703,
290
+ 0.7495347857475281,
291
+ 0.783293604850769,
292
+ 0.7293379902839661,
293
+ 0.7455155253410339,
294
+ 0.7366787791252136,
295
+ 0.7058432698249817,
296
+ 0.6945399045944214,
297
+ 0.7149715423583984,
298
+ 0.7223644256591797,
299
+ 0.742652416229248,
300
+ 0.7686190009117126,
301
+ 0.7120343446731567,
302
+ 0.6914497017860413,
303
+ 0.7294840812683105,
304
+ 0.6813774108886719,
305
+ 0.7342352271080017,
306
+ 0.7158920168876648,
307
+ 0.7304601073265076,
308
+ 0.7657988667488098,
309
+ 0.7277460098266602,
310
+ 0.7255235314369202,
311
+ 0.6857752799987793,
312
+ 0.7248324155807495,
313
+ 0.7265051603317261,
314
+ 0.7114661335945129,
315
+ 0.7851969599723816,
316
+ 0.7334709167480469,
317
+ 0.7515817284584045
318
  ],
319
  "base_loss": [
320
+ 11.67835807800293,
321
+ 10.719292640686035,
322
+ 9.762801170349121,
323
+ 9.028444290161133,
324
+ 8.50659465789795,
325
+ 7.853487491607666,
326
+ 7.278311729431152,
327
+ 6.883544921875,
328
+ 6.277066230773926,
329
+ 5.69489049911499,
330
+ 5.099170207977295,
331
+ 4.710115909576416,
332
+ 4.105507850646973,
333
+ 3.5853869915008545,
334
+ 3.1561012268066406,
335
+ 2.732867956161499,
336
+ 2.5105342864990234,
337
+ 2.380223035812378,
338
+ 2.1963541507720947,
339
+ 2.0700221061706543,
340
+ 2.055406332015991,
341
+ 1.9924981594085693,
342
+ 1.9652398824691772,
343
+ 1.9360522031784058,
344
+ 1.9294606447219849,
345
+ 1.9191540479660034,
346
+ 1.884749174118042,
347
+ 1.8851999044418335,
348
+ 1.875937819480896,
349
+ 1.8560281991958618,
350
+ 1.820598840713501,
351
+ 1.7985869646072388,
352
+ 1.8134841918945312,
353
+ 1.8033429384231567,
354
+ 1.843862771987915,
355
+ 1.8056700229644775,
356
+ 1.7657887935638428,
357
+ 1.771441102027893,
358
+ 1.7327417135238647,
359
+ 1.7670358419418335,
360
+ 1.7790080308914185,
361
+ 1.7342703342437744,
362
+ 1.8182449340820312,
363
+ 1.758160948753357,
364
+ 1.8148043155670166,
365
+ 1.7672239542007446,
366
+ 1.759755253791809,
367
+ 1.6406434774398804,
368
+ 1.6834887266159058,
369
+ 1.6384862661361694,
370
+ 1.669707179069519,
371
+ 1.6000508069992065,
372
+ 1.7436786890029907,
373
+ 1.712514877319336,
374
+ 1.666881799697876,
375
+ 1.7062987089157104,
376
+ 1.604015588760376,
377
+ 1.6764469146728516,
378
+ 1.584958791732788,
379
+ 1.647741675376892,
380
+ 1.6784383058547974,
381
+ 1.5880528688430786,
382
+ 1.6094725131988525,
383
+ 1.5886987447738647,
384
+ 1.5662020444869995,
385
+ 1.6310920715332031,
386
+ 1.5743287801742554,
387
+ 1.5807675123214722,
388
+ 1.537028431892395,
389
+ 1.5652409791946411,
390
+ 1.475896954536438,
391
+ 1.5633798837661743,
392
+ 1.318359136581421,
393
+ 1.489592432975769,
394
+ 1.4318926334381104,
395
+ 1.295494794845581,
396
+ 1.4096946716308594,
397
+ 1.255828857421875,
398
+ 1.237991452217102,
399
+ 1.1837003231048584,
400
+ 1.2067559957504272,
401
+ 1.183036208152771,
402
+ 1.0562609434127808,
403
+ 1.0683873891830444,
404
+ 1.002286672592163,
405
+ 1.0516374111175537,
406
+ 1.0503524541854858,
407
+ 1.0116995573043823,
408
+ 0.9874834418296814,
409
+ 0.9922329783439636,
410
+ 1.0121840238571167,
411
+ 0.9221497774124146,
412
+ 0.9718131422996521,
413
+ 0.9580082893371582,
414
+ 0.9458693265914917,
415
+ 0.8837714791297913,
416
+ 0.852050244808197,
417
+ 0.8929813504219055,
418
+ 0.863052248954773,
419
+ 0.8303223848342896,
420
+ 0.912526547908783,
421
+ 0.8493172526359558,
422
+ 0.836484432220459,
423
+ 0.8957289457321167,
424
+ 0.8345073461532593,
425
+ 0.8347916007041931,
426
+ 0.8315057754516602,
427
+ 0.8784408569335938,
428
+ 0.774987518787384,
429
+ 0.7976661324501038,
430
+ 0.7938314080238342,
431
+ 0.7878810167312622,
432
+ 0.7915247082710266,
433
+ 0.7997151613235474,
434
+ 0.7489715218544006,
435
+ 0.7730088829994202,
436
+ 0.8028727769851685,
437
+ 0.8076448440551758,
438
+ 0.7823832631111145,
439
+ 0.762302815914154,
440
+ 0.714794933795929,
441
+ 0.7424928545951843,
442
+ 0.7266560196876526,
443
+ 0.734574019908905,
444
+ 0.7426309585571289,
445
+ 0.7433751821517944,
446
+ 0.6983920335769653,
447
+ 0.76390141248703,
448
+ 0.7495347857475281,
449
+ 0.783293604850769,
450
+ 0.7293379902839661,
451
+ 0.7455155253410339,
452
+ 0.7366787791252136,
453
+ 0.7058432698249817,
454
+ 0.6945399045944214,
455
+ 0.7149715423583984,
456
+ 0.7223644256591797,
457
+ 0.742652416229248,
458
+ 0.7686190009117126,
459
+ 0.7120343446731567,
460
+ 0.6914497017860413,
461
+ 0.7294840812683105,
462
+ 0.6813774108886719,
463
+ 0.7342352271080017,
464
+ 0.7158920168876648,
465
+ 0.7304601073265076,
466
+ 0.7657988667488098,
467
+ 0.7277460098266602,
468
+ 0.7255235314369202,
469
+ 0.6857752799987793,
470
+ 0.7248324155807495,
471
+ 0.7265051603317261,
472
+ 0.7114661335945129,
473
+ 0.7851969599723816,
474
+ 0.7334709167480469,
475
+ 0.7515817284584045
476
  ],
477
  "lr": [
478
  4.188034188034189e-06,
 
677
  20
678
  ],
679
  "eval_accuracy": [
680
+ 0.002105263157894737,
681
  0.0,
682
+ 0.010526315789473684,
683
+ 0.004210526315789474,
684
+ 0.00631578947368421,
685
+ 0.004210526315789474,
686
+ 0.009473684210526316,
687
+ 0.008421052631578947,
688
+ 0.015789473684210527,
689
+ 0.018947368421052633,
690
+ 0.037894736842105266,
691
+ 0.04842105263157895,
692
+ 0.034736842105263156,
693
+ 0.05263157894736842,
694
+ 0.07263157894736842,
695
+ 0.05052631578947368,
696
+ 0.06421052631578947,
697
+ 0.05157894736842105,
698
+ 0.05578947368421053,
699
+ 0.06526315789473684
 
700
  ]
701
  },
702
+ "final_accuracy": 0.07653846153846154,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
+ "n_per_split": 100
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.4,
714
+ "digit_accuracy": 0.8628571428571429,
715
+ "n_examples": 100,
716
  "per_subtask": {
717
  "SA": {
718
+ "accuracy": 0.8495867768595041,
719
+ "count": 605
720
  },
721
  "SS": {
722
+ "accuracy": 0.9473684210526315,
723
+ "count": 95
724
  }
725
  }
726
  },
727
  "add_S1": {
728
+ "full_accuracy": 0.02,
729
+ "digit_accuracy": 0.64,
730
+ "n_examples": 100,
731
  "per_subtask": {
732
  "SA": {
733
+ "accuracy": 0.8823529411764706,
734
+ "count": 204
735
  },
736
  "SC": {
737
+ "accuracy": 0.7455621301775148,
738
+ "count": 169
739
  },
740
  "SS": {
741
+ "accuracy": 0.8709677419354839,
742
+ "count": 31
743
  },
744
  "UC": {
745
+ "accuracy": 0.3885135135135135,
746
+ "count": 296
747
  }
748
  }
749
  },
750
  "add_S2": {
751
+ "full_accuracy": 0.09,
752
+ "digit_accuracy": 0.6614285714285715,
753
+ "n_examples": 100,
754
  "per_subtask": {
755
  "SA": {
756
+ "accuracy": 0.9202453987730062,
757
+ "count": 163
758
  },
759
  "SC": {
760
+ "accuracy": 0.7230769230769231,
761
+ "count": 130
762
  },
763
  "SS": {
764
+ "accuracy": 0.8390804597701149,
765
+ "count": 87
766
  },
767
  "UC": {
768
+ "accuracy": 0.4236453201970443,
769
+ "count": 203
770
  },
771
  "US": {
772
+ "accuracy": 0.5128205128205128,
773
+ "count": 117
774
  }
775
  }
776
  },
777
  "add_S3": {
778
+ "full_accuracy": 0.07,
779
+ "digit_accuracy": 0.49714285714285716,
780
+ "n_examples": 100,
781
  "per_subtask": {
782
  "SA": {
783
+ "accuracy": 0.9256198347107438,
784
+ "count": 121
785
  },
786
  "SC": {
787
+ "accuracy": 0.6611570247933884,
788
+ "count": 121
789
  },
790
  "SS": {
791
+ "accuracy": 0.9387755102040817,
792
+ "count": 49
793
  },
794
  "UC": {
795
+ "accuracy": 0.34946236559139787,
796
+ "count": 186
797
  },
798
  "US": {
799
+ "accuracy": 0.20179372197309417,
800
+ "count": 223
801
  }
802
  }
803
  },
804
  "add_S4": {
805
+ "full_accuracy": 0.11,
806
+ "digit_accuracy": 0.48428571428571426,
807
+ "n_examples": 100,
808
  "per_subtask": {
809
  "SA": {
810
+ "accuracy": 0.9230769230769231,
811
+ "count": 104
812
  },
813
  "SC": {
814
+ "accuracy": 0.6981132075471698,
815
+ "count": 106
816
  },
817
  "SS": {
818
+ "accuracy": 0.8695652173913043,
819
+ "count": 23
820
  },
821
  "UC": {
822
+ "accuracy": 0.49375,
823
+ "count": 160
824
  },
825
  "US": {
826
+ "accuracy": 0.2280130293159609,
827
+ "count": 307
828
  }
829
  }
830
  },
831
  "add_S5": {
832
+ "full_accuracy": 0.13,
833
+ "digit_accuracy": 0.37,
834
+ "n_examples": 100,
835
  "per_subtask": {
836
  "SA": {
837
  "accuracy": 0.98,
838
+ "count": 100
839
  },
840
  "SC": {
841
+ "accuracy": 0.55,
842
+ "count": 100
843
  },
844
  "UC": {
845
+ "accuracy": 0.32,
846
+ "count": 100
847
  },
848
  "US": {
849
+ "accuracy": 0.185,
850
+ "count": 400
851
  }
852
  }
853
  },
854
  "add_S6": {
855
+ "full_accuracy": 0.19,
856
+ "digit_accuracy": 0.3342857142857143,
857
+ "n_examples": 100,
858
  "per_subtask": {
859
  "SC": {
860
+ "accuracy": 0.64,
861
+ "count": 100
862
  },
863
  "UC": {
864
+ "accuracy": 0.32,
865
+ "count": 100
866
  },
867
  "US": {
868
+ "accuracy": 0.276,
869
+ "count": 500
870
  }
871
  }
872
  },
873
  "add_random": {
874
+ "full_accuracy": 0.035,
875
+ "digit_accuracy": 0.6421428571428571,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
+ "accuracy": 0.8702460850111857,
880
+ "count": 447
881
  },
882
  "SC": {
883
+ "accuracy": 0.771875,
884
+ "count": 320
885
  },
886
  "SS": {
887
+ "accuracy": 0.9464285714285714,
888
+ "count": 56
889
  },
890
  "UC": {
891
+ "accuracy": 0.3648393194706994,
892
+ "count": 529
893
  },
894
  "US": {
895
+ "accuracy": 0.3541666666666667,
896
+ "count": 48
897
  }
898
  }
899
  },
900
  "add_C1": {
901
  "full_accuracy": 0.16,
902
+ "digit_accuracy": 0.8028571428571428,
903
+ "n_examples": 100,
904
  "per_subtask": {
905
  "SA": {
906
+ "accuracy": 0.912,
907
+ "count": 500
908
  },
909
  "SC": {
910
+ "accuracy": 0.71,
911
+ "count": 100
912
  },
913
  "UC": {
914
+ "accuracy": 0.35,
915
+ "count": 100
916
  }
917
  }
918
  },
919
  "add_C2": {
920
+ "full_accuracy": 0.07,
921
+ "digit_accuracy": 0.7271428571428571,
922
+ "n_examples": 100,
923
  "per_subtask": {
924
  "SA": {
925
+ "accuracy": 0.91,
926
+ "count": 400
927
  },
928
  "SC": {
929
+ "accuracy": 0.7,
930
+ "count": 100
931
  },
932
  "UC": {
933
+ "accuracy": 0.3525641025641026,
934
+ "count": 156
935
  },
936
  "US": {
937
+ "accuracy": 0.45454545454545453,
938
+ "count": 44
939
  }
940
  }
941
  },
942
  "add_C3": {
943
  "full_accuracy": 0.02,
944
+ "digit_accuracy": 0.6271428571428571,
945
+ "n_examples": 100,
946
  "per_subtask": {
947
  "SA": {
948
+ "accuracy": 0.89,
949
+ "count": 300
950
  },
951
  "SC": {
952
+ "accuracy": 0.74,
953
+ "count": 100
954
  },
955
  "UC": {
956
+ "accuracy": 0.3417085427135678,
957
+ "count": 199
958
  },
959
  "US": {
960
+ "accuracy": 0.297029702970297,
961
+ "count": 101
962
  }
963
  }
964
  },
965
  "add_C4": {
966
+ "full_accuracy": 0.03,
967
+ "digit_accuracy": 0.55,
968
+ "n_examples": 100,
969
  "per_subtask": {
970
  "SA": {
971
+ "accuracy": 0.89,
972
+ "count": 200
973
  },
974
  "SC": {
975
+ "accuracy": 0.69,
976
+ "count": 100
977
  },
978
  "UC": {
979
+ "accuracy": 0.32954545454545453,
980
+ "count": 264
981
  },
982
  "US": {
983
+ "accuracy": 0.375,
984
+ "count": 136
985
  }
986
  }
987
  },
988
  "add_C5": {
989
+ "full_accuracy": 0.01,
990
+ "digit_accuracy": 0.45714285714285713,
991
+ "n_examples": 100,
992
  "per_subtask": {
993
  "SA": {
994
+ "accuracy": 0.92,
995
+ "count": 100
996
  },
997
  "SC": {
998
+ "accuracy": 0.71,
999
+ "count": 100
1000
  },
1001
  "UC": {
1002
+ "accuracy": 0.33225806451612905,
1003
+ "count": 310
1004
  },
1005
  "US": {
1006
+ "accuracy": 0.28421052631578947,
1007
+ "count": 190
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
+ "full_accuracy": 0.01,
1013
+ "digit_accuracy": 0.43,
1014
+ "n_examples": 100,
1015
  "per_subtask": {
1016
  "SC": {
1017
+ "accuracy": 0.65,
1018
+ "count": 100
1019
  },
1020
  "UC": {
1021
+ "accuracy": 0.41081081081081083,
1022
+ "count": 370
1023
  },
1024
  "US": {
1025
+ "accuracy": 0.3652173913043478,
1026
+ "count": 230
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
+ "full_accuracy": 0.49,
1032
+ "digit_accuracy": 0.8942857142857142,
1033
+ "n_examples": 100,
1034
  "per_subtask": {
1035
  "MD": {
1036
+ "accuracy": 0.8845528455284553,
1037
+ "count": 615
1038
  },
1039
  "ME": {
1040
+ "accuracy": 0.9647058823529412,
1041
+ "count": 85
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
+ "full_accuracy": 0.02,
1047
+ "digit_accuracy": 0.6257142857142857,
1048
+ "n_examples": 100,
1049
  "per_subtask": {
1050
  "MD": {
1051
+ "accuracy": 0.8561643835616438,
1052
+ "count": 292
1053
  },
1054
  "MB": {
1055
+ "accuracy": 0.6319444444444444,
1056
+ "count": 144
1057
  },
1058
  "ME": {
1059
+ "accuracy": 0.88,
1060
+ "count": 25
1061
  },
1062
  "UB": {
1063
+ "accuracy": 0.3138075313807531,
1064
+ "count": 239
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
  "full_accuracy": 0.0,
1070
+ "digit_accuracy": 0.5928571428571429,
1071
+ "n_examples": 100,
1072
  "per_subtask": {
1073
  "MD": {
1074
+ "accuracy": 0.95260663507109,
1075
+ "count": 211
1076
  },
1077
  "MB": {
1078
+ "accuracy": 0.6,
1079
+ "count": 115
1080
  },
1081
  "ME": {
1082
+ "accuracy": 0.9529411764705882,
1083
+ "count": 85
1084
  },
1085
  "UB": {
1086
+ "accuracy": 0.2541436464088398,
1087
+ "count": 181
1088
  },
1089
  "UD": {
1090
+ "accuracy": 0.16666666666666666,
1091
+ "count": 108
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
+ "full_accuracy": 0.01,
1097
+ "digit_accuracy": 0.48428571428571426,
1098
+ "n_examples": 100,
1099
  "per_subtask": {
1100
  "MD": {
1101
+ "accuracy": 0.9720670391061452,
1102
+ "count": 179
1103
  },
1104
  "MB": {
1105
+ "accuracy": 0.5048543689320388,
1106
+ "count": 103
1107
  },
1108
  "ME": {
1109
+ "accuracy": 0.9464285714285714,
1110
+ "count": 56
1111
  },
1112
  "UB": {
1113
+ "accuracy": 0.28187919463087246,
1114
+ "count": 149
1115
  },
1116
  "UD": {
1117
+ "accuracy": 0.08450704225352113,
1118
+ "count": 213
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
+ "full_accuracy": 0.01,
1124
+ "digit_accuracy": 0.4085714285714286,
1125
+ "n_examples": 100,
1126
  "per_subtask": {
1127
  "MD": {
1128
+ "accuracy": 0.85,
1129
+ "count": 200
1130
  },
1131
  "MB": {
1132
+ "accuracy": 0.71,
1133
+ "count": 100
1134
  },
1135
  "UB": {
1136
+ "accuracy": 0.28,
1137
+ "count": 100
1138
  },
1139
  "UD": {
1140
+ "accuracy": 0.056666666666666664,
1141
+ "count": 300
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
  "full_accuracy": 0.0,
1147
+ "digit_accuracy": 0.3157142857142857,
1148
+ "n_examples": 100,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
+ "count": 100
1153
  },
1154
  "MB": {
1155
+ "accuracy": 0.7,
1156
+ "count": 100
1157
  },
1158
  "UB": {
1159
+ "accuracy": 0.41,
1160
+ "count": 100
1161
  },
1162
  "UD": {
1163
+ "accuracy": 0.025,
1164
+ "count": 400
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
+ "full_accuracy": 0.035,
1170
+ "digit_accuracy": 0.6085714285714285,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
+ "accuracy": 0.87,
1175
+ "count": 600
1176
  },
1177
  "MB": {
1178
+ "accuracy": 0.5767790262172284,
1179
+ "count": 267
1180
  },
1181
  "ME": {
1182
+ "accuracy": 0.9811320754716981,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
+ "accuracy": 0.275626423690205,
1187
+ "count": 439
1188
  },
1189
  "UD": {
1190
+ "accuracy": 0.07317073170731707,
1191
+ "count": 41
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
+ "full_accuracy": 0.01,
1197
+ "digit_accuracy": 0.5728571428571428,
1198
+ "n_examples": 100,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.8566666666666667,
1202
+ "count": 300
1203
  },
1204
  "MB": {
1205
+ "accuracy": 0.75,
1206
+ "count": 100
1207
  },
1208
  "UB": {
1209
+ "accuracy": 0.28426395939086296,
1210
+ "count": 197
1211
  },
1212
  "UD": {
1213
+ "accuracy": 0.1262135922330097,
1214
+ "count": 103
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
  "full_accuracy": 0.0,
1220
+ "digit_accuracy": 0.48428571428571426,
1221
+ "n_examples": 100,
1222
  "per_subtask": {
1223
  "MD": {
1224
+ "accuracy": 0.905,
1225
+ "count": 200
1226
  },
1227
  "MB": {
1228
  "accuracy": 0.74,
1229
+ "count": 100
1230
  },
1231
  "UB": {
1232
+ "accuracy": 0.27125506072874495,
1233
+ "count": 247
1234
  },
1235
  "UD": {
1236
+ "accuracy": 0.1111111111111111,
1237
+ "count": 153
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
  "full_accuracy": 0.0,
1243
+ "digit_accuracy": 0.4014285714285714,
1244
+ "n_examples": 100,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
+ "count": 100
1249
  },
1250
  "MB": {
1251
+ "accuracy": 0.69,
1252
+ "count": 100
1253
  },
1254
  "UB": {
1255
+ "accuracy": 0.28187919463087246,
1256
+ "count": 298
1257
  },
1258
  "UD": {
1259
+ "accuracy": 0.13861386138613863,
1260
+ "count": 202
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
+ "overall_accuracy": 0.07653846153846154,
1267
+ "digit_accuracy": 0.5660439560439561,
1268
+ "total_examples": 2600,
1269
  "n_splits": 24
1270
  }
1271
  }
add_sub_baseline_25K_1L2H256d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2419df932e118212192de7abd45dde4a7ef65934c47e2f930ab92a12790c2905
3
  size 315072674
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb5c0fa94196e6867184d451f1f6bfc8c18f863c5aacd3016a5f2039f86c3c1
3
  size 315072674
add_sub_baseline_25K_1L2H256d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 78691840,
71
  "run_name": "add_sub_baseline_25K_1L2H256d",
72
- "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
- "timestamp": "2026-04-12T18:21:44.884269+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "122jv7wp",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/122jv7wp",
81
- "final_accuracy": 0.06916666666666667,
82
- "sft_accuracy": 0.06916666666666667,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 78691840,
71
  "run_name": "add_sub_baseline_25K_1L2H256d",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-15T11:19:56.467154+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_25K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "0dg4rvu7",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/0dg4rvu7",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.07653846153846154,
86
+ "sft_accuracy": 0.07653846153846154,
87
  "eval_method": "ArithmeticEvaluator"
88
  }