amirali1985 commited on
Commit
e3753ea
·
verified ·
1 Parent(s): e1a604e

Upload add_sub_baseline_25K_1L3H510d

Browse files
add_sub_baseline_25K_1L3H510d/metrics.json CHANGED
@@ -159,320 +159,320 @@
159
  7800
160
  ],
161
  "loss": [
162
- 8.74937915802002,
163
- 6.198915004730225,
164
- 4.366815090179443,
165
- 2.2759313583374023,
166
- 1.9526265859603882,
167
- 1.877150535583496,
168
- 1.8139151334762573,
169
- 1.822188377380371,
170
- 1.71466064453125,
171
- 1.590683937072754,
172
- 1.4826921224594116,
173
- 1.133190393447876,
174
- 0.8561835885047913,
175
- 0.7075616121292114,
176
- 0.6399551033973694,
177
- 0.6669952273368835,
178
- 0.6160100698471069,
179
- 0.5750561952590942,
180
- 0.5242418646812439,
181
- 0.5324519872665405,
182
- 0.49814528226852417,
183
- 0.4612176716327667,
184
- 0.5026556849479675,
185
- 0.47276973724365234,
186
- 0.43902602791786194,
187
- 0.446272611618042,
188
- 0.38574811816215515,
189
- 0.3711676001548767,
190
- 0.3969157338142395,
191
- 0.39037615060806274,
192
- 0.3659220337867737,
193
- 0.2901519238948822,
194
- 0.372765451669693,
195
- 0.33219239115715027,
196
- 0.34733083844184875,
197
- 0.31526169180870056,
198
- 0.33172616362571716,
199
- 0.3349834382534027,
200
- 0.3454124927520752,
201
- 0.32427385449409485,
202
- 0.29763302206993103,
203
- 0.2891975939273834,
204
- 0.3104265332221985,
205
- 0.2937973141670227,
206
- 0.29948028922080994,
207
- 0.26262804865837097,
208
- 0.26943641901016235,
209
- 0.2750188410282135,
210
- 0.2822719216346741,
211
- 0.2253115326166153,
212
- 0.2866693139076233,
213
- 0.29621443152427673,
214
- 0.2518032193183899,
215
- 0.25790685415267944,
216
- 0.33121928572654724,
217
- 0.2776740789413452,
218
- 0.240325927734375,
219
- 0.25502628087997437,
220
- 0.2516288459300995,
221
- 0.2600439190864563,
222
- 0.22953486442565918,
223
- 0.25047045946121216,
224
- 0.287687748670578,
225
- 0.24002419412136078,
226
- 0.20791330933570862,
227
- 0.24204131960868835,
228
- 0.27317410707473755,
229
- 0.24345578253269196,
230
- 0.21374334394931793,
231
- 0.21283268928527832,
232
- 0.21549828350543976,
233
- 0.19832827150821686,
234
- 0.21149209141731262,
235
- 0.1855495721101761,
236
- 0.19525133073329926,
237
- 0.23278847336769104,
238
- 0.2606830298900604,
239
- 0.20044934749603271,
240
- 0.181396022439003,
241
- 0.21328875422477722,
242
- 0.212825208902359,
243
- 0.21052296459674835,
244
- 0.19027332961559296,
245
- 0.2117026001214981,
246
- 0.22618059813976288,
247
- 0.22302715480327606,
248
- 0.17805133759975433,
249
- 0.18642711639404297,
250
- 0.2146042138338089,
251
- 0.19574260711669922,
252
- 0.16053321957588196,
253
- 0.21608540415763855,
254
- 0.20082899928092957,
255
- 0.23242345452308655,
256
- 0.17946086823940277,
257
- 0.2614651918411255,
258
- 0.1651460975408554,
259
- 0.1777617186307907,
260
- 0.15014827251434326,
261
- 0.20371000468730927,
262
- 0.1573261022567749,
263
- 0.19635023176670074,
264
- 0.18772493302822113,
265
- 0.1814812421798706,
266
- 0.18410103023052216,
267
- 0.20629878342151642,
268
- 0.1857832968235016,
269
- 0.17055557668209076,
270
- 0.16345703601837158,
271
- 0.16755636036396027,
272
- 0.17432273924350739,
273
- 0.15670685470104218,
274
- 0.15046896040439606,
275
- 0.14622943103313446,
276
- 0.1660774052143097,
277
- 0.16316993534564972,
278
- 0.16458259522914886,
279
- 0.16533777117729187,
280
- 0.18796680867671967,
281
- 0.16055859625339508,
282
- 0.16241776943206787,
283
- 0.15573906898498535,
284
- 0.18253065645694733,
285
- 0.15446387231349945,
286
- 0.16669152677059174,
287
- 0.16877718269824982,
288
- 0.15042708814144135,
289
- 0.14171156287193298,
290
- 0.1469859778881073,
291
- 0.16647039353847504,
292
- 0.1766190528869629,
293
- 0.15534673631191254,
294
- 0.15092872083187103,
295
- 0.16818329691886902,
296
- 0.14259760081768036,
297
- 0.16949117183685303,
298
- 0.13747766613960266,
299
- 0.16281220316886902,
300
- 0.1641203612089157,
301
- 0.15849816799163818,
302
- 0.14388667047023773,
303
- 0.14095112681388855,
304
- 0.14720116555690765,
305
- 0.15021368861198425,
306
- 0.15137387812137604,
307
- 0.15634380280971527,
308
- 0.14748618006706238,
309
- 0.1435571163892746,
310
- 0.13644962012767792,
311
- 0.16987796127796173,
312
- 0.15995793044567108,
313
- 0.17382332682609558,
314
- 0.157388374209404,
315
- 0.13605724275112152,
316
- 0.174538716673851,
317
- 0.15863211452960968
318
  ],
319
  "base_loss": [
320
- 8.74937915802002,
321
- 6.198915004730225,
322
- 4.366815090179443,
323
- 2.2759313583374023,
324
- 1.9526265859603882,
325
- 1.877150535583496,
326
- 1.8139151334762573,
327
- 1.822188377380371,
328
- 1.71466064453125,
329
- 1.590683937072754,
330
- 1.4826921224594116,
331
- 1.133190393447876,
332
- 0.8561835885047913,
333
- 0.7075616121292114,
334
- 0.6399551033973694,
335
- 0.6669952273368835,
336
- 0.6160100698471069,
337
- 0.5750561952590942,
338
- 0.5242418646812439,
339
- 0.5324519872665405,
340
- 0.49814528226852417,
341
- 0.4612176716327667,
342
- 0.5026556849479675,
343
- 0.47276973724365234,
344
- 0.43902602791786194,
345
- 0.446272611618042,
346
- 0.38574811816215515,
347
- 0.3711676001548767,
348
- 0.3969157338142395,
349
- 0.39037615060806274,
350
- 0.3659220337867737,
351
- 0.2901519238948822,
352
- 0.372765451669693,
353
- 0.33219239115715027,
354
- 0.34733083844184875,
355
- 0.31526169180870056,
356
- 0.33172616362571716,
357
- 0.3349834382534027,
358
- 0.3454124927520752,
359
- 0.32427385449409485,
360
- 0.29763302206993103,
361
- 0.2891975939273834,
362
- 0.3104265332221985,
363
- 0.2937973141670227,
364
- 0.29948028922080994,
365
- 0.26262804865837097,
366
- 0.26943641901016235,
367
- 0.2750188410282135,
368
- 0.2822719216346741,
369
- 0.2253115326166153,
370
- 0.2866693139076233,
371
- 0.29621443152427673,
372
- 0.2518032193183899,
373
- 0.25790685415267944,
374
- 0.33121928572654724,
375
- 0.2776740789413452,
376
- 0.240325927734375,
377
- 0.25502628087997437,
378
- 0.2516288459300995,
379
- 0.2600439190864563,
380
- 0.22953486442565918,
381
- 0.25047045946121216,
382
- 0.287687748670578,
383
- 0.24002419412136078,
384
- 0.20791330933570862,
385
- 0.24204131960868835,
386
- 0.27317410707473755,
387
- 0.24345578253269196,
388
- 0.21374334394931793,
389
- 0.21283268928527832,
390
- 0.21549828350543976,
391
- 0.19832827150821686,
392
- 0.21149209141731262,
393
- 0.1855495721101761,
394
- 0.19525133073329926,
395
- 0.23278847336769104,
396
- 0.2606830298900604,
397
- 0.20044934749603271,
398
- 0.181396022439003,
399
- 0.21328875422477722,
400
- 0.212825208902359,
401
- 0.21052296459674835,
402
- 0.19027332961559296,
403
- 0.2117026001214981,
404
- 0.22618059813976288,
405
- 0.22302715480327606,
406
- 0.17805133759975433,
407
- 0.18642711639404297,
408
- 0.2146042138338089,
409
- 0.19574260711669922,
410
- 0.16053321957588196,
411
- 0.21608540415763855,
412
- 0.20082899928092957,
413
- 0.23242345452308655,
414
- 0.17946086823940277,
415
- 0.2614651918411255,
416
- 0.1651460975408554,
417
- 0.1777617186307907,
418
- 0.15014827251434326,
419
- 0.20371000468730927,
420
- 0.1573261022567749,
421
- 0.19635023176670074,
422
- 0.18772493302822113,
423
- 0.1814812421798706,
424
- 0.18410103023052216,
425
- 0.20629878342151642,
426
- 0.1857832968235016,
427
- 0.17055557668209076,
428
- 0.16345703601837158,
429
- 0.16755636036396027,
430
- 0.17432273924350739,
431
- 0.15670685470104218,
432
- 0.15046896040439606,
433
- 0.14622943103313446,
434
- 0.1660774052143097,
435
- 0.16316993534564972,
436
- 0.16458259522914886,
437
- 0.16533777117729187,
438
- 0.18796680867671967,
439
- 0.16055859625339508,
440
- 0.16241776943206787,
441
- 0.15573906898498535,
442
- 0.18253065645694733,
443
- 0.15446387231349945,
444
- 0.16669152677059174,
445
- 0.16877718269824982,
446
- 0.15042708814144135,
447
- 0.14171156287193298,
448
- 0.1469859778881073,
449
- 0.16647039353847504,
450
- 0.1766190528869629,
451
- 0.15534673631191254,
452
- 0.15092872083187103,
453
- 0.16818329691886902,
454
- 0.14259760081768036,
455
- 0.16949117183685303,
456
- 0.13747766613960266,
457
- 0.16281220316886902,
458
- 0.1641203612089157,
459
- 0.15849816799163818,
460
- 0.14388667047023773,
461
- 0.14095112681388855,
462
- 0.14720116555690765,
463
- 0.15021368861198425,
464
- 0.15137387812137604,
465
- 0.15634380280971527,
466
- 0.14748618006706238,
467
- 0.1435571163892746,
468
- 0.13644962012767792,
469
- 0.16987796127796173,
470
- 0.15995793044567108,
471
- 0.17382332682609558,
472
- 0.157388374209404,
473
- 0.13605724275112152,
474
- 0.174538716673851,
475
- 0.15863211452960968
476
  ],
477
  "lr": [
478
  1.6752136752136756e-05,
@@ -677,595 +677,595 @@
677
  20
678
  ],
679
  "eval_accuracy": [
680
- 0.005555555555555556,
681
- 0.05333333333333334,
682
- 0.20555555555555555,
683
- 0.29333333333333333,
684
- 0.3055555555555556,
685
- 0.31777777777777777,
686
- 0.36777777777777776,
687
- 0.3988888888888889,
688
- 0.4388888888888889,
689
- 0.5322222222222223,
690
- 0.5355555555555556,
691
- 0.5366666666666666,
692
- 0.62,
693
- 0.5488888888888889,
694
- 0.5488888888888889,
695
- 0.5866666666666667,
696
- 0.5811111111111111,
697
- 0.5811111111111111,
698
- 0.5733333333333334,
699
- 0.5777777777777777
700
  ]
701
  },
702
- "final_accuracy": 0.49541666666666667,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
- "n_per_split": 50
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 0.74,
714
- "digit_accuracy": 0.96,
715
- "n_examples": 50,
716
  "per_subtask": {
717
  "SA": {
718
- "accuracy": 0.9525423728813559,
719
- "count": 295
720
  },
721
  "SS": {
722
- "accuracy": 1.0,
723
- "count": 55
724
  }
725
  }
726
  },
727
  "add_S1": {
728
- "full_accuracy": 0.74,
729
  "digit_accuracy": 0.9628571428571429,
730
- "n_examples": 50,
731
  "per_subtask": {
732
  "SA": {
733
- "accuracy": 0.9682539682539683,
734
- "count": 126
735
  },
736
  "SC": {
737
- "accuracy": 0.9620253164556962,
738
- "count": 79
739
  },
740
  "SS": {
741
- "accuracy": 1.0,
742
- "count": 21
743
  },
744
  "UC": {
745
- "accuracy": 0.9516129032258065,
746
- "count": 124
747
  }
748
  }
749
  },
750
  "add_S2": {
751
- "full_accuracy": 0.6,
752
- "digit_accuracy": 0.9342857142857143,
753
- "n_examples": 50,
754
  "per_subtask": {
755
  "SA": {
756
- "accuracy": 0.9733333333333334,
757
- "count": 75
758
  },
759
  "SC": {
760
- "accuracy": 0.967741935483871,
761
- "count": 62
762
  },
763
  "SS": {
764
- "accuracy": 0.9743589743589743,
765
- "count": 39
766
  },
767
  "UC": {
768
- "accuracy": 0.8558558558558559,
769
- "count": 111
770
  },
771
  "US": {
772
- "accuracy": 0.9682539682539683,
773
- "count": 63
774
  }
775
  }
776
  },
777
  "add_S3": {
778
- "full_accuracy": 0.38,
779
- "digit_accuracy": 0.8428571428571429,
780
- "n_examples": 50,
781
  "per_subtask": {
782
  "SA": {
783
- "accuracy": 0.9833333333333333,
784
- "count": 60
785
  },
786
  "SC": {
787
- "accuracy": 1.0,
788
- "count": 57
789
  },
790
  "SS": {
791
- "accuracy": 1.0,
792
- "count": 19
793
  },
794
  "UC": {
795
- "accuracy": 0.7115384615384616,
796
- "count": 104
797
  },
798
  "US": {
799
- "accuracy": 0.7818181818181819,
800
- "count": 110
801
  }
802
  }
803
  },
804
  "add_S4": {
805
- "full_accuracy": 0.3,
806
- "digit_accuracy": 0.7371428571428571,
807
- "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
  "accuracy": 1.0,
811
- "count": 48
812
  },
813
  "SC": {
814
- "accuracy": 1.0,
815
- "count": 52
816
  },
817
  "SS": {
818
  "accuracy": 1.0,
819
- "count": 7
820
  },
821
  "UC": {
822
- "accuracy": 0.6629213483146067,
823
- "count": 89
824
  },
825
  "US": {
826
- "accuracy": 0.5974025974025974,
827
- "count": 154
828
  }
829
  }
830
  },
831
  "add_S5": {
832
- "full_accuracy": 0.24,
833
- "digit_accuracy": 0.6028571428571429,
834
- "n_examples": 50,
835
  "per_subtask": {
836
  "SA": {
837
  "accuracy": 1.0,
838
- "count": 50
839
  },
840
  "SC": {
841
- "accuracy": 1.0,
842
- "count": 50
843
  },
844
  "UC": {
845
- "accuracy": 0.52,
846
- "count": 50
847
  },
848
  "US": {
849
- "accuracy": 0.425,
850
- "count": 200
851
  }
852
  }
853
  },
854
  "add_S6": {
855
- "full_accuracy": 0.48,
856
- "digit_accuracy": 0.6371428571428571,
857
- "n_examples": 50,
858
  "per_subtask": {
859
  "SC": {
860
  "accuracy": 1.0,
861
- "count": 50
862
  },
863
  "UC": {
864
- "accuracy": 0.54,
865
- "count": 50
866
  },
867
  "US": {
868
- "accuracy": 0.584,
869
- "count": 250
870
  }
871
  }
872
  },
873
  "add_random": {
874
  "full_accuracy": 0.765,
875
- "digit_accuracy": 0.9621428571428572,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
- "accuracy": 0.988399071925754,
880
- "count": 431
881
  },
882
  "SC": {
883
- "accuracy": 0.9936708860759493,
884
- "count": 316
885
  },
886
  "SS": {
887
- "accuracy": 1.0,
888
- "count": 39
889
  },
890
  "UC": {
891
- "accuracy": 0.9285714285714286,
892
- "count": 560
893
  },
894
  "US": {
895
- "accuracy": 0.8888888888888888,
896
- "count": 54
897
  }
898
  }
899
  },
900
  "add_C1": {
901
- "full_accuracy": 0.84,
902
  "digit_accuracy": 0.9742857142857143,
903
- "n_examples": 50,
904
  "per_subtask": {
905
  "SA": {
906
  "accuracy": 0.976,
907
- "count": 250
908
  },
909
  "SC": {
910
  "accuracy": 1.0,
911
- "count": 50
912
  },
913
  "UC": {
914
  "accuracy": 0.94,
915
- "count": 50
916
  }
917
  }
918
  },
919
  "add_C2": {
920
- "full_accuracy": 0.76,
921
- "digit_accuracy": 0.96,
922
- "n_examples": 50,
923
  "per_subtask": {
924
  "SA": {
925
- "accuracy": 0.975,
926
- "count": 200
927
  },
928
  "SC": {
929
  "accuracy": 1.0,
930
- "count": 50
931
  },
932
  "UC": {
933
- "accuracy": 0.9156626506024096,
934
- "count": 83
935
  },
936
  "US": {
937
- "accuracy": 0.8823529411764706,
938
- "count": 17
939
  }
940
  }
941
  },
942
  "add_C3": {
943
- "full_accuracy": 0.38,
944
- "digit_accuracy": 0.8857142857142857,
945
- "n_examples": 50,
946
  "per_subtask": {
947
  "SA": {
948
- "accuracy": 0.9866666666666667,
949
- "count": 150
950
  },
951
  "SC": {
952
  "accuracy": 1.0,
953
- "count": 50
954
  },
955
  "UC": {
956
- "accuracy": 0.78,
957
- "count": 100
958
  },
959
  "US": {
960
- "accuracy": 0.68,
961
- "count": 50
962
  }
963
  }
964
  },
965
  "add_C4": {
966
- "full_accuracy": 0.5,
967
- "digit_accuracy": 0.8971428571428571,
968
- "n_examples": 50,
969
  "per_subtask": {
970
  "SA": {
971
- "accuracy": 0.99,
972
- "count": 100
973
  },
974
  "SC": {
975
- "accuracy": 1.0,
976
- "count": 50
977
  },
978
  "UC": {
979
- "accuracy": 0.8333333333333334,
980
- "count": 132
981
  },
982
  "US": {
983
- "accuracy": 0.8088235294117647,
984
- "count": 68
985
  }
986
  }
987
  },
988
  "add_C5": {
989
- "full_accuracy": 0.42,
990
- "digit_accuracy": 0.8371428571428572,
991
- "n_examples": 50,
992
  "per_subtask": {
993
  "SA": {
994
  "accuracy": 1.0,
995
- "count": 50
996
  },
997
  "SC": {
998
  "accuracy": 1.0,
999
- "count": 50
1000
  },
1001
  "UC": {
1002
- "accuracy": 0.7808219178082192,
1003
- "count": 146
1004
  },
1005
  "US": {
1006
- "accuracy": 0.7596153846153846,
1007
- "count": 104
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
- "full_accuracy": 0.48,
1013
- "digit_accuracy": 0.86,
1014
- "n_examples": 50,
1015
  "per_subtask": {
1016
  "SC": {
1017
  "accuracy": 1.0,
1018
- "count": 50
1019
  },
1020
  "UC": {
1021
- "accuracy": 0.8518518518518519,
1022
- "count": 189
1023
  },
1024
  "US": {
1025
- "accuracy": 0.8108108108108109,
1026
- "count": 111
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
- "full_accuracy": 0.88,
1032
- "digit_accuracy": 0.9828571428571429,
1033
- "n_examples": 50,
1034
  "per_subtask": {
1035
  "MD": {
1036
- "accuracy": 0.9801980198019802,
1037
- "count": 303
1038
  },
1039
  "ME": {
1040
  "accuracy": 1.0,
1041
- "count": 47
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
- "full_accuracy": 0.76,
1047
- "digit_accuracy": 0.96,
1048
- "n_examples": 50,
1049
  "per_subtask": {
1050
  "MD": {
1051
- "accuracy": 0.9858156028368794,
1052
- "count": 141
1053
  },
1054
  "MB": {
1055
- "accuracy": 0.9583333333333334,
1056
- "count": 72
1057
  },
1058
  "ME": {
1059
- "accuracy": 0.9444444444444444,
1060
- "count": 18
1061
  },
1062
  "UB": {
1063
- "accuracy": 0.9327731092436975,
1064
- "count": 119
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
- "full_accuracy": 0.46,
1070
- "digit_accuracy": 0.9,
1071
- "n_examples": 50,
1072
  "per_subtask": {
1073
  "MD": {
1074
- "accuracy": 0.9732142857142857,
1075
- "count": 112
1076
  },
1077
  "MB": {
1078
- "accuracy": 0.9056603773584906,
1079
- "count": 53
1080
  },
1081
  "ME": {
1082
- "accuracy": 0.9787234042553191,
1083
- "count": 47
1084
  },
1085
  "UB": {
1086
- "accuracy": 0.7176470588235294,
1087
- "count": 85
1088
  },
1089
  "UD": {
1090
- "accuracy": 0.9622641509433962,
1091
- "count": 53
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
- "full_accuracy": 0.28,
1097
- "digit_accuracy": 0.84,
1098
- "n_examples": 50,
1099
  "per_subtask": {
1100
  "MD": {
1101
  "accuracy": 1.0,
1102
- "count": 97
1103
  },
1104
  "MB": {
1105
- "accuracy": 0.9803921568627451,
1106
- "count": 51
1107
  },
1108
  "ME": {
1109
  "accuracy": 1.0,
1110
- "count": 27
1111
  },
1112
  "UB": {
1113
- "accuracy": 0.6486486486486487,
1114
- "count": 74
1115
  },
1116
  "UD": {
1117
- "accuracy": 0.7128712871287128,
1118
- "count": 101
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
- "full_accuracy": 0.04,
1124
- "digit_accuracy": 0.6285714285714286,
1125
- "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
- "accuracy": 0.99,
1129
- "count": 100
1130
  },
1131
  "MB": {
1132
  "accuracy": 1.0,
1133
- "count": 50
1134
  },
1135
  "UB": {
1136
- "accuracy": 0.34,
1137
- "count": 50
1138
  },
1139
  "UD": {
1140
- "accuracy": 0.36,
1141
- "count": 150
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
- "full_accuracy": 0.1,
1147
- "digit_accuracy": 0.5771428571428572,
1148
- "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
- "count": 50
1153
  },
1154
  "MB": {
1155
  "accuracy": 1.0,
1156
- "count": 50
1157
  },
1158
  "UB": {
1159
- "accuracy": 0.44,
1160
- "count": 50
1161
  },
1162
  "UD": {
1163
- "accuracy": 0.4,
1164
- "count": 200
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
- "full_accuracy": 0.75,
1170
- "digit_accuracy": 0.9578571428571429,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
- "accuracy": 0.9912280701754386,
1175
- "count": 570
1176
  },
1177
  "MB": {
1178
- "accuracy": 0.9675090252707581,
1179
- "count": 277
1180
  },
1181
  "ME": {
1182
  "accuracy": 1.0,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
- "accuracy": 0.9087048832271762,
1187
- "count": 471
1188
  },
1189
  "UD": {
1190
- "accuracy": 0.9310344827586207,
1191
- "count": 29
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
- "full_accuracy": 0.4,
1197
- "digit_accuracy": 0.8828571428571429,
1198
- "n_examples": 50,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 1.0,
1202
- "count": 150
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.98,
1206
- "count": 50
1207
  },
1208
  "UB": {
1209
- "accuracy": 0.7425742574257426,
1210
- "count": 101
1211
  },
1212
  "UD": {
1213
- "accuracy": 0.7142857142857143,
1214
- "count": 49
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
- "full_accuracy": 0.3,
1220
- "digit_accuracy": 0.8171428571428572,
1221
- "n_examples": 50,
1222
  "per_subtask": {
1223
  "MD": {
1224
- "accuracy": 1.0,
1225
- "count": 100
1226
  },
1227
  "MB": {
1228
  "accuracy": 1.0,
1229
- "count": 50
1230
  },
1231
  "UB": {
1232
- "accuracy": 0.7107438016528925,
1233
- "count": 121
1234
  },
1235
  "UD": {
1236
- "accuracy": 0.6329113924050633,
1237
- "count": 79
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
- "full_accuracy": 0.28,
1243
- "digit_accuracy": 0.8,
1244
- "n_examples": 50,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
- "count": 50
1249
  },
1250
  "MB": {
1251
  "accuracy": 1.0,
1252
- "count": 50
1253
  },
1254
  "UB": {
1255
- "accuracy": 0.75,
1256
- "count": 152
1257
  },
1258
  "UD": {
1259
- "accuracy": 0.673469387755102,
1260
- "count": 98
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
- "overall_accuracy": 0.5473333333333333,
1267
- "digit_accuracy": 0.872,
1268
- "total_examples": 1500,
1269
  "n_splits": 24
1270
  }
1271
  }
 
159
  7800
160
  ],
161
  "loss": [
162
+ 9.03099536895752,
163
+ 6.222628116607666,
164
+ 4.243917942047119,
165
+ 2.320626974105835,
166
+ 1.865071415901184,
167
+ 1.8302096128463745,
168
+ 1.8747438192367554,
169
+ 1.8183810710906982,
170
+ 1.750946283340454,
171
+ 1.6808611154556274,
172
+ 1.4773333072662354,
173
+ 1.0558074712753296,
174
+ 0.8097242712974548,
175
+ 0.6758731603622437,
176
+ 0.6198462247848511,
177
+ 0.5841843485832214,
178
+ 0.5899673104286194,
179
+ 0.5845558047294617,
180
+ 0.5196164846420288,
181
+ 0.47981908917427063,
182
+ 0.5051271319389343,
183
+ 0.45223522186279297,
184
+ 0.4028424322605133,
185
+ 0.41621074080467224,
186
+ 0.3799597918987274,
187
+ 0.4078504741191864,
188
+ 0.3867304027080536,
189
+ 0.3984403610229492,
190
+ 0.3459450602531433,
191
+ 0.44437745213508606,
192
+ 0.3283034861087799,
193
+ 0.33987003564834595,
194
+ 0.32265350222587585,
195
+ 0.39120978116989136,
196
+ 0.2970849573612213,
197
+ 0.28256362676620483,
198
+ 0.3255704343318939,
199
+ 0.3375805914402008,
200
+ 0.37903186678886414,
201
+ 0.3113180696964264,
202
+ 0.3064514696598053,
203
+ 0.3122137188911438,
204
+ 0.33857011795043945,
205
+ 0.284681111574173,
206
+ 0.2882470488548279,
207
+ 0.27921387553215027,
208
+ 0.25749853253364563,
209
+ 0.28591188788414,
210
+ 0.2735363841056824,
211
+ 0.32027482986450195,
212
+ 0.23003888130187988,
213
+ 0.2517407238483429,
214
+ 0.2710196077823639,
215
+ 0.23125974833965302,
216
+ 0.29058152437210083,
217
+ 0.30273309350013733,
218
+ 0.2546815276145935,
219
+ 0.23205725848674774,
220
+ 0.24823474884033203,
221
+ 0.2757126986980438,
222
+ 0.244472473859787,
223
+ 0.2154902219772339,
224
+ 0.22570113837718964,
225
+ 0.2101653665304184,
226
+ 0.26213961839675903,
227
+ 0.2357279509305954,
228
+ 0.20209209620952606,
229
+ 0.22399720549583435,
230
+ 0.24680034816265106,
231
+ 0.2356012910604477,
232
+ 0.21086706221103668,
233
+ 0.23208554089069366,
234
+ 0.21862395107746124,
235
+ 0.23082730174064636,
236
+ 0.18721143901348114,
237
+ 0.20573721826076508,
238
+ 0.22153320908546448,
239
+ 0.23551729321479797,
240
+ 0.19480657577514648,
241
+ 0.21060045063495636,
242
+ 0.17743727564811707,
243
+ 0.18577153980731964,
244
+ 0.2063155472278595,
245
+ 0.2116478681564331,
246
+ 0.2169320285320282,
247
+ 0.21249254047870636,
248
+ 0.20075924694538116,
249
+ 0.20089474320411682,
250
+ 0.16138426959514618,
251
+ 0.1542191505432129,
252
+ 0.1778700351715088,
253
+ 0.17714276909828186,
254
+ 0.1895652860403061,
255
+ 0.20230571925640106,
256
+ 0.19294473528862,
257
+ 0.17365707457065582,
258
+ 0.14464251697063446,
259
+ 0.16429129242897034,
260
+ 0.1803891807794571,
261
+ 0.18245069682598114,
262
+ 0.18170282244682312,
263
+ 0.16513392329216003,
264
+ 0.18308500945568085,
265
+ 0.15528549253940582,
266
+ 0.19069239497184753,
267
+ 0.1680574268102646,
268
+ 0.17276549339294434,
269
+ 0.1893104612827301,
270
+ 0.16881202161312103,
271
+ 0.1330322027206421,
272
+ 0.14430834352970123,
273
+ 0.140597864985466,
274
+ 0.18033921718597412,
275
+ 0.14561453461647034,
276
+ 0.15060050785541534,
277
+ 0.14696337282657623,
278
+ 0.1570371687412262,
279
+ 0.13974200189113617,
280
+ 0.15948486328125,
281
+ 0.14828327298164368,
282
+ 0.17222817242145538,
283
+ 0.15832778811454773,
284
+ 0.13328474760055542,
285
+ 0.1352883279323578,
286
+ 0.146428182721138,
287
+ 0.14305901527404785,
288
+ 0.14752517640590668,
289
+ 0.14967180788516998,
290
+ 0.16033518314361572,
291
+ 0.15050823986530304,
292
+ 0.12478389590978622,
293
+ 0.13973259925842285,
294
+ 0.16433657705783844,
295
+ 0.15506653487682343,
296
+ 0.10899337381124496,
297
+ 0.13583232462406158,
298
+ 0.1452769786119461,
299
+ 0.1488191783428192,
300
+ 0.16326965391635895,
301
+ 0.13210436701774597,
302
+ 0.1348404884338379,
303
+ 0.15470661222934723,
304
+ 0.0996326431632042,
305
+ 0.14851771295070648,
306
+ 0.14729616045951843,
307
+ 0.13585257530212402,
308
+ 0.16751345992088318,
309
+ 0.12513908743858337,
310
+ 0.12792132794857025,
311
+ 0.14990487694740295,
312
+ 0.14224524796009064,
313
+ 0.14584632217884064,
314
+ 0.1600850373506546,
315
+ 0.12832213938236237,
316
+ 0.12461556494235992,
317
+ 0.14382870495319366
318
  ],
319
  "base_loss": [
320
+ 9.03099536895752,
321
+ 6.222628116607666,
322
+ 4.243917942047119,
323
+ 2.320626974105835,
324
+ 1.865071415901184,
325
+ 1.8302096128463745,
326
+ 1.8747438192367554,
327
+ 1.8183810710906982,
328
+ 1.750946283340454,
329
+ 1.6808611154556274,
330
+ 1.4773333072662354,
331
+ 1.0558074712753296,
332
+ 0.8097242712974548,
333
+ 0.6758731603622437,
334
+ 0.6198462247848511,
335
+ 0.5841843485832214,
336
+ 0.5899673104286194,
337
+ 0.5845558047294617,
338
+ 0.5196164846420288,
339
+ 0.47981908917427063,
340
+ 0.5051271319389343,
341
+ 0.45223522186279297,
342
+ 0.4028424322605133,
343
+ 0.41621074080467224,
344
+ 0.3799597918987274,
345
+ 0.4078504741191864,
346
+ 0.3867304027080536,
347
+ 0.3984403610229492,
348
+ 0.3459450602531433,
349
+ 0.44437745213508606,
350
+ 0.3283034861087799,
351
+ 0.33987003564834595,
352
+ 0.32265350222587585,
353
+ 0.39120978116989136,
354
+ 0.2970849573612213,
355
+ 0.28256362676620483,
356
+ 0.3255704343318939,
357
+ 0.3375805914402008,
358
+ 0.37903186678886414,
359
+ 0.3113180696964264,
360
+ 0.3064514696598053,
361
+ 0.3122137188911438,
362
+ 0.33857011795043945,
363
+ 0.284681111574173,
364
+ 0.2882470488548279,
365
+ 0.27921387553215027,
366
+ 0.25749853253364563,
367
+ 0.28591188788414,
368
+ 0.2735363841056824,
369
+ 0.32027482986450195,
370
+ 0.23003888130187988,
371
+ 0.2517407238483429,
372
+ 0.2710196077823639,
373
+ 0.23125974833965302,
374
+ 0.29058152437210083,
375
+ 0.30273309350013733,
376
+ 0.2546815276145935,
377
+ 0.23205725848674774,
378
+ 0.24823474884033203,
379
+ 0.2757126986980438,
380
+ 0.244472473859787,
381
+ 0.2154902219772339,
382
+ 0.22570113837718964,
383
+ 0.2101653665304184,
384
+ 0.26213961839675903,
385
+ 0.2357279509305954,
386
+ 0.20209209620952606,
387
+ 0.22399720549583435,
388
+ 0.24680034816265106,
389
+ 0.2356012910604477,
390
+ 0.21086706221103668,
391
+ 0.23208554089069366,
392
+ 0.21862395107746124,
393
+ 0.23082730174064636,
394
+ 0.18721143901348114,
395
+ 0.20573721826076508,
396
+ 0.22153320908546448,
397
+ 0.23551729321479797,
398
+ 0.19480657577514648,
399
+ 0.21060045063495636,
400
+ 0.17743727564811707,
401
+ 0.18577153980731964,
402
+ 0.2063155472278595,
403
+ 0.2116478681564331,
404
+ 0.2169320285320282,
405
+ 0.21249254047870636,
406
+ 0.20075924694538116,
407
+ 0.20089474320411682,
408
+ 0.16138426959514618,
409
+ 0.1542191505432129,
410
+ 0.1778700351715088,
411
+ 0.17714276909828186,
412
+ 0.1895652860403061,
413
+ 0.20230571925640106,
414
+ 0.19294473528862,
415
+ 0.17365707457065582,
416
+ 0.14464251697063446,
417
+ 0.16429129242897034,
418
+ 0.1803891807794571,
419
+ 0.18245069682598114,
420
+ 0.18170282244682312,
421
+ 0.16513392329216003,
422
+ 0.18308500945568085,
423
+ 0.15528549253940582,
424
+ 0.19069239497184753,
425
+ 0.1680574268102646,
426
+ 0.17276549339294434,
427
+ 0.1893104612827301,
428
+ 0.16881202161312103,
429
+ 0.1330322027206421,
430
+ 0.14430834352970123,
431
+ 0.140597864985466,
432
+ 0.18033921718597412,
433
+ 0.14561453461647034,
434
+ 0.15060050785541534,
435
+ 0.14696337282657623,
436
+ 0.1570371687412262,
437
+ 0.13974200189113617,
438
+ 0.15948486328125,
439
+ 0.14828327298164368,
440
+ 0.17222817242145538,
441
+ 0.15832778811454773,
442
+ 0.13328474760055542,
443
+ 0.1352883279323578,
444
+ 0.146428182721138,
445
+ 0.14305901527404785,
446
+ 0.14752517640590668,
447
+ 0.14967180788516998,
448
+ 0.16033518314361572,
449
+ 0.15050823986530304,
450
+ 0.12478389590978622,
451
+ 0.13973259925842285,
452
+ 0.16433657705783844,
453
+ 0.15506653487682343,
454
+ 0.10899337381124496,
455
+ 0.13583232462406158,
456
+ 0.1452769786119461,
457
+ 0.1488191783428192,
458
+ 0.16326965391635895,
459
+ 0.13210436701774597,
460
+ 0.1348404884338379,
461
+ 0.15470661222934723,
462
+ 0.0996326431632042,
463
+ 0.14851771295070648,
464
+ 0.14729616045951843,
465
+ 0.13585257530212402,
466
+ 0.16751345992088318,
467
+ 0.12513908743858337,
468
+ 0.12792132794857025,
469
+ 0.14990487694740295,
470
+ 0.14224524796009064,
471
+ 0.14584632217884064,
472
+ 0.1600850373506546,
473
+ 0.12832213938236237,
474
+ 0.12461556494235992,
475
+ 0.14382870495319366
476
  ],
477
  "lr": [
478
  1.6752136752136756e-05,
 
677
  20
678
  ],
679
  "eval_accuracy": [
680
+ 0.002105263157894737,
681
+ 0.06736842105263158,
682
+ 0.17894736842105263,
683
+ 0.28526315789473683,
684
+ 0.3336842105263158,
685
+ 0.32842105263157895,
686
+ 0.4421052631578947,
687
+ 0.4168421052631579,
688
+ 0.43894736842105264,
689
+ 0.4305263157894737,
690
+ 0.4431578947368421,
691
+ 0.5505263157894736,
692
+ 0.5305263157894737,
693
+ 0.52,
694
+ 0.5726315789473684,
695
+ 0.5810526315789474,
696
+ 0.5768421052631579,
697
+ 0.5810526315789474,
698
+ 0.5852631578947368,
699
+ 0.5863157894736842
700
  ]
701
  },
702
+ "final_accuracy": 0.46307692307692305,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
+ "n_per_split": 100
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.78,
714
+ "digit_accuracy": 0.9614285714285714,
715
+ "n_examples": 100,
716
  "per_subtask": {
717
  "SA": {
718
+ "accuracy": 0.9636363636363636,
719
+ "count": 605
720
  },
721
  "SS": {
722
+ "accuracy": 0.9473684210526315,
723
+ "count": 95
724
  }
725
  }
726
  },
727
  "add_S1": {
728
+ "full_accuracy": 0.76,
729
  "digit_accuracy": 0.9628571428571429,
730
+ "n_examples": 100,
731
  "per_subtask": {
732
  "SA": {
733
+ "accuracy": 0.9607843137254902,
734
+ "count": 204
735
  },
736
  "SC": {
737
+ "accuracy": 0.9704142011834319,
738
+ "count": 169
739
  },
740
  "SS": {
741
+ "accuracy": 0.9354838709677419,
742
+ "count": 31
743
  },
744
  "UC": {
745
+ "accuracy": 0.9628378378378378,
746
+ "count": 296
747
  }
748
  }
749
  },
750
  "add_S2": {
751
+ "full_accuracy": 0.5,
752
+ "digit_accuracy": 0.9114285714285715,
753
+ "n_examples": 100,
754
  "per_subtask": {
755
  "SA": {
756
+ "accuracy": 0.9631901840490797,
757
+ "count": 163
758
  },
759
  "SC": {
760
+ "accuracy": 0.9461538461538461,
761
+ "count": 130
762
  },
763
  "SS": {
764
+ "accuracy": 0.9195402298850575,
765
+ "count": 87
766
  },
767
  "UC": {
768
+ "accuracy": 0.8078817733990148,
769
+ "count": 203
770
  },
771
  "US": {
772
+ "accuracy": 0.9743589743589743,
773
+ "count": 117
774
  }
775
  }
776
  },
777
  "add_S3": {
778
+ "full_accuracy": 0.14,
779
+ "digit_accuracy": 0.7757142857142857,
780
+ "n_examples": 100,
781
  "per_subtask": {
782
  "SA": {
783
+ "accuracy": 0.9752066115702479,
784
+ "count": 121
785
  },
786
  "SC": {
787
+ "accuracy": 0.9256198347107438,
788
+ "count": 121
789
  },
790
  "SS": {
791
+ "accuracy": 0.9591836734693877,
792
+ "count": 49
793
  },
794
  "UC": {
795
+ "accuracy": 0.6129032258064516,
796
+ "count": 186
797
  },
798
  "US": {
799
+ "accuracy": 0.6816143497757847,
800
+ "count": 223
801
  }
802
  }
803
  },
804
  "add_S4": {
805
+ "full_accuracy": 0.22,
806
+ "digit_accuracy": 0.7,
807
+ "n_examples": 100,
808
  "per_subtask": {
809
  "SA": {
810
  "accuracy": 1.0,
811
+ "count": 104
812
  },
813
  "SC": {
814
+ "accuracy": 0.9905660377358491,
815
+ "count": 106
816
  },
817
  "SS": {
818
  "accuracy": 1.0,
819
+ "count": 23
820
  },
821
  "UC": {
822
+ "accuracy": 0.61875,
823
+ "count": 160
824
  },
825
  "US": {
826
+ "accuracy": 0.5179153094462541,
827
+ "count": 307
828
  }
829
  }
830
  },
831
  "add_S5": {
832
+ "full_accuracy": 0.21,
833
+ "digit_accuracy": 0.5942857142857143,
834
+ "n_examples": 100,
835
  "per_subtask": {
836
  "SA": {
837
  "accuracy": 1.0,
838
+ "count": 100
839
  },
840
  "SC": {
841
+ "accuracy": 0.97,
842
+ "count": 100
843
  },
844
  "UC": {
845
+ "accuracy": 0.43,
846
+ "count": 100
847
  },
848
  "US": {
849
+ "accuracy": 0.44,
850
+ "count": 400
851
  }
852
  }
853
  },
854
  "add_S6": {
855
+ "full_accuracy": 0.27,
856
+ "digit_accuracy": 0.5028571428571429,
857
+ "n_examples": 100,
858
  "per_subtask": {
859
  "SC": {
860
  "accuracy": 1.0,
861
+ "count": 100
862
  },
863
  "UC": {
864
+ "accuracy": 0.39,
865
+ "count": 100
866
  },
867
  "US": {
868
+ "accuracy": 0.426,
869
+ "count": 500
870
  }
871
  }
872
  },
873
  "add_random": {
874
  "full_accuracy": 0.765,
875
+ "digit_accuracy": 0.96,
876
  "n_examples": 200,
877
  "per_subtask": {
878
  "SA": {
879
+ "accuracy": 0.9753914988814317,
880
+ "count": 447
881
  },
882
  "SC": {
883
+ "accuracy": 0.984375,
884
+ "count": 320
885
  },
886
  "SS": {
887
+ "accuracy": 0.9642857142857143,
888
+ "count": 56
889
  },
890
  "UC": {
891
+ "accuracy": 0.9376181474480151,
892
+ "count": 529
893
  },
894
  "US": {
895
+ "accuracy": 0.8958333333333334,
896
+ "count": 48
897
  }
898
  }
899
  },
900
  "add_C1": {
901
+ "full_accuracy": 0.83,
902
  "digit_accuracy": 0.9742857142857143,
903
+ "n_examples": 100,
904
  "per_subtask": {
905
  "SA": {
906
  "accuracy": 0.976,
907
+ "count": 500
908
  },
909
  "SC": {
910
  "accuracy": 1.0,
911
+ "count": 100
912
  },
913
  "UC": {
914
  "accuracy": 0.94,
915
+ "count": 100
916
  }
917
  }
918
  },
919
  "add_C2": {
920
+ "full_accuracy": 0.69,
921
+ "digit_accuracy": 0.9485714285714286,
922
+ "n_examples": 100,
923
  "per_subtask": {
924
  "SA": {
925
+ "accuracy": 0.97,
926
+ "count": 400
927
  },
928
  "SC": {
929
  "accuracy": 1.0,
930
+ "count": 100
931
  },
932
  "UC": {
933
+ "accuracy": 0.8461538461538461,
934
+ "count": 156
935
  },
936
  "US": {
937
+ "accuracy": 1.0,
938
+ "count": 44
939
  }
940
  }
941
  },
942
  "add_C3": {
943
+ "full_accuracy": 0.5,
944
+ "digit_accuracy": 0.9085714285714286,
945
+ "n_examples": 100,
946
  "per_subtask": {
947
  "SA": {
948
+ "accuracy": 0.9933333333333333,
949
+ "count": 300
950
  },
951
  "SC": {
952
  "accuracy": 1.0,
953
+ "count": 100
954
  },
955
  "UC": {
956
+ "accuracy": 0.7587939698492462,
957
+ "count": 199
958
  },
959
  "US": {
960
+ "accuracy": 0.8613861386138614,
961
+ "count": 101
962
  }
963
  }
964
  },
965
  "add_C4": {
966
+ "full_accuracy": 0.47,
967
+ "digit_accuracy": 0.8842857142857142,
968
+ "n_examples": 100,
969
  "per_subtask": {
970
  "SA": {
971
+ "accuracy": 0.985,
972
+ "count": 200
973
  },
974
  "SC": {
975
+ "accuracy": 0.99,
976
+ "count": 100
977
  },
978
  "UC": {
979
+ "accuracy": 0.8143939393939394,
980
+ "count": 264
981
  },
982
  "US": {
983
+ "accuracy": 0.7941176470588235,
984
+ "count": 136
985
  }
986
  }
987
  },
988
  "add_C5": {
989
+ "full_accuracy": 0.29,
990
+ "digit_accuracy": 0.8228571428571428,
991
+ "n_examples": 100,
992
  "per_subtask": {
993
  "SA": {
994
  "accuracy": 1.0,
995
+ "count": 100
996
  },
997
  "SC": {
998
  "accuracy": 1.0,
999
+ "count": 100
1000
  },
1001
  "UC": {
1002
+ "accuracy": 0.7483870967741936,
1003
+ "count": 310
1004
  },
1005
  "US": {
1006
+ "accuracy": 0.7578947368421053,
1007
+ "count": 190
1008
  }
1009
  }
1010
  },
1011
  "add_C6": {
1012
+ "full_accuracy": 0.33,
1013
+ "digit_accuracy": 0.8,
1014
+ "n_examples": 100,
1015
  "per_subtask": {
1016
  "SC": {
1017
  "accuracy": 1.0,
1018
+ "count": 100
1019
  },
1020
  "UC": {
1021
+ "accuracy": 0.8081081081081081,
1022
+ "count": 370
1023
  },
1024
  "US": {
1025
+ "accuracy": 0.7,
1026
+ "count": 230
1027
  }
1028
  }
1029
  },
1030
  "sub_M0": {
1031
+ "full_accuracy": 0.84,
1032
+ "digit_accuracy": 0.9771428571428571,
1033
+ "n_examples": 100,
1034
  "per_subtask": {
1035
  "MD": {
1036
+ "accuracy": 0.9739837398373984,
1037
+ "count": 615
1038
  },
1039
  "ME": {
1040
  "accuracy": 1.0,
1041
+ "count": 85
1042
  }
1043
  }
1044
  },
1045
  "sub_M1": {
1046
+ "full_accuracy": 0.83,
1047
+ "digit_accuracy": 0.9728571428571429,
1048
+ "n_examples": 100,
1049
  "per_subtask": {
1050
  "MD": {
1051
+ "accuracy": 0.9897260273972602,
1052
+ "count": 292
1053
  },
1054
  "MB": {
1055
+ "accuracy": 0.9791666666666666,
1056
+ "count": 144
1057
  },
1058
  "ME": {
1059
+ "accuracy": 0.96,
1060
+ "count": 25
1061
  },
1062
  "UB": {
1063
+ "accuracy": 0.9497907949790795,
1064
+ "count": 239
1065
  }
1066
  }
1067
  },
1068
  "sub_M2": {
1069
+ "full_accuracy": 0.3,
1070
+ "digit_accuracy": 0.8842857142857142,
1071
+ "n_examples": 100,
1072
  "per_subtask": {
1073
  "MD": {
1074
+ "accuracy": 0.985781990521327,
1075
+ "count": 211
1076
  },
1077
  "MB": {
1078
+ "accuracy": 0.9565217391304348,
1079
+ "count": 115
1080
  },
1081
  "ME": {
1082
+ "accuracy": 0.9764705882352941,
1083
+ "count": 85
1084
  },
1085
  "UB": {
1086
+ "accuracy": 0.6243093922651933,
1087
+ "count": 181
1088
  },
1089
  "UD": {
1090
+ "accuracy": 0.9722222222222222,
1091
+ "count": 108
1092
  }
1093
  }
1094
  },
1095
  "sub_M3": {
1096
+ "full_accuracy": 0.11,
1097
+ "digit_accuracy": 0.7928571428571428,
1098
+ "n_examples": 100,
1099
  "per_subtask": {
1100
  "MD": {
1101
  "accuracy": 1.0,
1102
+ "count": 179
1103
  },
1104
  "MB": {
1105
+ "accuracy": 0.941747572815534,
1106
+ "count": 103
1107
  },
1108
  "ME": {
1109
  "accuracy": 1.0,
1110
+ "count": 56
1111
  },
1112
  "UB": {
1113
+ "accuracy": 0.5100671140939598,
1114
+ "count": 149
1115
  },
1116
  "UD": {
1117
+ "accuracy": 0.6901408450704225,
1118
+ "count": 213
1119
  }
1120
  }
1121
  },
1122
  "sub_M4": {
1123
+ "full_accuracy": 0.07,
1124
+ "digit_accuracy": 0.6642857142857143,
1125
+ "n_examples": 100,
1126
  "per_subtask": {
1127
  "MD": {
1128
+ "accuracy": 0.995,
1129
+ "count": 200
1130
  },
1131
  "MB": {
1132
  "accuracy": 1.0,
1133
+ "count": 100
1134
  },
1135
  "UB": {
1136
+ "accuracy": 0.46,
1137
+ "count": 100
1138
  },
1139
  "UD": {
1140
+ "accuracy": 0.4,
1141
+ "count": 300
1142
  }
1143
  }
1144
  },
1145
  "sub_M5": {
1146
+ "full_accuracy": 0.04,
1147
+ "digit_accuracy": 0.5014285714285714,
1148
+ "n_examples": 100,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
+ "count": 100
1153
  },
1154
  "MB": {
1155
  "accuracy": 1.0,
1156
+ "count": 100
1157
  },
1158
  "UB": {
1159
+ "accuracy": 0.51,
1160
+ "count": 100
1161
  },
1162
  "UD": {
1163
+ "accuracy": 0.25,
1164
+ "count": 400
1165
  }
1166
  }
1167
  },
1168
  "sub_random": {
1169
+ "full_accuracy": 0.775,
1170
+ "digit_accuracy": 0.9664285714285714,
1171
  "n_examples": 200,
1172
  "per_subtask": {
1173
  "MD": {
1174
+ "accuracy": 0.98,
1175
+ "count": 600
1176
  },
1177
  "MB": {
1178
+ "accuracy": 0.9737827715355806,
1179
+ "count": 267
1180
  },
1181
  "ME": {
1182
  "accuracy": 1.0,
1183
  "count": 53
1184
  },
1185
  "UB": {
1186
+ "accuracy": 0.9384965831435079,
1187
+ "count": 439
1188
  },
1189
  "UD": {
1190
+ "accuracy": 0.975609756097561,
1191
+ "count": 41
1192
  }
1193
  }
1194
  },
1195
  "sub_B3": {
1196
+ "full_accuracy": 0.38,
1197
+ "digit_accuracy": 0.8757142857142857,
1198
+ "n_examples": 100,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.9933333333333333,
1202
+ "count": 300
1203
  },
1204
  "MB": {
1205
+ "accuracy": 1.0,
1206
+ "count": 100
1207
  },
1208
  "UB": {
1209
+ "accuracy": 0.7208121827411168,
1210
+ "count": 197
1211
  },
1212
  "UD": {
1213
+ "accuracy": 0.7087378640776699,
1214
+ "count": 103
1215
  }
1216
  }
1217
  },
1218
  "sub_B4": {
1219
+ "full_accuracy": 0.2,
1220
+ "digit_accuracy": 0.8142857142857143,
1221
+ "n_examples": 100,
1222
  "per_subtask": {
1223
  "MD": {
1224
+ "accuracy": 0.99,
1225
+ "count": 200
1226
  },
1227
  "MB": {
1228
  "accuracy": 1.0,
1229
+ "count": 100
1230
  },
1231
  "UB": {
1232
+ "accuracy": 0.6963562753036437,
1233
+ "count": 247
1234
  },
1235
  "UD": {
1236
+ "accuracy": 0.6535947712418301,
1237
+ "count": 153
1238
  }
1239
  }
1240
  },
1241
  "sub_B5": {
1242
+ "full_accuracy": 0.21,
1243
+ "digit_accuracy": 0.77,
1244
+ "n_examples": 100,
1245
  "per_subtask": {
1246
  "MD": {
1247
  "accuracy": 1.0,
1248
+ "count": 100
1249
  },
1250
  "MB": {
1251
  "accuracy": 1.0,
1252
+ "count": 100
1253
  },
1254
  "UB": {
1255
+ "accuracy": 0.714765100671141,
1256
+ "count": 298
1257
  },
1258
  "UD": {
1259
+ "accuracy": 0.6237623762376238,
1260
+ "count": 202
1261
  }
1262
  }
1263
  }
1264
  },
1265
  "summary": {
1266
+ "overall_accuracy": 0.46307692307692305,
1267
+ "digit_accuracy": 0.8403846153846154,
1268
+ "total_examples": 2600,
1269
  "n_splits": 24
1270
  }
1271
  }
add_sub_baseline_25K_1L3H510d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5b3d0a4f5926074c2ad67eaf23945b1b503b7915d0fbafb817b3c7222125cc7
3
  size 634642298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1136aa94b4c06fff5761cf5e96a59cb0caae50c49067b0a3d00ef1072485dd
3
  size 634642298
add_sub_baseline_25K_1L3H510d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_25K_1L3H510d",
72
- "git_commit": "dc8dd776fb0c30a4c9073052dcc5e943e0fd80c6",
73
- "timestamp": "2026-04-13T06:26:07.104526+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "wpi2qkei",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/wpi2qkei",
81
- "final_accuracy": 0.49541666666666667,
82
- "sft_accuracy": 0.49541666666666667,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_25K_1L3H510d",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-15T12:26:18.246615+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_25K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "sft",
80
+ "wandb_run_id": "sqn2003t",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/sqn2003t",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.46307692307692305,
86
+ "sft_accuracy": 0.46307692307692305,
87
  "eval_method": "ArithmeticEvaluator"
88
  }