amirali1985 commited on
Commit
0bcafbc
·
verified ·
1 Parent(s): c0aa9f3

Upload add_sub_sorl_v1_abs30_K1_10K_2L1H128d

Browse files
add_sub_sorl_v1_abs30_K1_10K_2L1H128d/metrics.json CHANGED
@@ -63,377 +63,377 @@
63
  3133
64
  ],
65
  "loss": [
66
- 11.109960556030273,
67
- 9.487344741821289,
68
- 11.728270530700684,
69
- 13.046238899230957,
70
- 12.526779174804688,
71
- 11.766132354736328,
72
- 10.708438873291016,
73
- 9.716384887695312,
74
- 9.139434814453125,
75
- 8.302337646484375,
76
- 7.700814247131348,
77
- 7.2936859130859375,
78
- 6.779129981994629,
79
- 6.4132561683654785,
80
- 6.046442031860352,
81
- 5.605899333953857,
82
- 5.361966133117676,
83
- 4.989933013916016,
84
- 4.719845294952393,
85
- 4.400616645812988,
86
- 4.159879207611084,
87
- 3.9028968811035156,
88
- 3.719845771789551,
89
- 3.6537272930145264,
90
- 3.5202412605285645,
91
- 3.3910393714904785,
92
- 3.424415111541748,
93
- 3.3224616050720215,
94
- 3.213174343109131,
95
- 3.204472064971924,
96
- 3.1120765209198,
97
- 3.1674718856811523,
98
- 3.0288727283477783,
99
- 2.898710250854492,
100
- 3.0461621284484863,
101
- 2.9120559692382812,
102
- 2.9968926906585693,
103
- 2.7610628604888916,
104
- 2.805112600326538,
105
- 2.8641648292541504,
106
- 2.6749207973480225,
107
- 2.742555618286133,
108
- 2.5006370544433594,
109
- 2.5789735317230225,
110
- 2.5932741165161133,
111
- 2.3684425354003906,
112
- 2.1633200645446777,
113
- 1.9918112754821777,
114
- 1.792123794555664,
115
- 1.8161230087280273,
116
- 1.2393646240234375,
117
- 1.4787614345550537,
118
- 1.1252933740615845,
119
- 1.4857960939407349,
120
- 0.9767551422119141,
121
- 0.9414909482002258,
122
- 1.2717292308807373,
123
- 1.1875169277191162,
124
- 0.974738597869873,
125
- 0.9622255563735962
126
  ],
127
  "base_loss": [
128
- 11.86424446105957,
129
- 11.58115291595459,
130
- 11.0537748336792,
131
- 10.185273170471191,
132
- 9.717636108398438,
133
- 9.179335594177246,
134
- 8.59729290008545,
135
- 7.95011568069458,
136
- 7.552673816680908,
137
- 6.889076232910156,
138
- 6.402405738830566,
139
- 6.064190864562988,
140
- 5.633248329162598,
141
- 5.255946636199951,
142
- 4.898728370666504,
143
- 4.464768409729004,
144
- 4.268970489501953,
145
- 3.9639620780944824,
146
- 3.6311986446380615,
147
- 3.3756792545318604,
148
- 3.146327495574951,
149
- 2.862344980239868,
150
- 2.6884384155273438,
151
- 2.6278915405273438,
152
- 2.5437910556793213,
153
- 2.4695563316345215,
154
- 2.347290277481079,
155
- 2.332256317138672,
156
- 2.310865640640259,
157
- 2.2869811058044434,
158
- 2.1933040618896484,
159
- 2.2128100395202637,
160
- 2.1955173015594482,
161
- 2.168916940689087,
162
- 2.2345030307769775,
163
- 2.1226439476013184,
164
- 2.208935022354126,
165
- 2.1581814289093018,
166
- 2.1644232273101807,
167
- 2.1009058952331543,
168
- 2.1047756671905518,
169
- 2.0930335521698,
170
- 2.0997116565704346,
171
- 2.1156744956970215,
172
- 2.1169261932373047,
173
- 2.1085855960845947,
174
- 2.0818076133728027,
175
- 2.14859938621521,
176
- 2.0613577365875244,
177
- 2.130605459213257,
178
- 2.01482892036438,
179
- 2.1084914207458496,
180
- 2.082641124725342,
181
- 2.1299586296081543,
182
- 2.1353821754455566,
183
- 2.0989015102386475,
184
- 2.130192995071411,
185
- 2.1105315685272217,
186
- 2.1727616786956787,
187
- 2.1143510341644287
188
  ],
189
  "info_loss": [
190
- -0.801727294921875,
191
- -0.8869190216064453,
192
- -0.5174455642700195,
193
- -0.17232799530029297,
194
- -0.08138847351074219,
195
- -0.033415794372558594,
196
- -0.022771835327148438,
197
- -0.020429134368896484,
198
- -0.013097763061523438,
199
- -0.009570598602294922,
200
- -0.010950565338134766,
201
- -0.011286258697509766,
202
- -0.015323638916015625,
203
- -0.010664939880371094,
204
- -0.009275436401367188,
205
- -0.0075473785400390625,
206
- -0.011253833770751953,
207
- -0.016321420669555664,
208
- -0.009073734283447266,
209
- -0.014719724655151367,
210
- -0.015111207962036133,
211
- -0.011531829833984375,
212
- -0.011838674545288086,
213
- -0.011916875839233398,
214
- -0.01653766632080078,
215
- -0.021875381469726562,
216
- -0.0058193206787109375,
217
- -0.014473438262939453,
218
- -0.022913217544555664,
219
- -0.021147489547729492,
220
- -0.02110123634338379,
221
- -0.017413616180419922,
222
- -0.02963995933532715,
223
- -0.03967618942260742,
224
- -0.030973196029663086,
225
- -0.033582448959350586,
226
- -0.033734798431396484,
227
- -0.05214858055114746,
228
- -0.048038482666015625,
229
- -0.03609871864318848,
230
- -0.0551145076751709,
231
- -0.04743504524230957,
232
- -0.07212638854980469,
233
- -0.065765380859375,
234
- -0.06428670883178711,
235
- -0.08607649803161621,
236
- -0.1040034294128418,
237
- -0.1276688575744629,
238
- -0.13939201831817627,
239
- -0.14394164085388184,
240
- -0.1898949146270752,
241
- -0.17540061473846436,
242
- -0.20797789096832275,
243
- -0.17642903327941895,
244
- -0.22839045524597168,
245
- -0.2279418706893921,
246
- -0.19848155975341797,
247
- -0.20455431938171387,
248
- -0.2321758270263672,
249
- -0.2275463342666626
250
  ],
251
  "abs_loss": [
252
- 3.400404691696167,
253
- 3.3852317333221436,
254
- 3.3510189056396484,
255
- 3.2838191986083984,
256
- 3.229877471923828,
257
- 3.163647413253784,
258
- 3.0793673992156982,
259
- 3.0087764263153076,
260
- 2.875743865966797,
261
- 2.81463623046875,
262
- 2.793081521987915,
263
- 2.756037473678589,
264
- 2.7811977863311768,
265
- 2.746596336364746,
266
- 2.7444820404052734,
267
- 2.71988844871521,
268
- 2.747046709060669,
269
- 2.710127830505371,
270
- 2.7222626209259033,
271
- 2.735506057739258,
272
- 2.739694356918335,
273
- 2.7228400707244873,
274
- 2.7128937244415283,
275
- 2.7144927978515625,
276
- 2.7234694957733154,
277
- 2.7356929779052734,
278
- 2.7130095958709717,
279
- 2.736820936203003,
280
- 2.7188408374786377,
281
- 2.709321975708008,
282
- 2.731154203414917,
283
- 2.7337539196014404,
284
- 2.7499465942382812,
285
- 2.7255847454071045,
286
- 2.6828200817108154,
287
- 2.7293548583984375,
288
- 2.7345211505889893,
289
- 2.7277286052703857,
290
- 2.6977577209472656,
291
- 2.733896255493164,
292
- 2.709213972091675,
293
- 2.7374496459960938,
294
- 2.722898483276367,
295
- 2.7119290828704834,
296
- 2.696190118789673,
297
- 2.7109363079071045,
298
- 2.721041679382324,
299
- 2.7053773403167725,
300
- 2.75048828125,
301
- 2.748612642288208,
302
- 2.732274293899536,
303
- 2.737445831298828,
304
- 2.718538999557495,
305
- 2.693471670150757,
306
- 2.744067430496216,
307
- 2.7114646434783936,
308
- 2.7530784606933594,
309
- 2.7140703201293945,
310
- 2.7263448238372803,
311
- 2.7224409580230713
312
  ],
313
  "zipf_loss": [
314
- 6.922948837280273,
315
- 6.436858654022217,
316
- 5.513849258422852,
317
- 4.255864143371582,
318
- 3.3000402450561523,
319
- 2.6045901775360107,
320
- 2.0309271812438965,
321
- 1.6696820259094238,
322
- 1.4301646947860718,
323
- 1.227504014968872,
324
- 1.1286059617996216,
325
- 1.06675386428833,
326
- 1.0209985971450806,
327
- 0.9892993569374084,
328
- 0.966019868850708,
329
- 0.94461590051651,
330
- 0.930829644203186,
331
- 0.9181725382804871,
332
- 0.9071574807167053,
333
- 0.898584246635437,
334
- 0.8906944990158081,
335
- 0.8835861086845398,
336
- 0.8785046339035034,
337
- 0.8735551834106445,
338
- 0.8694800138473511,
339
- 0.8666675090789795,
340
- 0.8640170097351074,
341
- 0.8612576127052307,
342
- 0.8595567941665649,
343
- 0.8580335378646851,
344
- 0.8566694259643555,
345
- 0.8554226160049438,
346
- 0.8547603487968445,
347
- 0.8539966344833374,
348
- 0.8531091809272766,
349
- 0.8523011803627014,
350
- 0.851853609085083,
351
- 0.8515943288803101,
352
- 0.851298451423645,
353
- 0.8508565425872803,
354
- 0.8503687381744385,
355
- 0.8501275777816772,
356
- 0.8498992919921875,
357
- 0.8497598767280579,
358
- 0.8495959639549255,
359
- 0.8495282530784607,
360
- 0.849442720413208,
361
- 0.8493627309799194,
362
- 0.8496373295783997,
363
- 0.8500726819038391,
364
- 0.850257396697998,
365
- 0.8505315780639648,
366
- 0.8505772352218628,
367
- 0.8507806062698364,
368
- 0.8508707880973816,
369
- 0.850861668586731,
370
- 0.8510439991950989,
371
- 0.8511215448379517,
372
- 0.8511006832122803,
373
- 0.8510937690734863
374
  ],
375
  "denoise_loss": [],
376
  "ortho_loss": [
377
- 0.7170330882072449,
378
- 0.6676084399223328,
379
- 0.4536101818084717,
380
- 0.3363282084465027,
381
- 0.2717445492744446,
382
- 0.25135740637779236,
383
- 0.28221043944358826,
384
- 0.30178341269493103,
385
- 0.31571823358535767,
386
- 0.3284376859664917,
387
- 0.3331630527973175,
388
- 0.34331825375556946,
389
- 0.35866957902908325,
390
- 0.36964812874794006,
391
- 0.3801919221878052,
392
- 0.37727099657058716,
393
- 0.3913383483886719,
394
- 0.39448729157447815,
395
- 0.3997330367565155,
396
- 0.4072052836418152,
397
- 0.3953169882297516,
398
- 0.3841864764690399,
399
- 0.3772575259208679,
400
- 0.3756064772605896,
401
- 0.3684244751930237,
402
- 0.3618350923061371,
403
- 0.36770451068878174,
404
- 0.36369138956069946,
405
- 0.3568985164165497,
406
- 0.3539384603500366,
407
- 0.3554384112358093,
408
- 0.3487551510334015,
409
- 0.3574298024177551,
410
- 0.3522641360759735,
411
- 0.3537253141403198,
412
- 0.3676007091999054,
413
- 0.37232914566993713,
414
- 0.3805965781211853,
415
- 0.38131752610206604,
416
- 0.3908657133579254,
417
- 0.40144118666648865,
418
- 0.4037792682647705,
419
- 0.4099784791469574,
420
- 0.41942158341407776,
421
- 0.42823660373687744,
422
- 0.43726399540901184,
423
- 0.44310638308525085,
424
- 0.452615886926651,
425
- 0.4632907509803772,
426
- 0.4734920859336853,
427
- 0.4806124269962311,
428
- 0.4873966872692108,
429
- 0.49242761731147766,
430
- 0.49730294942855835,
431
- 0.5018022656440735,
432
- 0.5048937797546387,
433
- 0.5078352093696594,
434
- 0.5118813514709473,
435
- 0.5134710669517517,
436
- 0.5150383114814758
437
  ],
438
  "lr": [
439
  9.800000000000001e-06,
@@ -521,491 +521,491 @@
521
  3083
522
  ],
523
  "eval_accuracy": [
524
- 0.0,
525
- 0.0,
526
- 0.0,
527
- 0.0,
528
- 0.0,
529
- 0.0,
530
- 0.0,
531
  0.02,
532
- 0.01,
533
- 0.0,
534
- 0.0,
535
- 0.01,
536
- 0.0,
537
- 0.0,
538
  0.02,
539
- 0.0,
540
- 0.0,
541
- 0.01,
542
- 0.01,
543
- 0.01
 
 
 
 
 
 
 
 
 
 
 
 
 
544
  ]
545
  },
546
- "final_accuracy": 0.00125,
547
  "sft_eval": {
548
  "config": {
549
  "ops": "add_sub",
550
  "K": null,
551
  "mode": "sft",
552
  "n_digits": 6,
553
- "n_per_split": 50
554
  },
555
  "splits": {
556
  "add_S0": {
557
  "full_accuracy": 0.0,
558
- "digit_accuracy": 0.32285714285714284,
559
- "n_examples": 50,
560
  "per_subtask": {
561
  "SA": {
562
- "accuracy": 0.22033898305084745,
563
- "count": 295
564
  },
565
  "SS": {
566
- "accuracy": 0.8727272727272727,
567
- "count": 55
568
  }
569
  }
570
  },
571
  "add_S1": {
572
  "full_accuracy": 0.0,
573
- "digit_accuracy": 0.18571428571428572,
574
- "n_examples": 50,
575
  "per_subtask": {
576
  "SA": {
577
- "accuracy": 0.2619047619047619,
578
- "count": 126
579
  },
580
  "SC": {
581
- "accuracy": 0.0759493670886076,
582
- "count": 79
583
  },
584
  "SS": {
585
- "accuracy": 0.6190476190476191,
586
- "count": 21
587
  },
588
  "UC": {
589
- "accuracy": 0.10483870967741936,
590
- "count": 124
591
  }
592
  }
593
  },
594
  "add_S2": {
595
  "full_accuracy": 0.0,
596
- "digit_accuracy": 0.2542857142857143,
597
- "n_examples": 50,
598
  "per_subtask": {
599
  "SA": {
600
- "accuracy": 0.3466666666666667,
601
- "count": 75
602
  },
603
  "SC": {
604
- "accuracy": 0.08064516129032258,
605
- "count": 62
606
  },
607
  "SS": {
608
- "accuracy": 0.717948717948718,
609
- "count": 39
610
  },
611
  "UC": {
612
- "accuracy": 0.11711711711711711,
613
- "count": 111
614
  },
615
  "US": {
616
- "accuracy": 0.2698412698412698,
617
- "count": 63
618
  }
619
  }
620
  },
621
  "add_S3": {
622
  "full_accuracy": 0.0,
623
- "digit_accuracy": 0.2542857142857143,
624
- "n_examples": 50,
625
  "per_subtask": {
626
  "SA": {
627
- "accuracy": 0.43333333333333335,
628
- "count": 60
629
  },
630
  "SC": {
631
- "accuracy": 0.08771929824561403,
632
- "count": 57
633
  },
634
  "SS": {
635
- "accuracy": 0.6842105263157895,
636
- "count": 19
637
  },
638
  "UC": {
639
- "accuracy": 0.125,
640
- "count": 104
641
  },
642
  "US": {
643
- "accuracy": 0.2909090909090909,
644
- "count": 110
645
  }
646
  }
647
  },
648
  "add_S4": {
649
  "full_accuracy": 0.0,
650
- "digit_accuracy": 0.19714285714285715,
651
- "n_examples": 50,
652
  "per_subtask": {
653
  "SA": {
654
- "accuracy": 0.4166666666666667,
655
- "count": 48
656
  },
657
  "SC": {
658
- "accuracy": 0.038461538461538464,
659
- "count": 52
660
  },
661
  "SS": {
662
- "accuracy": 0.8571428571428571,
663
- "count": 7
664
  },
665
  "UC": {
666
- "accuracy": 0.1348314606741573,
667
- "count": 89
668
  },
669
  "US": {
670
- "accuracy": 0.18831168831168832,
671
- "count": 154
672
  }
673
  }
674
  },
675
  "add_S5": {
676
  "full_accuracy": 0.0,
677
- "digit_accuracy": 0.24,
678
- "n_examples": 50,
679
  "per_subtask": {
680
  "SA": {
681
- "accuracy": 0.46,
682
- "count": 50
683
  },
684
  "SC": {
685
- "accuracy": 0.12,
686
- "count": 50
687
  },
688
  "UC": {
689
  "accuracy": 0.12,
690
- "count": 50
691
  },
692
  "US": {
693
- "accuracy": 0.245,
694
- "count": 200
695
  }
696
  }
697
  },
698
  "add_S6": {
699
- "full_accuracy": 0.06,
700
- "digit_accuracy": 0.26,
701
- "n_examples": 50,
702
  "per_subtask": {
703
  "SC": {
704
- "accuracy": 0.08,
705
- "count": 50
706
  },
707
  "UC": {
708
- "accuracy": 0.28,
709
- "count": 50
710
  },
711
  "US": {
712
- "accuracy": 0.292,
713
- "count": 250
714
  }
715
  }
716
  },
717
  "add_random": {
718
  "full_accuracy": 0.0,
719
- "digit_accuracy": 0.15857142857142856,
720
  "n_examples": 200,
721
  "per_subtask": {
722
  "SA": {
723
- "accuracy": 0.22041763341067286,
724
- "count": 431
725
  },
726
  "SC": {
727
- "accuracy": 0.04430379746835443,
728
- "count": 316
729
  },
730
  "SS": {
731
- "accuracy": 0.7948717948717948,
732
- "count": 39
733
  },
734
  "UC": {
735
- "accuracy": 0.11071428571428571,
736
- "count": 560
737
  },
738
  "US": {
739
- "accuracy": 0.37037037037037035,
740
- "count": 54
741
  }
742
  }
743
  },
744
  "add_C1": {
745
  "full_accuracy": 0.0,
746
- "digit_accuracy": 0.14,
747
- "n_examples": 50,
748
  "per_subtask": {
749
  "SA": {
750
- "accuracy": 0.192,
751
- "count": 250
752
  },
753
  "SC": {
754
- "accuracy": 0.02,
755
- "count": 50
756
  },
757
  "UC": {
758
  "accuracy": 0.0,
759
- "count": 50
760
  }
761
  }
762
  },
763
  "add_C2": {
764
  "full_accuracy": 0.0,
765
- "digit_accuracy": 0.14,
766
- "n_examples": 50,
767
  "per_subtask": {
768
  "SA": {
769
- "accuracy": 0.21,
770
- "count": 200
771
  },
772
  "SC": {
773
- "accuracy": 0.02,
774
- "count": 50
775
  },
776
  "UC": {
777
- "accuracy": 0.04819277108433735,
778
- "count": 83
779
  },
780
  "US": {
781
- "accuracy": 0.11764705882352941,
782
- "count": 17
783
  }
784
  }
785
  },
786
  "add_C3": {
787
  "full_accuracy": 0.0,
788
- "digit_accuracy": 0.15428571428571428,
789
- "n_examples": 50,
790
  "per_subtask": {
791
  "SA": {
792
- "accuracy": 0.2733333333333333,
793
- "count": 150
794
  },
795
  "SC": {
796
- "accuracy": 0.04,
797
- "count": 50
798
  },
799
  "UC": {
800
- "accuracy": 0.01,
801
- "count": 100
802
  },
803
  "US": {
804
- "accuracy": 0.2,
805
- "count": 50
806
  }
807
  }
808
  },
809
  "add_C4": {
810
  "full_accuracy": 0.0,
811
- "digit_accuracy": 0.18857142857142858,
812
- "n_examples": 50,
813
  "per_subtask": {
814
  "SA": {
815
- "accuracy": 0.38,
816
- "count": 100
817
  },
818
  "SC": {
819
- "accuracy": 0.06,
820
- "count": 50
821
  },
822
  "UC": {
823
- "accuracy": 0.030303030303030304,
824
- "count": 132
825
  },
826
  "US": {
827
- "accuracy": 0.3088235294117647,
828
- "count": 68
829
  }
830
  }
831
  },
832
  "add_C5": {
833
  "full_accuracy": 0.0,
834
- "digit_accuracy": 0.17142857142857143,
835
- "n_examples": 50,
836
  "per_subtask": {
837
  "SA": {
838
- "accuracy": 0.4,
839
- "count": 50
840
  },
841
  "SC": {
842
- "accuracy": 0.04,
843
- "count": 50
844
  },
845
  "UC": {
846
- "accuracy": 0.0547945205479452,
847
- "count": 146
848
  },
849
  "US": {
850
- "accuracy": 0.28846153846153844,
851
- "count": 104
852
  }
853
  }
854
  },
855
  "add_C6": {
856
  "full_accuracy": 0.0,
857
- "digit_accuracy": 0.21142857142857144,
858
- "n_examples": 50,
859
  "per_subtask": {
860
  "SC": {
861
- "accuracy": 0.08,
862
- "count": 50
863
  },
864
  "UC": {
865
- "accuracy": 0.09523809523809523,
866
- "count": 189
867
  },
868
  "US": {
869
- "accuracy": 0.46846846846846846,
870
- "count": 111
871
  }
872
  }
873
  },
874
  "sub_M0": {
875
  "full_accuracy": 0.0,
876
- "digit_accuracy": 0.3057142857142857,
877
- "n_examples": 50,
878
  "per_subtask": {
879
  "MD": {
880
- "accuracy": 0.19801980198019803,
881
- "count": 303
882
  },
883
  "ME": {
884
  "accuracy": 1.0,
885
- "count": 47
886
  }
887
  }
888
  },
889
  "sub_M1": {
890
  "full_accuracy": 0.0,
891
- "digit_accuracy": 0.26285714285714284,
892
- "n_examples": 50,
893
  "per_subtask": {
894
  "MD": {
895
- "accuracy": 0.3900709219858156,
896
- "count": 141
897
  },
898
  "MB": {
899
  "accuracy": 0.0,
900
- "count": 72
901
  },
902
  "ME": {
903
  "accuracy": 1.0,
904
- "count": 18
905
  },
906
  "UB": {
907
- "accuracy": 0.15966386554621848,
908
- "count": 119
909
  }
910
  }
911
  },
912
  "sub_M2": {
913
  "full_accuracy": 0.0,
914
- "digit_accuracy": 0.38285714285714284,
915
- "n_examples": 50,
916
  "per_subtask": {
917
  "MD": {
918
- "accuracy": 0.6428571428571429,
919
- "count": 112
920
  },
921
  "MB": {
922
  "accuracy": 0.0,
923
- "count": 53
924
  },
925
  "ME": {
926
  "accuracy": 1.0,
927
- "count": 47
928
  },
929
  "UB": {
930
- "accuracy": 0.17647058823529413,
931
- "count": 85
932
  },
933
  "UD": {
934
  "accuracy": 0.0,
935
- "count": 53
936
  }
937
  }
938
  },
939
  "sub_M3": {
940
  "full_accuracy": 0.0,
941
- "digit_accuracy": 0.28,
942
- "n_examples": 50,
943
  "per_subtask": {
944
  "MD": {
945
- "accuracy": 0.6494845360824743,
946
- "count": 97
947
  },
948
  "MB": {
949
  "accuracy": 0.0,
950
- "count": 51
951
  },
952
  "ME": {
953
  "accuracy": 1.0,
954
- "count": 27
955
  },
956
  "UB": {
957
- "accuracy": 0.10810810810810811,
958
- "count": 74
959
  },
960
  "UD": {
961
  "accuracy": 0.0,
962
- "count": 101
963
  }
964
  }
965
  },
966
  "sub_M4": {
967
  "full_accuracy": 0.0,
968
- "digit_accuracy": 0.21142857142857144,
969
- "n_examples": 50,
970
  "per_subtask": {
971
  "MD": {
972
  "accuracy": 0.5,
973
- "count": 100
974
  },
975
  "MB": {
976
  "accuracy": 0.0,
977
- "count": 50
978
  },
979
  "UB": {
980
- "accuracy": 0.48,
981
- "count": 50
982
  },
983
  "UD": {
984
  "accuracy": 0.0,
985
- "count": 150
986
  }
987
  }
988
  },
989
  "sub_M5": {
990
  "full_accuracy": 0.0,
991
- "digit_accuracy": 0.18285714285714286,
992
- "n_examples": 50,
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
- "count": 50
997
  },
998
  "MB": {
999
  "accuracy": 0.0,
1000
- "count": 50
1001
  },
1002
  "UB": {
1003
- "accuracy": 0.28,
1004
- "count": 50
1005
  },
1006
  "UD": {
1007
  "accuracy": 0.0,
1008
- "count": 200
1009
  }
1010
  }
1011
  },
@@ -1015,101 +1015,101 @@
1015
  "n_examples": 200,
1016
  "per_subtask": {
1017
  "MD": {
1018
- "accuracy": 0.37719298245614036,
1019
- "count": 570
1020
  },
1021
  "MB": {
1022
  "accuracy": 0.0,
1023
- "count": 277
1024
  },
1025
  "ME": {
1026
  "accuracy": 1.0,
1027
  "count": 53
1028
  },
1029
  "UB": {
1030
- "accuracy": 0.11677282377919321,
1031
- "count": 471
1032
  },
1033
  "UD": {
1034
  "accuracy": 0.0,
1035
- "count": 29
1036
  }
1037
  }
1038
  },
1039
  "sub_B3": {
1040
  "full_accuracy": 0.0,
1041
- "digit_accuracy": 0.18857142857142858,
1042
- "n_examples": 50,
1043
  "per_subtask": {
1044
  "MD": {
1045
  "accuracy": 0.3333333333333333,
1046
- "count": 150
1047
  },
1048
  "MB": {
1049
  "accuracy": 0.0,
1050
- "count": 50
1051
  },
1052
  "UB": {
1053
- "accuracy": 0.15841584158415842,
1054
- "count": 101
1055
  },
1056
  "UD": {
1057
  "accuracy": 0.0,
1058
- "count": 49
1059
  }
1060
  }
1061
  },
1062
  "sub_B4": {
1063
  "full_accuracy": 0.0,
1064
- "digit_accuracy": 0.17714285714285713,
1065
- "n_examples": 50,
1066
  "per_subtask": {
1067
  "MD": {
1068
  "accuracy": 0.5,
1069
- "count": 100
1070
  },
1071
  "MB": {
1072
  "accuracy": 0.0,
1073
- "count": 50
1074
  },
1075
  "UB": {
1076
- "accuracy": 0.09917355371900827,
1077
- "count": 121
1078
  },
1079
  "UD": {
1080
  "accuracy": 0.0,
1081
- "count": 79
1082
  }
1083
  }
1084
  },
1085
  "sub_B5": {
1086
  "full_accuracy": 0.0,
1087
- "digit_accuracy": 0.18,
1088
- "n_examples": 50,
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
- "count": 50
1093
  },
1094
  "MB": {
1095
  "accuracy": 0.0,
1096
- "count": 50
1097
  },
1098
  "UB": {
1099
- "accuracy": 0.08552631578947369,
1100
- "count": 152
1101
  },
1102
  "UD": {
1103
  "accuracy": 0.0,
1104
- "count": 98
1105
  }
1106
  }
1107
  }
1108
  },
1109
  "summary": {
1110
- "overall_accuracy": 0.002,
1111
- "digit_accuracy": 0.21466666666666667,
1112
- "total_examples": 1500,
1113
  "n_splits": 24
1114
  }
1115
  },
@@ -1119,569 +1119,569 @@
1119
  "K": 1,
1120
  "mode": "sorl",
1121
  "n_digits": 6,
1122
- "n_per_split": 50
1123
  },
1124
  "splits": {
1125
  "add_S0": {
1126
  "full_accuracy": 0.0,
1127
- "digit_accuracy": 0.2714285714285714,
1128
- "n_examples": 50,
1129
  "per_subtask": {
1130
  "SA": {
1131
- "accuracy": 0.22033898305084745,
1132
- "count": 295
1133
  },
1134
  "SS": {
1135
- "accuracy": 0.5454545454545454,
1136
- "count": 55
1137
  }
1138
  }
1139
  },
1140
  "add_S1": {
1141
  "full_accuracy": 0.0,
1142
- "digit_accuracy": 0.23714285714285716,
1143
- "n_examples": 50,
1144
  "per_subtask": {
1145
  "SA": {
1146
- "accuracy": 0.29365079365079366,
1147
- "count": 126
1148
  },
1149
  "SC": {
1150
- "accuracy": 0.189873417721519,
1151
- "count": 79
1152
  },
1153
  "SS": {
1154
- "accuracy": 0.5238095238095238,
1155
- "count": 21
1156
  },
1157
  "UC": {
1158
- "accuracy": 0.16129032258064516,
1159
- "count": 124
1160
  }
1161
  }
1162
  },
1163
  "add_S2": {
1164
- "full_accuracy": 0.0,
1165
- "digit_accuracy": 0.38285714285714284,
1166
- "n_examples": 50,
1167
  "per_subtask": {
1168
  "SA": {
1169
- "accuracy": 0.3333333333333333,
1170
- "count": 75
1171
  },
1172
  "SC": {
1173
- "accuracy": 0.25806451612903225,
1174
- "count": 62
1175
  },
1176
  "SS": {
1177
- "accuracy": 0.5128205128205128,
1178
- "count": 39
1179
  },
1180
  "UC": {
1181
- "accuracy": 0.2702702702702703,
1182
- "count": 111
1183
  },
1184
  "US": {
1185
- "accuracy": 0.6825396825396826,
1186
- "count": 63
1187
  }
1188
  }
1189
  },
1190
  "add_S3": {
1191
- "full_accuracy": 0.0,
1192
- "digit_accuracy": 0.38,
1193
- "n_examples": 50,
1194
  "per_subtask": {
1195
  "SA": {
1196
- "accuracy": 0.36666666666666664,
1197
- "count": 60
1198
  },
1199
  "SC": {
1200
- "accuracy": 0.22807017543859648,
1201
- "count": 57
1202
  },
1203
  "SS": {
1204
- "accuracy": 0.3157894736842105,
1205
- "count": 19
1206
  },
1207
  "UC": {
1208
- "accuracy": 0.2403846153846154,
1209
- "count": 104
1210
  },
1211
  "US": {
1212
- "accuracy": 0.6090909090909091,
1213
- "count": 110
1214
  }
1215
  }
1216
  },
1217
  "add_S4": {
1218
  "full_accuracy": 0.0,
1219
- "digit_accuracy": 0.36857142857142855,
1220
- "n_examples": 50,
1221
  "per_subtask": {
1222
  "SA": {
1223
- "accuracy": 0.25,
1224
- "count": 48
1225
  },
1226
  "SC": {
1227
- "accuracy": 0.11538461538461539,
1228
- "count": 52
1229
  },
1230
  "SS": {
1231
- "accuracy": 0.7142857142857143,
1232
- "count": 7
1233
  },
1234
  "UC": {
1235
- "accuracy": 0.24719101123595505,
1236
- "count": 89
1237
  },
1238
  "US": {
1239
- "accuracy": 0.5454545454545454,
1240
- "count": 154
1241
  }
1242
  }
1243
  },
1244
  "add_S5": {
1245
  "full_accuracy": 0.0,
1246
- "digit_accuracy": 0.26571428571428574,
1247
- "n_examples": 50,
1248
  "per_subtask": {
1249
  "SA": {
1250
- "accuracy": 0.54,
1251
- "count": 50
1252
  },
1253
  "SC": {
1254
- "accuracy": 0.16,
1255
- "count": 50
1256
  },
1257
  "UC": {
1258
- "accuracy": 0.12,
1259
- "count": 50
1260
  },
1261
  "US": {
1262
- "accuracy": 0.26,
1263
- "count": 200
1264
  }
1265
  }
1266
  },
1267
  "add_S6": {
1268
- "full_accuracy": 0.06,
1269
- "digit_accuracy": 0.07714285714285714,
1270
- "n_examples": 50,
1271
  "per_subtask": {
1272
  "SC": {
1273
- "accuracy": 0.06,
1274
- "count": 50
1275
  },
1276
  "UC": {
1277
- "accuracy": 0.08,
1278
- "count": 50
1279
  },
1280
  "US": {
1281
- "accuracy": 0.08,
1282
- "count": 250
1283
  }
1284
  }
1285
  },
1286
  "add_random": {
1287
  "full_accuracy": 0.0,
1288
- "digit_accuracy": 0.2507142857142857,
1289
  "n_examples": 200,
1290
  "per_subtask": {
1291
  "SA": {
1292
- "accuracy": 0.2505800464037123,
1293
- "count": 431
1294
  },
1295
  "SC": {
1296
- "accuracy": 0.18354430379746836,
1297
- "count": 316
1298
  },
1299
  "SS": {
1300
- "accuracy": 0.5641025641025641,
1301
- "count": 39
1302
  },
1303
  "UC": {
1304
- "accuracy": 0.21607142857142858,
1305
- "count": 560
1306
  },
1307
  "US": {
1308
- "accuracy": 0.7777777777777778,
1309
- "count": 54
1310
  }
1311
  }
1312
  },
1313
  "add_C1": {
1314
  "full_accuracy": 0.0,
1315
- "digit_accuracy": 0.23142857142857143,
1316
- "n_examples": 50,
1317
  "per_subtask": {
1318
  "SA": {
1319
- "accuracy": 0.288,
1320
- "count": 250
1321
  },
1322
  "SC": {
1323
- "accuracy": 0.12,
1324
- "count": 50
1325
  },
1326
  "UC": {
1327
- "accuracy": 0.06,
1328
- "count": 50
1329
  }
1330
  }
1331
  },
1332
  "add_C2": {
1333
  "full_accuracy": 0.0,
1334
- "digit_accuracy": 0.25142857142857145,
1335
- "n_examples": 50,
1336
  "per_subtask": {
1337
  "SA": {
1338
- "accuracy": 0.33,
1339
- "count": 200
1340
  },
1341
  "SC": {
1342
- "accuracy": 0.16,
1343
- "count": 50
1344
  },
1345
  "UC": {
1346
- "accuracy": 0.07228915662650602,
1347
- "count": 83
1348
  },
1349
  "US": {
1350
- "accuracy": 0.47058823529411764,
1351
- "count": 17
1352
  }
1353
  }
1354
  },
1355
  "add_C3": {
1356
  "full_accuracy": 0.0,
1357
- "digit_accuracy": 0.3142857142857143,
1358
- "n_examples": 50,
1359
  "per_subtask": {
1360
  "SA": {
1361
- "accuracy": 0.38,
1362
- "count": 150
1363
  },
1364
  "SC": {
1365
- "accuracy": 0.18,
1366
- "count": 50
1367
  },
1368
  "UC": {
1369
- "accuracy": 0.09,
1370
- "count": 100
1371
  },
1372
  "US": {
1373
- "accuracy": 0.7,
1374
- "count": 50
1375
  }
1376
  }
1377
  },
1378
  "add_C4": {
1379
  "full_accuracy": 0.0,
1380
- "digit_accuracy": 0.32857142857142857,
1381
- "n_examples": 50,
1382
  "per_subtask": {
1383
  "SA": {
1384
- "accuracy": 0.44,
1385
- "count": 100
1386
  },
1387
  "SC": {
1388
- "accuracy": 0.14,
1389
- "count": 50
1390
  },
1391
  "UC": {
1392
- "accuracy": 0.11363636363636363,
1393
- "count": 132
1394
  },
1395
  "US": {
1396
- "accuracy": 0.7205882352941176,
1397
- "count": 68
1398
  }
1399
  }
1400
  },
1401
  "add_C5": {
1402
  "full_accuracy": 0.0,
1403
- "digit_accuracy": 0.33714285714285713,
1404
- "n_examples": 50,
1405
  "per_subtask": {
1406
  "SA": {
1407
- "accuracy": 0.48,
1408
- "count": 50
1409
  },
1410
  "SC": {
1411
- "accuracy": 0.18,
1412
- "count": 50
1413
  },
1414
  "UC": {
1415
- "accuracy": 0.15753424657534246,
1416
- "count": 146
1417
  },
1418
  "US": {
1419
- "accuracy": 0.5961538461538461,
1420
- "count": 104
1421
  }
1422
  }
1423
  },
1424
  "add_C6": {
1425
  "full_accuracy": 0.0,
1426
- "digit_accuracy": 0.42857142857142855,
1427
- "n_examples": 50,
1428
  "per_subtask": {
1429
  "SC": {
1430
- "accuracy": 0.2,
1431
- "count": 50
1432
  },
1433
  "UC": {
1434
- "accuracy": 0.2804232804232804,
1435
- "count": 189
1436
  },
1437
  "US": {
1438
- "accuracy": 0.7837837837837838,
1439
- "count": 111
1440
  }
1441
  }
1442
  },
1443
  "sub_M0": {
1444
  "full_accuracy": 0.0,
1445
- "digit_accuracy": 0.34285714285714286,
1446
- "n_examples": 50,
1447
  "per_subtask": {
1448
  "MD": {
1449
- "accuracy": 0.24092409240924093,
1450
- "count": 303
1451
  },
1452
  "ME": {
1453
  "accuracy": 1.0,
1454
- "count": 47
1455
  }
1456
  }
1457
  },
1458
  "sub_M1": {
1459
  "full_accuracy": 0.0,
1460
- "digit_accuracy": 0.2914285714285714,
1461
- "n_examples": 50,
1462
  "per_subtask": {
1463
  "MD": {
1464
- "accuracy": 0.4326241134751773,
1465
- "count": 141
1466
  },
1467
  "MB": {
1468
- "accuracy": 0.027777777777777776,
1469
- "count": 72
1470
  },
1471
  "ME": {
1472
  "accuracy": 1.0,
1473
- "count": 18
1474
  },
1475
  "UB": {
1476
- "accuracy": 0.17647058823529413,
1477
- "count": 119
1478
  }
1479
  }
1480
  },
1481
  "sub_M2": {
1482
  "full_accuracy": 0.0,
1483
- "digit_accuracy": 0.4057142857142857,
1484
- "n_examples": 50,
1485
  "per_subtask": {
1486
  "MD": {
1487
- "accuracy": 0.6517857142857143,
1488
- "count": 112
1489
  },
1490
  "MB": {
1491
- "accuracy": 0.03773584905660377,
1492
- "count": 53
1493
  },
1494
  "ME": {
1495
  "accuracy": 1.0,
1496
- "count": 47
1497
  },
1498
  "UB": {
1499
- "accuracy": 0.2235294117647059,
1500
- "count": 85
1501
  },
1502
  "UD": {
1503
- "accuracy": 0.018867924528301886,
1504
- "count": 53
1505
  }
1506
  }
1507
  },
1508
  "sub_M3": {
1509
  "full_accuracy": 0.0,
1510
- "digit_accuracy": 0.3,
1511
- "n_examples": 50,
1512
  "per_subtask": {
1513
  "MD": {
1514
- "accuracy": 0.6597938144329897,
1515
- "count": 97
1516
  },
1517
  "MB": {
1518
- "accuracy": 0.0,
1519
- "count": 51
1520
  },
1521
  "ME": {
1522
  "accuracy": 1.0,
1523
- "count": 27
1524
  },
1525
  "UB": {
1526
- "accuracy": 0.16216216216216217,
1527
- "count": 74
1528
  },
1529
  "UD": {
1530
- "accuracy": 0.019801980198019802,
1531
- "count": 101
1532
  }
1533
  }
1534
  },
1535
  "sub_M4": {
1536
  "full_accuracy": 0.0,
1537
- "digit_accuracy": 0.2257142857142857,
1538
- "n_examples": 50,
1539
  "per_subtask": {
1540
  "MD": {
1541
  "accuracy": 0.54,
1542
- "count": 100
1543
  },
1544
  "MB": {
1545
- "accuracy": 0.0,
1546
- "count": 50
1547
  },
1548
  "UB": {
1549
- "accuracy": 0.5,
1550
- "count": 50
1551
  },
1552
  "UD": {
1553
  "accuracy": 0.0,
1554
- "count": 150
1555
  }
1556
  }
1557
  },
1558
  "sub_M5": {
1559
  "full_accuracy": 0.0,
1560
- "digit_accuracy": 0.19714285714285715,
1561
- "n_examples": 50,
1562
  "per_subtask": {
1563
  "MD": {
1564
  "accuracy": 1.0,
1565
- "count": 50
1566
  },
1567
  "MB": {
1568
  "accuracy": 0.0,
1569
- "count": 50
1570
  },
1571
  "UB": {
1572
- "accuracy": 0.38,
1573
- "count": 50
1574
  },
1575
  "UD": {
1576
  "accuracy": 0.0,
1577
- "count": 200
1578
  }
1579
  }
1580
  },
1581
  "sub_random": {
1582
  "full_accuracy": 0.0,
1583
- "digit_accuracy": 0.25857142857142856,
1584
  "n_examples": 200,
1585
  "per_subtask": {
1586
  "MD": {
1587
- "accuracy": 0.39473684210526316,
1588
- "count": 570
1589
  },
1590
  "MB": {
1591
- "accuracy": 0.032490974729241874,
1592
- "count": 277
1593
  },
1594
  "ME": {
1595
- "accuracy": 0.9622641509433962,
1596
  "count": 53
1597
  },
1598
  "UB": {
1599
- "accuracy": 0.1613588110403397,
1600
- "count": 471
1601
  },
1602
  "UD": {
1603
- "accuracy": 0.034482758620689655,
1604
- "count": 29
1605
  }
1606
  }
1607
  },
1608
  "sub_B3": {
1609
  "full_accuracy": 0.0,
1610
- "digit_accuracy": 0.21428571428571427,
1611
- "n_examples": 50,
1612
  "per_subtask": {
1613
  "MD": {
1614
- "accuracy": 0.35333333333333333,
1615
- "count": 150
1616
  },
1617
  "MB": {
1618
- "accuracy": 0.06,
1619
- "count": 50
1620
  },
1621
  "UB": {
1622
- "accuracy": 0.18811881188118812,
1623
- "count": 101
1624
  },
1625
  "UD": {
1626
  "accuracy": 0.0,
1627
- "count": 49
1628
  }
1629
  }
1630
  },
1631
  "sub_B4": {
1632
  "full_accuracy": 0.0,
1633
- "digit_accuracy": 0.2057142857142857,
1634
- "n_examples": 50,
1635
  "per_subtask": {
1636
  "MD": {
1637
- "accuracy": 0.54,
1638
- "count": 100
1639
  },
1640
  "MB": {
1641
- "accuracy": 0.04,
1642
- "count": 50
1643
  },
1644
  "UB": {
1645
- "accuracy": 0.1322314049586777,
1646
- "count": 121
1647
  },
1648
  "UD": {
1649
  "accuracy": 0.0,
1650
- "count": 79
1651
  }
1652
  }
1653
  },
1654
  "sub_B5": {
1655
  "full_accuracy": 0.0,
1656
- "digit_accuracy": 0.20285714285714285,
1657
- "n_examples": 50,
1658
  "per_subtask": {
1659
  "MD": {
1660
  "accuracy": 1.0,
1661
- "count": 50
1662
  },
1663
  "MB": {
1664
- "accuracy": 0.02,
1665
- "count": 50
1666
  },
1667
  "UB": {
1668
- "accuracy": 0.13157894736842105,
1669
- "count": 152
1670
  },
1671
  "UD": {
1672
  "accuracy": 0.0,
1673
- "count": 98
1674
  }
1675
  }
1676
  }
1677
  },
1678
  "summary": {
1679
- "overall_accuracy": 0.002,
1680
- "digit_accuracy": 0.2763809523809524,
1681
- "total_examples": 1500,
1682
  "n_splits": 24
1683
  }
1684
  },
1685
- "sorl_overall_accuracy": 0.00125,
1686
- "sft_overall_accuracy": 0.0016666666666666668
1687
  }
 
63
  3133
64
  ],
65
  "loss": [
66
+ 11.08719539642334,
67
+ 9.882888793945312,
68
+ 11.4771728515625,
69
+ 13.394209861755371,
70
+ 12.636463165283203,
71
+ 11.754125595092773,
72
+ 10.885461807250977,
73
+ 9.980756759643555,
74
+ 9.17142391204834,
75
+ 8.430733680725098,
76
+ 7.6542534828186035,
77
+ 7.293120384216309,
78
+ 6.862236499786377,
79
+ 6.354031562805176,
80
+ 6.001899719238281,
81
+ 5.606424331665039,
82
+ 5.264089584350586,
83
+ 4.9789347648620605,
84
+ 4.6479172706604,
85
+ 4.434387683868408,
86
+ 4.143214225769043,
87
+ 3.9974446296691895,
88
+ 3.7414889335632324,
89
+ 3.581064224243164,
90
+ 3.542850971221924,
91
+ 3.4644646644592285,
92
+ 3.242861270904541,
93
+ 3.382519245147705,
94
+ 3.218432903289795,
95
+ 3.201049327850342,
96
+ 2.9937853813171387,
97
+ 3.0674819946289062,
98
+ 3.1582376956939697,
99
+ 3.025876522064209,
100
+ 3.013111114501953,
101
+ 2.821368932723999,
102
+ 2.875208854675293,
103
+ 2.663424253463745,
104
+ 2.536762237548828,
105
+ 2.6094765663146973,
106
+ 2.504122734069824,
107
+ 2.469931125640869,
108
+ 2.3832502365112305,
109
+ 2.1861073970794678,
110
+ 1.937701940536499,
111
+ 1.7912967205047607,
112
+ 1.3576332330703735,
113
+ 1.3698662519454956,
114
+ 1.4696470499038696,
115
+ 1.02448308467865,
116
+ 0.7232930660247803,
117
+ 0.8976768255233765,
118
+ 0.8781657814979553,
119
+ 0.36818739771842957,
120
+ 0.5743820667266846,
121
+ 0.18251502513885498,
122
+ 0.3461122512817383,
123
+ 0.08204305171966553,
124
+ 0.28174304962158203,
125
+ 0.3735370934009552
126
  ],
127
  "base_loss": [
128
+ 11.855416297912598,
129
+ 11.582043647766113,
130
+ 10.906392097473145,
131
+ 10.266691207885742,
132
+ 9.720009803771973,
133
+ 9.272294998168945,
134
+ 8.666574478149414,
135
+ 8.154210090637207,
136
+ 7.616458415985107,
137
+ 6.998318195343018,
138
+ 6.393656253814697,
139
+ 6.0761284828186035,
140
+ 5.678290367126465,
141
+ 5.230454444885254,
142
+ 4.891925811767578,
143
+ 4.48134708404541,
144
+ 4.189845085144043,
145
+ 3.902498245239258,
146
+ 3.601158618927002,
147
+ 3.3346099853515625,
148
+ 3.106675386428833,
149
+ 2.981271266937256,
150
+ 2.795375347137451,
151
+ 2.646695375442505,
152
+ 2.5778772830963135,
153
+ 2.52484393119812,
154
+ 2.3839688301086426,
155
+ 2.4521782398223877,
156
+ 2.3377792835235596,
157
+ 2.3254542350769043,
158
+ 2.270014524459839,
159
+ 2.2316880226135254,
160
+ 2.2519335746765137,
161
+ 2.232677698135376,
162
+ 2.2708544731140137,
163
+ 2.127883195877075,
164
+ 2.219191312789917,
165
+ 2.1732959747314453,
166
+ 2.038266658782959,
167
+ 2.107933282852173,
168
+ 2.0626580715179443,
169
+ 2.143627166748047,
170
+ 2.118760585784912,
171
+ 2.1483101844787598,
172
+ 2.1393425464630127,
173
+ 2.1049411296844482,
174
+ 2.007964849472046,
175
+ 2.0863873958587646,
176
+ 2.121614933013916,
177
+ 2.1018025875091553,
178
+ 2.010502338409424,
179
+ 2.0667836666107178,
180
+ 2.0408687591552734,
181
+ 2.058290958404541,
182
+ 2.1121718883514404,
183
+ 2.0361921787261963,
184
+ 2.0869758129119873,
185
+ 2.087939977645874,
186
+ 2.131622076034546,
187
+ 2.101810932159424
188
  ],
189
  "info_loss": [
190
+ -0.8031711578369141,
191
+ -0.8466024398803711,
192
+ -0.5249614715576172,
193
+ -0.14881229400634766,
194
+ -0.07111930847167969,
195
+ -0.04557037353515625,
196
+ -0.018856048583984375,
197
+ -0.018548965454101562,
198
+ -0.01670694351196289,
199
+ -0.008439064025878906,
200
+ -0.014551639556884766,
201
+ -0.012627601623535156,
202
+ -0.011160850524902344,
203
+ -0.013494491577148438,
204
+ -0.012564659118652344,
205
+ -0.009298324584960938,
206
+ -0.012515544891357422,
207
+ -0.011241912841796875,
208
+ -0.013210296630859375,
209
+ -0.007313251495361328,
210
+ -0.012842655181884766,
211
+ -0.013678312301635742,
212
+ -0.02021336555480957,
213
+ -0.02120494842529297,
214
+ -0.01790618896484375,
215
+ -0.019744157791137695,
216
+ -0.028128623962402344,
217
+ -0.020328998565673828,
218
+ -0.025052547454833984,
219
+ -0.025846242904663086,
220
+ -0.0404818058013916,
221
+ -0.02934575080871582,
222
+ -0.022233009338378906,
223
+ -0.03363370895385742,
224
+ -0.038091421127319336,
225
+ -0.0435643196105957,
226
+ -0.04671955108642578,
227
+ -0.06309247016906738,
228
+ -0.06224644184112549,
229
+ -0.06188392639160156,
230
+ -0.06756675243377686,
231
+ -0.07924365997314453,
232
+ -0.08580851554870605,
233
+ -0.10824298858642578,
234
+ -0.1323099136352539,
235
+ -0.14376580715179443,
236
+ -0.17741644382476807,
237
+ -0.18436694145202637,
238
+ -0.17743921279907227,
239
+ -0.22029244899749756,
240
+ -0.24071002006530762,
241
+ -0.22940337657928467,
242
+ -0.2283191680908203,
243
+ -0.2808573246002197,
244
+ -0.26602745056152344,
245
+ -0.29790258407592773,
246
+ -0.28659355640411377,
247
+ -0.31294679641723633,
248
+ -0.2972530126571655,
249
+ -0.28541290760040283
250
  ],
251
  "abs_loss": [
252
+ 3.39985728263855,
253
+ 3.386993646621704,
254
+ 3.3492507934570312,
255
+ 3.2870309352874756,
256
+ 3.2153663635253906,
257
+ 3.1640560626983643,
258
+ 3.0829689502716064,
259
+ 2.9905052185058594,
260
+ 2.8724257946014404,
261
+ 2.830716848373413,
262
+ 2.7724239826202393,
263
+ 2.7584240436553955,
264
+ 2.753511667251587,
265
+ 2.706627130508423,
266
+ 2.728182554244995,
267
+ 2.7566611766815186,
268
+ 2.7176859378814697,
269
+ 2.729097604751587,
270
+ 2.7399768829345703,
271
+ 2.7603671550750732,
272
+ 2.7478103637695312,
273
+ 2.6929080486297607,
274
+ 2.698748826980591,
275
+ 2.7124502658843994,
276
+ 2.729304075241089,
277
+ 2.687528610229492,
278
+ 2.7349331378936768,
279
+ 2.7035627365112305,
280
+ 2.699801445007324,
281
+ 2.7429096698760986,
282
+ 2.7024919986724854,
283
+ 2.7189064025878906,
284
+ 2.72162127494812,
285
+ 2.742133140563965,
286
+ 2.687856674194336,
287
+ 2.7535006999969482,
288
+ 2.7046287059783936,
289
+ 2.689242362976074,
290
+ 2.695612668991089,
291
+ 2.6956450939178467,
292
+ 2.6737430095672607,
293
+ 2.688300371170044,
294
+ 2.730506181716919,
295
+ 2.705724000930786,
296
+ 2.7161991596221924,
297
+ 2.7383010387420654,
298
+ 2.731405019760132,
299
+ 2.762179136276245,
300
+ 2.7152442932128906,
301
+ 2.7449119091033936,
302
+ 2.6875953674316406,
303
+ 2.7378857135772705,
304
+ 2.691694974899292,
305
+ 2.6705048084259033,
306
+ 2.7100117206573486,
307
+ 2.737171173095703,
308
+ 2.7343177795410156,
309
+ 2.7189133167266846,
310
+ 2.710089921951294,
311
+ 2.7421464920043945
312
  ],
313
  "zipf_loss": [
314
+ 6.923504829406738,
315
+ 6.428170680999756,
316
+ 5.485470294952393,
317
+ 4.286938667297363,
318
+ 3.3061094284057617,
319
+ 2.621128797531128,
320
+ 2.099151134490967,
321
+ 1.7129861116409302,
322
+ 1.4347922801971436,
323
+ 1.2337348461151123,
324
+ 1.1288714408874512,
325
+ 1.0674259662628174,
326
+ 1.0202035903930664,
327
+ 0.9878593683242798,
328
+ 0.9628022909164429,
329
+ 0.9423943758010864,
330
+ 0.9276311993598938,
331
+ 0.9159458875656128,
332
+ 0.9048639535903931,
333
+ 0.896873414516449,
334
+ 0.8901845216751099,
335
+ 0.8836656212806702,
336
+ 0.8783724308013916,
337
+ 0.8751733303070068,
338
+ 0.8711051344871521,
339
+ 0.8683096170425415,
340
+ 0.8666852712631226,
341
+ 0.8632747530937195,
342
+ 0.861198902130127,
343
+ 0.8597666025161743,
344
+ 0.8583396673202515,
345
+ 0.8573608994483948,
346
+ 0.8564719557762146,
347
+ 0.8553227186203003,
348
+ 0.8543850183486938,
349
+ 0.8537788987159729,
350
+ 0.8527502417564392,
351
+ 0.8521287441253662,
352
+ 0.8513987064361572,
353
+ 0.8508179187774658,
354
+ 0.8497580289840698,
355
+ 0.8499106168746948,
356
+ 0.8495240807533264,
357
+ 0.8496546745300293,
358
+ 0.8498386144638062,
359
+ 0.8501836061477661,
360
+ 0.8506922721862793,
361
+ 0.8509303331375122,
362
+ 0.8508998155593872,
363
+ 0.8511137962341309,
364
+ 0.8511313796043396,
365
+ 0.8511383533477783,
366
+ 0.8513191938400269,
367
+ 0.85141921043396,
368
+ 0.8514835238456726,
369
+ 0.8516315221786499,
370
+ 0.8516402244567871,
371
+ 0.8516796827316284,
372
+ 0.8516421318054199,
373
+ 0.8516405820846558
374
  ],
375
  "denoise_loss": [],
376
  "ortho_loss": [
377
+ 0.6979529857635498,
378
+ 0.6298285126686096,
379
+ 0.4650823473930359,
380
+ 0.36667466163635254,
381
+ 0.32546141743659973,
382
+ 0.2925472557544708,
383
+ 0.31016528606414795,
384
+ 0.35216864943504333,
385
+ 0.35810449719429016,
386
+ 0.3601681888103485,
387
+ 0.363854318857193,
388
+ 0.3894629180431366,
389
+ 0.4043414294719696,
390
+ 0.4201587736606598,
391
+ 0.4274826645851135,
392
+ 0.4372390806674957,
393
+ 0.43081748485565186,
394
+ 0.42623206973075867,
395
+ 0.421860009431839,
396
+ 0.42481598258018494,
397
+ 0.41381338238716125,
398
+ 0.4135570526123047,
399
+ 0.41264602541923523,
400
+ 0.39332830905914307,
401
+ 0.4006246030330658,
402
+ 0.39508968591690063,
403
+ 0.38602590560913086,
404
+ 0.3827188014984131,
405
+ 0.3881952464580536,
406
+ 0.394728422164917,
407
+ 0.39518600702285767,
408
+ 0.3969350755214691,
409
+ 0.39978182315826416,
410
+ 0.4028274714946747,
411
+ 0.41307270526885986,
412
+ 0.4215640425682068,
413
+ 0.4269300401210785,
414
+ 0.42162781953811646,
415
+ 0.4252450466156006,
416
+ 0.44025519490242004,
417
+ 0.4450463354587555,
418
+ 0.4435034692287445,
419
+ 0.4542127251625061,
420
+ 0.46466323733329773,
421
+ 0.47896111011505127,
422
+ 0.4957045316696167,
423
+ 0.5050824284553528,
424
+ 0.5143278241157532,
425
+ 0.5217639207839966,
426
+ 0.5282729268074036,
427
+ 0.5332797169685364,
428
+ 0.5378868579864502,
429
+ 0.5416458249092102,
430
+ 0.5473858714103699,
431
+ 0.5503306984901428,
432
+ 0.5527967810630798,
433
+ 0.5556142330169678,
434
+ 0.5579712390899658,
435
+ 0.5602691173553467,
436
+ 0.5617437362670898
437
  ],
438
  "lr": [
439
  9.800000000000001e-06,
 
521
  3083
522
  ],
523
  "eval_accuracy": [
 
 
 
 
 
 
 
524
  0.02,
 
 
 
 
 
 
525
  0.02,
526
+ 0.02,
527
+ 0.02,
528
+ 0.02,
529
+ 0.02,
530
+ 0.02,
531
+ 0.02,
532
+ 0.02,
533
+ 0.02,
534
+ 0.02,
535
+ 0.02,
536
+ 0.02,
537
+ 0.02,
538
+ 0.02,
539
+ 0.02,
540
+ 0.02,
541
+ 0.02,
542
+ 0.02,
543
+ 0.02
544
  ]
545
  },
546
+ "final_accuracy": 0.001153846153846154,
547
  "sft_eval": {
548
  "config": {
549
  "ops": "add_sub",
550
  "K": null,
551
  "mode": "sft",
552
  "n_digits": 6,
553
+ "n_per_split": 100
554
  },
555
  "splits": {
556
  "add_S0": {
557
  "full_accuracy": 0.0,
558
+ "digit_accuracy": 0.2642857142857143,
559
+ "n_examples": 100,
560
  "per_subtask": {
561
  "SA": {
562
+ "accuracy": 0.19669421487603306,
563
+ "count": 605
564
  },
565
  "SS": {
566
+ "accuracy": 0.6947368421052632,
567
+ "count": 95
568
  }
569
  }
570
  },
571
  "add_S1": {
572
  "full_accuracy": 0.0,
573
+ "digit_accuracy": 0.15571428571428572,
574
+ "n_examples": 100,
575
  "per_subtask": {
576
  "SA": {
577
+ "accuracy": 0.22549019607843138,
578
+ "count": 204
579
  },
580
  "SC": {
581
+ "accuracy": 0.1301775147928994,
582
+ "count": 169
583
  },
584
  "SS": {
585
+ "accuracy": 0.5161290322580645,
586
+ "count": 31
587
  },
588
  "UC": {
589
+ "accuracy": 0.08445945945945946,
590
+ "count": 296
591
  }
592
  }
593
  },
594
  "add_S2": {
595
  "full_accuracy": 0.0,
596
+ "digit_accuracy": 0.29285714285714287,
597
+ "n_examples": 100,
598
  "per_subtask": {
599
  "SA": {
600
+ "accuracy": 0.32515337423312884,
601
+ "count": 163
602
  },
603
  "SC": {
604
+ "accuracy": 0.13076923076923078,
605
+ "count": 130
606
  },
607
  "SS": {
608
+ "accuracy": 0.47126436781609193,
609
+ "count": 87
610
  },
611
  "UC": {
612
+ "accuracy": 0.12315270935960591,
613
+ "count": 203
614
  },
615
  "US": {
616
+ "accuracy": 0.5897435897435898,
617
+ "count": 117
618
  }
619
  }
620
  },
621
  "add_S3": {
622
  "full_accuracy": 0.0,
623
+ "digit_accuracy": 0.31857142857142856,
624
+ "n_examples": 100,
625
  "per_subtask": {
626
  "SA": {
627
+ "accuracy": 0.36363636363636365,
628
+ "count": 121
629
  },
630
  "SC": {
631
+ "accuracy": 0.10743801652892562,
632
+ "count": 121
633
  },
634
  "SS": {
635
+ "accuracy": 0.40816326530612246,
636
+ "count": 49
637
  },
638
  "UC": {
639
+ "accuracy": 0.14516129032258066,
640
+ "count": 186
641
  },
642
  "US": {
643
+ "accuracy": 0.5336322869955157,
644
+ "count": 223
645
  }
646
  }
647
  },
648
  "add_S4": {
649
  "full_accuracy": 0.0,
650
+ "digit_accuracy": 0.3242857142857143,
651
+ "n_examples": 100,
652
  "per_subtask": {
653
  "SA": {
654
+ "accuracy": 0.41346153846153844,
655
+ "count": 104
656
  },
657
  "SC": {
658
+ "accuracy": 0.11320754716981132,
659
+ "count": 106
660
  },
661
  "SS": {
662
+ "accuracy": 0.5217391304347826,
663
+ "count": 23
664
  },
665
  "UC": {
666
+ "accuracy": 0.15625,
667
+ "count": 160
668
  },
669
  "US": {
670
+ "accuracy": 0.43973941368078173,
671
+ "count": 307
672
  }
673
  }
674
  },
675
  "add_S5": {
676
  "full_accuracy": 0.0,
677
+ "digit_accuracy": 0.37142857142857144,
678
+ "n_examples": 100,
679
  "per_subtask": {
680
  "SA": {
681
+ "accuracy": 0.42,
682
+ "count": 100
683
  },
684
  "SC": {
685
+ "accuracy": 0.15,
686
+ "count": 100
687
  },
688
  "UC": {
689
  "accuracy": 0.12,
690
+ "count": 100
691
  },
692
  "US": {
693
+ "accuracy": 0.4775,
694
+ "count": 400
695
  }
696
  }
697
  },
698
  "add_S6": {
699
+ "full_accuracy": 0.05,
700
+ "digit_accuracy": 0.37857142857142856,
701
+ "n_examples": 100,
702
  "per_subtask": {
703
  "SC": {
704
+ "accuracy": 0.09,
705
+ "count": 100
706
  },
707
  "UC": {
708
+ "accuracy": 0.21,
709
+ "count": 100
710
  },
711
  "US": {
712
+ "accuracy": 0.47,
713
+ "count": 500
714
  }
715
  }
716
  },
717
  "add_random": {
718
  "full_accuracy": 0.0,
719
+ "digit_accuracy": 0.1692857142857143,
720
  "n_examples": 200,
721
  "per_subtask": {
722
  "SA": {
723
+ "accuracy": 0.24161073825503357,
724
+ "count": 447
725
  },
726
  "SC": {
727
+ "accuracy": 0.096875,
728
+ "count": 320
729
  },
730
  "SS": {
731
+ "accuracy": 0.625,
732
+ "count": 56
733
  },
734
  "UC": {
735
+ "accuracy": 0.0888468809073724,
736
+ "count": 529
737
  },
738
  "US": {
739
+ "accuracy": 0.3333333333333333,
740
+ "count": 48
741
  }
742
  }
743
  },
744
  "add_C1": {
745
  "full_accuracy": 0.0,
746
+ "digit_accuracy": 0.13285714285714287,
747
+ "n_examples": 100,
748
  "per_subtask": {
749
  "SA": {
750
+ "accuracy": 0.178,
751
+ "count": 500
752
  },
753
  "SC": {
754
+ "accuracy": 0.04,
755
+ "count": 100
756
  },
757
  "UC": {
758
  "accuracy": 0.0,
759
+ "count": 100
760
  }
761
  }
762
  },
763
  "add_C2": {
764
  "full_accuracy": 0.0,
765
+ "digit_accuracy": 0.16,
766
+ "n_examples": 100,
767
  "per_subtask": {
768
  "SA": {
769
+ "accuracy": 0.22,
770
+ "count": 400
771
  },
772
  "SC": {
773
+ "accuracy": 0.07,
774
+ "count": 100
775
  },
776
  "UC": {
777
+ "accuracy": 0.02564102564102564,
778
+ "count": 156
779
  },
780
  "US": {
781
+ "accuracy": 0.29545454545454547,
782
+ "count": 44
783
  }
784
  }
785
  },
786
  "add_C3": {
787
  "full_accuracy": 0.0,
788
+ "digit_accuracy": 0.17142857142857143,
789
+ "n_examples": 100,
790
  "per_subtask": {
791
  "SA": {
792
+ "accuracy": 0.24333333333333335,
793
+ "count": 300
794
  },
795
  "SC": {
796
+ "accuracy": 0.06,
797
+ "count": 100
798
  },
799
  "UC": {
800
+ "accuracy": 0.04020100502512563,
801
+ "count": 199
802
  },
803
  "US": {
804
+ "accuracy": 0.32673267326732675,
805
+ "count": 101
806
  }
807
  }
808
  },
809
  "add_C4": {
810
  "full_accuracy": 0.0,
811
+ "digit_accuracy": 0.19285714285714287,
812
+ "n_examples": 100,
813
  "per_subtask": {
814
  "SA": {
815
+ "accuracy": 0.32,
816
+ "count": 200
817
  },
818
  "SC": {
819
+ "accuracy": 0.11,
820
+ "count": 100
821
  },
822
  "UC": {
823
+ "accuracy": 0.03409090909090909,
824
+ "count": 264
825
  },
826
  "US": {
827
+ "accuracy": 0.375,
828
+ "count": 136
829
  }
830
  }
831
  },
832
  "add_C5": {
833
  "full_accuracy": 0.0,
834
+ "digit_accuracy": 0.26571428571428574,
835
+ "n_examples": 100,
836
  "per_subtask": {
837
  "SA": {
838
+ "accuracy": 0.49,
839
+ "count": 100
840
  },
841
  "SC": {
842
+ "accuracy": 0.11,
843
+ "count": 100
844
  },
845
  "UC": {
846
+ "accuracy": 0.05806451612903226,
847
+ "count": 310
848
  },
849
  "US": {
850
+ "accuracy": 0.5684210526315789,
851
+ "count": 190
852
  }
853
  }
854
  },
855
  "add_C6": {
856
  "full_accuracy": 0.0,
857
+ "digit_accuracy": 0.24142857142857144,
858
+ "n_examples": 100,
859
  "per_subtask": {
860
  "SC": {
861
+ "accuracy": 0.11,
862
+ "count": 100
863
  },
864
  "UC": {
865
+ "accuracy": 0.0891891891891892,
866
+ "count": 370
867
  },
868
  "US": {
869
+ "accuracy": 0.5434782608695652,
870
+ "count": 230
871
  }
872
  }
873
  },
874
  "sub_M0": {
875
  "full_accuracy": 0.0,
876
+ "digit_accuracy": 0.29285714285714287,
877
+ "n_examples": 100,
878
  "per_subtask": {
879
  "MD": {
880
+ "accuracy": 0.1951219512195122,
881
+ "count": 615
882
  },
883
  "ME": {
884
  "accuracy": 1.0,
885
+ "count": 85
886
  }
887
  }
888
  },
889
  "sub_M1": {
890
  "full_accuracy": 0.0,
891
+ "digit_accuracy": 0.22428571428571428,
892
+ "n_examples": 100,
893
  "per_subtask": {
894
  "MD": {
895
+ "accuracy": 0.3698630136986301,
896
+ "count": 292
897
  },
898
  "MB": {
899
  "accuracy": 0.0,
900
+ "count": 144
901
  },
902
  "ME": {
903
  "accuracy": 1.0,
904
+ "count": 25
905
  },
906
  "UB": {
907
+ "accuracy": 0.100418410041841,
908
+ "count": 239
909
  }
910
  }
911
  },
912
  "sub_M2": {
913
  "full_accuracy": 0.0,
914
+ "digit_accuracy": 0.35428571428571426,
915
+ "n_examples": 100,
916
  "per_subtask": {
917
  "MD": {
918
+ "accuracy": 0.6208530805687204,
919
+ "count": 211
920
  },
921
  "MB": {
922
  "accuracy": 0.0,
923
+ "count": 115
924
  },
925
  "ME": {
926
  "accuracy": 1.0,
927
+ "count": 85
928
  },
929
  "UB": {
930
+ "accuracy": 0.17679558011049723,
931
+ "count": 181
932
  },
933
  "UD": {
934
  "accuracy": 0.0,
935
+ "count": 108
936
  }
937
  }
938
  },
939
  "sub_M3": {
940
  "full_accuracy": 0.0,
941
+ "digit_accuracy": 0.3,
942
+ "n_examples": 100,
943
  "per_subtask": {
944
  "MD": {
945
+ "accuracy": 0.7597765363128491,
946
+ "count": 179
947
  },
948
  "MB": {
949
  "accuracy": 0.0,
950
+ "count": 103
951
  },
952
  "ME": {
953
  "accuracy": 1.0,
954
+ "count": 56
955
  },
956
  "UB": {
957
+ "accuracy": 0.12080536912751678,
958
+ "count": 149
959
  },
960
  "UD": {
961
  "accuracy": 0.0,
962
+ "count": 213
963
  }
964
  }
965
  },
966
  "sub_M4": {
967
  "full_accuracy": 0.0,
968
+ "digit_accuracy": 0.18571428571428572,
969
+ "n_examples": 100,
970
  "per_subtask": {
971
  "MD": {
972
  "accuracy": 0.5,
973
+ "count": 200
974
  },
975
  "MB": {
976
  "accuracy": 0.0,
977
+ "count": 100
978
  },
979
  "UB": {
980
+ "accuracy": 0.3,
981
+ "count": 100
982
  },
983
  "UD": {
984
  "accuracy": 0.0,
985
+ "count": 300
986
  }
987
  }
988
  },
989
  "sub_M5": {
990
  "full_accuracy": 0.0,
991
+ "digit_accuracy": 0.18714285714285714,
992
+ "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
+ "count": 100
997
  },
998
  "MB": {
999
  "accuracy": 0.0,
1000
+ "count": 100
1001
  },
1002
  "UB": {
1003
+ "accuracy": 0.31,
1004
+ "count": 100
1005
  },
1006
  "UD": {
1007
  "accuracy": 0.0,
1008
+ "count": 400
1009
  }
1010
  }
1011
  },
 
1015
  "n_examples": 200,
1016
  "per_subtask": {
1017
  "MD": {
1018
+ "accuracy": 0.3616666666666667,
1019
+ "count": 600
1020
  },
1021
  "MB": {
1022
  "accuracy": 0.0,
1023
+ "count": 267
1024
  },
1025
  "ME": {
1026
  "accuracy": 1.0,
1027
  "count": 53
1028
  },
1029
  "UB": {
1030
+ "accuracy": 0.12072892938496584,
1031
+ "count": 439
1032
  },
1033
  "UD": {
1034
  "accuracy": 0.0,
1035
+ "count": 41
1036
  }
1037
  }
1038
  },
1039
  "sub_B3": {
1040
  "full_accuracy": 0.0,
1041
+ "digit_accuracy": 0.19285714285714287,
1042
+ "n_examples": 100,
1043
  "per_subtask": {
1044
  "MD": {
1045
  "accuracy": 0.3333333333333333,
1046
+ "count": 300
1047
  },
1048
  "MB": {
1049
  "accuracy": 0.0,
1050
+ "count": 100
1051
  },
1052
  "UB": {
1053
+ "accuracy": 0.17766497461928935,
1054
+ "count": 197
1055
  },
1056
  "UD": {
1057
  "accuracy": 0.0,
1058
+ "count": 103
1059
  }
1060
  }
1061
  },
1062
  "sub_B4": {
1063
  "full_accuracy": 0.0,
1064
+ "digit_accuracy": 0.19428571428571428,
1065
+ "n_examples": 100,
1066
  "per_subtask": {
1067
  "MD": {
1068
  "accuracy": 0.5,
1069
+ "count": 200
1070
  },
1071
  "MB": {
1072
  "accuracy": 0.0,
1073
+ "count": 100
1074
  },
1075
  "UB": {
1076
+ "accuracy": 0.145748987854251,
1077
+ "count": 247
1078
  },
1079
  "UD": {
1080
  "accuracy": 0.0,
1081
+ "count": 153
1082
  }
1083
  }
1084
  },
1085
  "sub_B5": {
1086
  "full_accuracy": 0.0,
1087
+ "digit_accuracy": 0.19,
1088
+ "n_examples": 100,
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
+ "count": 100
1093
  },
1094
  "MB": {
1095
  "accuracy": 0.0,
1096
+ "count": 100
1097
  },
1098
  "UB": {
1099
+ "accuracy": 0.11073825503355705,
1100
+ "count": 298
1101
  },
1102
  "UD": {
1103
  "accuracy": 0.0,
1104
+ "count": 202
1105
  }
1106
  }
1107
  }
1108
  },
1109
  "summary": {
1110
+ "overall_accuracy": 0.0019230769230769232,
1111
+ "digit_accuracy": 0.23807692307692307,
1112
+ "total_examples": 2600,
1113
  "n_splits": 24
1114
  }
1115
  },
 
1119
  "K": 1,
1120
  "mode": "sorl",
1121
  "n_digits": 6,
1122
+ "n_per_split": 100
1123
  },
1124
  "splits": {
1125
  "add_S0": {
1126
  "full_accuracy": 0.0,
1127
+ "digit_accuracy": 0.38,
1128
+ "n_examples": 100,
1129
  "per_subtask": {
1130
  "SA": {
1131
+ "accuracy": 0.3239669421487603,
1132
+ "count": 605
1133
  },
1134
  "SS": {
1135
+ "accuracy": 0.7368421052631579,
1136
+ "count": 95
1137
  }
1138
  }
1139
  },
1140
  "add_S1": {
1141
  "full_accuracy": 0.0,
1142
+ "digit_accuracy": 0.31,
1143
+ "n_examples": 100,
1144
  "per_subtask": {
1145
  "SA": {
1146
+ "accuracy": 0.35294117647058826,
1147
+ "count": 204
1148
  },
1149
  "SC": {
1150
+ "accuracy": 0.28402366863905326,
1151
+ "count": 169
1152
  },
1153
  "SS": {
1154
+ "accuracy": 0.4838709677419355,
1155
+ "count": 31
1156
  },
1157
  "UC": {
1158
+ "accuracy": 0.27702702702702703,
1159
+ "count": 296
1160
  }
1161
  }
1162
  },
1163
  "add_S2": {
1164
+ "full_accuracy": 0.01,
1165
+ "digit_accuracy": 0.39,
1166
+ "n_examples": 100,
1167
  "per_subtask": {
1168
  "SA": {
1169
+ "accuracy": 0.4294478527607362,
1170
+ "count": 163
1171
  },
1172
  "SC": {
1173
+ "accuracy": 0.34615384615384615,
1174
+ "count": 130
1175
  },
1176
  "SS": {
1177
+ "accuracy": 0.45977011494252873,
1178
+ "count": 87
1179
  },
1180
  "UC": {
1181
+ "accuracy": 0.28078817733990147,
1182
+ "count": 203
1183
  },
1184
  "US": {
1185
+ "accuracy": 0.5213675213675214,
1186
+ "count": 117
1187
  }
1188
  }
1189
  },
1190
  "add_S3": {
1191
+ "full_accuracy": 0.02,
1192
+ "digit_accuracy": 0.37285714285714283,
1193
+ "n_examples": 100,
1194
  "per_subtask": {
1195
  "SA": {
1196
+ "accuracy": 0.45454545454545453,
1197
+ "count": 121
1198
  },
1199
  "SC": {
1200
+ "accuracy": 0.2892561983471074,
1201
+ "count": 121
1202
  },
1203
  "SS": {
1204
+ "accuracy": 0.46938775510204084,
1205
+ "count": 49
1206
  },
1207
  "UC": {
1208
+ "accuracy": 0.24193548387096775,
1209
+ "count": 186
1210
  },
1211
  "US": {
1212
+ "accuracy": 0.4618834080717489,
1213
+ "count": 223
1214
  }
1215
  }
1216
  },
1217
  "add_S4": {
1218
  "full_accuracy": 0.0,
1219
+ "digit_accuracy": 0.35714285714285715,
1220
+ "n_examples": 100,
1221
  "per_subtask": {
1222
  "SA": {
1223
+ "accuracy": 0.49038461538461536,
1224
+ "count": 104
1225
  },
1226
  "SC": {
1227
+ "accuracy": 0.2358490566037736,
1228
+ "count": 106
1229
  },
1230
  "SS": {
1231
+ "accuracy": 0.43478260869565216,
1232
+ "count": 23
1233
  },
1234
  "UC": {
1235
+ "accuracy": 0.26875,
1236
+ "count": 160
1237
  },
1238
  "US": {
1239
+ "accuracy": 0.3941368078175896,
1240
+ "count": 307
1241
  }
1242
  }
1243
  },
1244
  "add_S5": {
1245
  "full_accuracy": 0.0,
1246
+ "digit_accuracy": 0.2357142857142857,
1247
+ "n_examples": 100,
1248
  "per_subtask": {
1249
  "SA": {
1250
+ "accuracy": 0.46,
1251
+ "count": 100
1252
  },
1253
  "SC": {
1254
+ "accuracy": 0.26,
1255
+ "count": 100
1256
  },
1257
  "UC": {
1258
+ "accuracy": 0.09,
1259
+ "count": 100
1260
  },
1261
  "US": {
1262
+ "accuracy": 0.21,
1263
+ "count": 400
1264
  }
1265
  }
1266
  },
1267
  "add_S6": {
1268
+ "full_accuracy": 0.0,
1269
+ "digit_accuracy": 0.008571428571428572,
1270
+ "n_examples": 100,
1271
  "per_subtask": {
1272
  "SC": {
1273
+ "accuracy": 0.0,
1274
+ "count": 100
1275
  },
1276
  "UC": {
1277
+ "accuracy": 0.01,
1278
+ "count": 100
1279
  },
1280
  "US": {
1281
+ "accuracy": 0.01,
1282
+ "count": 500
1283
  }
1284
  }
1285
  },
1286
  "add_random": {
1287
  "full_accuracy": 0.0,
1288
+ "digit_accuracy": 0.3192857142857143,
1289
  "n_examples": 200,
1290
  "per_subtask": {
1291
  "SA": {
1292
+ "accuracy": 0.33557046979865773,
1293
+ "count": 447
1294
  },
1295
  "SC": {
1296
+ "accuracy": 0.284375,
1297
+ "count": 320
1298
  },
1299
  "SS": {
1300
+ "accuracy": 0.5178571428571429,
1301
+ "count": 56
1302
  },
1303
  "UC": {
1304
+ "accuracy": 0.2835538752362949,
1305
+ "count": 529
1306
  },
1307
  "US": {
1308
+ "accuracy": 0.5625,
1309
+ "count": 48
1310
  }
1311
  }
1312
  },
1313
  "add_C1": {
1314
  "full_accuracy": 0.0,
1315
+ "digit_accuracy": 0.3585714285714286,
1316
+ "n_examples": 100,
1317
  "per_subtask": {
1318
  "SA": {
1319
+ "accuracy": 0.408,
1320
+ "count": 500
1321
  },
1322
  "SC": {
1323
+ "accuracy": 0.3,
1324
+ "count": 100
1325
  },
1326
  "UC": {
1327
+ "accuracy": 0.17,
1328
+ "count": 100
1329
  }
1330
  }
1331
  },
1332
  "add_C2": {
1333
  "full_accuracy": 0.0,
1334
+ "digit_accuracy": 0.33285714285714285,
1335
+ "n_examples": 100,
1336
  "per_subtask": {
1337
  "SA": {
1338
+ "accuracy": 0.3925,
1339
+ "count": 400
1340
  },
1341
  "SC": {
1342
+ "accuracy": 0.3,
1343
+ "count": 100
1344
  },
1345
  "UC": {
1346
+ "accuracy": 0.09615384615384616,
1347
+ "count": 156
1348
  },
1349
  "US": {
1350
+ "accuracy": 0.7045454545454546,
1351
+ "count": 44
1352
  }
1353
  }
1354
  },
1355
  "add_C3": {
1356
  "full_accuracy": 0.0,
1357
+ "digit_accuracy": 0.36,
1358
+ "n_examples": 100,
1359
  "per_subtask": {
1360
  "SA": {
1361
+ "accuracy": 0.41333333333333333,
1362
+ "count": 300
1363
  },
1364
  "SC": {
1365
+ "accuracy": 0.38,
1366
+ "count": 100
1367
  },
1368
  "UC": {
1369
+ "accuracy": 0.135678391959799,
1370
+ "count": 199
1371
  },
1372
  "US": {
1373
+ "accuracy": 0.6237623762376238,
1374
+ "count": 101
1375
  }
1376
  }
1377
  },
1378
  "add_C4": {
1379
  "full_accuracy": 0.0,
1380
+ "digit_accuracy": 0.35428571428571426,
1381
+ "n_examples": 100,
1382
  "per_subtask": {
1383
  "SA": {
1384
+ "accuracy": 0.46,
1385
+ "count": 200
1386
  },
1387
  "SC": {
1388
+ "accuracy": 0.33,
1389
+ "count": 100
1390
  },
1391
  "UC": {
1392
+ "accuracy": 0.13636363636363635,
1393
+ "count": 264
1394
  },
1395
  "US": {
1396
+ "accuracy": 0.6397058823529411,
1397
+ "count": 136
1398
  }
1399
  }
1400
  },
1401
  "add_C5": {
1402
  "full_accuracy": 0.0,
1403
+ "digit_accuracy": 0.37857142857142856,
1404
+ "n_examples": 100,
1405
  "per_subtask": {
1406
  "SA": {
1407
+ "accuracy": 0.56,
1408
+ "count": 100
1409
  },
1410
  "SC": {
1411
+ "accuracy": 0.28,
1412
+ "count": 100
1413
  },
1414
  "UC": {
1415
+ "accuracy": 0.16451612903225807,
1416
+ "count": 310
1417
  },
1418
  "US": {
1419
+ "accuracy": 0.6842105263157895,
1420
+ "count": 190
1421
  }
1422
  }
1423
  },
1424
  "add_C6": {
1425
  "full_accuracy": 0.0,
1426
+ "digit_accuracy": 0.3842857142857143,
1427
+ "n_examples": 100,
1428
  "per_subtask": {
1429
  "SC": {
1430
+ "accuracy": 0.18,
1431
+ "count": 100
1432
  },
1433
  "UC": {
1434
+ "accuracy": 0.25135135135135134,
1435
+ "count": 370
1436
  },
1437
  "US": {
1438
+ "accuracy": 0.6869565217391305,
1439
+ "count": 230
1440
  }
1441
  }
1442
  },
1443
  "sub_M0": {
1444
  "full_accuracy": 0.0,
1445
+ "digit_accuracy": 0.3314285714285714,
1446
+ "n_examples": 100,
1447
  "per_subtask": {
1448
  "MD": {
1449
+ "accuracy": 0.23902439024390243,
1450
+ "count": 615
1451
  },
1452
  "ME": {
1453
  "accuracy": 1.0,
1454
+ "count": 85
1455
  }
1456
  }
1457
  },
1458
  "sub_M1": {
1459
  "full_accuracy": 0.0,
1460
+ "digit_accuracy": 0.2571428571428571,
1461
+ "n_examples": 100,
1462
  "per_subtask": {
1463
  "MD": {
1464
+ "accuracy": 0.4006849315068493,
1465
+ "count": 292
1466
  },
1467
  "MB": {
1468
+ "accuracy": 0.034722222222222224,
1469
+ "count": 144
1470
  },
1471
  "ME": {
1472
  "accuracy": 1.0,
1473
+ "count": 25
1474
  },
1475
  "UB": {
1476
+ "accuracy": 0.13807531380753138,
1477
+ "count": 239
1478
  }
1479
  }
1480
  },
1481
  "sub_M2": {
1482
  "full_accuracy": 0.0,
1483
+ "digit_accuracy": 0.39,
1484
+ "n_examples": 100,
1485
  "per_subtask": {
1486
  "MD": {
1487
+ "accuracy": 0.6445497630331753,
1488
+ "count": 211
1489
  },
1490
  "MB": {
1491
+ "accuracy": 0.02608695652173913,
1492
+ "count": 115
1493
  },
1494
  "ME": {
1495
  "accuracy": 1.0,
1496
+ "count": 85
1497
  },
1498
  "UB": {
1499
+ "accuracy": 0.26519337016574585,
1500
+ "count": 181
1501
  },
1502
  "UD": {
1503
+ "accuracy": 0.009259259259259259,
1504
+ "count": 108
1505
  }
1506
  }
1507
  },
1508
  "sub_M3": {
1509
  "full_accuracy": 0.0,
1510
+ "digit_accuracy": 0.31285714285714283,
1511
+ "n_examples": 100,
1512
  "per_subtask": {
1513
  "MD": {
1514
+ "accuracy": 0.7821229050279329,
1515
+ "count": 179
1516
  },
1517
  "MB": {
1518
+ "accuracy": 0.009708737864077669,
1519
+ "count": 103
1520
  },
1521
  "ME": {
1522
  "accuracy": 1.0,
1523
+ "count": 56
1524
  },
1525
  "UB": {
1526
+ "accuracy": 0.1476510067114094,
1527
+ "count": 149
1528
  },
1529
  "UD": {
1530
+ "accuracy": 0.0,
1531
+ "count": 213
1532
  }
1533
  }
1534
  },
1535
  "sub_M4": {
1536
  "full_accuracy": 0.0,
1537
+ "digit_accuracy": 0.21428571428571427,
1538
+ "n_examples": 100,
1539
  "per_subtask": {
1540
  "MD": {
1541
  "accuracy": 0.54,
1542
+ "count": 200
1543
  },
1544
  "MB": {
1545
+ "accuracy": 0.06,
1546
+ "count": 100
1547
  },
1548
  "UB": {
1549
+ "accuracy": 0.36,
1550
+ "count": 100
1551
  },
1552
  "UD": {
1553
  "accuracy": 0.0,
1554
+ "count": 300
1555
  }
1556
  }
1557
  },
1558
  "sub_M5": {
1559
  "full_accuracy": 0.0,
1560
+ "digit_accuracy": 0.18857142857142858,
1561
+ "n_examples": 100,
1562
  "per_subtask": {
1563
  "MD": {
1564
  "accuracy": 1.0,
1565
+ "count": 100
1566
  },
1567
  "MB": {
1568
  "accuracy": 0.0,
1569
+ "count": 100
1570
  },
1571
  "UB": {
1572
+ "accuracy": 0.32,
1573
+ "count": 100
1574
  },
1575
  "UD": {
1576
  "accuracy": 0.0,
1577
+ "count": 400
1578
  }
1579
  }
1580
  },
1581
  "sub_random": {
1582
  "full_accuracy": 0.0,
1583
+ "digit_accuracy": 0.2621428571428571,
1584
  "n_examples": 200,
1585
  "per_subtask": {
1586
  "MD": {
1587
+ "accuracy": 0.37833333333333335,
1588
+ "count": 600
1589
  },
1590
  "MB": {
1591
+ "accuracy": 0.03745318352059925,
1592
+ "count": 267
1593
  },
1594
  "ME": {
1595
+ "accuracy": 1.0,
1596
  "count": 53
1597
  },
1598
  "UB": {
1599
+ "accuracy": 0.17539863325740318,
1600
+ "count": 439
1601
  },
1602
  "UD": {
1603
+ "accuracy": 0.0,
1604
+ "count": 41
1605
  }
1606
  }
1607
  },
1608
  "sub_B3": {
1609
  "full_accuracy": 0.0,
1610
+ "digit_accuracy": 0.22857142857142856,
1611
+ "n_examples": 100,
1612
  "per_subtask": {
1613
  "MD": {
1614
+ "accuracy": 0.36333333333333334,
1615
+ "count": 300
1616
  },
1617
  "MB": {
1618
+ "accuracy": 0.05,
1619
+ "count": 100
1620
  },
1621
  "UB": {
1622
+ "accuracy": 0.233502538071066,
1623
+ "count": 197
1624
  },
1625
  "UD": {
1626
  "accuracy": 0.0,
1627
+ "count": 103
1628
  }
1629
  }
1630
  },
1631
  "sub_B4": {
1632
  "full_accuracy": 0.0,
1633
+ "digit_accuracy": 0.21,
1634
+ "n_examples": 100,
1635
  "per_subtask": {
1636
  "MD": {
1637
+ "accuracy": 0.525,
1638
+ "count": 200
1639
  },
1640
  "MB": {
1641
+ "accuracy": 0.01,
1642
+ "count": 100
1643
  },
1644
  "UB": {
1645
+ "accuracy": 0.1659919028340081,
1646
+ "count": 247
1647
  },
1648
  "UD": {
1649
  "accuracy": 0.0,
1650
+ "count": 153
1651
  }
1652
  }
1653
  },
1654
  "sub_B5": {
1655
  "full_accuracy": 0.0,
1656
+ "digit_accuracy": 0.19285714285714287,
1657
+ "n_examples": 100,
1658
  "per_subtask": {
1659
  "MD": {
1660
  "accuracy": 1.0,
1661
+ "count": 100
1662
  },
1663
  "MB": {
1664
+ "accuracy": 0.0,
1665
+ "count": 100
1666
  },
1667
  "UB": {
1668
+ "accuracy": 0.1174496644295302,
1669
+ "count": 298
1670
  },
1671
  "UD": {
1672
  "accuracy": 0.0,
1673
+ "count": 202
1674
  }
1675
  }
1676
  }
1677
  },
1678
  "summary": {
1679
+ "overall_accuracy": 0.001153846153846154,
1680
+ "digit_accuracy": 0.29637362637362635,
1681
+ "total_examples": 2600,
1682
  "n_splits": 24
1683
  }
1684
  },
1685
+ "sorl_overall_accuracy": 0.001153846153846154,
1686
+ "sft_overall_accuracy": 0.0019230769230769232
1687
  }
add_sub_sorl_v1_abs30_K1_10K_2L1H128d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6472ccf29af6fd4a4dbd18dfd61ef27a518076dbed42e6ade264b2a1ae69f11
3
  size 157722580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25d4c381a7edfa9205467f1839da87f38a3c53638694a57a1ec4181fe5dc9334
3
  size 157722580
add_sub_sorl_v1_abs30_K1_10K_2L1H128d/train_config.json CHANGED
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 39353984,
71
  "run_name": "add_sub_sorl_v1_abs30_K1_10K_2L1H128d",
72
- "git_commit": "17e935f460a7f9595b705c1d614101a6b0e520f7",
73
- "timestamp": "2026-04-14T03:28:47.578176+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "evjie1zh",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/evjie1zh",
81
- "final_accuracy": 0.00125,
82
- "sft_accuracy": 0.0016666666666666668,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
69
  "no_wandb": false,
70
  "n_params": 39353984,
71
  "run_name": "add_sub_sorl_v1_abs30_K1_10K_2L1H128d",
72
+ "git_commit": "1d5a160e16a5070d61b881494e832aa88149b15c",
73
+ "timestamp": "2026-04-15T07:41:24.020201+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "v1",
80
+ "wandb_run_id": "stx7wi4h",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/stx7wi4h",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.001153846153846154,
86
+ "sft_accuracy": 0.0019230769230769232,
87
  "eval_method": "ArithmeticEvaluator"
88
  }