amirali1985 commited on
Commit
44ff9e8
·
verified ·
1 Parent(s): 11cfda9

Upload add_sub_sorl_v1_abs10_K1_10K

Browse files
add_sub_sorl_v1_abs10_K1_10K/metrics.json CHANGED
@@ -63,377 +63,377 @@
63
  3133
64
  ],
65
  "loss": [
66
- 9.340689659118652,
67
- 4.987771987915039,
68
- 3.1618072986602783,
69
- 2.856262445449829,
70
- 2.5268046855926514,
71
- 2.1091880798339844,
72
- -0.44331324100494385,
73
- -4.092588424682617,
74
- -4.768612384796143,
75
- -3.7999463081359863,
76
- -1.6939505338668823,
77
- -2.5403966903686523,
78
- -2.0438146591186523,
79
- -1.7041206359863281,
80
- -1.5684587955474854,
81
- -1.9102389812469482,
82
- -1.880913257598877,
83
- -1.9313758611679077,
84
- -1.8197805881500244,
85
- -1.7383019924163818,
86
- -2.4177093505859375,
87
- -1.3264975547790527,
88
- -1.2404942512512207,
89
- -1.5992217063903809,
90
- -1.9741289615631104,
91
- -1.0616717338562012,
92
- -1.5642304420471191,
93
- -1.3654898405075073,
94
- -1.3151346445083618,
95
- -0.7584792971611023,
96
- -1.3367366790771484,
97
- -1.2703979015350342,
98
- -1.0298550128936768,
99
- -1.6327556371688843,
100
- -1.734535813331604,
101
- -1.002307653427124,
102
- -0.9018746018409729,
103
- -2.051492214202881,
104
- -0.6267629861831665,
105
- -0.9340898990631104,
106
- -0.7012732028961182,
107
- -0.9501388669013977,
108
- -0.9147610664367676,
109
- -0.5015538930892944,
110
- -1.2906075716018677,
111
- -0.9741727113723755,
112
- -0.7373249530792236,
113
- -0.8238300085067749,
114
- -0.48194777965545654,
115
- -0.7015870213508606,
116
- -0.4147356152534485,
117
- -0.649371325969696,
118
- -0.7445440292358398,
119
- -0.7792887687683105,
120
- -0.3351507782936096,
121
- -0.4870011508464813,
122
- -0.2853846549987793,
123
- -0.4475783109664917,
124
- -0.42194119095802307,
125
- -0.4129505455493927
126
  ],
127
  "base_loss": [
128
- 7.390318870544434,
129
- 3.826241970062256,
130
- 1.9951651096343994,
131
- 1.8394466638565063,
132
- 1.8613300323486328,
133
- 1.835070252418518,
134
- 1.8304622173309326,
135
- 1.7413225173950195,
136
- 1.6582175493240356,
137
- 1.3251668214797974,
138
- 0.9685406684875488,
139
- 0.9228308796882629,
140
- 0.7192235589027405,
141
- 0.5874363780021667,
142
- 0.5402608513832092,
143
- 0.5179902911186218,
144
- 0.47875410318374634,
145
- 0.4619639217853546,
146
- 0.39008235931396484,
147
- 0.43836668133735657,
148
- 0.48746830224990845,
149
- 0.2996414601802826,
150
- 0.26576855778694153,
151
- 0.3262496590614319,
152
- 0.33699464797973633,
153
- 0.24052418768405914,
154
- 0.2867306172847748,
155
- 0.23591551184654236,
156
- 0.233742356300354,
157
- 0.15799184143543243,
158
- 0.2535308301448822,
159
- 0.22049279510974884,
160
- 0.1870928704738617,
161
- 0.2405395358800888,
162
- 0.25070902705192566,
163
- 0.16667716205120087,
164
- 0.15809106826782227,
165
- 0.31137070059776306,
166
- 0.11368274688720703,
167
- 0.14004917442798615,
168
- 0.13582302629947662,
169
- 0.14286470413208008,
170
- 0.13543015718460083,
171
- 0.08498942852020264,
172
- 0.18833109736442566,
173
- 0.13422875106334686,
174
- 0.10775727778673172,
175
- 0.12300104647874832,
176
- 0.07167976349592209,
177
- 0.11587230116128922,
178
- 0.0645364299416542,
179
- 0.09650751203298569,
180
- 0.09376537799835205,
181
- 0.09722624719142914,
182
- 0.045721422880887985,
183
- 0.06196039915084839,
184
- 0.05605100467801094,
185
- 0.06696300953626633,
186
- 0.053617555648088455,
187
- 0.052670858800411224
188
  ],
189
  "info_loss": [
190
- -0.4859275817871094,
191
- -0.1129457950592041,
192
- -0.07287442684173584,
193
- -0.0863027572631836,
194
- -0.12096810340881348,
195
- -0.1600182056427002,
196
- -0.41505467891693115,
197
- -0.7713239789009094,
198
- -0.8302392959594727,
199
- -0.6981891989707947,
200
- -0.4436832070350647,
201
- -0.5114461183547974,
202
- -0.42171621322631836,
203
- -0.35522836446762085,
204
- -0.32737311720848083,
205
- -0.3521292805671692,
206
- -0.33850210905075073,
207
- -0.33436745405197144,
208
- -0.3098219931125641,
209
- -0.3004074692726135,
210
- -0.3650834560394287,
211
- -0.2377147525548935,
212
- -0.21807271242141724,
213
- -0.2605586349964142,
214
- -0.2963896691799164,
215
- -0.19201940298080444,
216
- -0.2403343915939331,
217
- -0.2125036120414734,
218
- -0.2005012035369873,
219
- -0.14176680147647858,
220
- -0.20001220703125,
221
- -0.18811163306236267,
222
- -0.16207079589366913,
223
- -0.22838185727596283,
224
- -0.23002460598945618,
225
- -0.1475900262594223,
226
- -0.1343834400177002,
227
- -0.26728555560112,
228
- -0.10240201652050018,
229
- -0.1339702010154724,
230
- -0.11256629973649979,
231
- -0.13482710719108582,
232
- -0.12813962996006012,
233
- -0.08094710111618042,
234
- -0.1647377759218216,
235
- -0.13163283467292786,
236
- -0.1023663803935051,
237
- -0.11182866990566254,
238
- -0.06797388941049576,
239
- -0.09571453928947449,
240
- -0.06312839686870575,
241
- -0.0817064419388771,
242
- -0.09321703016757965,
243
- -0.09572642296552658,
244
- -0.04448504000902176,
245
- -0.06147210672497749,
246
- -0.040348708629608154,
247
- -0.05947120487689972,
248
- -0.05293218418955803,
249
- -0.05189599096775055
250
  ],
251
  "abs_loss": [
252
- 2.2262747287750244,
253
- 1.8807449340820312,
254
- 1.833791732788086,
255
- 1.8724111318588257,
256
- 1.8476206064224243,
257
- 1.8559261560440063,
258
- 1.87335205078125,
259
- 1.8458162546157837,
260
- 1.813267707824707,
261
- 1.6594558954238892,
262
- 1.4701725244522095,
263
- 1.2008872032165527,
264
- 0.8842588067054749,
265
- 0.7724447250366211,
266
- 0.6772302985191345,
267
- 0.6123369336128235,
268
- 0.49044322967529297,
269
- 0.4312553107738495,
270
- 0.38433346152305603,
271
- 0.3575459420681,
272
- 0.2791888415813446,
273
- 0.3738294541835785,
274
- 0.2938316762447357,
275
- 0.2500404715538025,
276
- 0.24418945610523224,
277
- 0.2506295442581177,
278
- 0.1918429583311081,
279
- 0.21762271225452423,
280
- 0.19322265684604645,
281
- 0.174521803855896,
282
- 0.1660517454147339,
283
- 0.1862434595823288,
284
- 0.14573685824871063,
285
- 0.13860176503658295,
286
- 0.13301999866962433,
287
- 0.12413140386343002,
288
- 0.0981183648109436,
289
- 0.1176028847694397,
290
- 0.12784387171268463,
291
- 0.08845017105340958,
292
- 0.10072199255228043,
293
- 0.08419985324144363,
294
- 0.10068891197443008,
295
- 0.08726515620946884,
296
- 0.08775246143341064,
297
- 0.06938570737838745,
298
- 0.07202962785959244,
299
- 0.07672454416751862,
300
- 0.06882075220346451,
301
- 0.0632840245962143,
302
- 0.06827760487794876,
303
- 0.05087992176413536,
304
- 0.04930369183421135,
305
- 0.05453367158770561,
306
- 0.04361484572291374,
307
- 0.05198704078793526,
308
- 0.042146146297454834,
309
- 0.0432199127972126,
310
- 0.042553484439849854,
311
- 0.054908428341150284
312
  ],
313
  "zipf_loss": [
314
- 6.587018966674805,
315
- 2.1029131412506104,
316
- 1.7120072841644287,
317
- 1.692602276802063,
318
- 1.6903936862945557,
319
- 1.6887073516845703,
320
- 1.6894363164901733,
321
- 1.6947468519210815,
322
- 1.694236397743225,
323
- 1.690833568572998,
324
- 1.6273235082626343,
325
- 1.531144618988037,
326
- 1.3656980991363525,
327
- 1.1834821701049805,
328
- 1.0972886085510254,
329
- 1.031829595565796,
330
- 0.9763095378875732,
331
- 0.9072092771530151,
332
- 0.8499235510826111,
333
- 0.7916512489318848,
334
- 0.7177382707595825,
335
- 0.7136257290840149,
336
- 0.6450810432434082,
337
- 0.655110776424408,
338
- 0.6283543109893799,
339
- 0.5929351449012756,
340
- 0.5331985950469971,
341
- 0.5018686056137085,
342
- 0.43681272864341736,
343
- 0.48374468088150024,
344
- 0.39324942231178284,
345
- 0.37160131335258484,
346
- 0.3891863524913788,
347
- 0.3966630697250366,
348
- 0.3016990125179291,
349
- 0.2945023775100708,
350
- 0.2740568518638611,
351
- 0.29823243618011475,
352
- 0.27079012989997864,
353
- 0.2567179501056671,
354
- 0.278494656085968,
355
- 0.24684756994247437,
356
- 0.22113612294197083,
357
- 0.21420112252235413,
358
- 0.1596638262271881,
359
- 0.2009882926940918,
360
- 0.17137852311134338,
361
- 0.1637832671403885,
362
- 0.1192292720079422,
363
- 0.13335762917995453,
364
- 0.14518415927886963,
365
- 0.06609760969877243,
366
- 0.08893045783042908,
367
- 0.07529586553573608,
368
- 0.05961669981479645,
369
- 0.06056078150868416,
370
- 0.05783679336309433,
371
- 0.07584869861602783,
372
- 0.0495077446103096,
373
- 0.04784762114286423
374
  ],
375
  "denoise_loss": [],
376
  "ortho_loss": [
377
- 0.3918377161026001,
378
- 0.20118887722492218,
379
- 0.12065954506397247,
380
- 0.0842815414071083,
381
- 0.07851361483335495,
382
- 0.08886810392141342,
383
- 0.1118798479437828,
384
- 0.16310755908489227,
385
- 0.21554774045944214,
386
- 0.2462739199399948,
387
- 0.2753961980342865,
388
- 0.30527833104133606,
389
- 0.33816447854042053,
390
- 0.3525509536266327,
391
- 0.36425891518592834,
392
- 0.3624407649040222,
393
- 0.36967605352401733,
394
- 0.3742343485355377,
395
- 0.3982963562011719,
396
- 0.3951874375343323,
397
- 0.3906378149986267,
398
- 0.3858691453933716,
399
- 0.390056312084198,
400
- 0.3822699785232544,
401
- 0.3838372230529785,
402
- 0.3822362422943115,
403
- 0.375870019197464,
404
- 0.38709157705307007,
405
- 0.3905230164527893,
406
- 0.39558085799217224,
407
- 0.3952634334564209,
408
- 0.39085277915000916,
409
- 0.39921504259109497,
410
- 0.39666083455085754,
411
- 0.3942829966545105,
412
- 0.3991093337535858,
413
- 0.39491263031959534,
414
- 0.3742454946041107,
415
- 0.3708445429801941,
416
- 0.37462660670280457,
417
- 0.3777347207069397,
418
- 0.36790767312049866,
419
- 0.36087462306022644,
420
- 0.3528926968574524,
421
- 0.3517568111419678,
422
- 0.3517394959926605,
423
- 0.3531302809715271,
424
- 0.35252636671066284,
425
- 0.35542812943458557,
426
- 0.35618269443511963,
427
- 0.3573307693004608,
428
- 0.3604697287082672,
429
- 0.36059707403182983,
430
- 0.36255255341529846,
431
- 0.362486869096756,
432
- 0.3620503544807434,
433
- 0.363882839679718,
434
- 0.36408552527427673,
435
- 0.3636517822742462,
436
- 0.3643637001514435
437
  ],
438
  "lr": [
439
  3.9200000000000004e-05,
@@ -521,8 +521,8 @@
521
  3083
522
  ],
523
  "eval_accuracy": [
524
- 0.01,
525
- 0.0,
526
  0.0,
527
  0.0,
528
  0.0,
@@ -543,573 +543,573 @@
543
  0.0
544
  ]
545
  },
546
- "final_accuracy": 0.9470833333333334,
547
  "sft_eval": {
548
  "config": {
549
  "ops": "add_sub",
550
  "K": null,
551
  "mode": "sft",
552
  "n_digits": 6,
553
- "n_per_split": 50
554
  },
555
  "splits": {
556
  "add_S0": {
557
- "full_accuracy": 0.9,
558
- "digit_accuracy": 0.9857142857142858,
559
- "n_examples": 50,
560
  "per_subtask": {
561
  "SA": {
562
- "accuracy": 0.9864406779661017,
563
- "count": 295
564
  },
565
  "SS": {
566
- "accuracy": 0.9818181818181818,
567
- "count": 55
568
  }
569
  }
570
  },
571
  "add_S1": {
572
  "full_accuracy": 1.0,
573
  "digit_accuracy": 1.0,
574
- "n_examples": 50,
575
  "per_subtask": {
576
  "SA": {
577
  "accuracy": 1.0,
578
- "count": 126
579
  },
580
  "SC": {
581
  "accuracy": 1.0,
582
- "count": 79
583
  },
584
  "SS": {
585
  "accuracy": 1.0,
586
- "count": 21
587
  },
588
  "UC": {
589
  "accuracy": 1.0,
590
- "count": 124
591
  }
592
  }
593
  },
594
  "add_S2": {
595
- "full_accuracy": 0.88,
596
- "digit_accuracy": 0.9771428571428571,
597
- "n_examples": 50,
598
  "per_subtask": {
599
  "SA": {
600
- "accuracy": 0.9733333333333334,
601
- "count": 75
602
  },
603
  "SC": {
604
- "accuracy": 0.967741935483871,
605
- "count": 62
606
  },
607
  "SS": {
608
- "accuracy": 0.9230769230769231,
609
- "count": 39
610
  },
611
  "UC": {
612
- "accuracy": 0.990990990990991,
613
- "count": 111
614
  },
615
  "US": {
616
  "accuracy": 1.0,
617
- "count": 63
618
  }
619
  }
620
  },
621
  "add_S3": {
622
- "full_accuracy": 0.72,
623
- "digit_accuracy": 0.96,
624
- "n_examples": 50,
625
  "per_subtask": {
626
  "SA": {
627
  "accuracy": 1.0,
628
- "count": 60
629
  },
630
  "SC": {
631
- "accuracy": 0.9824561403508771,
632
- "count": 57
633
  },
634
  "SS": {
635
  "accuracy": 1.0,
636
- "count": 19
637
  },
638
  "UC": {
639
- "accuracy": 0.875,
640
- "count": 104
641
  },
642
  "US": {
643
  "accuracy": 1.0,
644
- "count": 110
645
  }
646
  }
647
  },
648
  "add_S4": {
649
- "full_accuracy": 0.62,
650
- "digit_accuracy": 0.9228571428571428,
651
- "n_examples": 50,
652
  "per_subtask": {
653
  "SA": {
654
  "accuracy": 1.0,
655
- "count": 48
656
  },
657
  "SC": {
658
- "accuracy": 0.9807692307692307,
659
- "count": 52
660
  },
661
  "SS": {
662
  "accuracy": 1.0,
663
- "count": 7
664
  },
665
  "UC": {
666
- "accuracy": 0.8314606741573034,
667
- "count": 89
668
  },
669
  "US": {
670
- "accuracy": 0.9285714285714286,
671
- "count": 154
672
  }
673
  }
674
  },
675
  "add_S5": {
676
- "full_accuracy": 0.42,
677
- "digit_accuracy": 0.7971428571428572,
678
- "n_examples": 50,
679
  "per_subtask": {
680
  "SA": {
681
  "accuracy": 1.0,
682
- "count": 50
683
  },
684
  "SC": {
685
  "accuracy": 1.0,
686
- "count": 50
687
  },
688
  "UC": {
689
- "accuracy": 0.48,
690
- "count": 50
691
  },
692
  "US": {
693
- "accuracy": 0.775,
694
- "count": 200
695
  }
696
  }
697
  },
698
  "add_S6": {
699
- "full_accuracy": 0.9,
700
- "digit_accuracy": 0.9457142857142857,
701
- "n_examples": 50,
702
  "per_subtask": {
703
  "SC": {
704
  "accuracy": 1.0,
705
- "count": 50
706
  },
707
  "UC": {
708
- "accuracy": 0.9,
709
- "count": 50
710
  },
711
  "US": {
712
- "accuracy": 0.944,
713
- "count": 250
714
  }
715
  }
716
  },
717
  "add_random": {
718
- "full_accuracy": 0.98,
719
- "digit_accuracy": 0.9971428571428571,
720
  "n_examples": 200,
721
  "per_subtask": {
722
  "SA": {
723
- "accuracy": 1.0,
724
- "count": 431
725
  },
726
  "SC": {
727
  "accuracy": 1.0,
728
- "count": 316
729
  },
730
  "SS": {
731
  "accuracy": 1.0,
732
- "count": 39
733
  },
734
  "UC": {
735
- "accuracy": 0.9928571428571429,
736
- "count": 560
737
  },
738
  "US": {
739
  "accuracy": 1.0,
740
- "count": 54
741
  }
742
  }
743
  },
744
  "add_C1": {
745
  "full_accuracy": 1.0,
746
  "digit_accuracy": 1.0,
747
- "n_examples": 50,
748
  "per_subtask": {
749
  "SA": {
750
  "accuracy": 1.0,
751
- "count": 250
752
  },
753
  "SC": {
754
  "accuracy": 1.0,
755
- "count": 50
756
  },
757
  "UC": {
758
  "accuracy": 1.0,
759
- "count": 50
760
  }
761
  }
762
  },
763
  "add_C2": {
764
- "full_accuracy": 0.96,
765
- "digit_accuracy": 0.9914285714285714,
766
- "n_examples": 50,
767
  "per_subtask": {
768
  "SA": {
769
  "accuracy": 1.0,
770
- "count": 200
771
  },
772
  "SC": {
773
  "accuracy": 1.0,
774
- "count": 50
775
  },
776
  "UC": {
777
- "accuracy": 0.9759036144578314,
778
- "count": 83
779
  },
780
  "US": {
781
- "accuracy": 0.9411764705882353,
782
- "count": 17
783
  }
784
  }
785
  },
786
  "add_C3": {
787
- "full_accuracy": 0.88,
788
- "digit_accuracy": 0.98,
789
- "n_examples": 50,
790
  "per_subtask": {
791
  "SA": {
792
  "accuracy": 1.0,
793
- "count": 150
794
  },
795
  "SC": {
796
  "accuracy": 1.0,
797
- "count": 50
798
  },
799
  "UC": {
800
- "accuracy": 0.94,
801
- "count": 100
802
  },
803
  "US": {
804
- "accuracy": 0.98,
805
- "count": 50
806
  }
807
  }
808
  },
809
  "add_C4": {
810
- "full_accuracy": 0.82,
811
- "digit_accuracy": 0.9714285714285714,
812
- "n_examples": 50,
813
  "per_subtask": {
814
  "SA": {
815
  "accuracy": 1.0,
816
- "count": 100
817
  },
818
  "SC": {
819
  "accuracy": 1.0,
820
- "count": 50
821
  },
822
  "UC": {
823
- "accuracy": 0.9393939393939394,
824
- "count": 132
825
  },
826
  "US": {
827
- "accuracy": 0.9705882352941176,
828
- "count": 68
829
  }
830
  }
831
  },
832
  "add_C5": {
833
- "full_accuracy": 0.68,
834
- "digit_accuracy": 0.9457142857142857,
835
- "n_examples": 50,
836
  "per_subtask": {
837
  "SA": {
838
  "accuracy": 1.0,
839
- "count": 50
840
  },
841
  "SC": {
842
  "accuracy": 1.0,
843
- "count": 50
844
  },
845
  "UC": {
846
- "accuracy": 0.8904109589041096,
847
- "count": 146
848
  },
849
  "US": {
850
- "accuracy": 0.9711538461538461,
851
- "count": 104
852
  }
853
  }
854
  },
855
  "add_C6": {
856
- "full_accuracy": 0.88,
857
- "digit_accuracy": 0.9742857142857143,
858
- "n_examples": 50,
859
  "per_subtask": {
860
  "SC": {
861
  "accuracy": 1.0,
862
- "count": 50
863
  },
864
  "UC": {
865
- "accuracy": 0.9682539682539683,
866
- "count": 189
867
  },
868
  "US": {
869
- "accuracy": 0.972972972972973,
870
- "count": 111
871
  }
872
  }
873
  },
874
  "sub_M0": {
875
  "full_accuracy": 1.0,
876
  "digit_accuracy": 1.0,
877
- "n_examples": 50,
878
  "per_subtask": {
879
  "MD": {
880
  "accuracy": 1.0,
881
- "count": 303
882
  },
883
  "ME": {
884
  "accuracy": 1.0,
885
- "count": 47
886
  }
887
  }
888
  },
889
  "sub_M1": {
890
- "full_accuracy": 0.96,
891
- "digit_accuracy": 0.9942857142857143,
892
- "n_examples": 50,
893
  "per_subtask": {
894
  "MD": {
895
- "accuracy": 1.0,
896
- "count": 141
897
  },
898
  "MB": {
899
- "accuracy": 0.9861111111111112,
900
- "count": 72
901
  },
902
  "ME": {
903
  "accuracy": 1.0,
904
- "count": 18
905
  },
906
  "UB": {
907
- "accuracy": 0.9915966386554622,
908
- "count": 119
909
  }
910
  }
911
  },
912
  "sub_M2": {
913
- "full_accuracy": 0.92,
914
- "digit_accuracy": 0.9885714285714285,
915
- "n_examples": 50,
916
  "per_subtask": {
917
  "MD": {
918
- "accuracy": 0.9910714285714286,
919
- "count": 112
920
  },
921
  "MB": {
922
  "accuracy": 1.0,
923
- "count": 53
924
  },
925
  "ME": {
926
  "accuracy": 1.0,
927
- "count": 47
928
  },
929
  "UB": {
930
- "accuracy": 0.9764705882352941,
931
- "count": 85
932
  },
933
  "UD": {
934
- "accuracy": 0.9811320754716981,
935
- "count": 53
936
  }
937
  }
938
  },
939
  "sub_M3": {
940
- "full_accuracy": 0.22,
941
- "digit_accuracy": 0.8857142857142857,
942
- "n_examples": 50,
943
  "per_subtask": {
944
  "MD": {
945
  "accuracy": 1.0,
946
- "count": 97
947
  },
948
  "MB": {
949
  "accuracy": 1.0,
950
- "count": 51
951
  },
952
  "ME": {
953
  "accuracy": 1.0,
954
- "count": 27
955
  },
956
  "UB": {
957
- "accuracy": 0.5,
958
- "count": 74
959
  },
960
  "UD": {
961
- "accuracy": 0.9702970297029703,
962
- "count": 101
963
  }
964
  }
965
  },
966
  "sub_M4": {
967
- "full_accuracy": 0.02,
968
- "digit_accuracy": 0.7,
969
- "n_examples": 50,
970
  "per_subtask": {
971
  "MD": {
972
  "accuracy": 1.0,
973
- "count": 100
974
  },
975
  "MB": {
976
  "accuracy": 1.0,
977
- "count": 50
978
  },
979
  "UB": {
980
- "accuracy": 0.14,
981
- "count": 50
982
  },
983
  "UD": {
984
- "accuracy": 0.5866666666666667,
985
- "count": 150
986
  }
987
  }
988
  },
989
  "sub_M5": {
990
- "full_accuracy": 0.0,
991
- "digit_accuracy": 0.5428571428571428,
992
- "n_examples": 50,
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
- "count": 50
997
  },
998
  "MB": {
999
  "accuracy": 1.0,
1000
- "count": 50
1001
  },
1002
  "UB": {
1003
- "accuracy": 0.14,
1004
- "count": 50
1005
  },
1006
  "UD": {
1007
- "accuracy": 0.415,
1008
- "count": 200
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
- "full_accuracy": 0.98,
1014
- "digit_accuracy": 0.9971428571428571,
1015
  "n_examples": 200,
1016
  "per_subtask": {
1017
  "MD": {
1018
- "accuracy": 0.9982456140350877,
1019
- "count": 570
1020
  },
1021
  "MB": {
1022
  "accuracy": 1.0,
1023
- "count": 277
1024
  },
1025
  "ME": {
1026
  "accuracy": 1.0,
1027
  "count": 53
1028
  },
1029
  "UB": {
1030
- "accuracy": 0.9957537154989384,
1031
- "count": 471
1032
  },
1033
  "UD": {
1034
- "accuracy": 0.9655172413793104,
1035
- "count": 29
1036
  }
1037
  }
1038
  },
1039
  "sub_B3": {
1040
- "full_accuracy": 0.68,
1041
- "digit_accuracy": 0.9542857142857143,
1042
- "n_examples": 50,
1043
  "per_subtask": {
1044
  "MD": {
1045
  "accuracy": 1.0,
1046
- "count": 150
1047
  },
1048
  "MB": {
1049
  "accuracy": 1.0,
1050
- "count": 50
1051
  },
1052
  "UB": {
1053
- "accuracy": 0.8415841584158416,
1054
- "count": 101
1055
  },
1056
  "UD": {
1057
  "accuracy": 1.0,
1058
- "count": 49
1059
  }
1060
  }
1061
  },
1062
  "sub_B4": {
1063
- "full_accuracy": 0.58,
1064
- "digit_accuracy": 0.9171428571428571,
1065
- "n_examples": 50,
1066
  "per_subtask": {
1067
  "MD": {
1068
  "accuracy": 1.0,
1069
- "count": 100
1070
  },
1071
  "MB": {
1072
  "accuracy": 1.0,
1073
- "count": 50
1074
  },
1075
  "UB": {
1076
- "accuracy": 0.8264462809917356,
1077
- "count": 121
1078
  },
1079
  "UD": {
1080
- "accuracy": 0.8987341772151899,
1081
- "count": 79
1082
  }
1083
  }
1084
  },
1085
  "sub_B5": {
1086
- "full_accuracy": 0.58,
1087
- "digit_accuracy": 0.9228571428571428,
1088
- "n_examples": 50,
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
- "count": 50
1093
  },
1094
  "MB": {
1095
  "accuracy": 1.0,
1096
- "count": 50
1097
  },
1098
  "UB": {
1099
- "accuracy": 0.881578947368421,
1100
- "count": 152
1101
  },
1102
  "UD": {
1103
- "accuracy": 0.9081632653061225,
1104
- "count": 98
1105
  }
1106
  }
1107
  }
1108
  },
1109
  "summary": {
1110
- "overall_accuracy": 0.7806666666666666,
1111
- "digit_accuracy": 0.9444761904761905,
1112
- "total_examples": 1500,
1113
  "n_splits": 24
1114
  }
1115
  },
@@ -1119,167 +1119,167 @@
1119
  "K": 1,
1120
  "mode": "sorl",
1121
  "n_digits": 6,
1122
- "n_per_split": 50
1123
  },
1124
  "splits": {
1125
  "add_S0": {
1126
  "full_accuracy": 1.0,
1127
  "digit_accuracy": 1.0,
1128
- "n_examples": 50,
1129
  "per_subtask": {
1130
  "SA": {
1131
  "accuracy": 1.0,
1132
- "count": 295
1133
  },
1134
  "SS": {
1135
  "accuracy": 1.0,
1136
- "count": 55
1137
  }
1138
  }
1139
  },
1140
  "add_S1": {
1141
  "full_accuracy": 1.0,
1142
  "digit_accuracy": 1.0,
1143
- "n_examples": 50,
1144
  "per_subtask": {
1145
  "SA": {
1146
  "accuracy": 1.0,
1147
- "count": 126
1148
  },
1149
  "SC": {
1150
  "accuracy": 1.0,
1151
- "count": 79
1152
  },
1153
  "SS": {
1154
  "accuracy": 1.0,
1155
- "count": 21
1156
  },
1157
  "UC": {
1158
  "accuracy": 1.0,
1159
- "count": 124
1160
  }
1161
  }
1162
  },
1163
  "add_S2": {
1164
  "full_accuracy": 1.0,
1165
  "digit_accuracy": 1.0,
1166
- "n_examples": 50,
1167
  "per_subtask": {
1168
  "SA": {
1169
  "accuracy": 1.0,
1170
- "count": 75
1171
  },
1172
  "SC": {
1173
  "accuracy": 1.0,
1174
- "count": 62
1175
  },
1176
  "SS": {
1177
  "accuracy": 1.0,
1178
- "count": 39
1179
  },
1180
  "UC": {
1181
  "accuracy": 1.0,
1182
- "count": 111
1183
  },
1184
  "US": {
1185
  "accuracy": 1.0,
1186
- "count": 63
1187
  }
1188
  }
1189
  },
1190
  "add_S3": {
1191
  "full_accuracy": 1.0,
1192
  "digit_accuracy": 1.0,
1193
- "n_examples": 50,
1194
  "per_subtask": {
1195
  "SA": {
1196
  "accuracy": 1.0,
1197
- "count": 60
1198
  },
1199
  "SC": {
1200
  "accuracy": 1.0,
1201
- "count": 57
1202
  },
1203
  "SS": {
1204
  "accuracy": 1.0,
1205
- "count": 19
1206
  },
1207
  "UC": {
1208
  "accuracy": 1.0,
1209
- "count": 104
1210
  },
1211
  "US": {
1212
  "accuracy": 1.0,
1213
- "count": 110
1214
  }
1215
  }
1216
  },
1217
  "add_S4": {
1218
  "full_accuracy": 1.0,
1219
  "digit_accuracy": 1.0,
1220
- "n_examples": 50,
1221
  "per_subtask": {
1222
  "SA": {
1223
  "accuracy": 1.0,
1224
- "count": 48
1225
  },
1226
  "SC": {
1227
  "accuracy": 1.0,
1228
- "count": 52
1229
  },
1230
  "SS": {
1231
  "accuracy": 1.0,
1232
- "count": 7
1233
  },
1234
  "UC": {
1235
  "accuracy": 1.0,
1236
- "count": 89
1237
  },
1238
  "US": {
1239
  "accuracy": 1.0,
1240
- "count": 154
1241
  }
1242
  }
1243
  },
1244
  "add_S5": {
1245
- "full_accuracy": 0.54,
1246
- "digit_accuracy": 0.9342857142857143,
1247
- "n_examples": 50,
1248
  "per_subtask": {
1249
  "SA": {
1250
  "accuracy": 1.0,
1251
- "count": 50
1252
  },
1253
  "SC": {
1254
  "accuracy": 1.0,
1255
- "count": 50
1256
  },
1257
  "UC": {
1258
- "accuracy": 0.54,
1259
- "count": 50
1260
  },
1261
  "US": {
1262
  "accuracy": 1.0,
1263
- "count": 200
1264
  }
1265
  }
1266
  },
1267
  "add_S6": {
1268
- "full_accuracy": 1.0,
1269
- "digit_accuracy": 1.0,
1270
- "n_examples": 50,
1271
  "per_subtask": {
1272
  "SC": {
1273
  "accuracy": 1.0,
1274
- "count": 50
1275
  },
1276
  "UC": {
1277
- "accuracy": 1.0,
1278
- "count": 50
1279
  },
1280
  "US": {
1281
- "accuracy": 1.0,
1282
- "count": 250
1283
  }
1284
  }
1285
  },
@@ -1290,291 +1290,291 @@
1290
  "per_subtask": {
1291
  "SA": {
1292
  "accuracy": 1.0,
1293
- "count": 431
1294
  },
1295
  "SC": {
1296
  "accuracy": 1.0,
1297
- "count": 316
1298
  },
1299
  "SS": {
1300
  "accuracy": 1.0,
1301
- "count": 39
1302
  },
1303
  "UC": {
1304
  "accuracy": 1.0,
1305
- "count": 560
1306
  },
1307
  "US": {
1308
  "accuracy": 1.0,
1309
- "count": 54
1310
  }
1311
  }
1312
  },
1313
  "add_C1": {
1314
  "full_accuracy": 1.0,
1315
  "digit_accuracy": 1.0,
1316
- "n_examples": 50,
1317
  "per_subtask": {
1318
  "SA": {
1319
  "accuracy": 1.0,
1320
- "count": 250
1321
  },
1322
  "SC": {
1323
  "accuracy": 1.0,
1324
- "count": 50
1325
  },
1326
  "UC": {
1327
  "accuracy": 1.0,
1328
- "count": 50
1329
  }
1330
  }
1331
  },
1332
  "add_C2": {
1333
  "full_accuracy": 1.0,
1334
  "digit_accuracy": 1.0,
1335
- "n_examples": 50,
1336
  "per_subtask": {
1337
  "SA": {
1338
  "accuracy": 1.0,
1339
- "count": 200
1340
  },
1341
  "SC": {
1342
  "accuracy": 1.0,
1343
- "count": 50
1344
  },
1345
  "UC": {
1346
  "accuracy": 1.0,
1347
- "count": 83
1348
  },
1349
  "US": {
1350
  "accuracy": 1.0,
1351
- "count": 17
1352
  }
1353
  }
1354
  },
1355
  "add_C3": {
1356
  "full_accuracy": 1.0,
1357
  "digit_accuracy": 1.0,
1358
- "n_examples": 50,
1359
  "per_subtask": {
1360
  "SA": {
1361
  "accuracy": 1.0,
1362
- "count": 150
1363
  },
1364
  "SC": {
1365
  "accuracy": 1.0,
1366
- "count": 50
1367
  },
1368
  "UC": {
1369
  "accuracy": 1.0,
1370
- "count": 100
1371
  },
1372
  "US": {
1373
  "accuracy": 1.0,
1374
- "count": 50
1375
  }
1376
  }
1377
  },
1378
  "add_C4": {
1379
- "full_accuracy": 1.0,
1380
- "digit_accuracy": 1.0,
1381
- "n_examples": 50,
1382
  "per_subtask": {
1383
  "SA": {
1384
  "accuracy": 1.0,
1385
- "count": 100
1386
  },
1387
  "SC": {
1388
  "accuracy": 1.0,
1389
- "count": 50
1390
  },
1391
  "UC": {
1392
- "accuracy": 1.0,
1393
- "count": 132
1394
  },
1395
  "US": {
1396
  "accuracy": 1.0,
1397
- "count": 68
1398
  }
1399
  }
1400
  },
1401
  "add_C5": {
1402
- "full_accuracy": 0.94,
1403
- "digit_accuracy": 0.9914285714285714,
1404
- "n_examples": 50,
1405
  "per_subtask": {
1406
  "SA": {
1407
  "accuracy": 1.0,
1408
- "count": 50
1409
  },
1410
  "SC": {
1411
  "accuracy": 1.0,
1412
- "count": 50
1413
  },
1414
  "UC": {
1415
- "accuracy": 0.9794520547945206,
1416
- "count": 146
1417
  },
1418
  "US": {
1419
  "accuracy": 1.0,
1420
- "count": 104
1421
  }
1422
  }
1423
  },
1424
  "add_C6": {
1425
- "full_accuracy": 1.0,
1426
- "digit_accuracy": 1.0,
1427
- "n_examples": 50,
1428
  "per_subtask": {
1429
  "SC": {
1430
  "accuracy": 1.0,
1431
- "count": 50
1432
  },
1433
  "UC": {
1434
- "accuracy": 1.0,
1435
- "count": 189
1436
  },
1437
  "US": {
1438
  "accuracy": 1.0,
1439
- "count": 111
1440
  }
1441
  }
1442
  },
1443
  "sub_M0": {
1444
  "full_accuracy": 1.0,
1445
  "digit_accuracy": 1.0,
1446
- "n_examples": 50,
1447
  "per_subtask": {
1448
  "MD": {
1449
  "accuracy": 1.0,
1450
- "count": 303
1451
  },
1452
  "ME": {
1453
  "accuracy": 1.0,
1454
- "count": 47
1455
  }
1456
  }
1457
  },
1458
  "sub_M1": {
1459
  "full_accuracy": 1.0,
1460
  "digit_accuracy": 1.0,
1461
- "n_examples": 50,
1462
  "per_subtask": {
1463
  "MD": {
1464
  "accuracy": 1.0,
1465
- "count": 141
1466
  },
1467
  "MB": {
1468
  "accuracy": 1.0,
1469
- "count": 72
1470
  },
1471
  "ME": {
1472
  "accuracy": 1.0,
1473
- "count": 18
1474
  },
1475
  "UB": {
1476
  "accuracy": 1.0,
1477
- "count": 119
1478
  }
1479
  }
1480
  },
1481
  "sub_M2": {
1482
  "full_accuracy": 1.0,
1483
  "digit_accuracy": 1.0,
1484
- "n_examples": 50,
1485
  "per_subtask": {
1486
  "MD": {
1487
  "accuracy": 1.0,
1488
- "count": 112
1489
  },
1490
  "MB": {
1491
  "accuracy": 1.0,
1492
- "count": 53
1493
  },
1494
  "ME": {
1495
  "accuracy": 1.0,
1496
- "count": 47
1497
  },
1498
  "UB": {
1499
  "accuracy": 1.0,
1500
- "count": 85
1501
  },
1502
  "UD": {
1503
  "accuracy": 1.0,
1504
- "count": 53
1505
  }
1506
  }
1507
  },
1508
  "sub_M3": {
1509
  "full_accuracy": 1.0,
1510
  "digit_accuracy": 1.0,
1511
- "n_examples": 50,
1512
  "per_subtask": {
1513
  "MD": {
1514
  "accuracy": 1.0,
1515
- "count": 97
1516
  },
1517
  "MB": {
1518
  "accuracy": 1.0,
1519
- "count": 51
1520
  },
1521
  "ME": {
1522
  "accuracy": 1.0,
1523
- "count": 27
1524
  },
1525
  "UB": {
1526
  "accuracy": 1.0,
1527
- "count": 74
1528
  },
1529
  "UD": {
1530
  "accuracy": 1.0,
1531
- "count": 101
1532
  }
1533
  }
1534
  },
1535
  "sub_M4": {
1536
  "full_accuracy": 1.0,
1537
  "digit_accuracy": 1.0,
1538
- "n_examples": 50,
1539
  "per_subtask": {
1540
  "MD": {
1541
  "accuracy": 1.0,
1542
- "count": 100
1543
  },
1544
  "MB": {
1545
  "accuracy": 1.0,
1546
- "count": 50
1547
  },
1548
  "UB": {
1549
  "accuracy": 1.0,
1550
- "count": 50
1551
  },
1552
  "UD": {
1553
  "accuracy": 1.0,
1554
- "count": 150
1555
  }
1556
  }
1557
  },
1558
  "sub_M5": {
1559
- "full_accuracy": 0.34,
1560
- "digit_accuracy": 0.9057142857142857,
1561
- "n_examples": 50,
1562
  "per_subtask": {
1563
  "MD": {
1564
  "accuracy": 1.0,
1565
- "count": 50
1566
  },
1567
  "MB": {
1568
  "accuracy": 1.0,
1569
- "count": 50
1570
  },
1571
  "UB": {
1572
- "accuracy": 0.34,
1573
- "count": 50
1574
  },
1575
  "UD": {
1576
  "accuracy": 1.0,
1577
- "count": 200
1578
  }
1579
  }
1580
  },
@@ -1585,11 +1585,11 @@
1585
  "per_subtask": {
1586
  "MD": {
1587
  "accuracy": 1.0,
1588
- "count": 570
1589
  },
1590
  "MB": {
1591
  "accuracy": 1.0,
1592
- "count": 277
1593
  },
1594
  "ME": {
1595
  "accuracy": 1.0,
@@ -1597,91 +1597,91 @@
1597
  },
1598
  "UB": {
1599
  "accuracy": 1.0,
1600
- "count": 471
1601
  },
1602
  "UD": {
1603
  "accuracy": 1.0,
1604
- "count": 29
1605
  }
1606
  }
1607
  },
1608
  "sub_B3": {
1609
  "full_accuracy": 1.0,
1610
  "digit_accuracy": 1.0,
1611
- "n_examples": 50,
1612
  "per_subtask": {
1613
  "MD": {
1614
  "accuracy": 1.0,
1615
- "count": 150
1616
  },
1617
  "MB": {
1618
  "accuracy": 1.0,
1619
- "count": 50
1620
  },
1621
  "UB": {
1622
  "accuracy": 1.0,
1623
- "count": 101
1624
  },
1625
  "UD": {
1626
  "accuracy": 1.0,
1627
- "count": 49
1628
  }
1629
  }
1630
  },
1631
  "sub_B4": {
1632
  "full_accuracy": 1.0,
1633
  "digit_accuracy": 1.0,
1634
- "n_examples": 50,
1635
  "per_subtask": {
1636
  "MD": {
1637
  "accuracy": 1.0,
1638
- "count": 100
1639
  },
1640
  "MB": {
1641
  "accuracy": 1.0,
1642
- "count": 50
1643
  },
1644
  "UB": {
1645
  "accuracy": 1.0,
1646
- "count": 121
1647
  },
1648
  "UD": {
1649
  "accuracy": 1.0,
1650
- "count": 79
1651
  }
1652
  }
1653
  },
1654
  "sub_B5": {
1655
- "full_accuracy": 0.98,
1656
- "digit_accuracy": 0.9971428571428571,
1657
- "n_examples": 50,
1658
  "per_subtask": {
1659
  "MD": {
1660
  "accuracy": 1.0,
1661
- "count": 50
1662
  },
1663
  "MB": {
1664
  "accuracy": 1.0,
1665
- "count": 50
1666
  },
1667
  "UB": {
1668
- "accuracy": 0.993421052631579,
1669
- "count": 152
1670
  },
1671
  "UD": {
1672
  "accuracy": 1.0,
1673
- "count": 98
1674
  }
1675
  }
1676
  }
1677
  },
1678
  "summary": {
1679
- "overall_accuracy": 0.96,
1680
- "digit_accuracy": 0.9942857142857143,
1681
- "total_examples": 1500,
1682
  "n_splits": 24
1683
  }
1684
  },
1685
- "sorl_overall_accuracy": 0.9470833333333334,
1686
- "sft_overall_accuracy": 0.7258333333333333
1687
  }
 
63
  3133
64
  ],
65
  "loss": [
66
+ 7.942777633666992,
67
+ 4.879890441894531,
68
+ 3.230745553970337,
69
+ 3.171808958053589,
70
+ 3.3972392082214355,
71
+ 2.463961124420166,
72
+ -0.31970155239105225,
73
+ -5.494422912597656,
74
+ -6.8470611572265625,
75
+ -4.793292045593262,
76
+ -2.040498733520508,
77
+ -2.3940882682800293,
78
+ -2.170124053955078,
79
+ -1.7399487495422363,
80
+ -2.3086109161376953,
81
+ -2.7756190299987793,
82
+ -2.891526699066162,
83
+ -2.4666450023651123,
84
+ -2.295619010925293,
85
+ -2.048872470855713,
86
+ -1.5986868143081665,
87
+ -2.349621057510376,
88
+ -1.2452442646026611,
89
+ -2.901057004928589,
90
+ -1.3245774507522583,
91
+ -0.9554328918457031,
92
+ -0.9006512761116028,
93
+ -0.7657713890075684,
94
+ -0.7965363264083862,
95
+ -1.0751973390579224,
96
+ -0.8257421255111694,
97
+ -0.7674775123596191,
98
+ -0.6108952164649963,
99
+ -0.39728718996047974,
100
+ -1.0187804698944092,
101
+ -0.31591135263442993,
102
+ -0.39469149708747864,
103
+ -0.2729785442352295,
104
+ -0.10930931568145752,
105
+ -0.24461984634399414,
106
+ -0.08862721920013428,
107
+ -0.2117801159620285,
108
+ -0.16590479016304016,
109
+ -0.38362351059913635,
110
+ -0.4167384207248688,
111
+ -0.40334945917129517,
112
+ -0.19316883385181427,
113
+ -0.14879119396209717,
114
+ -0.19406011700630188,
115
+ -0.17914623022079468,
116
+ -0.2789972126483917,
117
+ -0.05913069099187851,
118
+ -0.29538771510124207,
119
+ -0.36722680926322937,
120
+ -0.35507896542549133,
121
+ -0.05119173973798752,
122
+ -0.11240553855895996,
123
+ -0.19777432084083557,
124
+ -0.007915813475847244,
125
+ -0.033937860280275345
126
  ],
127
  "base_loss": [
128
+ 7.671196937561035,
129
+ 4.010580062866211,
130
+ 1.8913097381591797,
131
+ 1.9108556509017944,
132
+ 1.8557425737380981,
133
+ 1.833831548690796,
134
+ 1.82988440990448,
135
+ 1.787339448928833,
136
+ 1.7512098550796509,
137
+ 1.2430487871170044,
138
+ 0.9070897698402405,
139
+ 0.9038087725639343,
140
+ 0.7104665637016296,
141
+ 0.6114657521247864,
142
+ 0.6148020625114441,
143
+ 0.5949119329452515,
144
+ 0.5328211188316345,
145
+ 0.44292375445365906,
146
+ 0.41273045539855957,
147
+ 0.3532269299030304,
148
+ 0.2902839481830597,
149
+ 0.36093834042549133,
150
+ 0.24082647264003754,
151
+ 0.40677782893180847,
152
+ 0.2435729205608368,
153
+ 0.17918619513511658,
154
+ 0.16348111629486084,
155
+ 0.14683635532855988,
156
+ 0.1624801903963089,
157
+ 0.17330823838710785,
158
+ 0.14925424754619598,
159
+ 0.13458098471164703,
160
+ 0.11640487611293793,
161
+ 0.09549444168806076,
162
+ 0.16984343528747559,
163
+ 0.0791398137807846,
164
+ 0.08832497149705887,
165
+ 0.061722755432128906,
166
+ 0.04690014570951462,
167
+ 0.0503598190844059,
168
+ 0.04861803725361824,
169
+ 0.04905056953430176,
170
+ 0.04119657352566719,
171
+ 0.06299275904893875,
172
+ 0.0651763305068016,
173
+ 0.05840466544032097,
174
+ 0.03202011436223984,
175
+ 0.029487838968634605,
176
+ 0.035392142832279205,
177
+ 0.03208036348223686,
178
+ 0.04259788244962692,
179
+ 0.020311372354626656,
180
+ 0.041775498539209366,
181
+ 0.050391655415296555,
182
+ 0.05066566914319992,
183
+ 0.008960098959505558,
184
+ 0.018476324155926704,
185
+ 0.027055857703089714,
186
+ 0.005176883656531572,
187
+ 0.008925193920731544
188
  ],
189
  "info_loss": [
190
+ -0.6543827056884766,
191
+ -0.1371753215789795,
192
+ -0.05590975284576416,
193
+ -0.062001824378967285,
194
+ -0.032317399978637695,
195
+ -0.11670446395874023,
196
+ -0.3843038082122803,
197
+ -0.8943668007850647,
198
+ -1.0231645107269287,
199
+ -0.7599178552627563,
200
+ -0.449214369058609,
201
+ -0.482033371925354,
202
+ -0.43131735920906067,
203
+ -0.3736041486263275,
204
+ -0.4215145707130432,
205
+ -0.4602014720439911,
206
+ -0.4593234658241272,
207
+ -0.3962600529193878,
208
+ -0.36444565653800964,
209
+ -0.3282434344291687,
210
+ -0.27206987142562866,
211
+ -0.3457140326499939,
212
+ -0.21850009262561798,
213
+ -0.3961044251918793,
214
+ -0.22760552167892456,
215
+ -0.16711612045764923,
216
+ -0.1588182896375656,
217
+ -0.14292679727077484,
218
+ -0.14594416320323944,
219
+ -0.16852590441703796,
220
+ -0.14467473328113556,
221
+ -0.13348457217216492,
222
+ -0.11509168893098831,
223
+ -0.09416339546442032,
224
+ -0.15572039783000946,
225
+ -0.07744807004928589,
226
+ -0.08698518574237823,
227
+ -0.06068040058016777,
228
+ -0.0375378355383873,
229
+ -0.04975220188498497,
230
+ -0.04101487994194031,
231
+ -0.04862057790160179,
232
+ -0.0394514799118042,
233
+ -0.05956646800041199,
234
+ -0.06477057933807373,
235
+ -0.05815031751990318,
236
+ -0.03168302774429321,
237
+ -0.029237886890769005,
238
+ -0.03526085987687111,
239
+ -0.03186190128326416,
240
+ -0.04240777716040611,
241
+ -0.020102767273783684,
242
+ -0.0416855663061142,
243
+ -0.0503324493765831,
244
+ -0.05060190334916115,
245
+ -0.008832032792270184,
246
+ -0.01841319166123867,
247
+ -0.026989510282874107,
248
+ -0.0051260534673929214,
249
+ -0.008811188861727715
250
  ],
251
  "abs_loss": [
252
+ 2.223989725112915,
253
+ 1.871252179145813,
254
+ 1.8544975519180298,
255
+ 1.8529828786849976,
256
+ 1.7808314561843872,
257
+ 1.5479220151901245,
258
+ 1.4092364311218262,
259
+ 1.4009379148483276,
260
+ 1.320881962776184,
261
+ 1.1338235139846802,
262
+ 1.0362251996994019,
263
+ 0.9345109462738037,
264
+ 0.8529377579689026,
265
+ 0.7628306746482849,
266
+ 0.690753161907196,
267
+ 0.6221196055412292,
268
+ 0.5280961394309998,
269
+ 0.46778860688209534,
270
+ 0.3726791441440582,
271
+ 0.31298816204071045,
272
+ 0.2620382010936737,
273
+ 0.18087559938430786,
274
+ 0.21465563774108887,
275
+ 0.16152803599834442,
276
+ 0.15743668377399445,
277
+ 0.14526526629924774,
278
+ 0.14475920796394348,
279
+ 0.12059714645147324,
280
+ 0.1169460192322731,
281
+ 0.09149991720914841,
282
+ 0.09585744142532349,
283
+ 0.0799269825220108,
284
+ 0.08726473897695541,
285
+ 0.06842993944883347,
286
+ 0.0803660899400711,
287
+ 0.07159644365310669,
288
+ 0.07437893003225327,
289
+ 0.06340949982404709,
290
+ 0.06646980345249176,
291
+ 0.07242506742477417,
292
+ 0.05180174112319946,
293
+ 0.06437482684850693,
294
+ 0.07705993205308914,
295
+ 0.05421462655067444,
296
+ 0.04291941598057747,
297
+ 0.03475573658943176,
298
+ 0.03258237987756729,
299
+ 0.04605608060956001,
300
+ 0.029467642307281494,
301
+ 0.02811291068792343,
302
+ 0.016363779082894325,
303
+ 0.02535133622586727,
304
+ 0.01502001192420721,
305
+ 0.016885360702872276,
306
+ 0.029067935422062874,
307
+ 0.019427979364991188,
308
+ 0.028035461902618408,
309
+ 0.012935961596667767,
310
+ 0.03265028074383736,
311
+ 0.02496322989463806
312
  ],
313
  "zipf_loss": [
314
+ 6.593008518218994,
315
+ 2.053938388824463,
316
+ 1.7130836248397827,
317
+ 1.6956732273101807,
318
+ 1.6865873336791992,
319
+ 1.6423821449279785,
320
+ 1.5525283813476562,
321
+ 1.5218125581741333,
322
+ 1.5012855529785156,
323
+ 1.4494550228118896,
324
+ 1.4409327507019043,
325
+ 1.4289852380752563,
326
+ 1.3472892045974731,
327
+ 1.3083441257476807,
328
+ 1.222657322883606,
329
+ 1.169271469116211,
330
+ 1.1160770654678345,
331
+ 1.0062527656555176,
332
+ 0.8988392949104309,
333
+ 0.8490362763404846,
334
+ 0.8055242300033569,
335
+ 0.7284933924674988,
336
+ 0.6774646043777466,
337
+ 0.6370565295219421,
338
+ 0.6921612024307251,
339
+ 0.5220155715942383,
340
+ 0.5095745921134949,
341
+ 0.5046005249023438,
342
+ 0.4887305498123169,
343
+ 0.42760345339775085,
344
+ 0.4621652364730835,
345
+ 0.42479458451271057,
346
+ 0.4148903489112854,
347
+ 0.4420093297958374,
348
+ 0.36054351925849915,
349
+ 0.372269868850708,
350
+ 0.3793974816799164,
351
+ 0.2657617926597595,
352
+ 0.21252191066741943,
353
+ 0.19529986381530762,
354
+ 0.26772335171699524,
355
+ 0.21893762052059174,
356
+ 0.17970743775367737,
357
+ 0.14362695813179016,
358
+ 0.16149911284446716,
359
+ 0.11627347767353058,
360
+ 0.08838309347629547,
361
+ 0.10949420928955078,
362
+ 0.12020957469940186,
363
+ 0.10458115488290787,
364
+ 0.1008462905883789,
365
+ 0.11905048042535782,
366
+ 0.07819044589996338,
367
+ 0.08401751518249512,
368
+ 0.09736760705709457,
369
+ 0.02622569352388382,
370
+ 0.050446510314941406,
371
+ 0.04377133026719093,
372
+ 0.03490281105041504,
373
+ 0.04275251552462578
374
  ],
375
  "denoise_loss": [],
376
  "ortho_loss": [
377
+ 0.3635007441043854,
378
+ 0.18963280320167542,
379
+ 0.1485888510942459,
380
+ 0.08869273215532303,
381
+ 0.11493418365716934,
382
+ 0.1571045219898224,
383
+ 0.1805325299501419,
384
+ 0.18971112370491028,
385
+ 0.2055690884590149,
386
+ 0.22591449320316315,
387
+ 0.24307440221309662,
388
+ 0.23882627487182617,
389
+ 0.2520371079444885,
390
+ 0.25790753960609436,
391
+ 0.2630678117275238,
392
+ 0.30761033296585083,
393
+ 0.32258620858192444,
394
+ 0.32426348328590393,
395
+ 0.32995936274528503,
396
+ 0.33315232396125793,
397
+ 0.3359712064266205,
398
+ 0.3444741368293762,
399
+ 0.33952945470809937,
400
+ 0.34313759207725525,
401
+ 0.3325212001800537,
402
+ 0.3572663366794586,
403
+ 0.3538215458393097,
404
+ 0.35331991314888,
405
+ 0.34806913137435913,
406
+ 0.35233867168426514,
407
+ 0.3563789427280426,
408
+ 0.35214924812316895,
409
+ 0.3493984043598175,
410
+ 0.35396456718444824,
411
+ 0.33881986141204834,
412
+ 0.34869933128356934,
413
+ 0.3384784758090973,
414
+ 0.3430968225002289,
415
+ 0.33994701504707336,
416
+ 0.3194200396537781,
417
+ 0.3237259089946747,
418
+ 0.3362569808959961,
419
+ 0.3448117971420288,
420
+ 0.35400205850601196,
421
+ 0.3509945273399353,
422
+ 0.3481496572494507,
423
+ 0.34772804379463196,
424
+ 0.34519320726394653,
425
+ 0.3447076976299286,
426
+ 0.3428516387939453,
427
+ 0.34590229392051697,
428
+ 0.3494682312011719,
429
+ 0.3497217297554016,
430
+ 0.34933948516845703,
431
+ 0.35108673572540283,
432
+ 0.35249459743499756,
433
+ 0.35031837224960327,
434
+ 0.3520292043685913,
435
+ 0.35233885049819946,
436
+ 0.3538017272949219
437
  ],
438
  "lr": [
439
  3.9200000000000004e-05,
 
521
  3083
522
  ],
523
  "eval_accuracy": [
524
+ 0.02,
525
+ 0.02,
526
  0.0,
527
  0.0,
528
  0.0,
 
543
  0.0
544
  ]
545
  },
546
+ "final_accuracy": 0.9561538461538461,
547
  "sft_eval": {
548
  "config": {
549
  "ops": "add_sub",
550
  "K": null,
551
  "mode": "sft",
552
  "n_digits": 6,
553
+ "n_per_split": 100
554
  },
555
  "splits": {
556
  "add_S0": {
557
+ "full_accuracy": 0.98,
558
+ "digit_accuracy": 0.9971428571428571,
559
+ "n_examples": 100,
560
  "per_subtask": {
561
  "SA": {
562
+ "accuracy": 0.996694214876033,
563
+ "count": 605
564
  },
565
  "SS": {
566
+ "accuracy": 1.0,
567
+ "count": 95
568
  }
569
  }
570
  },
571
  "add_S1": {
572
  "full_accuracy": 1.0,
573
  "digit_accuracy": 1.0,
574
+ "n_examples": 100,
575
  "per_subtask": {
576
  "SA": {
577
  "accuracy": 1.0,
578
+ "count": 204
579
  },
580
  "SC": {
581
  "accuracy": 1.0,
582
+ "count": 169
583
  },
584
  "SS": {
585
  "accuracy": 1.0,
586
+ "count": 31
587
  },
588
  "UC": {
589
  "accuracy": 1.0,
590
+ "count": 296
591
  }
592
  }
593
  },
594
  "add_S2": {
595
+ "full_accuracy": 1.0,
596
+ "digit_accuracy": 1.0,
597
+ "n_examples": 100,
598
  "per_subtask": {
599
  "SA": {
600
+ "accuracy": 1.0,
601
+ "count": 163
602
  },
603
  "SC": {
604
+ "accuracy": 1.0,
605
+ "count": 130
606
  },
607
  "SS": {
608
+ "accuracy": 1.0,
609
+ "count": 87
610
  },
611
  "UC": {
612
+ "accuracy": 1.0,
613
+ "count": 203
614
  },
615
  "US": {
616
  "accuracy": 1.0,
617
+ "count": 117
618
  }
619
  }
620
  },
621
  "add_S3": {
622
+ "full_accuracy": 0.93,
623
+ "digit_accuracy": 0.99,
624
+ "n_examples": 100,
625
  "per_subtask": {
626
  "SA": {
627
  "accuracy": 1.0,
628
+ "count": 121
629
  },
630
  "SC": {
631
+ "accuracy": 1.0,
632
+ "count": 121
633
  },
634
  "SS": {
635
  "accuracy": 1.0,
636
+ "count": 49
637
  },
638
  "UC": {
639
+ "accuracy": 0.9623655913978495,
640
+ "count": 186
641
  },
642
  "US": {
643
  "accuracy": 1.0,
644
+ "count": 223
645
  }
646
  }
647
  },
648
  "add_S4": {
649
+ "full_accuracy": 0.71,
650
+ "digit_accuracy": 0.9585714285714285,
651
+ "n_examples": 100,
652
  "per_subtask": {
653
  "SA": {
654
  "accuracy": 1.0,
655
+ "count": 104
656
  },
657
  "SC": {
658
+ "accuracy": 1.0,
659
+ "count": 106
660
  },
661
  "SS": {
662
  "accuracy": 1.0,
663
+ "count": 23
664
  },
665
  "UC": {
666
+ "accuracy": 0.825,
667
+ "count": 160
668
  },
669
  "US": {
670
+ "accuracy": 0.996742671009772,
671
+ "count": 307
672
  }
673
  }
674
  },
675
  "add_S5": {
676
+ "full_accuracy": 0.61,
677
+ "digit_accuracy": 0.91,
678
+ "n_examples": 100,
679
  "per_subtask": {
680
  "SA": {
681
  "accuracy": 1.0,
682
+ "count": 100
683
  },
684
  "SC": {
685
  "accuracy": 1.0,
686
+ "count": 100
687
  },
688
  "UC": {
689
+ "accuracy": 0.71,
690
+ "count": 100
691
  },
692
  "US": {
693
+ "accuracy": 0.915,
694
+ "count": 400
695
  }
696
  }
697
  },
698
  "add_S6": {
699
+ "full_accuracy": 0.83,
700
+ "digit_accuracy": 0.9528571428571428,
701
+ "n_examples": 100,
702
  "per_subtask": {
703
  "SC": {
704
  "accuracy": 1.0,
705
+ "count": 100
706
  },
707
  "UC": {
708
+ "accuracy": 0.93,
709
+ "count": 100
710
  },
711
  "US": {
712
+ "accuracy": 0.948,
713
+ "count": 500
714
  }
715
  }
716
  },
717
  "add_random": {
718
+ "full_accuracy": 0.985,
719
+ "digit_accuracy": 0.9978571428571429,
720
  "n_examples": 200,
721
  "per_subtask": {
722
  "SA": {
723
+ "accuracy": 0.9977628635346756,
724
+ "count": 447
725
  },
726
  "SC": {
727
  "accuracy": 1.0,
728
+ "count": 320
729
  },
730
  "SS": {
731
  "accuracy": 1.0,
732
+ "count": 56
733
  },
734
  "UC": {
735
+ "accuracy": 0.996219281663516,
736
+ "count": 529
737
  },
738
  "US": {
739
  "accuracy": 1.0,
740
+ "count": 48
741
  }
742
  }
743
  },
744
  "add_C1": {
745
  "full_accuracy": 1.0,
746
  "digit_accuracy": 1.0,
747
+ "n_examples": 100,
748
  "per_subtask": {
749
  "SA": {
750
  "accuracy": 1.0,
751
+ "count": 500
752
  },
753
  "SC": {
754
  "accuracy": 1.0,
755
+ "count": 100
756
  },
757
  "UC": {
758
  "accuracy": 1.0,
759
+ "count": 100
760
  }
761
  }
762
  },
763
  "add_C2": {
764
+ "full_accuracy": 0.98,
765
+ "digit_accuracy": 0.9971428571428571,
766
+ "n_examples": 100,
767
  "per_subtask": {
768
  "SA": {
769
  "accuracy": 1.0,
770
+ "count": 400
771
  },
772
  "SC": {
773
  "accuracy": 1.0,
774
+ "count": 100
775
  },
776
  "UC": {
777
+ "accuracy": 0.9871794871794872,
778
+ "count": 156
779
  },
780
  "US": {
781
+ "accuracy": 1.0,
782
+ "count": 44
783
  }
784
  }
785
  },
786
  "add_C3": {
787
+ "full_accuracy": 0.95,
788
+ "digit_accuracy": 0.9928571428571429,
789
+ "n_examples": 100,
790
  "per_subtask": {
791
  "SA": {
792
  "accuracy": 1.0,
793
+ "count": 300
794
  },
795
  "SC": {
796
  "accuracy": 1.0,
797
+ "count": 100
798
  },
799
  "UC": {
800
+ "accuracy": 0.9748743718592965,
801
+ "count": 199
802
  },
803
  "US": {
804
+ "accuracy": 1.0,
805
+ "count": 101
806
  }
807
  }
808
  },
809
  "add_C4": {
810
+ "full_accuracy": 0.94,
811
+ "digit_accuracy": 0.9914285714285714,
812
+ "n_examples": 100,
813
  "per_subtask": {
814
  "SA": {
815
  "accuracy": 1.0,
816
+ "count": 200
817
  },
818
  "SC": {
819
  "accuracy": 1.0,
820
+ "count": 100
821
  },
822
  "UC": {
823
+ "accuracy": 0.9772727272727273,
824
+ "count": 264
825
  },
826
  "US": {
827
+ "accuracy": 1.0,
828
+ "count": 136
829
  }
830
  }
831
  },
832
  "add_C5": {
833
+ "full_accuracy": 0.92,
834
+ "digit_accuracy": 0.9885714285714285,
835
+ "n_examples": 100,
836
  "per_subtask": {
837
  "SA": {
838
  "accuracy": 1.0,
839
+ "count": 100
840
  },
841
  "SC": {
842
  "accuracy": 1.0,
843
+ "count": 100
844
  },
845
  "UC": {
846
+ "accuracy": 0.9741935483870968,
847
+ "count": 310
848
  },
849
  "US": {
850
+ "accuracy": 1.0,
851
+ "count": 190
852
  }
853
  }
854
  },
855
  "add_C6": {
856
+ "full_accuracy": 0.87,
857
+ "digit_accuracy": 0.9771428571428571,
858
+ "n_examples": 100,
859
  "per_subtask": {
860
  "SC": {
861
  "accuracy": 1.0,
862
+ "count": 100
863
  },
864
  "UC": {
865
+ "accuracy": 0.9675675675675676,
866
+ "count": 370
867
  },
868
  "US": {
869
+ "accuracy": 0.9826086956521739,
870
+ "count": 230
871
  }
872
  }
873
  },
874
  "sub_M0": {
875
  "full_accuracy": 1.0,
876
  "digit_accuracy": 1.0,
877
+ "n_examples": 100,
878
  "per_subtask": {
879
  "MD": {
880
  "accuracy": 1.0,
881
+ "count": 615
882
  },
883
  "ME": {
884
  "accuracy": 1.0,
885
+ "count": 85
886
  }
887
  }
888
  },
889
  "sub_M1": {
890
+ "full_accuracy": 0.97,
891
+ "digit_accuracy": 0.9957142857142857,
892
+ "n_examples": 100,
893
  "per_subtask": {
894
  "MD": {
895
+ "accuracy": 0.9931506849315068,
896
+ "count": 292
897
  },
898
  "MB": {
899
+ "accuracy": 1.0,
900
+ "count": 144
901
  },
902
  "ME": {
903
  "accuracy": 1.0,
904
+ "count": 25
905
  },
906
  "UB": {
907
+ "accuracy": 0.99581589958159,
908
+ "count": 239
909
  }
910
  }
911
  },
912
  "sub_M2": {
913
+ "full_accuracy": 1.0,
914
+ "digit_accuracy": 1.0,
915
+ "n_examples": 100,
916
  "per_subtask": {
917
  "MD": {
918
+ "accuracy": 1.0,
919
+ "count": 211
920
  },
921
  "MB": {
922
  "accuracy": 1.0,
923
+ "count": 115
924
  },
925
  "ME": {
926
  "accuracy": 1.0,
927
+ "count": 85
928
  },
929
  "UB": {
930
+ "accuracy": 1.0,
931
+ "count": 181
932
  },
933
  "UD": {
934
+ "accuracy": 1.0,
935
+ "count": 108
936
  }
937
  }
938
  },
939
  "sub_M3": {
940
+ "full_accuracy": 0.92,
941
+ "digit_accuracy": 0.9885714285714285,
942
+ "n_examples": 100,
943
  "per_subtask": {
944
  "MD": {
945
  "accuracy": 1.0,
946
+ "count": 179
947
  },
948
  "MB": {
949
  "accuracy": 1.0,
950
+ "count": 103
951
  },
952
  "ME": {
953
  "accuracy": 1.0,
954
+ "count": 56
955
  },
956
  "UB": {
957
+ "accuracy": 0.9463087248322147,
958
+ "count": 149
959
  },
960
  "UD": {
961
+ "accuracy": 1.0,
962
+ "count": 213
963
  }
964
  }
965
  },
966
  "sub_M4": {
967
+ "full_accuracy": 0.3,
968
+ "digit_accuracy": 0.8814285714285715,
969
+ "n_examples": 100,
970
  "per_subtask": {
971
  "MD": {
972
  "accuracy": 1.0,
973
+ "count": 200
974
  },
975
  "MB": {
976
  "accuracy": 1.0,
977
+ "count": 100
978
  },
979
  "UB": {
980
+ "accuracy": 0.33,
981
+ "count": 100
982
  },
983
  "UD": {
984
+ "accuracy": 0.9466666666666667,
985
+ "count": 300
986
  }
987
  }
988
  },
989
  "sub_M5": {
990
+ "full_accuracy": 0.04,
991
+ "digit_accuracy": 0.7242857142857143,
992
+ "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
+ "count": 100
997
  },
998
  "MB": {
999
  "accuracy": 1.0,
1000
+ "count": 100
1001
  },
1002
  "UB": {
1003
+ "accuracy": 0.43,
1004
+ "count": 100
1005
  },
1006
  "UD": {
1007
+ "accuracy": 0.66,
1008
+ "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
+ "full_accuracy": 0.985,
1014
+ "digit_accuracy": 0.9978571428571429,
1015
  "n_examples": 200,
1016
  "per_subtask": {
1017
  "MD": {
1018
+ "accuracy": 0.9966666666666667,
1019
+ "count": 600
1020
  },
1021
  "MB": {
1022
  "accuracy": 1.0,
1023
+ "count": 267
1024
  },
1025
  "ME": {
1026
  "accuracy": 1.0,
1027
  "count": 53
1028
  },
1029
  "UB": {
1030
+ "accuracy": 0.9977220956719818,
1031
+ "count": 439
1032
  },
1033
  "UD": {
1034
+ "accuracy": 1.0,
1035
+ "count": 41
1036
  }
1037
  }
1038
  },
1039
  "sub_B3": {
1040
+ "full_accuracy": 0.96,
1041
+ "digit_accuracy": 0.9942857142857143,
1042
+ "n_examples": 100,
1043
  "per_subtask": {
1044
  "MD": {
1045
  "accuracy": 1.0,
1046
+ "count": 300
1047
  },
1048
  "MB": {
1049
  "accuracy": 1.0,
1050
+ "count": 100
1051
  },
1052
  "UB": {
1053
+ "accuracy": 0.9796954314720813,
1054
+ "count": 197
1055
  },
1056
  "UD": {
1057
  "accuracy": 1.0,
1058
+ "count": 103
1059
  }
1060
  }
1061
  },
1062
  "sub_B4": {
1063
+ "full_accuracy": 0.87,
1064
+ "digit_accuracy": 0.9771428571428571,
1065
+ "n_examples": 100,
1066
  "per_subtask": {
1067
  "MD": {
1068
  "accuracy": 1.0,
1069
+ "count": 200
1070
  },
1071
  "MB": {
1072
  "accuracy": 1.0,
1073
+ "count": 100
1074
  },
1075
  "UB": {
1076
+ "accuracy": 0.951417004048583,
1077
+ "count": 247
1078
  },
1079
  "UD": {
1080
+ "accuracy": 0.9738562091503268,
1081
+ "count": 153
1082
  }
1083
  }
1084
  },
1085
  "sub_B5": {
1086
+ "full_accuracy": 0.9,
1087
+ "digit_accuracy": 0.9757142857142858,
1088
+ "n_examples": 100,
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
+ "count": 100
1093
  },
1094
  "MB": {
1095
  "accuracy": 1.0,
1096
+ "count": 100
1097
  },
1098
  "UB": {
1099
+ "accuracy": 0.9731543624161074,
1100
+ "count": 298
1101
  },
1102
  "UD": {
1103
+ "accuracy": 0.9554455445544554,
1104
+ "count": 202
1105
  }
1106
  }
1107
  }
1108
  },
1109
  "summary": {
1110
+ "overall_accuracy": 0.87,
1111
+ "digit_accuracy": 0.9724175824175825,
1112
+ "total_examples": 2600,
1113
  "n_splits": 24
1114
  }
1115
  },
 
1119
  "K": 1,
1120
  "mode": "sorl",
1121
  "n_digits": 6,
1122
+ "n_per_split": 100
1123
  },
1124
  "splits": {
1125
  "add_S0": {
1126
  "full_accuracy": 1.0,
1127
  "digit_accuracy": 1.0,
1128
+ "n_examples": 100,
1129
  "per_subtask": {
1130
  "SA": {
1131
  "accuracy": 1.0,
1132
+ "count": 605
1133
  },
1134
  "SS": {
1135
  "accuracy": 1.0,
1136
+ "count": 95
1137
  }
1138
  }
1139
  },
1140
  "add_S1": {
1141
  "full_accuracy": 1.0,
1142
  "digit_accuracy": 1.0,
1143
+ "n_examples": 100,
1144
  "per_subtask": {
1145
  "SA": {
1146
  "accuracy": 1.0,
1147
+ "count": 204
1148
  },
1149
  "SC": {
1150
  "accuracy": 1.0,
1151
+ "count": 169
1152
  },
1153
  "SS": {
1154
  "accuracy": 1.0,
1155
+ "count": 31
1156
  },
1157
  "UC": {
1158
  "accuracy": 1.0,
1159
+ "count": 296
1160
  }
1161
  }
1162
  },
1163
  "add_S2": {
1164
  "full_accuracy": 1.0,
1165
  "digit_accuracy": 1.0,
1166
+ "n_examples": 100,
1167
  "per_subtask": {
1168
  "SA": {
1169
  "accuracy": 1.0,
1170
+ "count": 163
1171
  },
1172
  "SC": {
1173
  "accuracy": 1.0,
1174
+ "count": 130
1175
  },
1176
  "SS": {
1177
  "accuracy": 1.0,
1178
+ "count": 87
1179
  },
1180
  "UC": {
1181
  "accuracy": 1.0,
1182
+ "count": 203
1183
  },
1184
  "US": {
1185
  "accuracy": 1.0,
1186
+ "count": 117
1187
  }
1188
  }
1189
  },
1190
  "add_S3": {
1191
  "full_accuracy": 1.0,
1192
  "digit_accuracy": 1.0,
1193
+ "n_examples": 100,
1194
  "per_subtask": {
1195
  "SA": {
1196
  "accuracy": 1.0,
1197
+ "count": 121
1198
  },
1199
  "SC": {
1200
  "accuracy": 1.0,
1201
+ "count": 121
1202
  },
1203
  "SS": {
1204
  "accuracy": 1.0,
1205
+ "count": 49
1206
  },
1207
  "UC": {
1208
  "accuracy": 1.0,
1209
+ "count": 186
1210
  },
1211
  "US": {
1212
  "accuracy": 1.0,
1213
+ "count": 223
1214
  }
1215
  }
1216
  },
1217
  "add_S4": {
1218
  "full_accuracy": 1.0,
1219
  "digit_accuracy": 1.0,
1220
+ "n_examples": 100,
1221
  "per_subtask": {
1222
  "SA": {
1223
  "accuracy": 1.0,
1224
+ "count": 104
1225
  },
1226
  "SC": {
1227
  "accuracy": 1.0,
1228
+ "count": 106
1229
  },
1230
  "SS": {
1231
  "accuracy": 1.0,
1232
+ "count": 23
1233
  },
1234
  "UC": {
1235
  "accuracy": 1.0,
1236
+ "count": 160
1237
  },
1238
  "US": {
1239
  "accuracy": 1.0,
1240
+ "count": 307
1241
  }
1242
  }
1243
  },
1244
  "add_S5": {
1245
+ "full_accuracy": 0.61,
1246
+ "digit_accuracy": 0.9442857142857143,
1247
+ "n_examples": 100,
1248
  "per_subtask": {
1249
  "SA": {
1250
  "accuracy": 1.0,
1251
+ "count": 100
1252
  },
1253
  "SC": {
1254
  "accuracy": 1.0,
1255
+ "count": 100
1256
  },
1257
  "UC": {
1258
+ "accuracy": 0.61,
1259
+ "count": 100
1260
  },
1261
  "US": {
1262
  "accuracy": 1.0,
1263
+ "count": 400
1264
  }
1265
  }
1266
  },
1267
  "add_S6": {
1268
+ "full_accuracy": 0.96,
1269
+ "digit_accuracy": 0.9928571428571429,
1270
+ "n_examples": 100,
1271
  "per_subtask": {
1272
  "SC": {
1273
  "accuracy": 1.0,
1274
+ "count": 100
1275
  },
1276
  "UC": {
1277
+ "accuracy": 0.99,
1278
+ "count": 100
1279
  },
1280
  "US": {
1281
+ "accuracy": 0.992,
1282
+ "count": 500
1283
  }
1284
  }
1285
  },
 
1290
  "per_subtask": {
1291
  "SA": {
1292
  "accuracy": 1.0,
1293
+ "count": 447
1294
  },
1295
  "SC": {
1296
  "accuracy": 1.0,
1297
+ "count": 320
1298
  },
1299
  "SS": {
1300
  "accuracy": 1.0,
1301
+ "count": 56
1302
  },
1303
  "UC": {
1304
  "accuracy": 1.0,
1305
+ "count": 529
1306
  },
1307
  "US": {
1308
  "accuracy": 1.0,
1309
+ "count": 48
1310
  }
1311
  }
1312
  },
1313
  "add_C1": {
1314
  "full_accuracy": 1.0,
1315
  "digit_accuracy": 1.0,
1316
+ "n_examples": 100,
1317
  "per_subtask": {
1318
  "SA": {
1319
  "accuracy": 1.0,
1320
+ "count": 500
1321
  },
1322
  "SC": {
1323
  "accuracy": 1.0,
1324
+ "count": 100
1325
  },
1326
  "UC": {
1327
  "accuracy": 1.0,
1328
+ "count": 100
1329
  }
1330
  }
1331
  },
1332
  "add_C2": {
1333
  "full_accuracy": 1.0,
1334
  "digit_accuracy": 1.0,
1335
+ "n_examples": 100,
1336
  "per_subtask": {
1337
  "SA": {
1338
  "accuracy": 1.0,
1339
+ "count": 400
1340
  },
1341
  "SC": {
1342
  "accuracy": 1.0,
1343
+ "count": 100
1344
  },
1345
  "UC": {
1346
  "accuracy": 1.0,
1347
+ "count": 156
1348
  },
1349
  "US": {
1350
  "accuracy": 1.0,
1351
+ "count": 44
1352
  }
1353
  }
1354
  },
1355
  "add_C3": {
1356
  "full_accuracy": 1.0,
1357
  "digit_accuracy": 1.0,
1358
+ "n_examples": 100,
1359
  "per_subtask": {
1360
  "SA": {
1361
  "accuracy": 1.0,
1362
+ "count": 300
1363
  },
1364
  "SC": {
1365
  "accuracy": 1.0,
1366
+ "count": 100
1367
  },
1368
  "UC": {
1369
  "accuracy": 1.0,
1370
+ "count": 199
1371
  },
1372
  "US": {
1373
  "accuracy": 1.0,
1374
+ "count": 101
1375
  }
1376
  }
1377
  },
1378
  "add_C4": {
1379
+ "full_accuracy": 0.99,
1380
+ "digit_accuracy": 0.9985714285714286,
1381
+ "n_examples": 100,
1382
  "per_subtask": {
1383
  "SA": {
1384
  "accuracy": 1.0,
1385
+ "count": 200
1386
  },
1387
  "SC": {
1388
  "accuracy": 1.0,
1389
+ "count": 100
1390
  },
1391
  "UC": {
1392
+ "accuracy": 0.9962121212121212,
1393
+ "count": 264
1394
  },
1395
  "US": {
1396
  "accuracy": 1.0,
1397
+ "count": 136
1398
  }
1399
  }
1400
  },
1401
  "add_C5": {
1402
+ "full_accuracy": 0.99,
1403
+ "digit_accuracy": 0.9985714285714286,
1404
+ "n_examples": 100,
1405
  "per_subtask": {
1406
  "SA": {
1407
  "accuracy": 1.0,
1408
+ "count": 100
1409
  },
1410
  "SC": {
1411
  "accuracy": 1.0,
1412
+ "count": 100
1413
  },
1414
  "UC": {
1415
+ "accuracy": 0.9967741935483871,
1416
+ "count": 310
1417
  },
1418
  "US": {
1419
  "accuracy": 1.0,
1420
+ "count": 190
1421
  }
1422
  }
1423
  },
1424
  "add_C6": {
1425
+ "full_accuracy": 0.96,
1426
+ "digit_accuracy": 0.9942857142857143,
1427
+ "n_examples": 100,
1428
  "per_subtask": {
1429
  "SC": {
1430
  "accuracy": 1.0,
1431
+ "count": 100
1432
  },
1433
  "UC": {
1434
+ "accuracy": 0.9891891891891892,
1435
+ "count": 370
1436
  },
1437
  "US": {
1438
  "accuracy": 1.0,
1439
+ "count": 230
1440
  }
1441
  }
1442
  },
1443
  "sub_M0": {
1444
  "full_accuracy": 1.0,
1445
  "digit_accuracy": 1.0,
1446
+ "n_examples": 100,
1447
  "per_subtask": {
1448
  "MD": {
1449
  "accuracy": 1.0,
1450
+ "count": 615
1451
  },
1452
  "ME": {
1453
  "accuracy": 1.0,
1454
+ "count": 85
1455
  }
1456
  }
1457
  },
1458
  "sub_M1": {
1459
  "full_accuracy": 1.0,
1460
  "digit_accuracy": 1.0,
1461
+ "n_examples": 100,
1462
  "per_subtask": {
1463
  "MD": {
1464
  "accuracy": 1.0,
1465
+ "count": 292
1466
  },
1467
  "MB": {
1468
  "accuracy": 1.0,
1469
+ "count": 144
1470
  },
1471
  "ME": {
1472
  "accuracy": 1.0,
1473
+ "count": 25
1474
  },
1475
  "UB": {
1476
  "accuracy": 1.0,
1477
+ "count": 239
1478
  }
1479
  }
1480
  },
1481
  "sub_M2": {
1482
  "full_accuracy": 1.0,
1483
  "digit_accuracy": 1.0,
1484
+ "n_examples": 100,
1485
  "per_subtask": {
1486
  "MD": {
1487
  "accuracy": 1.0,
1488
+ "count": 211
1489
  },
1490
  "MB": {
1491
  "accuracy": 1.0,
1492
+ "count": 115
1493
  },
1494
  "ME": {
1495
  "accuracy": 1.0,
1496
+ "count": 85
1497
  },
1498
  "UB": {
1499
  "accuracy": 1.0,
1500
+ "count": 181
1501
  },
1502
  "UD": {
1503
  "accuracy": 1.0,
1504
+ "count": 108
1505
  }
1506
  }
1507
  },
1508
  "sub_M3": {
1509
  "full_accuracy": 1.0,
1510
  "digit_accuracy": 1.0,
1511
+ "n_examples": 100,
1512
  "per_subtask": {
1513
  "MD": {
1514
  "accuracy": 1.0,
1515
+ "count": 179
1516
  },
1517
  "MB": {
1518
  "accuracy": 1.0,
1519
+ "count": 103
1520
  },
1521
  "ME": {
1522
  "accuracy": 1.0,
1523
+ "count": 56
1524
  },
1525
  "UB": {
1526
  "accuracy": 1.0,
1527
+ "count": 149
1528
  },
1529
  "UD": {
1530
  "accuracy": 1.0,
1531
+ "count": 213
1532
  }
1533
  }
1534
  },
1535
  "sub_M4": {
1536
  "full_accuracy": 1.0,
1537
  "digit_accuracy": 1.0,
1538
+ "n_examples": 100,
1539
  "per_subtask": {
1540
  "MD": {
1541
  "accuracy": 1.0,
1542
+ "count": 200
1543
  },
1544
  "MB": {
1545
  "accuracy": 1.0,
1546
+ "count": 100
1547
  },
1548
  "UB": {
1549
  "accuracy": 1.0,
1550
+ "count": 100
1551
  },
1552
  "UD": {
1553
  "accuracy": 1.0,
1554
+ "count": 300
1555
  }
1556
  }
1557
  },
1558
  "sub_M5": {
1559
+ "full_accuracy": 0.39,
1560
+ "digit_accuracy": 0.9128571428571428,
1561
+ "n_examples": 100,
1562
  "per_subtask": {
1563
  "MD": {
1564
  "accuracy": 1.0,
1565
+ "count": 100
1566
  },
1567
  "MB": {
1568
  "accuracy": 1.0,
1569
+ "count": 100
1570
  },
1571
  "UB": {
1572
+ "accuracy": 0.39,
1573
+ "count": 100
1574
  },
1575
  "UD": {
1576
  "accuracy": 1.0,
1577
+ "count": 400
1578
  }
1579
  }
1580
  },
 
1585
  "per_subtask": {
1586
  "MD": {
1587
  "accuracy": 1.0,
1588
+ "count": 600
1589
  },
1590
  "MB": {
1591
  "accuracy": 1.0,
1592
+ "count": 267
1593
  },
1594
  "ME": {
1595
  "accuracy": 1.0,
 
1597
  },
1598
  "UB": {
1599
  "accuracy": 1.0,
1600
+ "count": 439
1601
  },
1602
  "UD": {
1603
  "accuracy": 1.0,
1604
+ "count": 41
1605
  }
1606
  }
1607
  },
1608
  "sub_B3": {
1609
  "full_accuracy": 1.0,
1610
  "digit_accuracy": 1.0,
1611
+ "n_examples": 100,
1612
  "per_subtask": {
1613
  "MD": {
1614
  "accuracy": 1.0,
1615
+ "count": 300
1616
  },
1617
  "MB": {
1618
  "accuracy": 1.0,
1619
+ "count": 100
1620
  },
1621
  "UB": {
1622
  "accuracy": 1.0,
1623
+ "count": 197
1624
  },
1625
  "UD": {
1626
  "accuracy": 1.0,
1627
+ "count": 103
1628
  }
1629
  }
1630
  },
1631
  "sub_B4": {
1632
  "full_accuracy": 1.0,
1633
  "digit_accuracy": 1.0,
1634
+ "n_examples": 100,
1635
  "per_subtask": {
1636
  "MD": {
1637
  "accuracy": 1.0,
1638
+ "count": 200
1639
  },
1640
  "MB": {
1641
  "accuracy": 1.0,
1642
+ "count": 100
1643
  },
1644
  "UB": {
1645
  "accuracy": 1.0,
1646
+ "count": 247
1647
  },
1648
  "UD": {
1649
  "accuracy": 1.0,
1650
+ "count": 153
1651
  }
1652
  }
1653
  },
1654
  "sub_B5": {
1655
+ "full_accuracy": 0.96,
1656
+ "digit_accuracy": 0.9942857142857143,
1657
+ "n_examples": 100,
1658
  "per_subtask": {
1659
  "MD": {
1660
  "accuracy": 1.0,
1661
+ "count": 100
1662
  },
1663
  "MB": {
1664
  "accuracy": 1.0,
1665
+ "count": 100
1666
  },
1667
  "UB": {
1668
+ "accuracy": 0.9865771812080537,
1669
+ "count": 298
1670
  },
1671
  "UD": {
1672
  "accuracy": 1.0,
1673
+ "count": 202
1674
  }
1675
  }
1676
  }
1677
  },
1678
  "summary": {
1679
+ "overall_accuracy": 0.9561538461538461,
1680
+ "digit_accuracy": 0.9936813186813187,
1681
+ "total_examples": 2600,
1682
  "n_splits": 24
1683
  }
1684
  },
1685
+ "sorl_overall_accuracy": 0.9561538461538461,
1686
+ "sft_overall_accuracy": 0.87
1687
  }
add_sub_sorl_v1_abs10_K1_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82f05b0b5afcd6ea5dc437d2363128885fcfd080a9b715032b42b76a2c61eb2d
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47ed228ea959d64128647e608261362d573f06e45dfd07c02259100cb8257aed
3
  size 650303660
add_sub_sorl_v1_abs10_K1_10K/train_config.json CHANGED
@@ -36,7 +36,7 @@
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
- "output_dir": "ckpt/sweep/as_sorl_abs10_K1_10K",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
@@ -69,16 +69,21 @@
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_10K",
72
- "git_commit": "57deaa28d9c21e39ddac5ef448d6e1be992fba91",
73
- "timestamp": "2026-04-13T10:55:46.960375+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "0r6snjo4",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/0r6snjo4",
81
- "final_accuracy": 0.9470833333333334,
82
- "sft_accuracy": 0.7258333333333333,
 
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_sorl_abs10_K1_10K_2L3H510d",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
 
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_10K",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-16T04:12:01.606714+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_10K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "v1",
80
+ "wandb_run_id": "lb17t8m0",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/lb17t8m0",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "config_hash": "5637ae880e02",
86
+ "final_accuracy": 0.9561538461538461,
87
+ "sft_accuracy": 0.87,
88
  "eval_method": "ArithmeticEvaluator"
89
  }