amirali1985 commited on
Commit
2ab001f
·
verified ·
1 Parent(s): e0a4b24

Upload add_sub_sorl_v1_abs10_K1_25K

Browse files
add_sub_sorl_v1_abs10_K1_25K/metrics.json CHANGED
@@ -73,509 +73,509 @@
73
  3869
74
  ],
75
  "loss": [
76
- 8.440483093261719,
77
- 3.670170307159424,
78
- 3.2093284130096436,
79
- 3.1004648208618164,
80
- 3.070091724395752,
81
- 3.0389091968536377,
82
- 2.5894856452941895,
83
- 0.7675281763076782,
84
- -3.0276317596435547,
85
- -4.935643196105957,
86
- -4.584414958953857,
87
- -2.691551923751831,
88
- -1.9160187244415283,
89
- -1.2295455932617188,
90
- -1.3427369594573975,
91
- -1.812907099723816,
92
- -1.9342525005340576,
93
- -1.293236255645752,
94
- -1.3251506090164185,
95
- -1.4479036331176758,
96
- -1.5105352401733398,
97
- -2.014406204223633,
98
- -1.9828301668167114,
99
- -1.7031790018081665,
100
- -2.5977749824523926,
101
- -2.3531222343444824,
102
- -2.0115716457366943,
103
- -1.536534070968628,
104
- -1.4918975830078125,
105
- -1.0497915744781494,
106
- -1.3855948448181152,
107
- -1.9143116474151611,
108
- -1.2018264532089233,
109
- -0.9971982836723328,
110
- -1.4220505952835083,
111
- -0.9136620759963989,
112
- -0.9068200588226318,
113
- -1.1527667045593262,
114
- -0.6265409588813782,
115
- -0.8574066162109375,
116
- -0.42161595821380615,
117
- -0.434919536113739,
118
- -0.4632154107093811,
119
- -0.5082250237464905,
120
- -0.5800994634628296,
121
- -0.789692759513855,
122
- -0.2702712416648865,
123
- -0.4394497275352478,
124
- -0.5490878820419312,
125
- -0.5185199975967407,
126
- -0.32845696806907654,
127
- -0.2897554337978363,
128
- -0.39504489302635193,
129
- -0.27258846163749695,
130
- -0.25301221013069153,
131
- -0.08269159495830536,
132
- -0.18398964405059814,
133
- -0.23646067082881927,
134
- -0.10407372564077377,
135
- -0.09299999475479126,
136
- -0.08140064775943756,
137
- -0.09688820689916611,
138
- -0.05483222380280495,
139
- -0.03655329346656799,
140
- -0.05594222992658615,
141
- -0.008253187872469425,
142
- -0.09734220057725906,
143
- -0.05176986753940582,
144
- -0.05214906856417656,
145
- -0.07431081682443619
146
  ],
147
  "base_loss": [
148
- 6.488893985748291,
149
- 2.3548998832702637,
150
- 1.9527490139007568,
151
- 1.8523368835449219,
152
- 1.8802011013031006,
153
- 1.8292043209075928,
154
- 1.8652232885360718,
155
- 1.815848469734192,
156
- 1.848860740661621,
157
- 1.741455316543579,
158
- 1.3914107084274292,
159
- 1.068152666091919,
160
- 0.9446887969970703,
161
- 0.7281270623207092,
162
- 0.599748432636261,
163
- 0.5804276466369629,
164
- 0.5249930024147034,
165
- 0.454721599817276,
166
- 0.3896333873271942,
167
- 0.32255008816719055,
168
- 0.34916767477989197,
169
- 0.32530543208122253,
170
- 0.2995271384716034,
171
- 0.2648261487483978,
172
- 0.3412800133228302,
173
- 0.28785938024520874,
174
- 0.24230030179023743,
175
- 0.19102659821510315,
176
- 0.17416465282440186,
177
- 0.1800290048122406,
178
- 0.1682046502828598,
179
- 0.24820947647094727,
180
- 0.14416953921318054,
181
- 0.11789846420288086,
182
- 0.16254489123821259,
183
- 0.11386360228061676,
184
- 0.12063971906900406,
185
- 0.13949914276599884,
186
- 0.07232778519392014,
187
- 0.10645522177219391,
188
- 0.04950258880853653,
189
- 0.05214657261967659,
190
- 0.0541020967066288,
191
- 0.06009664013981819,
192
- 0.07329648733139038,
193
- 0.09814182668924332,
194
- 0.031828977167606354,
195
- 0.05065033212304115,
196
- 0.06576144695281982,
197
- 0.06058066338300705,
198
- 0.037494949996471405,
199
- 0.03307747095823288,
200
- 0.04483410716056824,
201
- 0.03311517834663391,
202
- 0.030032988637685776,
203
- 0.00985543243587017,
204
- 0.02162509225308895,
205
- 0.02781860902905464,
206
- 0.012631254270672798,
207
- 0.011143012903630733,
208
- 0.009497015736997128,
209
- 0.01135705504566431,
210
- 0.008042754605412483,
211
- 0.005164582747966051,
212
- 0.006489872932434082,
213
- 0.0014688527444377542,
214
- 0.011420228518545628,
215
- 0.006360507570207119,
216
- 0.006299526430666447,
217
- 0.008574886247515678
218
  ],
219
  "info_loss": [
220
- -0.2627429962158203,
221
- -0.0675959587097168,
222
- -0.06391513347625732,
223
- -0.06318938732147217,
224
- -0.068672776222229,
225
- -0.06626904010772705,
226
- -0.11490964889526367,
227
- -0.29147613048553467,
228
- -0.6731091737747192,
229
- -0.8447501063346863,
230
- -0.7606219053268433,
231
- -0.516476035118103,
232
- -0.41259610652923584,
233
- -0.31304433941841125,
234
- -0.2955251634120941,
235
- -0.3318001329898834,
236
- -0.33346521854400635,
237
- -0.25125789642333984,
238
- -0.23548583686351776,
239
- -0.21368299424648285,
240
- -0.20429086685180664,
241
- -0.24210147559642792,
242
- -0.23295104503631592,
243
- -0.19994628429412842,
244
- -0.2960708439350128,
245
- -0.2660777270793915,
246
- -0.22968997061252594,
247
- -0.17489448189735413,
248
- -0.16804951429367065,
249
- -0.12430460005998611,
250
- -0.15695549547672272,
251
- -0.2182704359292984,
252
- -0.13539204001426697,
253
- -0.11300598084926605,
254
- -0.15940292179584503,
255
- -0.10396484285593033,
256
- -0.10412997007369995,
257
- -0.13012480735778809,
258
- -0.07035745680332184,
259
- -0.09725693613290787,
260
- -0.0479651540517807,
261
- -0.04964497312903404,
262
- -0.05318129435181618,
263
- -0.05739910155534744,
264
- -0.06728154420852661,
265
- -0.09058966487646103,
266
- -0.03128577023744583,
267
- -0.04990841820836067,
268
- -0.06233469396829605,
269
- -0.05823947489261627,
270
- -0.03711993992328644,
271
- -0.03283752128481865,
272
- -0.04459574073553085,
273
- -0.031027397140860558,
274
- -0.028812875971198082,
275
- -0.009709836915135384,
276
- -0.021456001326441765,
277
- -0.02661510370671749,
278
- -0.012435774318873882,
279
- -0.011000837199389935,
280
- -0.009408645331859589,
281
- -0.011241589672863483,
282
- -0.006964982487261295,
283
- -0.005082710646092892,
284
- -0.006405099760740995,
285
- -0.0013885911321267486,
286
- -0.011316562071442604,
287
- -0.006281006615608931,
288
- -0.006225514691323042,
289
- -0.008512690663337708
290
  ],
291
  "abs_loss": [
292
- 2.062779664993286,
293
- 1.8660074472427368,
294
- 1.8702964782714844,
295
- 1.8314086198806763,
296
- 1.8298479318618774,
297
- 1.8219594955444336,
298
- 1.854802131652832,
299
- 1.825403094291687,
300
- 1.8147176504135132,
301
- 1.718223214149475,
302
- 1.5588526725769043,
303
- 1.2659032344818115,
304
- 1.0050292015075684,
305
- 0.8381269574165344,
306
- 0.7240123748779297,
307
- 0.6084690690040588,
308
- 0.56196129322052,
309
- 0.43736353516578674,
310
- 0.4195970594882965,
311
- 0.3356196880340576,
312
- 0.3217657208442688,
313
- 0.18604372441768646,
314
- 0.1829766184091568,
315
- 0.15309858322143555,
316
- 0.12250169366598129,
317
- 0.09407112747430801,
318
- 0.11580777168273926,
319
- 0.09737056493759155,
320
- 0.06344643235206604,
321
- 0.06409666687250137,
322
- 0.06125688925385475,
323
- 0.04178151860833168,
324
- 0.03453303501009941,
325
- 0.045983314514160156,
326
- 0.0417625792324543,
327
- 0.037050385028123856,
328
- 0.03088521957397461,
329
- 0.01876666396856308,
330
- 0.02520829439163208,
331
- 0.032921046018600464,
332
- 0.02231394685804844,
333
- 0.013566468842327595,
334
- 0.01799575798213482,
335
- 0.020950788632035255,
336
- 0.025772780179977417,
337
- 0.014466878958046436,
338
- 0.010489404201507568,
339
- 0.007517005782574415,
340
- 0.012727208435535431,
341
- 0.010788698680698872,
342
- 0.012184095568954945,
343
- 0.009171529673039913,
344
- 0.003999281208962202,
345
- 0.00763775035738945,
346
- 0.008048650808632374,
347
- 0.007546226028352976,
348
- 0.04280756413936615,
349
- 0.004729445558041334,
350
- 0.0336616076529026,
351
- 0.003961015492677689,
352
- 0.003912051673978567,
353
- 0.002861393615603447,
354
- 0.006178874522447586,
355
- 0.002068899804726243,
356
- 0.004654192831367254,
357
- 0.004825535695999861,
358
- 0.0032716484274715185,
359
- 0.004482524935156107,
360
- 0.004428912419825792,
361
- 0.004628469236195087
362
  ],
363
  "zipf_loss": [
364
- 4.37274169921875,
365
- 1.8046292066574097,
366
- 1.7087011337280273,
367
- 1.6968810558319092,
368
- 1.6936335563659668,
369
- 1.690199375152588,
370
- 1.6878787279129028,
371
- 1.6839007139205933,
372
- 1.6731270551681519,
373
- 1.5985805988311768,
374
- 1.4745081663131714,
375
- 1.278465986251831,
376
- 1.1647508144378662,
377
- 1.0889580249786377,
378
- 0.9403650760650635,
379
- 0.8638197183609009,
380
- 0.8192105293273926,
381
- 0.7208845019340515,
382
- 0.5981146097183228,
383
- 0.33281421661376953,
384
- 0.15102912485599518,
385
- 0.06269875168800354,
386
- 0.02885546162724495,
387
- 0.016147876158356667,
388
- 0.009403230622410774,
389
- 0.010388587601482868,
390
- 0.031447142362594604,
391
- 0.011647149920463562,
392
- 0.008088257163763046,
393
- 0.006815792992711067,
394
- 0.009629786014556885,
395
- 0.01600518450140953,
396
- 0.004471144638955593,
397
- 0.010364695452153683,
398
- 0.00525753665715456,
399
- 0.008417721837759018,
400
- 0.010751370340585709,
401
- 0.007105482742190361,
402
- 0.0021849004551768303,
403
- 0.005415467079728842,
404
- 0.006301610730588436,
405
- 0.00802694447338581,
406
- 0.012695873156189919,
407
- 0.0035742726176977158,
408
- 0.016842156648635864,
409
- 0.01661541312932968,
410
- 0.009708529338240623,
411
- 0.008232427760958672,
412
- 0.007224865257740021,
413
- 0.002215202199295163,
414
- 0.004029060248285532,
415
- 0.004625140223652124,
416
- 0.005678504705429077,
417
- 0.0038065649569034576,
418
- 0.004278676584362984,
419
- 0.0037967078387737274,
420
- 0.0046645235270261765,
421
- 0.0013988213613629341,
422
- 0.004286607727408409,
423
- 0.0054692612029612064,
424
- 0.002797577530145645,
425
- 0.0038844943046569824,
426
- 0.006156958639621735,
427
- 0.00890234112739563,
428
- 0.0011534709483385086,
429
- 0.0036813169717788696,
430
- 0.0040760282427072525,
431
- 0.004231432918459177,
432
- 0.003363661468029022,
433
- 0.0017783557996153831
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
- 0.3933292031288147,
438
- 0.19537021219730377,
439
- 0.1488562375307083,
440
- 0.13526654243469238,
441
- 0.1290234625339508,
442
- 0.12106598168611526,
443
- 0.13526804745197296,
444
- 0.1508757472038269,
445
- 0.21096870303153992,
446
- 0.2514957785606384,
447
- 0.27831581234931946,
448
- 0.2901284098625183,
449
- 0.29795151948928833,
450
- 0.32490643858909607,
451
- 0.3221665322780609,
452
- 0.3283940255641937,
453
- 0.3153337240219116,
454
- 0.3107602894306183,
455
- 0.30619168281555176,
456
- 0.3000865578651428,
457
- 0.29851052165031433,
458
- 0.29726409912109375,
459
- 0.2884874641895294,
460
- 0.28946927189826965,
461
- 0.27607646584510803,
462
- 0.28382158279418945,
463
- 0.28244829177856445,
464
- 0.29492366313934326,
465
- 0.28803184628486633,
466
- 0.2858043313026428,
467
- 0.27292492985725403,
468
- 0.2727271318435669,
469
- 0.27270883321762085,
470
- 0.277759850025177,
471
- 0.28240156173706055,
472
- 0.2703619599342346,
473
- 0.2714839279651642,
474
- 0.27064526081085205,
475
- 0.2735464572906494,
476
- 0.2756211757659912,
477
- 0.28339874744415283,
478
- 0.27706512808799744,
479
- 0.2764289081096649,
480
- 0.2683185040950775,
481
- 0.2532169818878174,
482
- 0.24946703016757965,
483
- 0.2466099113225937,
484
- 0.2510632872581482,
485
- 0.2696925401687622,
486
- 0.2670671343803406,
487
- 0.2715094983577728,
488
- 0.26723748445510864,
489
- 0.26079726219177246,
490
- 0.2603513300418854,
491
- 0.25771471858024597,
492
- 0.25748154520988464,
493
- 0.2570980489253998,
494
- 0.2605780363082886,
495
- 0.26496967673301697,
496
- 0.2605254054069519,
497
- 0.25853437185287476,
498
- 0.2556053102016449,
499
- 0.24565964937210083,
500
- 0.2401438057422638,
501
- 0.23765115439891815,
502
- 0.23894348740577698,
503
- 0.23828937113285065,
504
- 0.23862119019031525,
505
- 0.23789578676223755,
506
- 0.23580653965473175
507
  ],
508
  "lr": [
509
- 7.840000000000001e-05,
510
- 8e-05,
511
- 8e-05,
512
- 8e-05,
513
- 8e-05,
514
- 8e-05,
515
- 8e-05,
516
- 8e-05,
517
- 8e-05,
518
- 8e-05,
519
- 8e-05,
520
- 8e-05,
521
- 8e-05,
522
- 8e-05,
523
- 8e-05,
524
- 8e-05,
525
- 8e-05,
526
- 8e-05,
527
- 8e-05,
528
- 8e-05,
529
- 8e-05,
530
- 8e-05,
531
- 8e-05,
532
- 8e-05,
533
- 8e-05,
534
- 8e-05,
535
- 8e-05,
536
- 8e-05,
537
- 8e-05,
538
- 8e-05,
539
- 8e-05,
540
- 8e-05,
541
- 8e-05,
542
- 8e-05,
543
- 8e-05,
544
- 8e-05,
545
- 8e-05,
546
- 8e-05,
547
- 8e-05,
548
- 8e-05,
549
- 8e-05,
550
- 8e-05,
551
- 7.864766839378239e-05,
552
- 7.63160621761658e-05,
553
- 7.398445595854923e-05,
554
- 7.165284974093265e-05,
555
- 6.932124352331606e-05,
556
- 6.69896373056995e-05,
557
- 6.465803108808292e-05,
558
- 6.041450777202072e-05,
559
- 5.8082901554404154e-05,
560
- 5.5751295336787566e-05,
561
- 5.3419689119171e-05,
562
- 5.108808290155441e-05,
563
- 4.8756476683937825e-05,
564
- 4.642487046632125e-05,
565
- 4.218134715025906e-05,
566
- 3.98497409326425e-05,
567
- 3.7518134715025914e-05,
568
- 3.518652849740933e-05,
569
- 3.285492227979275e-05,
570
- 3.0523316062176166e-05,
571
- 2.8191709844559595e-05,
572
- 2.3948186528497416e-05,
573
- 2.1616580310880825e-05,
574
- 1.9284974093264255e-05,
575
- 1.6953367875647667e-05,
576
- 1.4621761658031097e-05,
577
- 1.2290155440414508e-05,
578
- 9.958549222797919e-06
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
@@ -591,7 +591,7 @@
591
  3869
592
  ],
593
  "eval_accuracy": [
594
- 0.0,
595
  0.0,
596
  0.0,
597
  0.0,
@@ -603,508 +603,508 @@
603
  0.0
604
  ]
605
  },
606
- "final_accuracy": 0.9707142857142858,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
610
  "K": null,
611
  "mode": "sft",
612
  "n_digits": 6,
613
- "n_per_split": 50
614
  },
615
  "splits": {
616
  "add_S0": {
617
- "full_accuracy": 1.0,
618
- "n_examples": 50,
619
  "per_subtask": {
620
  "SA": {
621
- "accuracy": 1.0,
622
- "count": 295
623
  },
624
  "SS": {
625
  "accuracy": 1.0,
626
- "count": 55
627
  }
628
  }
629
  },
630
  "add_S1": {
631
- "full_accuracy": 0.98,
632
- "n_examples": 50,
633
  "per_subtask": {
634
  "SA": {
635
- "accuracy": 1.0,
636
- "count": 126
637
  },
638
  "SC": {
639
- "accuracy": 1.0,
640
- "count": 79
641
  },
642
  "SS": {
643
  "accuracy": 1.0,
644
- "count": 21
645
  },
646
  "UC": {
647
- "accuracy": 0.9919354838709677,
648
- "count": 124
649
  }
650
  }
651
  },
652
  "add_S2": {
653
- "full_accuracy": 1.0,
654
- "n_examples": 50,
655
  "per_subtask": {
656
  "SA": {
657
  "accuracy": 1.0,
658
- "count": 75
659
  },
660
  "SC": {
661
- "accuracy": 1.0,
662
- "count": 62
663
  },
664
  "SS": {
665
  "accuracy": 1.0,
666
- "count": 39
667
  },
668
  "UC": {
669
- "accuracy": 1.0,
670
- "count": 111
671
  },
672
  "US": {
673
- "accuracy": 1.0,
674
- "count": 63
675
  }
676
  }
677
  },
678
  "add_S3": {
679
- "full_accuracy": 1.0,
680
- "n_examples": 50,
681
  "per_subtask": {
682
  "SA": {
683
- "accuracy": 1.0,
684
- "count": 60
685
  },
686
  "SC": {
687
- "accuracy": 1.0,
688
- "count": 57
689
  },
690
  "SS": {
691
- "accuracy": 1.0,
692
- "count": 19
693
  },
694
  "UC": {
695
- "accuracy": 1.0,
696
- "count": 104
697
  },
698
  "US": {
699
- "accuracy": 1.0,
700
- "count": 110
701
  }
702
  }
703
  },
704
  "add_S4": {
705
- "full_accuracy": 0.88,
706
- "n_examples": 50,
707
  "per_subtask": {
708
  "SA": {
709
- "accuracy": 1.0,
710
- "count": 48
711
  },
712
  "SC": {
713
- "accuracy": 1.0,
714
- "count": 52
715
  },
716
  "SS": {
717
  "accuracy": 1.0,
718
- "count": 7
719
  },
720
  "UC": {
721
- "accuracy": 0.9325842696629213,
722
- "count": 89
723
  },
724
  "US": {
725
- "accuracy": 1.0,
726
- "count": 154
727
  }
728
  }
729
  },
730
  "add_S5": {
731
- "full_accuracy": 0.92,
732
- "n_examples": 50,
733
  "per_subtask": {
734
  "SA": {
735
  "accuracy": 1.0,
736
- "count": 50
737
  },
738
  "SC": {
739
  "accuracy": 1.0,
740
- "count": 50
741
  },
742
  "UC": {
743
- "accuracy": 0.96,
744
- "count": 50
745
  },
746
  "US": {
747
- "accuracy": 0.99,
748
- "count": 200
749
  }
750
  }
751
  },
752
  "add_S6": {
753
- "full_accuracy": 1.0,
754
- "n_examples": 50,
755
  "per_subtask": {
756
  "SC": {
757
  "accuracy": 1.0,
758
- "count": 50
759
  },
760
  "UC": {
761
- "accuracy": 1.0,
762
- "count": 50
763
  },
764
  "US": {
765
- "accuracy": 1.0,
766
- "count": 250
767
  }
768
  }
769
  },
770
  "add_random": {
771
- "full_accuracy": 1.0,
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
- "accuracy": 1.0,
776
- "count": 431
777
  },
778
  "SC": {
779
- "accuracy": 1.0,
780
- "count": 316
781
  },
782
  "SS": {
783
  "accuracy": 1.0,
784
- "count": 39
785
  },
786
  "UC": {
787
- "accuracy": 1.0,
788
- "count": 560
789
  },
790
  "US": {
791
- "accuracy": 1.0,
792
- "count": 54
793
  }
794
  }
795
  },
796
  "add_C3": {
797
- "full_accuracy": 0.98,
798
- "n_examples": 50,
799
  "per_subtask": {
800
  "SA": {
801
  "accuracy": 1.0,
802
- "count": 150
803
  },
804
  "SC": {
805
- "accuracy": 1.0,
806
- "count": 50
807
  },
808
  "UC": {
809
- "accuracy": 0.9903846153846154,
810
- "count": 104
811
  },
812
  "US": {
813
- "accuracy": 1.0,
814
- "count": 46
815
  }
816
  }
817
  },
818
  "add_C4": {
819
- "full_accuracy": 0.96,
820
- "n_examples": 50,
821
  "per_subtask": {
822
  "SA": {
823
  "accuracy": 1.0,
824
- "count": 100
825
  },
826
  "SC": {
827
  "accuracy": 1.0,
828
- "count": 50
829
  },
830
  "UC": {
831
- "accuracy": 0.983739837398374,
832
- "count": 123
833
  },
834
  "US": {
835
- "accuracy": 1.0,
836
- "count": 77
837
  }
838
  }
839
  },
840
  "add_C5": {
841
- "full_accuracy": 0.98,
842
- "n_examples": 50,
843
  "per_subtask": {
844
  "SA": {
845
- "accuracy": 1.0,
846
- "count": 50
847
  },
848
  "SC": {
849
- "accuracy": 1.0,
850
- "count": 50
851
  },
852
  "UC": {
853
- "accuracy": 0.9935064935064936,
854
- "count": 154
855
  },
856
  "US": {
857
- "accuracy": 1.0,
858
- "count": 96
859
  }
860
  }
861
  },
862
  "add_C6": {
863
- "full_accuracy": 0.94,
864
- "n_examples": 50,
865
  "per_subtask": {
866
  "SC": {
867
  "accuracy": 1.0,
868
- "count": 50
869
  },
870
  "UC": {
871
- "accuracy": 0.9835164835164835,
872
- "count": 182
873
  },
874
  "US": {
875
- "accuracy": 1.0,
876
- "count": 118
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
- "full_accuracy": 1.0,
882
- "n_examples": 50,
883
  "per_subtask": {
884
  "MD": {
885
- "accuracy": 1.0,
886
- "count": 294
887
  },
888
  "ME": {
889
  "accuracy": 1.0,
890
- "count": 56
891
  }
892
  }
893
  },
894
  "sub_M1": {
895
- "full_accuracy": 1.0,
896
- "n_examples": 50,
897
  "per_subtask": {
898
  "MD": {
899
- "accuracy": 1.0,
900
- "count": 143
901
  },
902
  "MB": {
903
- "accuracy": 1.0,
904
- "count": 69
905
  },
906
  "ME": {
907
- "accuracy": 1.0,
908
- "count": 15
909
  },
910
  "UB": {
911
- "accuracy": 1.0,
912
- "count": 123
913
  }
914
  }
915
  },
916
  "sub_M2": {
917
- "full_accuracy": 1.0,
918
- "n_examples": 50,
919
  "per_subtask": {
920
  "MD": {
921
- "accuracy": 1.0,
922
- "count": 108
923
  },
924
  "MB": {
925
- "accuracy": 1.0,
926
- "count": 52
927
  },
928
  "ME": {
929
  "accuracy": 1.0,
930
- "count": 52
931
  },
932
  "UB": {
933
- "accuracy": 1.0,
934
- "count": 87
935
  },
936
  "UD": {
937
  "accuracy": 1.0,
938
- "count": 51
939
  }
940
  }
941
  },
942
  "sub_M3": {
943
- "full_accuracy": 1.0,
944
- "n_examples": 50,
945
  "per_subtask": {
946
  "MD": {
947
  "accuracy": 1.0,
948
- "count": 94
949
  },
950
  "MB": {
951
- "accuracy": 1.0,
952
- "count": 51
953
  },
954
  "ME": {
955
  "accuracy": 1.0,
956
- "count": 25
957
  },
958
  "UB": {
959
- "accuracy": 1.0,
960
- "count": 78
961
  },
962
  "UD": {
963
- "accuracy": 1.0,
964
- "count": 102
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
- "full_accuracy": 0.48,
970
- "n_examples": 50,
971
  "per_subtask": {
972
  "MD": {
973
  "accuracy": 1.0,
974
- "count": 100
975
  },
976
  "MB": {
977
  "accuracy": 1.0,
978
- "count": 50
979
  },
980
  "UB": {
981
- "accuracy": 0.48,
982
- "count": 50
983
  },
984
  "UD": {
985
- "accuracy": 1.0,
986
- "count": 150
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
- "full_accuracy": 0.6,
992
- "n_examples": 50,
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
- "count": 50
997
  },
998
  "MB": {
999
  "accuracy": 1.0,
1000
- "count": 50
1001
  },
1002
  "UB": {
1003
- "accuracy": 0.94,
1004
- "count": 50
1005
  },
1006
  "UD": {
1007
- "accuracy": 0.9,
1008
- "count": 200
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
- "full_accuracy": 1.0,
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 1.0,
1018
- "count": 588
1019
  },
1020
  "MB": {
1021
- "accuracy": 1.0,
1022
- "count": 268
1023
  },
1024
  "ME": {
1025
  "accuracy": 1.0,
1026
- "count": 60
1027
  },
1028
  "UB": {
1029
- "accuracy": 1.0,
1030
- "count": 447
1031
  },
1032
  "UD": {
1033
- "accuracy": 1.0,
1034
- "count": 37
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
- "full_accuracy": 1.0,
1040
- "n_examples": 50,
1041
  "per_subtask": {
1042
  "MD": {
1043
  "accuracy": 1.0,
1044
- "count": 150
1045
  },
1046
  "MB": {
1047
  "accuracy": 1.0,
1048
- "count": 50
1049
  },
1050
  "UB": {
1051
- "accuracy": 1.0,
1052
- "count": 107
1053
  },
1054
  "UD": {
1055
- "accuracy": 1.0,
1056
- "count": 43
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
- "full_accuracy": 0.88,
1062
- "n_examples": 50,
1063
  "per_subtask": {
1064
  "MD": {
1065
  "accuracy": 1.0,
1066
- "count": 100
1067
  },
1068
  "MB": {
1069
  "accuracy": 1.0,
1070
- "count": 50
1071
  },
1072
  "UB": {
1073
- "accuracy": 0.9473684210526315,
1074
- "count": 114
1075
  },
1076
  "UD": {
1077
- "accuracy": 1.0,
1078
- "count": 86
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
- "full_accuracy": 0.84,
1084
- "n_examples": 50,
1085
  "per_subtask": {
1086
  "MD": {
1087
  "accuracy": 1.0,
1088
- "count": 50
1089
  },
1090
  "MB": {
1091
  "accuracy": 1.0,
1092
- "count": 50
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.9738562091503268,
1096
- "count": 153
1097
  },
1098
  "UD": {
1099
- "accuracy": 0.9484536082474226,
1100
- "count": 97
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
- "overall_accuracy": 0.9442857142857143,
1107
- "total_examples": 1400,
1108
  "n_splits": 22
1109
  }
1110
  },
@@ -1114,504 +1114,504 @@
1114
  "K": 1,
1115
  "mode": "sorl",
1116
  "n_digits": 6,
1117
- "n_per_split": 50
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
  "full_accuracy": 1.0,
1122
- "n_examples": 50,
1123
  "per_subtask": {
1124
  "SA": {
1125
  "accuracy": 1.0,
1126
- "count": 295
1127
  },
1128
  "SS": {
1129
  "accuracy": 1.0,
1130
- "count": 55
1131
  }
1132
  }
1133
  },
1134
  "add_S1": {
1135
  "full_accuracy": 1.0,
1136
- "n_examples": 50,
1137
  "per_subtask": {
1138
  "SA": {
1139
  "accuracy": 1.0,
1140
- "count": 126
1141
  },
1142
  "SC": {
1143
  "accuracy": 1.0,
1144
- "count": 79
1145
  },
1146
  "SS": {
1147
  "accuracy": 1.0,
1148
- "count": 21
1149
  },
1150
  "UC": {
1151
  "accuracy": 1.0,
1152
- "count": 124
1153
  }
1154
  }
1155
  },
1156
  "add_S2": {
1157
- "full_accuracy": 1.0,
1158
- "n_examples": 50,
1159
  "per_subtask": {
1160
  "SA": {
1161
- "accuracy": 1.0,
1162
- "count": 75
1163
  },
1164
  "SC": {
1165
- "accuracy": 1.0,
1166
- "count": 62
1167
  },
1168
  "SS": {
1169
- "accuracy": 1.0,
1170
- "count": 39
1171
  },
1172
  "UC": {
1173
- "accuracy": 1.0,
1174
- "count": 111
1175
  },
1176
  "US": {
1177
  "accuracy": 1.0,
1178
- "count": 63
1179
  }
1180
  }
1181
  },
1182
  "add_S3": {
1183
- "full_accuracy": 1.0,
1184
- "n_examples": 50,
1185
  "per_subtask": {
1186
  "SA": {
1187
  "accuracy": 1.0,
1188
- "count": 60
1189
  },
1190
  "SC": {
1191
- "accuracy": 1.0,
1192
- "count": 57
1193
  },
1194
  "SS": {
1195
  "accuracy": 1.0,
1196
- "count": 19
1197
  },
1198
  "UC": {
1199
- "accuracy": 1.0,
1200
- "count": 104
1201
  },
1202
  "US": {
1203
- "accuracy": 1.0,
1204
- "count": 110
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
- "full_accuracy": 1.0,
1210
- "n_examples": 50,
1211
  "per_subtask": {
1212
  "SA": {
1213
  "accuracy": 1.0,
1214
- "count": 48
1215
  },
1216
  "SC": {
1217
  "accuracy": 1.0,
1218
- "count": 52
1219
  },
1220
  "SS": {
1221
  "accuracy": 1.0,
1222
- "count": 7
1223
  },
1224
  "UC": {
1225
- "accuracy": 1.0,
1226
- "count": 89
1227
  },
1228
  "US": {
1229
- "accuracy": 1.0,
1230
- "count": 154
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
- "full_accuracy": 0.62,
1236
- "n_examples": 50,
1237
  "per_subtask": {
1238
  "SA": {
1239
  "accuracy": 1.0,
1240
- "count": 50
1241
  },
1242
  "SC": {
1243
  "accuracy": 1.0,
1244
- "count": 50
1245
  },
1246
  "UC": {
1247
- "accuracy": 0.62,
1248
- "count": 50
1249
  },
1250
  "US": {
1251
- "accuracy": 1.0,
1252
- "count": 200
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
- "full_accuracy": 1.0,
1258
- "n_examples": 50,
1259
  "per_subtask": {
1260
  "SC": {
1261
  "accuracy": 1.0,
1262
- "count": 50
1263
  },
1264
  "UC": {
1265
- "accuracy": 1.0,
1266
- "count": 50
1267
  },
1268
  "US": {
1269
- "accuracy": 1.0,
1270
- "count": 250
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
- "full_accuracy": 1.0,
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
- "accuracy": 1.0,
1280
- "count": 431
1281
  },
1282
  "SC": {
1283
  "accuracy": 1.0,
1284
- "count": 316
1285
  },
1286
  "SS": {
1287
- "accuracy": 1.0,
1288
- "count": 39
1289
  },
1290
  "UC": {
1291
- "accuracy": 1.0,
1292
- "count": 560
1293
  },
1294
  "US": {
1295
  "accuracy": 1.0,
1296
- "count": 54
1297
  }
1298
  }
1299
  },
1300
  "add_C3": {
1301
- "full_accuracy": 1.0,
1302
- "n_examples": 50,
1303
  "per_subtask": {
1304
  "SA": {
1305
  "accuracy": 1.0,
1306
- "count": 150
1307
  },
1308
  "SC": {
1309
  "accuracy": 1.0,
1310
- "count": 50
1311
  },
1312
  "UC": {
1313
- "accuracy": 1.0,
1314
- "count": 104
1315
  },
1316
  "US": {
1317
- "accuracy": 1.0,
1318
- "count": 46
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
- "full_accuracy": 1.0,
1324
- "n_examples": 50,
1325
  "per_subtask": {
1326
  "SA": {
1327
  "accuracy": 1.0,
1328
- "count": 100
1329
  },
1330
  "SC": {
1331
  "accuracy": 1.0,
1332
- "count": 50
1333
  },
1334
  "UC": {
1335
- "accuracy": 1.0,
1336
- "count": 123
1337
  },
1338
  "US": {
1339
- "accuracy": 1.0,
1340
- "count": 77
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
- "full_accuracy": 1.0,
1346
- "n_examples": 50,
1347
  "per_subtask": {
1348
  "SA": {
1349
  "accuracy": 1.0,
1350
- "count": 50
1351
  },
1352
  "SC": {
1353
  "accuracy": 1.0,
1354
- "count": 50
1355
  },
1356
  "UC": {
1357
- "accuracy": 1.0,
1358
- "count": 154
1359
  },
1360
  "US": {
1361
- "accuracy": 1.0,
1362
- "count": 96
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
- "full_accuracy": 1.0,
1368
- "n_examples": 50,
1369
  "per_subtask": {
1370
  "SC": {
1371
  "accuracy": 1.0,
1372
- "count": 50
1373
  },
1374
  "UC": {
1375
- "accuracy": 1.0,
1376
- "count": 182
1377
  },
1378
  "US": {
1379
- "accuracy": 1.0,
1380
- "count": 118
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
- "full_accuracy": 1.0,
1386
- "n_examples": 50,
1387
  "per_subtask": {
1388
  "MD": {
1389
- "accuracy": 1.0,
1390
- "count": 294
1391
  },
1392
  "ME": {
1393
  "accuracy": 1.0,
1394
- "count": 56
1395
  }
1396
  }
1397
  },
1398
  "sub_M1": {
1399
  "full_accuracy": 1.0,
1400
- "n_examples": 50,
1401
  "per_subtask": {
1402
  "MD": {
1403
  "accuracy": 1.0,
1404
- "count": 143
1405
  },
1406
  "MB": {
1407
  "accuracy": 1.0,
1408
- "count": 69
1409
  },
1410
  "ME": {
1411
  "accuracy": 1.0,
1412
- "count": 15
1413
  },
1414
  "UB": {
1415
  "accuracy": 1.0,
1416
- "count": 123
1417
  }
1418
  }
1419
  },
1420
  "sub_M2": {
1421
- "full_accuracy": 1.0,
1422
- "n_examples": 50,
1423
  "per_subtask": {
1424
  "MD": {
1425
- "accuracy": 1.0,
1426
- "count": 108
1427
  },
1428
  "MB": {
1429
  "accuracy": 1.0,
1430
- "count": 52
1431
  },
1432
  "ME": {
1433
  "accuracy": 1.0,
1434
- "count": 52
1435
  },
1436
  "UB": {
1437
- "accuracy": 1.0,
1438
- "count": 87
1439
  },
1440
  "UD": {
1441
  "accuracy": 1.0,
1442
- "count": 51
1443
  }
1444
  }
1445
  },
1446
  "sub_M3": {
1447
- "full_accuracy": 1.0,
1448
- "n_examples": 50,
1449
  "per_subtask": {
1450
  "MD": {
1451
  "accuracy": 1.0,
1452
- "count": 94
1453
  },
1454
  "MB": {
1455
  "accuracy": 1.0,
1456
- "count": 51
1457
  },
1458
  "ME": {
1459
  "accuracy": 1.0,
1460
- "count": 25
1461
  },
1462
  "UB": {
1463
- "accuracy": 1.0,
1464
- "count": 78
1465
  },
1466
  "UD": {
1467
- "accuracy": 1.0,
1468
- "count": 102
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
- "full_accuracy": 1.0,
1474
- "n_examples": 50,
1475
  "per_subtask": {
1476
  "MD": {
1477
  "accuracy": 1.0,
1478
- "count": 100
1479
  },
1480
  "MB": {
1481
  "accuracy": 1.0,
1482
- "count": 50
1483
  },
1484
  "UB": {
1485
- "accuracy": 1.0,
1486
- "count": 50
1487
  },
1488
  "UD": {
1489
- "accuracy": 1.0,
1490
- "count": 150
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
- "full_accuracy": 0.58,
1496
- "n_examples": 50,
1497
  "per_subtask": {
1498
  "MD": {
1499
  "accuracy": 1.0,
1500
- "count": 50
1501
  },
1502
  "MB": {
1503
  "accuracy": 1.0,
1504
- "count": 50
1505
  },
1506
  "UB": {
1507
- "accuracy": 0.58,
1508
- "count": 50
1509
  },
1510
  "UD": {
1511
- "accuracy": 1.0,
1512
- "count": 200
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
- "full_accuracy": 1.0,
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
- "accuracy": 1.0,
1522
- "count": 588
1523
  },
1524
  "MB": {
1525
- "accuracy": 1.0,
1526
- "count": 268
1527
  },
1528
  "ME": {
1529
  "accuracy": 1.0,
1530
- "count": 60
1531
  },
1532
  "UB": {
1533
  "accuracy": 1.0,
1534
- "count": 447
1535
  },
1536
  "UD": {
1537
  "accuracy": 1.0,
1538
- "count": 37
1539
  }
1540
  }
1541
  },
1542
  "sub_B3": {
1543
- "full_accuracy": 1.0,
1544
- "n_examples": 50,
1545
  "per_subtask": {
1546
  "MD": {
1547
- "accuracy": 1.0,
1548
- "count": 150
1549
  },
1550
  "MB": {
1551
  "accuracy": 1.0,
1552
- "count": 50
1553
  },
1554
  "UB": {
1555
- "accuracy": 1.0,
1556
- "count": 107
1557
  },
1558
  "UD": {
1559
- "accuracy": 1.0,
1560
- "count": 43
1561
  }
1562
  }
1563
  },
1564
  "sub_B4": {
1565
- "full_accuracy": 1.0,
1566
- "n_examples": 50,
1567
  "per_subtask": {
1568
  "MD": {
1569
  "accuracy": 1.0,
1570
- "count": 100
1571
  },
1572
  "MB": {
1573
  "accuracy": 1.0,
1574
- "count": 50
1575
  },
1576
  "UB": {
1577
- "accuracy": 1.0,
1578
- "count": 114
1579
  },
1580
  "UD": {
1581
- "accuracy": 1.0,
1582
- "count": 86
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
- "full_accuracy": 1.0,
1588
- "n_examples": 50,
1589
  "per_subtask": {
1590
  "MD": {
1591
  "accuracy": 1.0,
1592
- "count": 50
1593
  },
1594
  "MB": {
1595
  "accuracy": 1.0,
1596
- "count": 50
1597
  },
1598
  "UB": {
1599
- "accuracy": 1.0,
1600
- "count": 153
1601
  },
1602
  "UD": {
1603
- "accuracy": 1.0,
1604
- "count": 97
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
- "overall_accuracy": 0.9707142857142858,
1611
- "total_examples": 1400,
1612
  "n_splits": 22
1613
  }
1614
  },
1615
- "sorl_overall_accuracy": 0.9707142857142858,
1616
- "sft_overall_accuracy": 0.9442857142857143
1617
  }
 
73
  3869
74
  ],
75
  "loss": [
76
+ 2.775237560272217,
77
+ 7.547610282897949,
78
+ 4.404772758483887,
79
+ 3.531778335571289,
80
+ 3.0909929275512695,
81
+ 3.0005178451538086,
82
+ 2.930567741394043,
83
+ 2.380660057067871,
84
+ 1.8402857780456543,
85
+ 1.5253196954727173,
86
+ 0.6456812620162964,
87
+ -0.922775387763977,
88
+ -5.972268104553223,
89
+ -7.329289436340332,
90
+ -3.549975872039795,
91
+ -2.491389274597168,
92
+ -0.8877972364425659,
93
+ -0.7063808441162109,
94
+ -1.138548493385315,
95
+ -0.44369208812713623,
96
+ -0.18519365787506104,
97
+ 0.001989603042602539,
98
+ -0.2683281898498535,
99
+ 0.038404107093811035,
100
+ 0.15909254550933838,
101
+ 0.3572588562965393,
102
+ -0.12331002950668335,
103
+ 0.23269200325012207,
104
+ -0.06346249580383301,
105
+ -0.600041389465332,
106
+ -0.17923784255981445,
107
+ -0.3249526619911194,
108
+ 0.10653692483901978,
109
+ -0.6031087636947632,
110
+ -0.2606835961341858,
111
+ -0.31056416034698486,
112
+ -0.71147620677948,
113
+ -0.4193435311317444,
114
+ -0.8142799139022827,
115
+ -0.7582582235336304,
116
+ -0.38260582089424133,
117
+ -0.4925423562526703,
118
+ -0.6277319192886353,
119
+ -0.5789560675621033,
120
+ -0.5000548958778381,
121
+ -0.24027329683303833,
122
+ -0.5376996994018555,
123
+ -0.7423339486122131,
124
+ -0.6099227070808411,
125
+ -0.734022855758667,
126
+ -0.5901182293891907,
127
+ -0.4833343029022217,
128
+ -0.50217604637146,
129
+ -0.4287641942501068,
130
+ -0.42581605911254883,
131
+ -0.32260385155677795,
132
+ -0.4461747407913208,
133
+ -0.4050253629684448,
134
+ -0.6488711833953857,
135
+ -0.3880271315574646,
136
+ -0.5698164105415344,
137
+ -0.6381738781929016,
138
+ -0.5308898091316223,
139
+ -0.3668437600135803,
140
+ -0.5520040988922119,
141
+ -0.463625431060791,
142
+ -0.6266764402389526,
143
+ -0.4801936149597168,
144
+ -0.22993049025535583,
145
+ -0.4441494047641754
146
  ],
147
  "base_loss": [
148
+ 9.28781795501709,
149
+ 6.408207416534424,
150
+ 4.156429767608643,
151
+ 2.373605966567993,
152
+ 2.0184969902038574,
153
+ 1.929623007774353,
154
+ 1.8580621480941772,
155
+ 1.8653138875961304,
156
+ 1.8420647382736206,
157
+ 1.7876325845718384,
158
+ 1.7426172494888306,
159
+ 1.8583365678787231,
160
+ 1.8177855014801025,
161
+ 1.7561644315719604,
162
+ 1.1391898393630981,
163
+ 0.8618606925010681,
164
+ 0.6386224627494812,
165
+ 0.6016677021980286,
166
+ 0.5611599087715149,
167
+ 0.43799519538879395,
168
+ 0.39136165380477905,
169
+ 0.35541364550590515,
170
+ 0.3650291860103607,
171
+ 0.28820252418518066,
172
+ 0.28319448232650757,
173
+ 0.27565693855285645,
174
+ 0.26511654257774353,
175
+ 0.17914986610412598,
176
+ 0.23786573112010956,
177
+ 0.3100714087486267,
178
+ 0.20881570875644684,
179
+ 0.2222459614276886,
180
+ 0.1805049628019333,
181
+ 0.21834571659564972,
182
+ 0.17398054897785187,
183
+ 0.17169272899627686,
184
+ 0.214468851685524,
185
+ 0.16018731892108917,
186
+ 0.21521098911762238,
187
+ 0.1976483017206192,
188
+ 0.16451016068458557,
189
+ 0.15567553043365479,
190
+ 0.1631745547056198,
191
+ 0.17332789301872253,
192
+ 0.16741959750652313,
193
+ 0.14030645787715912,
194
+ 0.16195133328437805,
195
+ 0.19442276656627655,
196
+ 0.23462989926338196,
197
+ 0.1750485599040985,
198
+ 0.15816131234169006,
199
+ 0.1345469057559967,
200
+ 0.11074493825435638,
201
+ 0.09530355036258698,
202
+ 0.12718015909194946,
203
+ 0.1237618550658226,
204
+ 0.15354827046394348,
205
+ 0.0893278494477272,
206
+ 0.1284264326095581,
207
+ 0.0889713391661644,
208
+ 0.10795746743679047,
209
+ 0.11995094269514084,
210
+ 0.11075253039598465,
211
+ 0.09607528150081635,
212
+ 0.0994424819946289,
213
+ 0.13344082236289978,
214
+ 0.12076304107904434,
215
+ 0.10045450925827026,
216
+ 0.07217186689376831,
217
+ 0.08645479381084442
218
  ],
219
  "info_loss": [
220
+ -1.4931654930114746,
221
+ -0.3287515640258789,
222
+ -0.20309758186340332,
223
+ -0.08040070533752441,
224
+ -0.08293402194976807,
225
+ -0.08145439624786377,
226
+ -0.08084726333618164,
227
+ -0.13623690605163574,
228
+ -0.18764793872833252,
229
+ -0.21354830265045166,
230
+ -0.2971622943878174,
231
+ -0.4655829668045044,
232
+ -0.9671027064323425,
233
+ -1.0975041389465332,
234
+ -0.6573877930641174,
235
+ -0.52280592918396,
236
+ -0.33613646030426025,
237
+ -0.30700165033340454,
238
+ -0.33531850576400757,
239
+ -0.24739503860473633,
240
+ -0.20575667917728424,
241
+ -0.1729479283094406,
242
+ -0.19086585938930511,
243
+ -0.1466362029314041,
244
+ -0.12648341059684753,
245
+ -0.09660688042640686,
246
+ -0.13731198012828827,
247
+ -0.090257927775383,
248
+ -0.11520253866910934,
249
+ -0.16884556412696838,
250
+ -0.1119970828294754,
251
+ -0.12500318884849548,
252
+ -0.07407157123088837,
253
+ -0.1408563256263733,
254
+ -0.10861283540725708,
255
+ -0.10317053645849228,
256
+ -0.1429494023323059,
257
+ -0.10317841172218323,
258
+ -0.14690537750720978,
259
+ -0.13502144813537598,
260
+ -0.0957949236035347,
261
+ -0.10858335345983505,
262
+ -0.11433392018079758,
263
+ -0.11800597608089447,
264
+ -0.1043647900223732,
265
+ -0.07300866395235062,
266
+ -0.10729879140853882,
267
+ -0.12777504324913025,
268
+ -0.1146274283528328,
269
+ -0.11757340282201767,
270
+ -0.10103149712085724,
271
+ -0.0873810350894928,
272
+ -0.08418776094913483,
273
+ -0.07348176836967468,
274
+ -0.08262161910533905,
275
+ -0.06471286714076996,
276
+ -0.08114015311002731,
277
+ -0.07110169529914856,
278
+ -0.09566681832075119,
279
+ -0.07004982978105545,
280
+ -0.08596527576446533,
281
+ -0.09278135001659393,
282
+ -0.08236707746982574,
283
+ -0.06275559961795807,
284
+ -0.08083701133728027,
285
+ -0.07647520303726196,
286
+ -0.08978942036628723,
287
+ -0.07253386080265045,
288
+ -0.04718723148107529,
289
+ -0.06880763173103333
290
  ],
291
  "abs_loss": [
292
+ 2.287022590637207,
293
+ 2.0916876792907715,
294
+ 1.904157280921936,
295
+ 1.865134596824646,
296
+ 1.8645482063293457,
297
+ 1.8384100198745728,
298
+ 1.8436139822006226,
299
+ 1.8555344343185425,
300
+ 1.8272294998168945,
301
+ 1.8342450857162476,
302
+ 1.8453730344772339,
303
+ 1.838832974433899,
304
+ 1.8428555727005005,
305
+ 1.8253898620605469,
306
+ 1.7948323488235474,
307
+ 1.7442383766174316,
308
+ 1.575785517692566,
309
+ 1.3669052124023438,
310
+ 1.1249511241912842,
311
+ 0.8671944737434387,
312
+ 0.7276226878166199,
313
+ 0.617413341999054,
314
+ 0.5250348448753357,
315
+ 0.44361987709999084,
316
+ 0.42031577229499817,
317
+ 0.41864728927612305,
318
+ 0.43554285168647766,
319
+ 0.3743334114551544,
320
+ 0.34732961654663086,
321
+ 0.33214130997657776,
322
+ 0.3264453709125519,
323
+ 0.31868138909339905,
324
+ 0.3359517753124237,
325
+ 0.2563074827194214,
326
+ 0.2655089795589447,
327
+ 0.24500541388988495,
328
+ 0.28663334250450134,
329
+ 0.2430928349494934,
330
+ 0.24621863663196564,
331
+ 0.22454464435577393,
332
+ 0.22838687896728516,
333
+ 0.23042821884155273,
334
+ 0.2049625664949417,
335
+ 0.21573372185230255,
336
+ 0.21731531620025635,
337
+ 0.21410906314849854,
338
+ 0.1734648495912552,
339
+ 0.17448820173740387,
340
+ 0.18207435309886932,
341
+ 0.17394252121448517,
342
+ 0.17870847880840302,
343
+ 0.1585991531610489,
344
+ 0.17257511615753174,
345
+ 0.18372474610805511,
346
+ 0.1582813858985901,
347
+ 0.1431683897972107,
348
+ 0.14724594354629517,
349
+ 0.15524940192699432,
350
+ 0.12837021052837372,
351
+ 0.15147118270397186,
352
+ 0.13836684823036194,
353
+ 0.14143264293670654,
354
+ 0.1055116131901741,
355
+ 0.12111660093069077,
356
+ 0.1088944599032402,
357
+ 0.12397897243499756,
358
+ 0.12244991213083267,
359
+ 0.11925437301397324,
360
+ 0.11537864059209824,
361
+ 0.12550200521945953
362
  ],
363
  "zipf_loss": [
364
+ 8.190372467041016,
365
+ 4.21774959564209,
366
+ 2.0889029502868652,
367
+ 1.7756659984588623,
368
+ 1.715381383895874,
369
+ 1.7015979290008545,
370
+ 1.6966168880462646,
371
+ 1.6921616792678833,
372
+ 1.6919775009155273,
373
+ 1.6897456645965576,
374
+ 1.6901496648788452,
375
+ 1.690834403038025,
376
+ 1.696687936782837,
377
+ 1.7070481777191162,
378
+ 1.7052288055419922,
379
+ 1.7003854513168335,
380
+ 1.6773663759231567,
381
+ 1.6252772808074951,
382
+ 1.540981650352478,
383
+ 1.5055437088012695,
384
+ 1.4082492589950562,
385
+ 1.3143138885498047,
386
+ 1.2227977514266968,
387
+ 1.1722016334533691,
388
+ 1.0987006425857544,
389
+ 1.0058059692382812,
390
+ 0.9411389231681824,
391
+ 0.9186881184577942,
392
+ 0.8159641623497009,
393
+ 0.7451286315917969,
394
+ 0.6992727518081665,
395
+ 0.6709651947021484,
396
+ 0.6331524848937988,
397
+ 0.5614780187606812,
398
+ 0.6249133348464966,
399
+ 0.5249479413032532,
400
+ 0.4748857021331787,
401
+ 0.4279439449310303,
402
+ 0.4149409234523773,
403
+ 0.37185347080230713,
404
+ 0.3879944980144501,
405
+ 0.4145728647708893,
406
+ 0.33193638920783997,
407
+ 0.40620243549346924,
408
+ 0.3544418215751648,
409
+ 0.3280959725379944,
410
+ 0.3559904098510742,
411
+ 0.32354503870010376,
412
+ 0.2835143208503723,
413
+ 0.24926838278770447,
414
+ 0.2441646009683609,
415
+ 0.2400692105293274,
416
+ 0.21169903874397278,
417
+ 0.1923774778842926,
418
+ 0.25739187002182007,
419
+ 0.1864461600780487,
420
+ 0.1969538927078247,
421
+ 0.20113880932331085,
422
+ 0.16653358936309814,
423
+ 0.208352729678154,
424
+ 0.16804218292236328,
425
+ 0.1555454134941101,
426
+ 0.17147725820541382,
427
+ 0.15252524614334106,
428
+ 0.1460340917110443,
429
+ 0.15528786182403564,
430
+ 0.13820970058441162,
431
+ 0.1327650099992752,
432
+ 0.15823212265968323,
433
+ 0.1449219286441803
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
+ 0.6088240742683411,
438
+ 0.33010056614875793,
439
+ 0.22861000895500183,
440
+ 0.21283285319805145,
441
+ 0.17951442301273346,
442
+ 0.16018734872341156,
443
+ 0.11169280111789703,
444
+ 0.12691719830036163,
445
+ 0.1456661969423294,
446
+ 0.15746210515499115,
447
+ 0.1443333923816681,
448
+ 0.16156138479709625,
449
+ 0.20633621513843536,
450
+ 0.22460925579071045,
451
+ 0.26161104440689087,
452
+ 0.27444061636924744,
453
+ 0.28116294741630554,
454
+ 0.28717944025993347,
455
+ 0.3055819571018219,
456
+ 0.325307697057724,
457
+ 0.3221662938594818,
458
+ 0.3159293830394745,
459
+ 0.3048442602157593,
460
+ 0.30672410130500793,
461
+ 0.301876962184906,
462
+ 0.29914870858192444,
463
+ 0.31866511702537537,
464
+ 0.33082228899002075,
465
+ 0.33669188618659973,
466
+ 0.32973605394363403,
467
+ 0.32963281869888306,
468
+ 0.33795875310897827,
469
+ 0.3385995030403137,
470
+ 0.3364645540714264,
471
+ 0.33232346177101135,
472
+ 0.33818763494491577,
473
+ 0.3330787122249603,
474
+ 0.3314555287361145,
475
+ 0.33205369114875793,
476
+ 0.3319832682609558,
477
+ 0.32697418332099915,
478
+ 0.3278193771839142,
479
+ 0.3068898022174835,
480
+ 0.309592068195343,
481
+ 0.3145006597042084,
482
+ 0.3080871105194092,
483
+ 0.3092295825481415,
484
+ 0.3054082989692688,
485
+ 0.30440065264701843,
486
+ 0.299023300409317,
487
+ 0.3067370355129242,
488
+ 0.31260034441947937,
489
+ 0.3118235766887665,
490
+ 0.3144630491733551,
491
+ 0.31215304136276245,
492
+ 0.31217923760414124,
493
+ 0.31190818548202515,
494
+ 0.3100627064704895,
495
+ 0.3103569447994232,
496
+ 0.3121229112148285,
497
+ 0.3065374791622162,
498
+ 0.3073209226131439,
499
+ 0.3084685206413269,
500
+ 0.30692750215530396,
501
+ 0.3080828785896301,
502
+ 0.30695533752441406,
503
+ 0.3065243661403656,
504
+ 0.30688145756721497,
505
+ 0.3082342743873596,
506
+ 0.30828753113746643
507
  ],
508
  "lr": [
509
+ 1.6752136752136756e-05,
510
+ 3.384615384615385e-05,
511
+ 4e-05,
512
+ 4e-05,
513
+ 4e-05,
514
+ 4e-05,
515
+ 4e-05,
516
+ 4e-05,
517
+ 4e-05,
518
+ 4e-05,
519
+ 4e-05,
520
+ 4e-05,
521
+ 4e-05,
522
+ 4e-05,
523
+ 4e-05,
524
+ 4e-05,
525
+ 4e-05,
526
+ 4e-05,
527
+ 4e-05,
528
+ 4e-05,
529
+ 4e-05,
530
+ 4e-05,
531
+ 4e-05,
532
+ 4e-05,
533
+ 4e-05,
534
+ 4e-05,
535
+ 4e-05,
536
+ 4e-05,
537
+ 4e-05,
538
+ 4e-05,
539
+ 4e-05,
540
+ 4e-05,
541
+ 4e-05,
542
+ 4e-05,
543
+ 4e-05,
544
+ 4e-05,
545
+ 4e-05,
546
+ 4e-05,
547
+ 4e-05,
548
+ 4e-05,
549
+ 4e-05,
550
+ 4e-05,
551
+ 3.9947798576324814e-05,
552
+ 3.8761402583706826e-05,
553
+ 3.757500659108885e-05,
554
+ 3.6388610598470864e-05,
555
+ 3.5202214605852884e-05,
556
+ 3.401581861323491e-05,
557
+ 3.282942262061693e-05,
558
+ 3.0670181914052204e-05,
559
+ 2.948378592143422e-05,
560
+ 2.8297389928816243e-05,
561
+ 2.711099393619826e-05,
562
+ 2.5924597943580284e-05,
563
+ 2.4738201950962303e-05,
564
+ 2.3551805958344316e-05,
565
+ 2.1392565251779595e-05,
566
+ 2.020616925916161e-05,
567
+ 1.901977326654364e-05,
568
+ 1.783337727392566e-05,
569
+ 1.6646981281307675e-05,
570
+ 1.546058528868969e-05,
571
+ 1.427418929607171e-05,
572
+ 1.2114948589506984e-05,
573
+ 1.0928552596889013e-05,
574
+ 9.742156604271029e-06,
575
+ 8.555760611653046e-06,
576
+ 7.369364619035064e-06,
577
+ 6.182968626417082e-06,
578
+ 4.996572633799099e-06
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
 
591
  3869
592
  ],
593
  "eval_accuracy": [
594
+ 0.01,
595
  0.0,
596
  0.0,
597
  0.0,
 
603
  0.0
604
  ]
605
  },
606
+ "final_accuracy": 0.7783333333333333,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
610
  "K": null,
611
  "mode": "sft",
612
  "n_digits": 6,
613
+ "n_per_split": 100
614
  },
615
  "splits": {
616
  "add_S0": {
617
+ "full_accuracy": 0.98,
618
+ "n_examples": 100,
619
  "per_subtask": {
620
  "SA": {
621
+ "accuracy": 0.996694214876033,
622
+ "count": 605
623
  },
624
  "SS": {
625
  "accuracy": 1.0,
626
+ "count": 95
627
  }
628
  }
629
  },
630
  "add_S1": {
631
+ "full_accuracy": 0.86,
632
+ "n_examples": 100,
633
  "per_subtask": {
634
  "SA": {
635
+ "accuracy": 0.9754901960784313,
636
+ "count": 204
637
  },
638
  "SC": {
639
+ "accuracy": 0.9763313609467456,
640
+ "count": 169
641
  },
642
  "SS": {
643
  "accuracy": 1.0,
644
+ "count": 31
645
  },
646
  "UC": {
647
+ "accuracy": 0.9831081081081081,
648
+ "count": 296
649
  }
650
  }
651
  },
652
  "add_S2": {
653
+ "full_accuracy": 0.84,
654
+ "n_examples": 100,
655
  "per_subtask": {
656
  "SA": {
657
  "accuracy": 1.0,
658
+ "count": 163
659
  },
660
  "SC": {
661
+ "accuracy": 0.9615384615384616,
662
+ "count": 130
663
  },
664
  "SS": {
665
  "accuracy": 1.0,
666
+ "count": 87
667
  },
668
  "UC": {
669
+ "accuracy": 0.9408866995073891,
670
+ "count": 203
671
  },
672
  "US": {
673
+ "accuracy": 0.9914529914529915,
674
+ "count": 117
675
  }
676
  }
677
  },
678
  "add_S3": {
679
+ "full_accuracy": 0.33,
680
+ "n_examples": 100,
681
  "per_subtask": {
682
  "SA": {
683
+ "accuracy": 0.9834710743801653,
684
+ "count": 121
685
  },
686
  "SC": {
687
+ "accuracy": 0.9834710743801653,
688
+ "count": 121
689
  },
690
  "SS": {
691
+ "accuracy": 0.9591836734693877,
692
+ "count": 49
693
  },
694
  "UC": {
695
+ "accuracy": 0.6505376344086021,
696
+ "count": 186
697
  },
698
  "US": {
699
+ "accuracy": 0.9147982062780269,
700
+ "count": 223
701
  }
702
  }
703
  },
704
  "add_S4": {
705
+ "full_accuracy": 0.29,
706
+ "n_examples": 100,
707
  "per_subtask": {
708
  "SA": {
709
+ "accuracy": 0.9903846153846154,
710
+ "count": 104
711
  },
712
  "SC": {
713
+ "accuracy": 0.9716981132075472,
714
+ "count": 106
715
  },
716
  "SS": {
717
  "accuracy": 1.0,
718
+ "count": 23
719
  },
720
  "UC": {
721
+ "accuracy": 0.61875,
722
+ "count": 160
723
  },
724
  "US": {
725
+ "accuracy": 0.7100977198697068,
726
+ "count": 307
727
  }
728
  }
729
  },
730
  "add_S5": {
731
+ "full_accuracy": 0.14,
732
+ "n_examples": 100,
733
  "per_subtask": {
734
  "SA": {
735
  "accuracy": 1.0,
736
+ "count": 100
737
  },
738
  "SC": {
739
  "accuracy": 1.0,
740
+ "count": 100
741
  },
742
  "UC": {
743
+ "accuracy": 0.18,
744
+ "count": 100
745
  },
746
  "US": {
747
+ "accuracy": 0.4975,
748
+ "count": 400
749
  }
750
  }
751
  },
752
  "add_S6": {
753
+ "full_accuracy": 0.28,
754
+ "n_examples": 100,
755
  "per_subtask": {
756
  "SC": {
757
  "accuracy": 1.0,
758
+ "count": 100
759
  },
760
  "UC": {
761
+ "accuracy": 0.42,
762
+ "count": 100
763
  },
764
  "US": {
765
+ "accuracy": 0.526,
766
+ "count": 500
767
  }
768
  }
769
  },
770
  "add_random": {
771
+ "full_accuracy": 0.88,
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
+ "accuracy": 0.9888143176733781,
776
+ "count": 447
777
  },
778
  "SC": {
779
+ "accuracy": 0.9875,
780
+ "count": 320
781
  },
782
  "SS": {
783
  "accuracy": 1.0,
784
+ "count": 56
785
  },
786
  "UC": {
787
+ "accuracy": 0.9716446124763705,
788
+ "count": 529
789
  },
790
  "US": {
791
+ "accuracy": 0.9791666666666666,
792
+ "count": 48
793
  }
794
  }
795
  },
796
  "add_C3": {
797
+ "full_accuracy": 0.62,
798
+ "n_examples": 100,
799
  "per_subtask": {
800
  "SA": {
801
  "accuracy": 1.0,
802
+ "count": 300
803
  },
804
  "SC": {
805
+ "accuracy": 0.99,
806
+ "count": 100
807
  },
808
  "UC": {
809
+ "accuracy": 0.8082901554404145,
810
+ "count": 193
811
  },
812
  "US": {
813
+ "accuracy": 0.9158878504672897,
814
+ "count": 107
815
  }
816
  }
817
  },
818
  "add_C4": {
819
+ "full_accuracy": 0.65,
820
+ "n_examples": 100,
821
  "per_subtask": {
822
  "SA": {
823
  "accuracy": 1.0,
824
+ "count": 200
825
  },
826
  "SC": {
827
  "accuracy": 1.0,
828
+ "count": 100
829
  },
830
  "UC": {
831
+ "accuracy": 0.875,
832
+ "count": 256
833
  },
834
  "US": {
835
+ "accuracy": 0.8819444444444444,
836
+ "count": 144
837
  }
838
  }
839
  },
840
  "add_C5": {
841
+ "full_accuracy": 0.52,
842
+ "n_examples": 100,
843
  "per_subtask": {
844
  "SA": {
845
+ "accuracy": 0.95,
846
+ "count": 100
847
  },
848
  "SC": {
849
+ "accuracy": 0.99,
850
+ "count": 100
851
  },
852
  "UC": {
853
+ "accuracy": 0.8496732026143791,
854
+ "count": 306
855
  },
856
  "US": {
857
+ "accuracy": 0.8350515463917526,
858
+ "count": 194
859
  }
860
  }
861
  },
862
  "add_C6": {
863
+ "full_accuracy": 0.56,
864
+ "n_examples": 100,
865
  "per_subtask": {
866
  "SC": {
867
  "accuracy": 1.0,
868
+ "count": 100
869
  },
870
  "UC": {
871
+ "accuracy": 0.8852459016393442,
872
+ "count": 366
873
  },
874
  "US": {
875
+ "accuracy": 0.905982905982906,
876
+ "count": 234
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
+ "full_accuracy": 0.91,
882
+ "n_examples": 100,
883
  "per_subtask": {
884
  "MD": {
885
+ "accuracy": 0.9850249584026622,
886
+ "count": 601
887
  },
888
  "ME": {
889
  "accuracy": 1.0,
890
+ "count": 99
891
  }
892
  }
893
  },
894
  "sub_M1": {
895
+ "full_accuracy": 0.94,
896
+ "n_examples": 100,
897
  "per_subtask": {
898
  "MD": {
899
+ "accuracy": 0.996415770609319,
900
+ "count": 279
901
  },
902
  "MB": {
903
+ "accuracy": 0.9862068965517241,
904
+ "count": 145
905
  },
906
  "ME": {
907
+ "accuracy": 0.9583333333333334,
908
+ "count": 24
909
  },
910
  "UB": {
911
+ "accuracy": 0.9920634920634921,
912
+ "count": 252
913
  }
914
  }
915
  },
916
  "sub_M2": {
917
+ "full_accuracy": 0.85,
918
+ "n_examples": 100,
919
  "per_subtask": {
920
  "MD": {
921
+ "accuracy": 0.9906103286384976,
922
+ "count": 213
923
  },
924
  "MB": {
925
+ "accuracy": 0.9823008849557522,
926
+ "count": 113
927
  },
928
  "ME": {
929
  "accuracy": 1.0,
930
+ "count": 85
931
  },
932
  "UB": {
933
+ "accuracy": 0.9281767955801105,
934
+ "count": 181
935
  },
936
  "UD": {
937
  "accuracy": 1.0,
938
+ "count": 108
939
  }
940
  }
941
  },
942
  "sub_M3": {
943
+ "full_accuracy": 0.3,
944
+ "n_examples": 100,
945
  "per_subtask": {
946
  "MD": {
947
  "accuracy": 1.0,
948
+ "count": 179
949
  },
950
  "MB": {
951
+ "accuracy": 0.9805825242718447,
952
+ "count": 103
953
  },
954
  "ME": {
955
  "accuracy": 1.0,
956
+ "count": 56
957
  },
958
  "UB": {
959
+ "accuracy": 0.6040268456375839,
960
+ "count": 149
961
  },
962
  "UD": {
963
+ "accuracy": 0.9061032863849765,
964
+ "count": 213
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
+ "full_accuracy": 0.03,
970
+ "n_examples": 100,
971
  "per_subtask": {
972
  "MD": {
973
  "accuracy": 1.0,
974
+ "count": 200
975
  },
976
  "MB": {
977
  "accuracy": 1.0,
978
+ "count": 100
979
  },
980
  "UB": {
981
+ "accuracy": 0.43,
982
+ "count": 100
983
  },
984
  "UD": {
985
+ "accuracy": 0.49333333333333335,
986
+ "count": 300
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
+ "full_accuracy": 0.06,
992
+ "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
995
  "accuracy": 1.0,
996
+ "count": 100
997
  },
998
  "MB": {
999
  "accuracy": 1.0,
1000
+ "count": 100
1001
  },
1002
  "UB": {
1003
+ "accuracy": 0.41,
1004
+ "count": 100
1005
  },
1006
  "UD": {
1007
+ "accuracy": 0.4075,
1008
+ "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
+ "full_accuracy": 0.875,
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
+ "accuracy": 0.99,
1018
+ "count": 600
1019
  },
1020
  "MB": {
1021
+ "accuracy": 0.9887640449438202,
1022
+ "count": 267
1023
  },
1024
  "ME": {
1025
  "accuracy": 1.0,
1026
+ "count": 53
1027
  },
1028
  "UB": {
1029
+ "accuracy": 0.9635535307517085,
1030
+ "count": 439
1031
  },
1032
  "UD": {
1033
+ "accuracy": 0.975609756097561,
1034
+ "count": 41
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
+ "full_accuracy": 0.75,
1040
+ "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
1043
  "accuracy": 1.0,
1044
+ "count": 300
1045
  },
1046
  "MB": {
1047
  "accuracy": 1.0,
1048
+ "count": 100
1049
  },
1050
  "UB": {
1051
+ "accuracy": 0.883248730964467,
1052
+ "count": 197
1053
  },
1054
  "UD": {
1055
+ "accuracy": 0.9514563106796117,
1056
+ "count": 103
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
+ "full_accuracy": 0.55,
1062
+ "n_examples": 100,
1063
  "per_subtask": {
1064
  "MD": {
1065
  "accuracy": 1.0,
1066
+ "count": 200
1067
  },
1068
  "MB": {
1069
  "accuracy": 1.0,
1070
+ "count": 100
1071
  },
1072
  "UB": {
1073
+ "accuracy": 0.8421052631578947,
1074
+ "count": 247
1075
  },
1076
  "UD": {
1077
+ "accuracy": 0.869281045751634,
1078
+ "count": 153
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
+ "full_accuracy": 0.48,
1084
+ "n_examples": 100,
1085
  "per_subtask": {
1086
  "MD": {
1087
  "accuracy": 1.0,
1088
+ "count": 100
1089
  },
1090
  "MB": {
1091
  "accuracy": 1.0,
1092
+ "count": 100
1093
  },
1094
  "UB": {
1095
+ "accuracy": 0.8624161073825504,
1096
+ "count": 298
1097
  },
1098
  "UD": {
1099
+ "accuracy": 0.8118811881188119,
1100
+ "count": 202
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
+ "overall_accuracy": 0.6016666666666667,
1107
+ "total_examples": 2400,
1108
  "n_splits": 22
1109
  }
1110
  },
 
1114
  "K": 1,
1115
  "mode": "sorl",
1116
  "n_digits": 6,
1117
+ "n_per_split": 100
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
  "full_accuracy": 1.0,
1122
+ "n_examples": 100,
1123
  "per_subtask": {
1124
  "SA": {
1125
  "accuracy": 1.0,
1126
+ "count": 605
1127
  },
1128
  "SS": {
1129
  "accuracy": 1.0,
1130
+ "count": 95
1131
  }
1132
  }
1133
  },
1134
  "add_S1": {
1135
  "full_accuracy": 1.0,
1136
+ "n_examples": 100,
1137
  "per_subtask": {
1138
  "SA": {
1139
  "accuracy": 1.0,
1140
+ "count": 204
1141
  },
1142
  "SC": {
1143
  "accuracy": 1.0,
1144
+ "count": 169
1145
  },
1146
  "SS": {
1147
  "accuracy": 1.0,
1148
+ "count": 31
1149
  },
1150
  "UC": {
1151
  "accuracy": 1.0,
1152
+ "count": 296
1153
  }
1154
  }
1155
  },
1156
  "add_S2": {
1157
+ "full_accuracy": 0.96,
1158
+ "n_examples": 100,
1159
  "per_subtask": {
1160
  "SA": {
1161
+ "accuracy": 0.9938650306748467,
1162
+ "count": 163
1163
  },
1164
  "SC": {
1165
+ "accuracy": 0.9923076923076923,
1166
+ "count": 130
1167
  },
1168
  "SS": {
1169
+ "accuracy": 0.9770114942528736,
1170
+ "count": 87
1171
  },
1172
  "UC": {
1173
+ "accuracy": 0.9901477832512315,
1174
+ "count": 203
1175
  },
1176
  "US": {
1177
  "accuracy": 1.0,
1178
+ "count": 117
1179
  }
1180
  }
1181
  },
1182
  "add_S3": {
1183
+ "full_accuracy": 0.67,
1184
+ "n_examples": 100,
1185
  "per_subtask": {
1186
  "SA": {
1187
  "accuracy": 1.0,
1188
+ "count": 121
1189
  },
1190
  "SC": {
1191
+ "accuracy": 0.9917355371900827,
1192
+ "count": 121
1193
  },
1194
  "SS": {
1195
  "accuracy": 1.0,
1196
+ "count": 49
1197
  },
1198
  "UC": {
1199
+ "accuracy": 0.8279569892473119,
1200
+ "count": 186
1201
  },
1202
  "US": {
1203
+ "accuracy": 0.9955156950672646,
1204
+ "count": 223
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
+ "full_accuracy": 0.45,
1210
+ "n_examples": 100,
1211
  "per_subtask": {
1212
  "SA": {
1213
  "accuracy": 1.0,
1214
+ "count": 104
1215
  },
1216
  "SC": {
1217
  "accuracy": 1.0,
1218
+ "count": 106
1219
  },
1220
  "SS": {
1221
  "accuracy": 1.0,
1222
+ "count": 23
1223
  },
1224
  "UC": {
1225
+ "accuracy": 0.6875,
1226
+ "count": 160
1227
  },
1228
  "US": {
1229
+ "accuracy": 0.8273615635179153,
1230
+ "count": 307
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
+ "full_accuracy": 0.21,
1236
+ "n_examples": 100,
1237
  "per_subtask": {
1238
  "SA": {
1239
  "accuracy": 1.0,
1240
+ "count": 100
1241
  },
1242
  "SC": {
1243
  "accuracy": 1.0,
1244
+ "count": 100
1245
  },
1246
  "UC": {
1247
+ "accuracy": 0.33,
1248
+ "count": 100
1249
  },
1250
  "US": {
1251
+ "accuracy": 0.6175,
1252
+ "count": 400
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
+ "full_accuracy": 0.19,
1258
+ "n_examples": 100,
1259
  "per_subtask": {
1260
  "SC": {
1261
  "accuracy": 1.0,
1262
+ "count": 100
1263
  },
1264
  "UC": {
1265
+ "accuracy": 0.34,
1266
+ "count": 100
1267
  },
1268
  "US": {
1269
+ "accuracy": 0.54,
1270
+ "count": 500
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
+ "full_accuracy": 0.97,
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
+ "accuracy": 0.9888143176733781,
1280
+ "count": 447
1281
  },
1282
  "SC": {
1283
  "accuracy": 1.0,
1284
+ "count": 320
1285
  },
1286
  "SS": {
1287
+ "accuracy": 0.9642857142857143,
1288
+ "count": 56
1289
  },
1290
  "UC": {
1291
+ "accuracy": 0.998109640831758,
1292
+ "count": 529
1293
  },
1294
  "US": {
1295
  "accuracy": 1.0,
1296
+ "count": 48
1297
  }
1298
  }
1299
  },
1300
  "add_C3": {
1301
+ "full_accuracy": 0.82,
1302
+ "n_examples": 100,
1303
  "per_subtask": {
1304
  "SA": {
1305
  "accuracy": 1.0,
1306
+ "count": 300
1307
  },
1308
  "SC": {
1309
  "accuracy": 1.0,
1310
+ "count": 100
1311
  },
1312
  "UC": {
1313
+ "accuracy": 0.9067357512953368,
1314
+ "count": 193
1315
  },
1316
  "US": {
1317
+ "accuracy": 0.9906542056074766,
1318
+ "count": 107
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
+ "full_accuracy": 0.79,
1324
+ "n_examples": 100,
1325
  "per_subtask": {
1326
  "SA": {
1327
  "accuracy": 1.0,
1328
+ "count": 200
1329
  },
1330
  "SC": {
1331
  "accuracy": 1.0,
1332
+ "count": 100
1333
  },
1334
  "UC": {
1335
+ "accuracy": 0.92578125,
1336
+ "count": 256
1337
  },
1338
  "US": {
1339
+ "accuracy": 0.9583333333333334,
1340
+ "count": 144
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
+ "full_accuracy": 0.78,
1346
+ "n_examples": 100,
1347
  "per_subtask": {
1348
  "SA": {
1349
  "accuracy": 1.0,
1350
+ "count": 100
1351
  },
1352
  "SC": {
1353
  "accuracy": 1.0,
1354
+ "count": 100
1355
  },
1356
  "UC": {
1357
+ "accuracy": 0.9379084967320261,
1358
+ "count": 306
1359
  },
1360
  "US": {
1361
+ "accuracy": 0.865979381443299,
1362
+ "count": 194
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
+ "full_accuracy": 0.85,
1368
+ "n_examples": 100,
1369
  "per_subtask": {
1370
  "SC": {
1371
  "accuracy": 1.0,
1372
+ "count": 100
1373
  },
1374
  "UC": {
1375
+ "accuracy": 0.9617486338797814,
1376
+ "count": 366
1377
  },
1378
  "US": {
1379
+ "accuracy": 0.9658119658119658,
1380
+ "count": 234
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
+ "full_accuracy": 0.96,
1386
+ "n_examples": 100,
1387
  "per_subtask": {
1388
  "MD": {
1389
+ "accuracy": 0.9933444259567388,
1390
+ "count": 601
1391
  },
1392
  "ME": {
1393
  "accuracy": 1.0,
1394
+ "count": 99
1395
  }
1396
  }
1397
  },
1398
  "sub_M1": {
1399
  "full_accuracy": 1.0,
1400
+ "n_examples": 100,
1401
  "per_subtask": {
1402
  "MD": {
1403
  "accuracy": 1.0,
1404
+ "count": 279
1405
  },
1406
  "MB": {
1407
  "accuracy": 1.0,
1408
+ "count": 145
1409
  },
1410
  "ME": {
1411
  "accuracy": 1.0,
1412
+ "count": 24
1413
  },
1414
  "UB": {
1415
  "accuracy": 1.0,
1416
+ "count": 252
1417
  }
1418
  }
1419
  },
1420
  "sub_M2": {
1421
+ "full_accuracy": 0.97,
1422
+ "n_examples": 100,
1423
  "per_subtask": {
1424
  "MD": {
1425
+ "accuracy": 0.9953051643192489,
1426
+ "count": 213
1427
  },
1428
  "MB": {
1429
  "accuracy": 1.0,
1430
+ "count": 113
1431
  },
1432
  "ME": {
1433
  "accuracy": 1.0,
1434
+ "count": 85
1435
  },
1436
  "UB": {
1437
+ "accuracy": 0.988950276243094,
1438
+ "count": 181
1439
  },
1440
  "UD": {
1441
  "accuracy": 1.0,
1442
+ "count": 108
1443
  }
1444
  }
1445
  },
1446
  "sub_M3": {
1447
+ "full_accuracy": 0.73,
1448
+ "n_examples": 100,
1449
  "per_subtask": {
1450
  "MD": {
1451
  "accuracy": 1.0,
1452
+ "count": 179
1453
  },
1454
  "MB": {
1455
  "accuracy": 1.0,
1456
+ "count": 103
1457
  },
1458
  "ME": {
1459
  "accuracy": 1.0,
1460
+ "count": 56
1461
  },
1462
  "UB": {
1463
+ "accuracy": 0.8322147651006712,
1464
+ "count": 149
1465
  },
1466
  "UD": {
1467
+ "accuracy": 0.9906103286384976,
1468
+ "count": 213
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
+ "full_accuracy": 0.48,
1474
+ "n_examples": 100,
1475
  "per_subtask": {
1476
  "MD": {
1477
  "accuracy": 1.0,
1478
+ "count": 200
1479
  },
1480
  "MB": {
1481
  "accuracy": 1.0,
1482
+ "count": 100
1483
  },
1484
  "UB": {
1485
+ "accuracy": 0.62,
1486
+ "count": 100
1487
  },
1488
  "UD": {
1489
+ "accuracy": 0.8666666666666667,
1490
+ "count": 300
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
+ "full_accuracy": 0.35,
1496
+ "n_examples": 100,
1497
  "per_subtask": {
1498
  "MD": {
1499
  "accuracy": 1.0,
1500
+ "count": 100
1501
  },
1502
  "MB": {
1503
  "accuracy": 1.0,
1504
+ "count": 100
1505
  },
1506
  "UB": {
1507
+ "accuracy": 0.68,
1508
+ "count": 100
1509
  },
1510
  "UD": {
1511
+ "accuracy": 0.755,
1512
+ "count": 400
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
+ "full_accuracy": 0.985,
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
+ "accuracy": 0.9966666666666667,
1522
+ "count": 600
1523
  },
1524
  "MB": {
1525
+ "accuracy": 0.9962546816479401,
1526
+ "count": 267
1527
  },
1528
  "ME": {
1529
  "accuracy": 1.0,
1530
+ "count": 53
1531
  },
1532
  "UB": {
1533
  "accuracy": 1.0,
1534
+ "count": 439
1535
  },
1536
  "UD": {
1537
  "accuracy": 1.0,
1538
+ "count": 41
1539
  }
1540
  }
1541
  },
1542
  "sub_B3": {
1543
+ "full_accuracy": 0.9,
1544
+ "n_examples": 100,
1545
  "per_subtask": {
1546
  "MD": {
1547
+ "accuracy": 0.9966666666666667,
1548
+ "count": 300
1549
  },
1550
  "MB": {
1551
  "accuracy": 1.0,
1552
+ "count": 100
1553
  },
1554
  "UB": {
1555
+ "accuracy": 0.9543147208121827,
1556
+ "count": 197
1557
  },
1558
  "UD": {
1559
+ "accuracy": 0.9902912621359223,
1560
+ "count": 103
1561
  }
1562
  }
1563
  },
1564
  "sub_B4": {
1565
+ "full_accuracy": 0.87,
1566
+ "n_examples": 100,
1567
  "per_subtask": {
1568
  "MD": {
1569
  "accuracy": 1.0,
1570
+ "count": 200
1571
  },
1572
  "MB": {
1573
  "accuracy": 1.0,
1574
+ "count": 100
1575
  },
1576
  "UB": {
1577
+ "accuracy": 0.9554655870445344,
1578
+ "count": 247
1579
  },
1580
  "UD": {
1581
+ "accuracy": 0.954248366013072,
1582
+ "count": 153
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
+ "full_accuracy": 0.79,
1588
+ "n_examples": 100,
1589
  "per_subtask": {
1590
  "MD": {
1591
  "accuracy": 1.0,
1592
+ "count": 100
1593
  },
1594
  "MB": {
1595
  "accuracy": 1.0,
1596
+ "count": 100
1597
  },
1598
  "UB": {
1599
+ "accuracy": 0.9328859060402684,
1600
+ "count": 298
1601
  },
1602
  "UD": {
1603
+ "accuracy": 0.9554455445544554,
1604
+ "count": 202
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
+ "overall_accuracy": 0.7783333333333333,
1611
+ "total_examples": 2400,
1612
  "n_splits": 22
1613
  }
1614
  },
1615
+ "sorl_overall_accuracy": 0.7783333333333333,
1616
+ "sft_overall_accuracy": 0.6016666666666667
1617
  }
add_sub_sorl_v1_abs10_K1_25K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccab70b2517264fbfc0cc6c61a4eb7e47555f03c4db8db8c06ccd93ca673eac1
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3abff1d3d65f28a467e0107b3021da4659af9733b2521b6687ffa6fbb31e60f
3
  size 650303660
add_sub_sorl_v1_abs10_K1_25K/train_config.json CHANGED
@@ -1,35 +1,84 @@
1
  {
2
- "mode": "sorl",
3
- "ops": "add_sub",
4
- "n_digits": 6,
5
- "n_layer": 2,
6
- "n_head": 3,
7
- "n_embd": 510,
8
- "abs_vocab": 10,
9
  "K": 1,
 
 
 
 
 
 
10
  "alpha_info_gain": 10.0,
11
  "alpha_abs": 0.1,
12
  "alpha_soft_zipf": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  "batch_size": 64,
 
14
  "num_epochs": 10,
15
- "dataset_size": 25000,
16
- "lr": 8e-05,
 
 
 
17
  "output_dir": "ckpt/sweep/as_sorl_abs10_K1_25K",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  "device": "cuda",
19
  "push_to_hub": true,
20
  "no_wandb": false,
21
  "n_params": 162499262,
22
  "run_name": "add_sub_sorl_v1_abs10_K1_25K",
23
- "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
- "timestamp": "2026-04-12T01:58:11.626522+00:00",
25
  "tokenizer": "Qwen/Qwen3-0.6B",
26
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
  "dataset_config": "add_sub_6digit",
28
  "model_repo": "thoughtworks/arithmetic-sorl",
29
  "trainer_version": "v1",
30
- "wandb_run_id": "nv7szjmr",
31
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/nv7szjmr",
32
- "final_accuracy": 0.9707142857142858,
33
- "sft_accuracy": 0.9442857142857143,
34
  "eval_method": "ArithmeticEvaluator"
35
  }
 
1
  {
2
+ "num_rollouts": 4,
 
 
 
 
 
 
3
  "K": 1,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
  "alpha_info_gain": 10.0,
11
  "alpha_abs": 0.1,
12
  "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 4e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 117,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
  "num_epochs": 10,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 390,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
  "output_dir": "ckpt/sweep/as_sorl_abs10_K1_25K",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 2,
61
+ "n_head": 3,
62
+ "n_embd": 510,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 10,
65
+ "dataset_size": 25000,
66
+ "mode": "sorl",
67
  "device": "cuda",
68
  "push_to_hub": true,
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K",
72
+ "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
+ "timestamp": "2026-04-12T08:59:10.014670+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "je0pfgat",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/je0pfgat",
81
+ "final_accuracy": 0.7783333333333333,
82
+ "sft_accuracy": 0.6016666666666667,
83
  "eval_method": "ArithmeticEvaluator"
84
  }