amirali1985 commited on
Commit
a811bb8
·
verified ·
1 Parent(s): 166906a

Upload add_sub_sorl_v1_abs10_K1_10K

Browse files
add_sub_sorl_v1_abs10_K1_10K/metrics.json CHANGED
@@ -33,229 +33,229 @@
33
  1563
34
  ],
35
  "loss": [
36
- 4.547272682189941,
37
- 7.030660629272461,
38
- 3.9523563385009766,
39
- 3.047678232192993,
40
- 3.085705280303955,
41
- 2.618412494659424,
42
- 2.579326629638672,
43
- 1.4848873615264893,
44
- -0.47891831398010254,
45
- -5.560091495513916,
46
- -4.520766735076904,
47
- -2.9085822105407715,
48
- -2.024510383605957,
49
- -1.1992690563201904,
50
- -0.9796783924102783,
51
- -0.5373831987380981,
52
- -0.5736587047576904,
53
- -0.7218008041381836,
54
- -0.23317885398864746,
55
- -0.339227557182312,
56
- -0.03128504753112793,
57
- -0.006059408187866211,
58
- 0.16202318668365479,
59
- 0.23520565032958984,
60
- 0.015602707862854004,
61
- 0.4141479730606079,
62
- 0.1808193325996399,
63
- 0.15000015497207642,
64
- 0.025604248046875,
65
- 0.30849093198776245
66
  ],
67
  "base_loss": [
68
- 8.916278839111328,
69
- 6.119595050811768,
70
- 3.673598527908325,
71
- 2.020040273666382,
72
- 1.944067120552063,
73
- 1.8892498016357422,
74
- 1.86012864112854,
75
- 1.8146944046020508,
76
- 1.869857907295227,
77
- 1.7928285598754883,
78
- 1.4957207441329956,
79
- 1.1758774518966675,
80
- 0.93010014295578,
81
- 0.6884778141975403,
82
- 0.6303120255470276,
83
- 0.5103054046630859,
84
- 0.4800644516944885,
85
- 0.431017130613327,
86
- 0.3630550801753998,
87
- 0.3632798194885254,
88
- 0.32823804020881653,
89
- 0.2981429398059845,
90
- 0.24132393300533295,
91
- 0.23764917254447937,
92
- 0.2843627333641052,
93
- 0.19818377494812012,
94
- 0.2125009298324585,
95
- 0.18415263295173645,
96
- 0.19887767732143402,
97
- 0.1734028160572052
98
  ],
99
  "info_loss": [
100
- -1.2568669319152832,
101
- -0.31183719635009766,
102
- -0.1900501251220703,
103
- -0.09034490585327148,
104
- -0.07560265064239502,
105
- -0.11611974239349365,
106
- -0.11614418029785156,
107
- -0.2208923101425171,
108
- -0.42284250259399414,
109
- -0.9240456819534302,
110
- -0.7907073497772217,
111
- -0.5979017615318298,
112
- -0.4831601679325104,
113
- -0.3740352690219879,
114
- -0.34062066674232483,
115
- -0.2785865366458893,
116
- -0.27203837037086487,
117
- -0.27023085951805115,
118
- -0.20767968893051147,
119
- -0.21481892466545105,
120
- -0.16902248561382294,
121
- -0.1613115668296814,
122
- -0.1327231526374817,
123
- -0.12193454056978226,
124
- -0.14406734704971313,
125
- -0.09546159207820892,
126
- -0.1119590476155281,
127
- -0.11057599633932114,
128
- -0.12191784381866455,
129
- -0.09515686333179474
130
  ],
131
  "abs_loss": [
132
- 2.28905987739563,
133
- 2.068211555480957,
134
- 1.8956918716430664,
135
- 1.865671157836914,
136
- 1.8401455879211426,
137
- 1.8589462041854858,
138
- 1.8197526931762695,
139
- 1.8445491790771484,
140
- 1.8481858968734741,
141
- 1.8446826934814453,
142
- 1.8715795278549194,
143
- 1.8394945859909058,
144
- 1.7485918998718262,
145
- 1.6203765869140625,
146
- 1.4851690530776978,
147
- 1.3853811025619507,
148
- 1.1656250953674316,
149
- 1.0630995035171509,
150
- 0.8788065910339355,
151
- 0.8097158074378967,
152
- 0.7307209968566895,
153
- 0.6055489182472229,
154
- 0.6438609957695007,
155
- 0.5721376538276672,
156
- 0.5374541878700256,
157
- 0.5297055840492249,
158
- 0.45829495787620544,
159
- 0.428969144821167,
160
- 0.47001174092292786,
161
- 0.4526701867580414
162
  ],
163
  "zipf_loss": [
164
- 7.970757007598877,
165
- 3.8226165771484375,
166
- 1.9896899461746216,
167
- 1.7445199489593506,
168
- 1.713650107383728,
169
- 1.704465627670288,
170
- 1.6986644268035889,
171
- 1.6946611404418945,
172
- 1.6948304176330566,
173
- 1.7030682563781738,
174
- 1.7034276723861694,
175
- 1.7106086015701294,
176
- 1.702131748199463,
177
- 1.690568208694458,
178
- 1.6476993560791016,
179
- 1.59963858127594,
180
- 1.550097942352295,
181
- 1.443180799484253,
182
- 1.392682433128357,
183
- 1.3647103309631348,
184
- 1.2576297521591187,
185
- 1.2483584880828857,
186
- 1.183544635772705,
187
- 1.159688115119934,
188
- 1.1181681156158447,
189
- 1.1176096200942993,
190
- 1.0420794486999512,
191
- 1.0287106037139893,
192
- 0.9989038109779358,
193
- 1.0413897037506104
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
197
- 0.5579876899719238,
198
- 0.252128541469574,
199
- 0.21211421489715576,
200
- 0.18933552503585815,
201
- 0.12901675701141357,
202
- 0.13211114704608917,
203
- 0.14735311269760132,
204
- 0.14610400795936584,
205
- 0.1557065099477768,
206
- 0.17582964897155762,
207
- 0.1939730942249298,
208
- 0.20376113057136536,
209
- 0.21460682153701782,
210
- 0.23509620130062103,
211
- 0.24628381431102753,
212
- 0.2590982913970947,
213
- 0.25852254033088684,
214
- 0.2667267620563507,
215
- 0.26971182227134705,
216
- 0.2820512354373932,
217
- 0.2821248769760132,
218
- 0.28232815861701965,
219
- 0.2819784879684448,
220
- 0.2866751551628113,
221
- 0.2907232642173767,
222
- 0.2921256422996521,
223
- 0.2953319251537323,
224
- 0.29460301995277405,
225
- 0.295316606760025,
226
- 0.2968752384185791
227
  ],
228
  "lr": [
229
- 1.9600000000000002e-05,
230
- 3.96e-05,
231
- 4e-05,
232
- 4e-05,
233
- 4e-05,
234
- 4e-05,
235
- 4e-05,
236
- 4e-05,
237
- 4e-05,
238
- 4e-05,
239
- 4e-05,
240
- 4e-05,
241
- 4e-05,
242
- 4e-05,
243
- 4e-05,
244
- 4e-05,
245
- 4e-05,
246
- 4e-05,
247
- 3.944897959183673e-05,
248
- 3.638775510204082e-05,
249
- 3.3326530612244897e-05,
250
- 2.983673469387755e-05,
251
- 2.6775510204081637e-05,
252
- 2.3714285714285713e-05,
253
- 2.022448979591837e-05,
254
- 1.7163265306122454e-05,
255
- 1.4102040816326535e-05,
256
- 1.0612244897959182e-05,
257
- 7.551020408163262e-06,
258
- 4.48979591836735e-06
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
@@ -283,7 +283,7 @@
283
  0.0
284
  ]
285
  },
286
- "final_accuracy": 0.6116666666666667,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
@@ -294,21 +294,21 @@
294
  },
295
  "splits": {
296
  "add_S0": {
297
- "full_accuracy": 0.89,
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
- "accuracy": 0.9834710743801653,
302
  "count": 605
303
  },
304
  "SS": {
305
- "accuracy": 0.9473684210526315,
306
  "count": 95
307
  }
308
  }
309
  },
310
  "add_S1": {
311
- "full_accuracy": 0.74,
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
@@ -316,7 +316,7 @@
316
  "count": 204
317
  },
318
  "SC": {
319
- "accuracy": 0.9467455621301775,
320
  "count": 169
321
  },
322
  "SS": {
@@ -324,91 +324,91 @@
324
  "count": 31
325
  },
326
  "UC": {
327
- "accuracy": 0.9391891891891891,
328
  "count": 296
329
  }
330
  }
331
  },
332
  "add_S2": {
333
- "full_accuracy": 0.5,
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
- "accuracy": 0.9631901840490797,
338
  "count": 163
339
  },
340
  "SC": {
341
- "accuracy": 0.9230769230769231,
342
  "count": 130
343
  },
344
  "SS": {
345
- "accuracy": 0.8505747126436781,
346
  "count": 87
347
  },
348
  "UC": {
349
- "accuracy": 0.8325123152709359,
350
  "count": 203
351
  },
352
  "US": {
353
- "accuracy": 0.9743589743589743,
354
  "count": 117
355
  }
356
  }
357
  },
358
  "add_S3": {
359
- "full_accuracy": 0.29,
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
- "accuracy": 0.9917355371900827,
364
  "count": 121
365
  },
366
  "SC": {
367
- "accuracy": 0.9421487603305785,
368
  "count": 121
369
  },
370
  "SS": {
371
- "accuracy": 0.8979591836734694,
372
  "count": 49
373
  },
374
  "UC": {
375
- "accuracy": 0.7204301075268817,
376
  "count": 186
377
  },
378
  "US": {
379
- "accuracy": 0.8340807174887892,
380
  "count": 223
381
  }
382
  }
383
  },
384
  "add_S4": {
385
- "full_accuracy": 0.19,
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
- "accuracy": 0.9711538461538461,
390
  "count": 104
391
  },
392
  "SC": {
393
- "accuracy": 0.9622641509433962,
394
  "count": 106
395
  },
396
  "SS": {
397
- "accuracy": 0.9565217391304348,
398
  "count": 23
399
  },
400
  "UC": {
401
- "accuracy": 0.59375,
402
  "count": 160
403
  },
404
  "US": {
405
- "accuracy": 0.6156351791530945,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
- "full_accuracy": 0.19,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
@@ -416,65 +416,65 @@
416
  "count": 100
417
  },
418
  "SC": {
419
- "accuracy": 0.96,
420
  "count": 100
421
  },
422
  "UC": {
423
- "accuracy": 0.31,
424
  "count": 100
425
  },
426
  "US": {
427
- "accuracy": 0.4775,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
- "full_accuracy": 0.53,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
437
- "accuracy": 0.99,
438
  "count": 100
439
  },
440
  "UC": {
441
- "accuracy": 0.65,
442
  "count": 100
443
  },
444
  "US": {
445
- "accuracy": 0.672,
446
  "count": 500
447
  }
448
  }
449
  },
450
  "add_random": {
451
- "full_accuracy": 0.73,
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
- "accuracy": 0.9776286353467561,
456
  "count": 447
457
  },
458
  "SC": {
459
- "accuracy": 0.975,
460
  "count": 320
461
  },
462
  "SS": {
463
- "accuracy": 0.9821428571428571,
464
  "count": 56
465
  },
466
  "UC": {
467
- "accuracy": 0.9338374291115312,
468
  "count": 529
469
  },
470
  "US": {
471
- "accuracy": 0.9583333333333334,
472
  "count": 48
473
  }
474
  }
475
  },
476
  "add_C3": {
477
- "full_accuracy": 0.42,
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
@@ -486,61 +486,61 @@
486
  "count": 100
487
  },
488
  "UC": {
489
- "accuracy": 0.7253886010362695,
490
  "count": 193
491
  },
492
  "US": {
493
- "accuracy": 0.8504672897196262,
494
  "count": 107
495
  }
496
  }
497
  },
498
  "add_C4": {
499
- "full_accuracy": 0.43,
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 1.0,
504
  "count": 200
505
  },
506
  "SC": {
507
- "accuracy": 0.96,
508
  "count": 100
509
  },
510
  "UC": {
511
- "accuracy": 0.76953125,
512
  "count": 256
513
  },
514
  "US": {
515
- "accuracy": 0.8055555555555556,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
- "full_accuracy": 0.31,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
- "accuracy": 1.0,
526
  "count": 100
527
  },
528
  "SC": {
529
- "accuracy": 0.98,
530
  "count": 100
531
  },
532
  "UC": {
533
- "accuracy": 0.7745098039215687,
534
  "count": 306
535
  },
536
  "US": {
537
- "accuracy": 0.7783505154639175,
538
  "count": 194
539
  }
540
  }
541
  },
542
  "add_C6": {
543
- "full_accuracy": 0.44,
544
  "n_examples": 100,
545
  "per_subtask": {
546
  "SC": {
@@ -548,39 +548,39 @@
548
  "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.8306010928961749,
552
  "count": 366
553
  },
554
  "US": {
555
- "accuracy": 0.8931623931623932,
556
  "count": 234
557
  }
558
  }
559
  },
560
  "sub_M0": {
561
- "full_accuracy": 0.87,
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
- "accuracy": 0.9750415973377704,
566
  "count": 601
567
  },
568
  "ME": {
569
- "accuracy": 1.0,
570
  "count": 99
571
  }
572
  }
573
  },
574
  "sub_M1": {
575
- "full_accuracy": 0.82,
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
- "accuracy": 0.989247311827957,
580
  "count": 279
581
  },
582
  "MB": {
583
- "accuracy": 0.9586206896551724,
584
  "count": 145
585
  },
586
  "ME": {
@@ -588,47 +588,47 @@
588
  "count": 24
589
  },
590
  "UB": {
591
- "accuracy": 0.9563492063492064,
592
  "count": 252
593
  }
594
  }
595
  },
596
  "sub_M2": {
597
- "full_accuracy": 0.47,
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
- "accuracy": 0.9953051643192489,
602
  "count": 213
603
  },
604
  "MB": {
605
- "accuracy": 0.9380530973451328,
606
  "count": 113
607
  },
608
  "ME": {
609
- "accuracy": 1.0,
610
  "count": 85
611
  },
612
  "UB": {
613
- "accuracy": 0.7237569060773481,
614
  "count": 181
615
  },
616
  "UD": {
617
- "accuracy": 0.9444444444444444,
618
  "count": 108
619
  }
620
  }
621
  },
622
  "sub_M3": {
623
- "full_accuracy": 0.12,
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
627
- "accuracy": 0.994413407821229,
628
  "count": 179
629
  },
630
  "MB": {
631
- "accuracy": 0.9805825242718447,
632
  "count": 103
633
  },
634
  "ME": {
@@ -636,17 +636,17 @@
636
  "count": 56
637
  },
638
  "UB": {
639
- "accuracy": 0.5302013422818792,
640
  "count": 149
641
  },
642
  "UD": {
643
- "accuracy": 0.7417840375586855,
644
  "count": 213
645
  }
646
  }
647
  },
648
  "sub_M4": {
649
- "full_accuracy": 0.0,
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
@@ -654,21 +654,21 @@
654
  "count": 200
655
  },
656
  "MB": {
657
- "accuracy": 1.0,
658
  "count": 100
659
  },
660
  "UB": {
661
- "accuracy": 0.36,
662
  "count": 100
663
  },
664
  "UD": {
665
- "accuracy": 0.2966666666666667,
666
  "count": 300
667
  }
668
  }
669
  },
670
  "sub_M5": {
671
- "full_accuracy": 0.01,
672
  "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
@@ -676,29 +676,29 @@
676
  "count": 100
677
  },
678
  "MB": {
679
- "accuracy": 1.0,
680
  "count": 100
681
  },
682
  "UB": {
683
- "accuracy": 0.3,
684
  "count": 100
685
  },
686
  "UD": {
687
- "accuracy": 0.215,
688
  "count": 400
689
  }
690
  }
691
  },
692
  "sub_random": {
693
- "full_accuracy": 0.765,
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
- "accuracy": 0.9783333333333334,
698
  "count": 600
699
  },
700
  "MB": {
701
- "accuracy": 0.9700374531835206,
702
  "count": 267
703
  },
704
  "ME": {
@@ -706,21 +706,21 @@
706
  "count": 53
707
  },
708
  "UB": {
709
- "accuracy": 0.9384965831435079,
710
  "count": 439
711
  },
712
  "UD": {
713
- "accuracy": 0.9512195121951219,
714
  "count": 41
715
  }
716
  }
717
  },
718
  "sub_B3": {
719
- "full_accuracy": 0.49,
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
- "accuracy": 0.9966666666666667,
724
  "count": 300
725
  },
726
  "MB": {
@@ -728,17 +728,17 @@
728
  "count": 100
729
  },
730
  "UB": {
731
- "accuracy": 0.766497461928934,
732
  "count": 197
733
  },
734
  "UD": {
735
- "accuracy": 0.7572815533980582,
736
  "count": 103
737
  }
738
  }
739
  },
740
  "sub_B4": {
741
- "full_accuracy": 0.4,
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
@@ -746,21 +746,21 @@
746
  "count": 200
747
  },
748
  "MB": {
749
- "accuracy": 1.0,
750
  "count": 100
751
  },
752
  "UB": {
753
- "accuracy": 0.7975708502024291,
754
  "count": 247
755
  },
756
  "UD": {
757
- "accuracy": 0.6928104575163399,
758
  "count": 153
759
  }
760
  }
761
  },
762
  "sub_B5": {
763
- "full_accuracy": 0.2,
764
  "n_examples": 100,
765
  "per_subtask": {
766
  "MD": {
@@ -768,22 +768,22 @@
768
  "count": 100
769
  },
770
  "MB": {
771
- "accuracy": 1.0,
772
  "count": 100
773
  },
774
  "UB": {
775
- "accuracy": 0.7348993288590604,
776
  "count": 298
777
  },
778
  "UD": {
779
- "accuracy": 0.6782178217821783,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
- "overall_accuracy": 0.47041666666666665,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
@@ -798,11 +798,11 @@
798
  },
799
  "splits": {
800
  "add_S0": {
801
- "full_accuracy": 0.97,
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
- "accuracy": 0.9950413223140496,
806
  "count": 605
807
  },
808
  "SS": {
@@ -812,11 +812,11 @@
812
  }
813
  },
814
  "add_S1": {
815
- "full_accuracy": 0.96,
816
  "n_examples": 100,
817
  "per_subtask": {
818
  "SA": {
819
- "accuracy": 0.9901960784313726,
820
  "count": 204
821
  },
822
  "SC": {
@@ -824,33 +824,33 @@
824
  "count": 169
825
  },
826
  "SS": {
827
- "accuracy": 0.967741935483871,
828
  "count": 31
829
  },
830
  "UC": {
831
- "accuracy": 0.9932432432432432,
832
  "count": 296
833
  }
834
  }
835
  },
836
  "add_S2": {
837
- "full_accuracy": 0.83,
838
  "n_examples": 100,
839
  "per_subtask": {
840
  "SA": {
841
- "accuracy": 0.9815950920245399,
842
  "count": 163
843
  },
844
  "SC": {
845
- "accuracy": 0.9769230769230769,
846
  "count": 130
847
  },
848
  "SS": {
849
- "accuracy": 0.9885057471264368,
850
  "count": 87
851
  },
852
  "UC": {
853
- "accuracy": 0.9408866995073891,
854
  "count": 203
855
  },
856
  "US": {
@@ -860,15 +860,15 @@
860
  }
861
  },
862
  "add_S3": {
863
- "full_accuracy": 0.55,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SA": {
867
- "accuracy": 0.9917355371900827,
868
  "count": 121
869
  },
870
  "SC": {
871
- "accuracy": 0.9917355371900827,
872
  "count": 121
873
  },
874
  "SS": {
@@ -876,17 +876,17 @@
876
  "count": 49
877
  },
878
  "UC": {
879
- "accuracy": 0.7903225806451613,
880
  "count": 186
881
  },
882
  "US": {
883
- "accuracy": 0.9461883408071748,
884
  "count": 223
885
  }
886
  }
887
  },
888
  "add_S4": {
889
- "full_accuracy": 0.4,
890
  "n_examples": 100,
891
  "per_subtask": {
892
  "SA": {
@@ -902,17 +902,17 @@
902
  "count": 23
903
  },
904
  "UC": {
905
- "accuracy": 0.7,
906
  "count": 160
907
  },
908
  "US": {
909
- "accuracy": 0.8045602605863192,
910
  "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
- "full_accuracy": 0.27,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
@@ -924,17 +924,17 @@
924
  "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.41,
928
  "count": 100
929
  },
930
  "US": {
931
- "accuracy": 0.59,
932
  "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
- "full_accuracy": 0.32,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
@@ -942,25 +942,25 @@
942
  "count": 100
943
  },
944
  "UC": {
945
- "accuracy": 0.52,
946
  "count": 100
947
  },
948
  "US": {
949
- "accuracy": 0.586,
950
  "count": 500
951
  }
952
  }
953
  },
954
  "add_random": {
955
- "full_accuracy": 0.955,
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
- "accuracy": 0.9977628635346756,
960
  "count": 447
961
  },
962
  "SC": {
963
- "accuracy": 0.996875,
964
  "count": 320
965
  },
966
  "SS": {
@@ -968,7 +968,7 @@
968
  "count": 56
969
  },
970
  "UC": {
971
- "accuracy": 0.9867674858223062,
972
  "count": 529
973
  },
974
  "US": {
@@ -978,7 +978,7 @@
978
  }
979
  },
980
  "add_C3": {
981
- "full_accuracy": 0.67,
982
  "n_examples": 100,
983
  "per_subtask": {
984
  "SA": {
@@ -990,17 +990,17 @@
990
  "count": 100
991
  },
992
  "UC": {
993
- "accuracy": 0.8341968911917098,
994
  "count": 193
995
  },
996
  "US": {
997
- "accuracy": 0.9065420560747663,
998
  "count": 107
999
  }
1000
  }
1001
  },
1002
  "add_C4": {
1003
- "full_accuracy": 0.63,
1004
  "n_examples": 100,
1005
  "per_subtask": {
1006
  "SA": {
@@ -1012,17 +1012,17 @@
1012
  "count": 100
1013
  },
1014
  "UC": {
1015
- "accuracy": 0.86328125,
1016
  "count": 256
1017
  },
1018
  "US": {
1019
- "accuracy": 0.9097222222222222,
1020
  "count": 144
1021
  }
1022
  }
1023
  },
1024
  "add_C5": {
1025
- "full_accuracy": 0.57,
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
@@ -1034,17 +1034,17 @@
1034
  "count": 100
1035
  },
1036
  "UC": {
1037
- "accuracy": 0.8725490196078431,
1038
  "count": 306
1039
  },
1040
  "US": {
1041
- "accuracy": 0.9020618556701031,
1042
  "count": 194
1043
  }
1044
  }
1045
  },
1046
  "add_C6": {
1047
- "full_accuracy": 0.66,
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
@@ -1052,31 +1052,31 @@
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
- "accuracy": 0.9098360655737705,
1056
  "count": 366
1057
  },
1058
  "US": {
1059
- "accuracy": 0.9487179487179487,
1060
  "count": 234
1061
  }
1062
  }
1063
  },
1064
  "sub_M0": {
1065
- "full_accuracy": 0.9,
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.9850249584026622,
1070
  "count": 601
1071
  },
1072
  "ME": {
1073
- "accuracy": 0.98989898989899,
1074
  "count": 99
1075
  }
1076
  }
1077
  },
1078
  "sub_M1": {
1079
- "full_accuracy": 0.99,
1080
  "n_examples": 100,
1081
  "per_subtask": {
1082
  "MD": {
@@ -1084,7 +1084,7 @@
1084
  "count": 279
1085
  },
1086
  "MB": {
1087
- "accuracy": 0.993103448275862,
1088
  "count": 145
1089
  },
1090
  "ME": {
@@ -1098,15 +1098,15 @@
1098
  }
1099
  },
1100
  "sub_M2": {
1101
- "full_accuracy": 0.6,
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.9953051643192489,
1106
  "count": 213
1107
  },
1108
  "MB": {
1109
- "accuracy": 0.9734513274336283,
1110
  "count": 113
1111
  },
1112
  "ME": {
@@ -1114,7 +1114,7 @@
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
- "accuracy": 0.7845303867403315,
1118
  "count": 181
1119
  },
1120
  "UD": {
@@ -1124,7 +1124,7 @@
1124
  }
1125
  },
1126
  "sub_M3": {
1127
- "full_accuracy": 0.07,
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
@@ -1132,7 +1132,7 @@
1132
  "count": 179
1133
  },
1134
  "MB": {
1135
- "accuracy": 0.9805825242718447,
1136
  "count": 103
1137
  },
1138
  "ME": {
@@ -1140,17 +1140,17 @@
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
- "accuracy": 0.4563758389261745,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
- "accuracy": 0.7699530516431925,
1148
  "count": 213
1149
  }
1150
  }
1151
  },
1152
  "sub_M4": {
1153
- "full_accuracy": 0.07,
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
@@ -1162,17 +1162,17 @@
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
- "accuracy": 0.38,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
- "accuracy": 0.48,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
- "full_accuracy": 0.02,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
@@ -1184,25 +1184,25 @@
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
- "accuracy": 0.26,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
- "accuracy": 0.38,
1192
  "count": 400
1193
  }
1194
  }
1195
  },
1196
  "sub_random": {
1197
- "full_accuracy": 0.89,
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.9966666666666667,
1202
  "count": 600
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.9962546816479401,
1206
  "count": 267
1207
  },
1208
  "ME": {
@@ -1210,7 +1210,7 @@
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
- "accuracy": 0.9567198177676538,
1214
  "count": 439
1215
  },
1216
  "UD": {
@@ -1220,11 +1220,11 @@
1220
  }
1221
  },
1222
  "sub_B3": {
1223
- "full_accuracy": 0.68,
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
- "accuracy": 1.0,
1228
  "count": 300
1229
  },
1230
  "MB": {
@@ -1232,21 +1232,21 @@
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
- "accuracy": 0.8578680203045685,
1236
  "count": 197
1237
  },
1238
  "UD": {
1239
- "accuracy": 0.8932038834951457,
1240
  "count": 103
1241
  }
1242
  }
1243
  },
1244
  "sub_B4": {
1245
- "full_accuracy": 0.51,
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
- "accuracy": 1.0,
1250
  "count": 200
1251
  },
1252
  "MB": {
@@ -1254,17 +1254,17 @@
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
- "accuracy": 0.8380566801619433,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
- "accuracy": 0.7647058823529411,
1262
  "count": 153
1263
  }
1264
  }
1265
  },
1266
  "sub_B5": {
1267
- "full_accuracy": 0.33,
1268
  "n_examples": 100,
1269
  "per_subtask": {
1270
  "MD": {
@@ -1276,22 +1276,22 @@
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
- "accuracy": 0.8187919463087249,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
- "accuracy": 0.7524752475247525,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
- "overall_accuracy": 0.6116666666666667,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
- "sorl_overall_accuracy": 0.6116666666666667,
1296
- "sft_overall_accuracy": 0.47041666666666665
1297
  }
 
33
  1563
34
  ],
35
  "loss": [
36
+ 9.340689659118652,
37
+ 4.987771987915039,
38
+ 3.1618072986602783,
39
+ 2.856262445449829,
40
+ 2.5268046855926514,
41
+ 2.1091880798339844,
42
+ -0.44331324100494385,
43
+ -4.092588424682617,
44
+ -4.768612384796143,
45
+ -3.7999463081359863,
46
+ -1.6939505338668823,
47
+ -2.5403966903686523,
48
+ -2.0438146591186523,
49
+ -1.7041206359863281,
50
+ -1.5684587955474854,
51
+ -1.9102389812469482,
52
+ -1.880913257598877,
53
+ -1.9313758611679077,
54
+ -1.818078875541687,
55
+ -1.5362316370010376,
56
+ -2.0820319652557373,
57
+ -1.3230443000793457,
58
+ -1.1353564262390137,
59
+ -1.5558422803878784,
60
+ -1.8902345895767212,
61
+ -1.0403810739517212,
62
+ -1.1732807159423828,
63
+ -0.8931325078010559,
64
+ -1.0903886556625366,
65
+ -0.5779882073402405
66
  ],
67
  "base_loss": [
68
+ 7.390318870544434,
69
+ 3.826241970062256,
70
+ 1.9951651096343994,
71
+ 1.8394466638565063,
72
+ 1.8613300323486328,
73
+ 1.835070252418518,
74
+ 1.8304622173309326,
75
+ 1.7413225173950195,
76
+ 1.6582175493240356,
77
+ 1.3251668214797974,
78
+ 0.9685406684875488,
79
+ 0.9228308796882629,
80
+ 0.7192235589027405,
81
+ 0.5874363780021667,
82
+ 0.5402608513832092,
83
+ 0.5179902911186218,
84
+ 0.47875410318374634,
85
+ 0.4619639217853546,
86
+ 0.3892979025840759,
87
+ 0.4061528742313385,
88
+ 0.4215702414512634,
89
+ 0.30625566840171814,
90
+ 0.26082319021224976,
91
+ 0.30791884660720825,
92
+ 0.3336397111415863,
93
+ 0.22246377170085907,
94
+ 0.2154165357351303,
95
+ 0.1796569526195526,
96
+ 0.1971123218536377,
97
+ 0.13742247223854065
98
  ],
99
  "info_loss": [
100
+ -0.4859275817871094,
101
+ -0.1129457950592041,
102
+ -0.07287442684173584,
103
+ -0.0863027572631836,
104
+ -0.12096810340881348,
105
+ -0.1600182056427002,
106
+ -0.41505467891693115,
107
+ -0.7713239789009094,
108
+ -0.8302392959594727,
109
+ -0.6981891989707947,
110
+ -0.4436832070350647,
111
+ -0.5114461183547974,
112
+ -0.42171621322631836,
113
+ -0.35522836446762085,
114
+ -0.32737311720848083,
115
+ -0.3521292805671692,
116
+ -0.33850210905075073,
117
+ -0.33436745405197144,
118
+ -0.3095327913761139,
119
+ -0.2765413522720337,
120
+ -0.32649245858192444,
121
+ -0.24035842716693878,
122
+ -0.20812112092971802,
123
+ -0.2552123963832855,
124
+ -0.28583821654319763,
125
+ -0.19351282715797424,
126
+ -0.19369123876094818,
127
+ -0.16221696138381958,
128
+ -0.18230006098747253,
129
+ -0.1274920552968979
130
  ],
131
  "abs_loss": [
132
+ 2.2262747287750244,
133
+ 1.8807449340820312,
134
+ 1.833791732788086,
135
+ 1.8724111318588257,
136
+ 1.8476206064224243,
137
+ 1.8559261560440063,
138
+ 1.87335205078125,
139
+ 1.8458162546157837,
140
+ 1.813267707824707,
141
+ 1.6594558954238892,
142
+ 1.4701725244522095,
143
+ 1.2008872032165527,
144
+ 0.8842588067054749,
145
+ 0.7724447250366211,
146
+ 0.6772302985191345,
147
+ 0.6123369336128235,
148
+ 0.49044322967529297,
149
+ 0.4312553107738495,
150
+ 0.38406622409820557,
151
+ 0.3418884575366974,
152
+ 0.2913786768913269,
153
+ 0.3654420077800751,
154
+ 0.28334739804267883,
155
+ 0.25913718342781067,
156
+ 0.2655659019947052,
157
+ 0.2588687837123871,
158
+ 0.21410413086414337,
159
+ 0.21956706047058105,
160
+ 0.2182735651731491,
161
+ 0.23050324618816376
162
  ],
163
  "zipf_loss": [
164
+ 6.587018966674805,
165
+ 2.1029131412506104,
166
+ 1.7120072841644287,
167
+ 1.692602276802063,
168
+ 1.6903936862945557,
169
+ 1.6887073516845703,
170
+ 1.6894363164901733,
171
+ 1.6947468519210815,
172
+ 1.694236397743225,
173
+ 1.690833568572998,
174
+ 1.6273235082626343,
175
+ 1.531144618988037,
176
+ 1.3656980991363525,
177
+ 1.1834821701049805,
178
+ 1.0972886085510254,
179
+ 1.031829595565796,
180
+ 0.9763095378875732,
181
+ 0.9072092771530151,
182
+ 0.8495444059371948,
183
+ 0.7888401746749878,
184
+ 0.7321844100952148,
185
+ 0.7377402782440186,
186
+ 0.6566966772079468,
187
+ 0.6624492406845093,
188
+ 0.6079515218734741,
189
+ 0.6463965177536011,
190
+ 0.5268046855926514,
191
+ 0.5274234414100647,
192
+ 0.5136723518371582,
193
+ 0.5364596247673035
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
197
+ 0.3918377161026001,
198
+ 0.20118887722492218,
199
+ 0.12065954506397247,
200
+ 0.0842815414071083,
201
+ 0.07851361483335495,
202
+ 0.08886810392141342,
203
+ 0.1118798479437828,
204
+ 0.16310755908489227,
205
+ 0.21554774045944214,
206
+ 0.2462739199399948,
207
+ 0.2753961980342865,
208
+ 0.30527833104133606,
209
+ 0.33816447854042053,
210
+ 0.3525509536266327,
211
+ 0.36425891518592834,
212
+ 0.3624407649040222,
213
+ 0.36967605352401733,
214
+ 0.3742343485355377,
215
+ 0.39829763770103455,
216
+ 0.39412182569503784,
217
+ 0.3936292827129364,
218
+ 0.3937382996082306,
219
+ 0.3880367875099182,
220
+ 0.39024752378463745,
221
+ 0.3874780535697937,
222
+ 0.39575129747390747,
223
+ 0.3984874486923218,
224
+ 0.39981427788734436,
225
+ 0.40026330947875977,
226
+ 0.4032067358493805
227
  ],
228
  "lr": [
229
+ 3.9200000000000004e-05,
230
+ 7.92e-05,
231
+ 8e-05,
232
+ 8e-05,
233
+ 8e-05,
234
+ 8e-05,
235
+ 8e-05,
236
+ 8e-05,
237
+ 8e-05,
238
+ 8e-05,
239
+ 8e-05,
240
+ 8e-05,
241
+ 8e-05,
242
+ 8e-05,
243
+ 8e-05,
244
+ 8e-05,
245
+ 8e-05,
246
+ 8e-05,
247
+ 7.889795918367346e-05,
248
+ 7.277551020408164e-05,
249
+ 6.665306122448979e-05,
250
+ 5.96734693877551e-05,
251
+ 5.3551020408163274e-05,
252
+ 4.7428571428571427e-05,
253
+ 4.044897959183674e-05,
254
+ 3.432653061224491e-05,
255
+ 2.820408163265307e-05,
256
+ 2.1224489795918364e-05,
257
+ 1.5102040816326524e-05,
258
+ 8.9795918367347e-06
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
 
283
  0.0
284
  ]
285
  },
286
+ "final_accuracy": 0.8704166666666666,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
 
294
  },
295
  "splits": {
296
  "add_S0": {
297
+ "full_accuracy": 0.9,
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
+ "accuracy": 0.9818181818181818,
302
  "count": 605
303
  },
304
  "SS": {
305
+ "accuracy": 1.0,
306
  "count": 95
307
  }
308
  }
309
  },
310
  "add_S1": {
311
+ "full_accuracy": 0.83,
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
 
316
  "count": 204
317
  },
318
  "SC": {
319
+ "accuracy": 0.9940828402366864,
320
  "count": 169
321
  },
322
  "SS": {
 
324
  "count": 31
325
  },
326
  "UC": {
327
+ "accuracy": 0.956081081081081,
328
  "count": 296
329
  }
330
  }
331
  },
332
  "add_S2": {
333
+ "full_accuracy": 0.54,
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
+ "accuracy": 0.9815950920245399,
338
  "count": 163
339
  },
340
  "SC": {
341
+ "accuracy": 0.9538461538461539,
342
  "count": 130
343
  },
344
  "SS": {
345
+ "accuracy": 0.9770114942528736,
346
  "count": 87
347
  },
348
  "UC": {
349
+ "accuracy": 0.812807881773399,
350
  "count": 203
351
  },
352
  "US": {
353
+ "accuracy": 0.9401709401709402,
354
  "count": 117
355
  }
356
  }
357
  },
358
  "add_S3": {
359
+ "full_accuracy": 0.3,
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
+ "accuracy": 0.9752066115702479,
364
  "count": 121
365
  },
366
  "SC": {
367
+ "accuracy": 0.9752066115702479,
368
  "count": 121
369
  },
370
  "SS": {
371
+ "accuracy": 0.9795918367346939,
372
  "count": 49
373
  },
374
  "UC": {
375
+ "accuracy": 0.6666666666666666,
376
  "count": 186
377
  },
378
  "US": {
379
+ "accuracy": 0.7668161434977578,
380
  "count": 223
381
  }
382
  }
383
  },
384
  "add_S4": {
385
+ "full_accuracy": 0.23,
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
+ "accuracy": 0.9807692307692307,
390
  "count": 104
391
  },
392
  "SC": {
393
+ "accuracy": 0.9811320754716981,
394
  "count": 106
395
  },
396
  "SS": {
397
+ "accuracy": 1.0,
398
  "count": 23
399
  },
400
  "UC": {
401
+ "accuracy": 0.58125,
402
  "count": 160
403
  },
404
  "US": {
405
+ "accuracy": 0.5635179153094463,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
+ "full_accuracy": 0.12,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
 
416
  "count": 100
417
  },
418
  "SC": {
419
+ "accuracy": 0.94,
420
  "count": 100
421
  },
422
  "UC": {
423
+ "accuracy": 0.3,
424
  "count": 100
425
  },
426
  "US": {
427
+ "accuracy": 0.325,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
+ "full_accuracy": 0.34,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
437
+ "accuracy": 1.0,
438
  "count": 100
439
  },
440
  "UC": {
441
+ "accuracy": 0.53,
442
  "count": 100
443
  },
444
  "US": {
445
+ "accuracy": 0.514,
446
  "count": 500
447
  }
448
  }
449
  },
450
  "add_random": {
451
+ "full_accuracy": 0.7,
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
+ "accuracy": 0.9686800894854586,
456
  "count": 447
457
  },
458
  "SC": {
459
+ "accuracy": 0.971875,
460
  "count": 320
461
  },
462
  "SS": {
463
+ "accuracy": 0.9642857142857143,
464
  "count": 56
465
  },
466
  "UC": {
467
+ "accuracy": 0.9243856332703214,
468
  "count": 529
469
  },
470
  "US": {
471
+ "accuracy": 0.8958333333333334,
472
  "count": 48
473
  }
474
  }
475
  },
476
  "add_C3": {
477
+ "full_accuracy": 0.48,
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
 
486
  "count": 100
487
  },
488
  "UC": {
489
+ "accuracy": 0.7461139896373057,
490
  "count": 193
491
  },
492
  "US": {
493
+ "accuracy": 0.7663551401869159,
494
  "count": 107
495
  }
496
  }
497
  },
498
  "add_C4": {
499
+ "full_accuracy": 0.38,
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.985,
504
  "count": 200
505
  },
506
  "SC": {
507
+ "accuracy": 0.98,
508
  "count": 100
509
  },
510
  "UC": {
511
+ "accuracy": 0.7734375,
512
  "count": 256
513
  },
514
  "US": {
515
+ "accuracy": 0.8194444444444444,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
+ "full_accuracy": 0.4,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
+ "accuracy": 0.99,
526
  "count": 100
527
  },
528
  "SC": {
529
+ "accuracy": 1.0,
530
  "count": 100
531
  },
532
  "UC": {
533
+ "accuracy": 0.7908496732026143,
534
  "count": 306
535
  },
536
  "US": {
537
+ "accuracy": 0.845360824742268,
538
  "count": 194
539
  }
540
  }
541
  },
542
  "add_C6": {
543
+ "full_accuracy": 0.35,
544
  "n_examples": 100,
545
  "per_subtask": {
546
  "SC": {
 
548
  "count": 100
549
  },
550
  "UC": {
551
+ "accuracy": 0.8005464480874317,
552
  "count": 366
553
  },
554
  "US": {
555
+ "accuracy": 0.8290598290598291,
556
  "count": 234
557
  }
558
  }
559
  },
560
  "sub_M0": {
561
+ "full_accuracy": 0.89,
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
+ "accuracy": 0.9833610648918469,
566
  "count": 601
567
  },
568
  "ME": {
569
+ "accuracy": 0.9797979797979798,
570
  "count": 99
571
  }
572
  }
573
  },
574
  "sub_M1": {
575
+ "full_accuracy": 0.81,
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
+ "accuracy": 0.992831541218638,
580
  "count": 279
581
  },
582
  "MB": {
583
+ "accuracy": 0.993103448275862,
584
  "count": 145
585
  },
586
  "ME": {
 
588
  "count": 24
589
  },
590
  "UB": {
591
+ "accuracy": 0.9285714285714286,
592
  "count": 252
593
  }
594
  }
595
  },
596
  "sub_M2": {
597
+ "full_accuracy": 0.42,
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
+ "accuracy": 0.9859154929577465,
602
  "count": 213
603
  },
604
  "MB": {
605
+ "accuracy": 0.9734513274336283,
606
  "count": 113
607
  },
608
  "ME": {
609
+ "accuracy": 0.9882352941176471,
610
  "count": 85
611
  },
612
  "UB": {
613
+ "accuracy": 0.6740331491712708,
614
  "count": 181
615
  },
616
  "UD": {
617
+ "accuracy": 0.9722222222222222,
618
  "count": 108
619
  }
620
  }
621
  },
622
  "sub_M3": {
623
+ "full_accuracy": 0.06,
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
627
+ "accuracy": 1.0,
628
  "count": 179
629
  },
630
  "MB": {
631
+ "accuracy": 0.970873786407767,
632
  "count": 103
633
  },
634
  "ME": {
 
636
  "count": 56
637
  },
638
  "UB": {
639
+ "accuracy": 0.3624161073825503,
640
  "count": 149
641
  },
642
  "UD": {
643
+ "accuracy": 0.6384976525821596,
644
  "count": 213
645
  }
646
  }
647
  },
648
  "sub_M4": {
649
+ "full_accuracy": 0.02,
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
 
654
  "count": 200
655
  },
656
  "MB": {
657
+ "accuracy": 0.91,
658
  "count": 100
659
  },
660
  "UB": {
661
+ "accuracy": 0.38,
662
  "count": 100
663
  },
664
  "UD": {
665
+ "accuracy": 0.29,
666
  "count": 300
667
  }
668
  }
669
  },
670
  "sub_M5": {
671
+ "full_accuracy": 0.0,
672
  "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
 
676
  "count": 100
677
  },
678
  "MB": {
679
+ "accuracy": 0.85,
680
  "count": 100
681
  },
682
  "UB": {
683
+ "accuracy": 0.25,
684
  "count": 100
685
  },
686
  "UD": {
687
+ "accuracy": 0.19,
688
  "count": 400
689
  }
690
  }
691
  },
692
  "sub_random": {
693
+ "full_accuracy": 0.71,
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
+ "accuracy": 0.9816666666666667,
698
  "count": 600
699
  },
700
  "MB": {
701
+ "accuracy": 0.9925093632958801,
702
  "count": 267
703
  },
704
  "ME": {
 
706
  "count": 53
707
  },
708
  "UB": {
709
+ "accuracy": 0.8906605922551253,
710
  "count": 439
711
  },
712
  "UD": {
713
+ "accuracy": 0.8536585365853658,
714
  "count": 41
715
  }
716
  }
717
  },
718
  "sub_B3": {
719
+ "full_accuracy": 0.33,
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
+ "accuracy": 0.9866666666666667,
724
  "count": 300
725
  },
726
  "MB": {
 
728
  "count": 100
729
  },
730
  "UB": {
731
+ "accuracy": 0.6700507614213198,
732
  "count": 197
733
  },
734
  "UD": {
735
+ "accuracy": 0.7378640776699029,
736
  "count": 103
737
  }
738
  }
739
  },
740
  "sub_B4": {
741
+ "full_accuracy": 0.21,
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
 
746
  "count": 200
747
  },
748
  "MB": {
749
+ "accuracy": 0.98,
750
  "count": 100
751
  },
752
  "UB": {
753
+ "accuracy": 0.6761133603238867,
754
  "count": 247
755
  },
756
  "UD": {
757
+ "accuracy": 0.6666666666666666,
758
  "count": 153
759
  }
760
  }
761
  },
762
  "sub_B5": {
763
+ "full_accuracy": 0.15,
764
  "n_examples": 100,
765
  "per_subtask": {
766
  "MD": {
 
768
  "count": 100
769
  },
770
  "MB": {
771
+ "accuracy": 0.97,
772
  "count": 100
773
  },
774
  "UB": {
775
+ "accuracy": 0.6543624161073825,
776
  "count": 298
777
  },
778
  "UD": {
779
+ "accuracy": 0.6089108910891089,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
+ "overall_accuracy": 0.44083333333333335,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
 
798
  },
799
  "splits": {
800
  "add_S0": {
801
+ "full_accuracy": 1.0,
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
+ "accuracy": 1.0,
806
  "count": 605
807
  },
808
  "SS": {
 
812
  }
813
  },
814
  "add_S1": {
815
+ "full_accuracy": 1.0,
816
  "n_examples": 100,
817
  "per_subtask": {
818
  "SA": {
819
+ "accuracy": 1.0,
820
  "count": 204
821
  },
822
  "SC": {
 
824
  "count": 169
825
  },
826
  "SS": {
827
+ "accuracy": 1.0,
828
  "count": 31
829
  },
830
  "UC": {
831
+ "accuracy": 1.0,
832
  "count": 296
833
  }
834
  }
835
  },
836
  "add_S2": {
837
+ "full_accuracy": 1.0,
838
  "n_examples": 100,
839
  "per_subtask": {
840
  "SA": {
841
+ "accuracy": 1.0,
842
  "count": 163
843
  },
844
  "SC": {
845
+ "accuracy": 1.0,
846
  "count": 130
847
  },
848
  "SS": {
849
+ "accuracy": 1.0,
850
  "count": 87
851
  },
852
  "UC": {
853
+ "accuracy": 1.0,
854
  "count": 203
855
  },
856
  "US": {
 
860
  }
861
  },
862
  "add_S3": {
863
+ "full_accuracy": 1.0,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SA": {
867
+ "accuracy": 1.0,
868
  "count": 121
869
  },
870
  "SC": {
871
+ "accuracy": 1.0,
872
  "count": 121
873
  },
874
  "SS": {
 
876
  "count": 49
877
  },
878
  "UC": {
879
+ "accuracy": 1.0,
880
  "count": 186
881
  },
882
  "US": {
883
+ "accuracy": 1.0,
884
  "count": 223
885
  }
886
  }
887
  },
888
  "add_S4": {
889
+ "full_accuracy": 1.0,
890
  "n_examples": 100,
891
  "per_subtask": {
892
  "SA": {
 
902
  "count": 23
903
  },
904
  "UC": {
905
+ "accuracy": 1.0,
906
  "count": 160
907
  },
908
  "US": {
909
+ "accuracy": 1.0,
910
  "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
+ "full_accuracy": 0.58,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
 
924
  "count": 100
925
  },
926
  "UC": {
927
+ "accuracy": 0.58,
928
  "count": 100
929
  },
930
  "US": {
931
+ "accuracy": 1.0,
932
  "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
+ "full_accuracy": 0.88,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
 
942
  "count": 100
943
  },
944
  "UC": {
945
+ "accuracy": 0.95,
946
  "count": 100
947
  },
948
  "US": {
949
+ "accuracy": 0.972,
950
  "count": 500
951
  }
952
  }
953
  },
954
  "add_random": {
955
+ "full_accuracy": 1.0,
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
+ "accuracy": 1.0,
960
  "count": 447
961
  },
962
  "SC": {
963
+ "accuracy": 1.0,
964
  "count": 320
965
  },
966
  "SS": {
 
968
  "count": 56
969
  },
970
  "UC": {
971
+ "accuracy": 1.0,
972
  "count": 529
973
  },
974
  "US": {
 
978
  }
979
  },
980
  "add_C3": {
981
+ "full_accuracy": 0.99,
982
  "n_examples": 100,
983
  "per_subtask": {
984
  "SA": {
 
990
  "count": 100
991
  },
992
  "UC": {
993
+ "accuracy": 0.9948186528497409,
994
  "count": 193
995
  },
996
  "US": {
997
+ "accuracy": 1.0,
998
  "count": 107
999
  }
1000
  }
1001
  },
1002
  "add_C4": {
1003
+ "full_accuracy": 1.0,
1004
  "n_examples": 100,
1005
  "per_subtask": {
1006
  "SA": {
 
1012
  "count": 100
1013
  },
1014
  "UC": {
1015
+ "accuracy": 1.0,
1016
  "count": 256
1017
  },
1018
  "US": {
1019
+ "accuracy": 1.0,
1020
  "count": 144
1021
  }
1022
  }
1023
  },
1024
  "add_C5": {
1025
+ "full_accuracy": 0.97,
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
 
1034
  "count": 100
1035
  },
1036
  "UC": {
1037
+ "accuracy": 0.9934640522875817,
1038
  "count": 306
1039
  },
1040
  "US": {
1041
+ "accuracy": 0.9948453608247423,
1042
  "count": 194
1043
  }
1044
  }
1045
  },
1046
  "add_C6": {
1047
+ "full_accuracy": 0.98,
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
 
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
+ "accuracy": 0.994535519125683,
1056
  "count": 366
1057
  },
1058
  "US": {
1059
+ "accuracy": 1.0,
1060
  "count": 234
1061
  }
1062
  }
1063
  },
1064
  "sub_M0": {
1065
+ "full_accuracy": 0.99,
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
+ "accuracy": 0.9983361064891847,
1070
  "count": 601
1071
  },
1072
  "ME": {
1073
+ "accuracy": 1.0,
1074
  "count": 99
1075
  }
1076
  }
1077
  },
1078
  "sub_M1": {
1079
+ "full_accuracy": 1.0,
1080
  "n_examples": 100,
1081
  "per_subtask": {
1082
  "MD": {
 
1084
  "count": 279
1085
  },
1086
  "MB": {
1087
+ "accuracy": 1.0,
1088
  "count": 145
1089
  },
1090
  "ME": {
 
1098
  }
1099
  },
1100
  "sub_M2": {
1101
+ "full_accuracy": 0.95,
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
+ "accuracy": 0.9906103286384976,
1106
  "count": 213
1107
  },
1108
  "MB": {
1109
+ "accuracy": 1.0,
1110
  "count": 113
1111
  },
1112
  "ME": {
 
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
+ "accuracy": 0.9834254143646409,
1118
  "count": 181
1119
  },
1120
  "UD": {
 
1124
  }
1125
  },
1126
  "sub_M3": {
1127
+ "full_accuracy": 0.79,
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
 
1132
  "count": 179
1133
  },
1134
  "MB": {
1135
+ "accuracy": 1.0,
1136
  "count": 103
1137
  },
1138
  "ME": {
 
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
+ "accuracy": 0.87248322147651,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
+ "accuracy": 0.9624413145539906,
1148
  "count": 213
1149
  }
1150
  }
1151
  },
1152
  "sub_M4": {
1153
+ "full_accuracy": 0.3,
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
 
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
+ "accuracy": 0.36,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
+ "accuracy": 0.8133333333333334,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
+ "full_accuracy": 0.04,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
 
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
+ "accuracy": 0.12,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
+ "accuracy": 0.6575,
1192
  "count": 400
1193
  }
1194
  }
1195
  },
1196
  "sub_random": {
1197
+ "full_accuracy": 0.985,
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.995,
1202
  "count": 600
1203
  },
1204
  "MB": {
1205
+ "accuracy": 1.0,
1206
  "count": 267
1207
  },
1208
  "ME": {
 
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
+ "accuracy": 1.0,
1214
  "count": 439
1215
  },
1216
  "UD": {
 
1220
  }
1221
  },
1222
  "sub_B3": {
1223
+ "full_accuracy": 0.93,
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
+ "accuracy": 0.9966666666666667,
1228
  "count": 300
1229
  },
1230
  "MB": {
 
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
+ "accuracy": 0.9695431472081218,
1236
  "count": 197
1237
  },
1238
  "UD": {
1239
+ "accuracy": 1.0,
1240
  "count": 103
1241
  }
1242
  }
1243
  },
1244
  "sub_B4": {
1245
+ "full_accuracy": 0.82,
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
+ "accuracy": 0.995,
1250
  "count": 200
1251
  },
1252
  "MB": {
 
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
+ "accuracy": 0.9352226720647774,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
+ "accuracy": 0.9607843137254902,
1262
  "count": 153
1263
  }
1264
  }
1265
  },
1266
  "sub_B5": {
1267
+ "full_accuracy": 0.71,
1268
  "n_examples": 100,
1269
  "per_subtask": {
1270
  "MD": {
 
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
+ "accuracy": 0.9026845637583892,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
+ "accuracy": 0.9207920792079208,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
+ "overall_accuracy": 0.8704166666666666,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
+ "sorl_overall_accuracy": 0.8704166666666666,
1296
+ "sft_overall_accuracy": 0.44083333333333335
1297
  }
add_sub_sorl_v1_abs10_K1_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b512b0970baed990d0164c674044de1463fd819596795fb6a137261185804ce5
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dc104978754dca706830aa83c50e922f112f2961bfe046fb9acb75fc2f4cfb9
3
  size 650303660
add_sub_sorl_v1_abs10_K1_10K/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_10K",
72
- "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
- "timestamp": "2026-04-12T08:59:10.916999+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "pdbywwrx",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/pdbywwrx",
81
- "final_accuracy": 0.6116666666666667,
82
- "sft_accuracy": 0.47041666666666665,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
 
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_10K",
72
+ "git_commit": "8d5ee5420119746ef4e2c87570eb250c9718f643",
73
+ "timestamp": "2026-04-12T23:50:30.170809+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "yh5d4it1",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/yh5d4it1",
81
+ "final_accuracy": 0.8704166666666666,
82
+ "sft_accuracy": 0.44083333333333335,
83
  "eval_method": "ArithmeticEvaluator"
84
  }