amirali1985 commited on
Commit
d72298f
·
verified ·
1 Parent(s): e3cbbc5

Upload add_sub_sorl_v1_abs10_K1_10K

Browse files
add_sub_sorl_v1_abs10_K1_10K/metrics.json CHANGED
@@ -30,7 +30,37 @@
30
  1406,
31
  1463,
32
  1513,
33
- 1563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ],
35
  "loss": [
36
  9.340689659118652,
@@ -51,18 +81,48 @@
51
  -1.9102389812469482,
52
  -1.880913257598877,
53
  -1.9313758611679077,
54
- -1.818078875541687,
55
- -1.5362316370010376,
56
- -2.0820319652557373,
57
- -1.3230443000793457,
58
- -1.1353564262390137,
59
- -1.5558422803878784,
60
- -1.8902345895767212,
61
- -1.0403810739517212,
62
- -1.1732807159423828,
63
- -0.8931325078010559,
64
- -1.0903886556625366,
65
- -0.5779882073402405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ],
67
  "base_loss": [
68
  7.390318870544434,
@@ -83,18 +143,48 @@
83
  0.5179902911186218,
84
  0.47875410318374634,
85
  0.4619639217853546,
86
- 0.3892979025840759,
87
- 0.4061528742313385,
88
- 0.4215702414512634,
89
- 0.30625566840171814,
90
- 0.26082319021224976,
91
- 0.30791884660720825,
92
- 0.3336397111415863,
93
- 0.22246377170085907,
94
- 0.2154165357351303,
95
- 0.1796569526195526,
96
- 0.1971123218536377,
97
- 0.13742247223854065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  ],
99
  "info_loss": [
100
  -0.4859275817871094,
@@ -115,18 +205,48 @@
115
  -0.3521292805671692,
116
  -0.33850210905075073,
117
  -0.33436745405197144,
118
- -0.3095327913761139,
119
- -0.2765413522720337,
120
- -0.32649245858192444,
121
- -0.24035842716693878,
122
- -0.20812112092971802,
123
- -0.2552123963832855,
124
- -0.28583821654319763,
125
- -0.19351282715797424,
126
- -0.19369123876094818,
127
- -0.16221696138381958,
128
- -0.18230006098747253,
129
- -0.1274920552968979
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  ],
131
  "abs_loss": [
132
  2.2262747287750244,
@@ -147,18 +267,48 @@
147
  0.6123369336128235,
148
  0.49044322967529297,
149
  0.4312553107738495,
150
- 0.38406622409820557,
151
- 0.3418884575366974,
152
- 0.2913786768913269,
153
- 0.3654420077800751,
154
- 0.28334739804267883,
155
- 0.25913718342781067,
156
- 0.2655659019947052,
157
- 0.2588687837123871,
158
- 0.21410413086414337,
159
- 0.21956706047058105,
160
- 0.2182735651731491,
161
- 0.23050324618816376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  ],
163
  "zipf_loss": [
164
  6.587018966674805,
@@ -179,18 +329,48 @@
179
  1.031829595565796,
180
  0.9763095378875732,
181
  0.9072092771530151,
182
- 0.8495444059371948,
183
- 0.7888401746749878,
184
- 0.7321844100952148,
185
- 0.7377402782440186,
186
- 0.6566966772079468,
187
- 0.6624492406845093,
188
- 0.6079515218734741,
189
- 0.6463965177536011,
190
- 0.5268046855926514,
191
- 0.5274234414100647,
192
- 0.5136723518371582,
193
- 0.5364596247673035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
@@ -212,18 +392,48 @@
212
  0.3624407649040222,
213
  0.36967605352401733,
214
  0.3742343485355377,
215
- 0.39829763770103455,
216
- 0.39412182569503784,
217
- 0.3936292827129364,
218
- 0.3937382996082306,
219
- 0.3880367875099182,
220
- 0.39024752378463745,
221
- 0.3874780535697937,
222
- 0.39575129747390747,
223
- 0.3984874486923218,
224
- 0.39981427788734436,
225
- 0.40026330947875977,
226
- 0.4032067358493805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  ],
228
  "lr": [
229
  3.9200000000000004e-05,
@@ -244,18 +454,48 @@
244
  8e-05,
245
  8e-05,
246
  8e-05,
247
- 7.889795918367346e-05,
248
- 7.277551020408164e-05,
249
- 6.665306122448979e-05,
250
- 5.96734693877551e-05,
251
- 5.3551020408163274e-05,
252
- 4.7428571428571427e-05,
253
- 4.044897959183674e-05,
254
- 3.432653061224491e-05,
255
- 2.820408163265307e-05,
256
- 2.1224489795918364e-05,
257
- 1.5102040816326524e-05,
258
- 8.9795918367347e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
@@ -268,7 +508,17 @@
268
  1092,
269
  1199,
270
  1356,
271
- 1513
 
 
 
 
 
 
 
 
 
 
272
  ],
273
  "eval_accuracy": [
274
  0.01,
@@ -280,10 +530,20 @@
280
  0.0,
281
  0.0,
282
  0.0,
 
 
 
 
 
 
 
 
 
 
283
  0.0
284
  ]
285
  },
286
- "final_accuracy": 0.8704166666666666,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
@@ -294,29 +554,29 @@
294
  },
295
  "splits": {
296
  "add_S0": {
297
- "full_accuracy": 0.9,
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
- "accuracy": 0.9818181818181818,
302
  "count": 605
303
  },
304
  "SS": {
305
- "accuracy": 1.0,
306
  "count": 95
307
  }
308
  }
309
  },
310
  "add_S1": {
311
- "full_accuracy": 0.83,
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
315
- "accuracy": 0.9803921568627451,
316
  "count": 204
317
  },
318
  "SC": {
319
- "accuracy": 0.9940828402366864,
320
  "count": 169
321
  },
322
  "SS": {
@@ -324,47 +584,47 @@
324
  "count": 31
325
  },
326
  "UC": {
327
- "accuracy": 0.956081081081081,
328
  "count": 296
329
  }
330
  }
331
  },
332
  "add_S2": {
333
- "full_accuracy": 0.54,
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
- "accuracy": 0.9815950920245399,
338
  "count": 163
339
  },
340
  "SC": {
341
- "accuracy": 0.9538461538461539,
342
  "count": 130
343
  },
344
  "SS": {
345
- "accuracy": 0.9770114942528736,
346
  "count": 87
347
  },
348
  "UC": {
349
- "accuracy": 0.812807881773399,
350
  "count": 203
351
  },
352
  "US": {
353
- "accuracy": 0.9401709401709402,
354
  "count": 117
355
  }
356
  }
357
  },
358
  "add_S3": {
359
- "full_accuracy": 0.3,
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
- "accuracy": 0.9752066115702479,
364
  "count": 121
365
  },
366
  "SC": {
367
- "accuracy": 0.9752066115702479,
368
  "count": 121
369
  },
370
  "SS": {
@@ -372,25 +632,25 @@
372
  "count": 49
373
  },
374
  "UC": {
375
- "accuracy": 0.6666666666666666,
376
  "count": 186
377
  },
378
  "US": {
379
- "accuracy": 0.7668161434977578,
380
  "count": 223
381
  }
382
  }
383
  },
384
  "add_S4": {
385
- "full_accuracy": 0.23,
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
- "accuracy": 0.9807692307692307,
390
  "count": 104
391
  },
392
  "SC": {
393
- "accuracy": 0.9811320754716981,
394
  "count": 106
395
  },
396
  "SS": {
@@ -398,17 +658,17 @@
398
  "count": 23
399
  },
400
  "UC": {
401
- "accuracy": 0.58125,
402
  "count": 160
403
  },
404
  "US": {
405
- "accuracy": 0.5635179153094463,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
- "full_accuracy": 0.12,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
@@ -416,21 +676,21 @@
416
  "count": 100
417
  },
418
  "SC": {
419
- "accuracy": 0.94,
420
  "count": 100
421
  },
422
  "UC": {
423
- "accuracy": 0.3,
424
  "count": 100
425
  },
426
  "US": {
427
- "accuracy": 0.325,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
- "full_accuracy": 0.34,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
@@ -438,43 +698,43 @@
438
  "count": 100
439
  },
440
  "UC": {
441
- "accuracy": 0.53,
442
  "count": 100
443
  },
444
  "US": {
445
- "accuracy": 0.514,
446
  "count": 500
447
  }
448
  }
449
  },
450
  "add_random": {
451
- "full_accuracy": 0.7,
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
- "accuracy": 0.9686800894854586,
456
  "count": 447
457
  },
458
  "SC": {
459
- "accuracy": 0.971875,
460
  "count": 320
461
  },
462
  "SS": {
463
- "accuracy": 0.9642857142857143,
464
  "count": 56
465
  },
466
  "UC": {
467
- "accuracy": 0.9243856332703214,
468
  "count": 529
469
  },
470
  "US": {
471
- "accuracy": 0.8958333333333334,
472
  "count": 48
473
  }
474
  }
475
  },
476
  "add_C3": {
477
- "full_accuracy": 0.48,
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
@@ -482,47 +742,47 @@
482
  "count": 300
483
  },
484
  "SC": {
485
- "accuracy": 0.99,
486
  "count": 100
487
  },
488
  "UC": {
489
- "accuracy": 0.7461139896373057,
490
  "count": 193
491
  },
492
  "US": {
493
- "accuracy": 0.7663551401869159,
494
  "count": 107
495
  }
496
  }
497
  },
498
  "add_C4": {
499
- "full_accuracy": 0.38,
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.985,
504
  "count": 200
505
  },
506
  "SC": {
507
- "accuracy": 0.98,
508
  "count": 100
509
  },
510
  "UC": {
511
- "accuracy": 0.7734375,
512
  "count": 256
513
  },
514
  "US": {
515
- "accuracy": 0.8194444444444444,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
- "full_accuracy": 0.4,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
- "accuracy": 0.99,
526
  "count": 100
527
  },
528
  "SC": {
@@ -530,17 +790,17 @@
530
  "count": 100
531
  },
532
  "UC": {
533
- "accuracy": 0.7908496732026143,
534
  "count": 306
535
  },
536
  "US": {
537
- "accuracy": 0.845360824742268,
538
  "count": 194
539
  }
540
  }
541
  },
542
  "add_C6": {
543
- "full_accuracy": 0.35,
544
  "n_examples": 100,
545
  "per_subtask": {
546
  "SC": {
@@ -548,39 +808,39 @@
548
  "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.8005464480874317,
552
  "count": 366
553
  },
554
  "US": {
555
- "accuracy": 0.8290598290598291,
556
  "count": 234
557
  }
558
  }
559
  },
560
  "sub_M0": {
561
- "full_accuracy": 0.89,
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
- "accuracy": 0.9833610648918469,
566
  "count": 601
567
  },
568
  "ME": {
569
- "accuracy": 0.9797979797979798,
570
  "count": 99
571
  }
572
  }
573
  },
574
  "sub_M1": {
575
- "full_accuracy": 0.81,
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
- "accuracy": 0.992831541218638,
580
  "count": 279
581
  },
582
  "MB": {
583
- "accuracy": 0.993103448275862,
584
  "count": 145
585
  },
586
  "ME": {
@@ -588,39 +848,39 @@
588
  "count": 24
589
  },
590
  "UB": {
591
- "accuracy": 0.9285714285714286,
592
  "count": 252
593
  }
594
  }
595
  },
596
  "sub_M2": {
597
- "full_accuracy": 0.42,
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
- "accuracy": 0.9859154929577465,
602
  "count": 213
603
  },
604
  "MB": {
605
- "accuracy": 0.9734513274336283,
606
  "count": 113
607
  },
608
  "ME": {
609
- "accuracy": 0.9882352941176471,
610
  "count": 85
611
  },
612
  "UB": {
613
- "accuracy": 0.6740331491712708,
614
  "count": 181
615
  },
616
  "UD": {
617
- "accuracy": 0.9722222222222222,
618
  "count": 108
619
  }
620
  }
621
  },
622
  "sub_M3": {
623
- "full_accuracy": 0.06,
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
@@ -628,7 +888,7 @@
628
  "count": 179
629
  },
630
  "MB": {
631
- "accuracy": 0.970873786407767,
632
  "count": 103
633
  },
634
  "ME": {
@@ -636,17 +896,17 @@
636
  "count": 56
637
  },
638
  "UB": {
639
- "accuracy": 0.3624161073825503,
640
  "count": 149
641
  },
642
  "UD": {
643
- "accuracy": 0.6384976525821596,
644
  "count": 213
645
  }
646
  }
647
  },
648
  "sub_M4": {
649
- "full_accuracy": 0.02,
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
@@ -654,15 +914,15 @@
654
  "count": 200
655
  },
656
  "MB": {
657
- "accuracy": 0.91,
658
  "count": 100
659
  },
660
  "UB": {
661
- "accuracy": 0.38,
662
  "count": 100
663
  },
664
  "UD": {
665
- "accuracy": 0.29,
666
  "count": 300
667
  }
668
  }
@@ -676,29 +936,29 @@
676
  "count": 100
677
  },
678
  "MB": {
679
- "accuracy": 0.85,
680
  "count": 100
681
  },
682
  "UB": {
683
- "accuracy": 0.25,
684
  "count": 100
685
  },
686
  "UD": {
687
- "accuracy": 0.19,
688
  "count": 400
689
  }
690
  }
691
  },
692
  "sub_random": {
693
- "full_accuracy": 0.71,
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
- "accuracy": 0.9816666666666667,
698
  "count": 600
699
  },
700
  "MB": {
701
- "accuracy": 0.9925093632958801,
702
  "count": 267
703
  },
704
  "ME": {
@@ -706,61 +966,61 @@
706
  "count": 53
707
  },
708
  "UB": {
709
- "accuracy": 0.8906605922551253,
710
  "count": 439
711
  },
712
  "UD": {
713
- "accuracy": 0.8536585365853658,
714
  "count": 41
715
  }
716
  }
717
  },
718
  "sub_B3": {
719
- "full_accuracy": 0.33,
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
- "accuracy": 0.9866666666666667,
724
  "count": 300
725
  },
726
  "MB": {
727
- "accuracy": 0.98,
728
  "count": 100
729
  },
730
  "UB": {
731
- "accuracy": 0.6700507614213198,
732
  "count": 197
733
  },
734
  "UD": {
735
- "accuracy": 0.7378640776699029,
736
  "count": 103
737
  }
738
  }
739
  },
740
  "sub_B4": {
741
- "full_accuracy": 0.21,
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
745
- "accuracy": 0.995,
746
  "count": 200
747
  },
748
  "MB": {
749
- "accuracy": 0.98,
750
  "count": 100
751
  },
752
  "UB": {
753
- "accuracy": 0.6761133603238867,
754
  "count": 247
755
  },
756
  "UD": {
757
- "accuracy": 0.6666666666666666,
758
  "count": 153
759
  }
760
  }
761
  },
762
  "sub_B5": {
763
- "full_accuracy": 0.15,
764
  "n_examples": 100,
765
  "per_subtask": {
766
  "MD": {
@@ -768,22 +1028,22 @@
768
  "count": 100
769
  },
770
  "MB": {
771
- "accuracy": 0.97,
772
  "count": 100
773
  },
774
  "UB": {
775
- "accuracy": 0.6543624161073825,
776
  "count": 298
777
  },
778
  "UD": {
779
- "accuracy": 0.6089108910891089,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
- "overall_accuracy": 0.44083333333333335,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
@@ -912,7 +1172,7 @@
912
  }
913
  },
914
  "add_S5": {
915
- "full_accuracy": 0.58,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
@@ -924,7 +1184,7 @@
924
  "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.58,
928
  "count": 100
929
  },
930
  "US": {
@@ -934,7 +1194,7 @@
934
  }
935
  },
936
  "add_S6": {
937
- "full_accuracy": 0.88,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
@@ -942,11 +1202,11 @@
942
  "count": 100
943
  },
944
  "UC": {
945
- "accuracy": 0.95,
946
  "count": 100
947
  },
948
  "US": {
949
- "accuracy": 0.972,
950
  "count": 500
951
  }
952
  }
@@ -1022,7 +1282,7 @@
1022
  }
1023
  },
1024
  "add_C5": {
1025
- "full_accuracy": 0.97,
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
@@ -1038,13 +1298,13 @@
1038
  "count": 306
1039
  },
1040
  "US": {
1041
- "accuracy": 0.9948453608247423,
1042
  "count": 194
1043
  }
1044
  }
1045
  },
1046
  "add_C6": {
1047
- "full_accuracy": 0.98,
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
@@ -1052,7 +1312,7 @@
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
- "accuracy": 0.994535519125683,
1056
  "count": 366
1057
  },
1058
  "US": {
@@ -1062,11 +1322,11 @@
1062
  }
1063
  },
1064
  "sub_M0": {
1065
- "full_accuracy": 0.99,
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.9983361064891847,
1070
  "count": 601
1071
  },
1072
  "ME": {
@@ -1098,11 +1358,11 @@
1098
  }
1099
  },
1100
  "sub_M2": {
1101
- "full_accuracy": 0.95,
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.9906103286384976,
1106
  "count": 213
1107
  },
1108
  "MB": {
@@ -1114,7 +1374,7 @@
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
- "accuracy": 0.9834254143646409,
1118
  "count": 181
1119
  },
1120
  "UD": {
@@ -1124,7 +1384,7 @@
1124
  }
1125
  },
1126
  "sub_M3": {
1127
- "full_accuracy": 0.79,
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
@@ -1140,17 +1400,17 @@
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
- "accuracy": 0.87248322147651,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
- "accuracy": 0.9624413145539906,
1148
  "count": 213
1149
  }
1150
  }
1151
  },
1152
  "sub_M4": {
1153
- "full_accuracy": 0.3,
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
@@ -1162,17 +1422,17 @@
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
- "accuracy": 0.36,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
- "accuracy": 0.8133333333333334,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
- "full_accuracy": 0.04,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
@@ -1184,21 +1444,21 @@
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
- "accuracy": 0.12,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
- "accuracy": 0.6575,
1192
  "count": 400
1193
  }
1194
  }
1195
  },
1196
  "sub_random": {
1197
- "full_accuracy": 0.985,
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.995,
1202
  "count": 600
1203
  },
1204
  "MB": {
@@ -1220,11 +1480,11 @@
1220
  }
1221
  },
1222
  "sub_B3": {
1223
- "full_accuracy": 0.93,
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
- "accuracy": 0.9966666666666667,
1228
  "count": 300
1229
  },
1230
  "MB": {
@@ -1232,7 +1492,7 @@
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
- "accuracy": 0.9695431472081218,
1236
  "count": 197
1237
  },
1238
  "UD": {
@@ -1242,11 +1502,11 @@
1242
  }
1243
  },
1244
  "sub_B4": {
1245
- "full_accuracy": 0.82,
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
- "accuracy": 0.995,
1250
  "count": 200
1251
  },
1252
  "MB": {
@@ -1254,17 +1514,17 @@
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
- "accuracy": 0.9352226720647774,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
- "accuracy": 0.9607843137254902,
1262
  "count": 153
1263
  }
1264
  }
1265
  },
1266
  "sub_B5": {
1267
- "full_accuracy": 0.71,
1268
  "n_examples": 100,
1269
  "per_subtask": {
1270
  "MD": {
@@ -1276,22 +1536,22 @@
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
- "accuracy": 0.9026845637583892,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
- "accuracy": 0.9207920792079208,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
- "overall_accuracy": 0.8704166666666666,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
- "sorl_overall_accuracy": 0.8704166666666666,
1296
- "sft_overall_accuracy": 0.44083333333333335
1297
  }
 
30
  1406,
31
  1463,
32
  1513,
33
+ 1563,
34
+ 1620,
35
+ 1670,
36
+ 1720,
37
+ 1777,
38
+ 1827,
39
+ 1877,
40
+ 1934,
41
+ 1984,
42
+ 2034,
43
+ 2091,
44
+ 2141,
45
+ 2191,
46
+ 2248,
47
+ 2298,
48
+ 2348,
49
+ 2405,
50
+ 2455,
51
+ 2505,
52
+ 2562,
53
+ 2612,
54
+ 2662,
55
+ 2719,
56
+ 2769,
57
+ 2819,
58
+ 2876,
59
+ 2926,
60
+ 2976,
61
+ 3033,
62
+ 3083,
63
+ 3133
64
  ],
65
  "loss": [
66
  9.340689659118652,
 
81
  -1.9102389812469482,
82
  -1.880913257598877,
83
  -1.9313758611679077,
84
+ -1.8197805881500244,
85
+ -1.7383019924163818,
86
+ -2.4177093505859375,
87
+ -1.3264975547790527,
88
+ -1.2404942512512207,
89
+ -1.5992217063903809,
90
+ -1.9741289615631104,
91
+ -1.0616717338562012,
92
+ -1.5642304420471191,
93
+ -1.3654898405075073,
94
+ -1.3151346445083618,
95
+ -0.7584792971611023,
96
+ -1.3367366790771484,
97
+ -1.2703979015350342,
98
+ -1.0298550128936768,
99
+ -1.6327556371688843,
100
+ -1.734535813331604,
101
+ -1.002307653427124,
102
+ -0.9018746018409729,
103
+ -2.051492214202881,
104
+ -0.6267629861831665,
105
+ -0.9340898990631104,
106
+ -0.7012732028961182,
107
+ -0.9501388669013977,
108
+ -0.9147610664367676,
109
+ -0.5015538930892944,
110
+ -1.2906075716018677,
111
+ -0.9741727113723755,
112
+ -0.7373249530792236,
113
+ -0.8238300085067749,
114
+ -0.48194777965545654,
115
+ -0.7015870213508606,
116
+ -0.4147356152534485,
117
+ -0.649371325969696,
118
+ -0.7445440292358398,
119
+ -0.7792887687683105,
120
+ -0.3351507782936096,
121
+ -0.4870011508464813,
122
+ -0.2853846549987793,
123
+ -0.4475783109664917,
124
+ -0.42194119095802307,
125
+ -0.4129505455493927
126
  ],
127
  "base_loss": [
128
  7.390318870544434,
 
143
  0.5179902911186218,
144
  0.47875410318374634,
145
  0.4619639217853546,
146
+ 0.39008235931396484,
147
+ 0.43836668133735657,
148
+ 0.48746830224990845,
149
+ 0.2996414601802826,
150
+ 0.26576855778694153,
151
+ 0.3262496590614319,
152
+ 0.33699464797973633,
153
+ 0.24052418768405914,
154
+ 0.2867306172847748,
155
+ 0.23591551184654236,
156
+ 0.233742356300354,
157
+ 0.15799184143543243,
158
+ 0.2535308301448822,
159
+ 0.22049279510974884,
160
+ 0.1870928704738617,
161
+ 0.2405395358800888,
162
+ 0.25070902705192566,
163
+ 0.16667716205120087,
164
+ 0.15809106826782227,
165
+ 0.31137070059776306,
166
+ 0.11368274688720703,
167
+ 0.14004917442798615,
168
+ 0.13582302629947662,
169
+ 0.14286470413208008,
170
+ 0.13543015718460083,
171
+ 0.08498942852020264,
172
+ 0.18833109736442566,
173
+ 0.13422875106334686,
174
+ 0.10775727778673172,
175
+ 0.12300104647874832,
176
+ 0.07167976349592209,
177
+ 0.11587230116128922,
178
+ 0.0645364299416542,
179
+ 0.09650751203298569,
180
+ 0.09376537799835205,
181
+ 0.09722624719142914,
182
+ 0.045721422880887985,
183
+ 0.06196039915084839,
184
+ 0.05605100467801094,
185
+ 0.06696300953626633,
186
+ 0.053617555648088455,
187
+ 0.052670858800411224
188
  ],
189
  "info_loss": [
190
  -0.4859275817871094,
 
205
  -0.3521292805671692,
206
  -0.33850210905075073,
207
  -0.33436745405197144,
208
+ -0.3098219931125641,
209
+ -0.3004074692726135,
210
+ -0.3650834560394287,
211
+ -0.2377147525548935,
212
+ -0.21807271242141724,
213
+ -0.2605586349964142,
214
+ -0.2963896691799164,
215
+ -0.19201940298080444,
216
+ -0.2403343915939331,
217
+ -0.2125036120414734,
218
+ -0.2005012035369873,
219
+ -0.14176680147647858,
220
+ -0.20001220703125,
221
+ -0.18811163306236267,
222
+ -0.16207079589366913,
223
+ -0.22838185727596283,
224
+ -0.23002460598945618,
225
+ -0.1475900262594223,
226
+ -0.1343834400177002,
227
+ -0.26728555560112,
228
+ -0.10240201652050018,
229
+ -0.1339702010154724,
230
+ -0.11256629973649979,
231
+ -0.13482710719108582,
232
+ -0.12813962996006012,
233
+ -0.08094710111618042,
234
+ -0.1647377759218216,
235
+ -0.13163283467292786,
236
+ -0.1023663803935051,
237
+ -0.11182866990566254,
238
+ -0.06797388941049576,
239
+ -0.09571453928947449,
240
+ -0.06312839686870575,
241
+ -0.0817064419388771,
242
+ -0.09321703016757965,
243
+ -0.09572642296552658,
244
+ -0.04448504000902176,
245
+ -0.06147210672497749,
246
+ -0.040348708629608154,
247
+ -0.05947120487689972,
248
+ -0.05293218418955803,
249
+ -0.05189599096775055
250
  ],
251
  "abs_loss": [
252
  2.2262747287750244,
 
267
  0.6123369336128235,
268
  0.49044322967529297,
269
  0.4312553107738495,
270
+ 0.38433346152305603,
271
+ 0.3575459420681,
272
+ 0.2791888415813446,
273
+ 0.3738294541835785,
274
+ 0.2938316762447357,
275
+ 0.2500404715538025,
276
+ 0.24418945610523224,
277
+ 0.2506295442581177,
278
+ 0.1918429583311081,
279
+ 0.21762271225452423,
280
+ 0.19322265684604645,
281
+ 0.174521803855896,
282
+ 0.1660517454147339,
283
+ 0.1862434595823288,
284
+ 0.14573685824871063,
285
+ 0.13860176503658295,
286
+ 0.13301999866962433,
287
+ 0.12413140386343002,
288
+ 0.0981183648109436,
289
+ 0.1176028847694397,
290
+ 0.12784387171268463,
291
+ 0.08845017105340958,
292
+ 0.10072199255228043,
293
+ 0.08419985324144363,
294
+ 0.10068891197443008,
295
+ 0.08726515620946884,
296
+ 0.08775246143341064,
297
+ 0.06938570737838745,
298
+ 0.07202962785959244,
299
+ 0.07672454416751862,
300
+ 0.06882075220346451,
301
+ 0.0632840245962143,
302
+ 0.06827760487794876,
303
+ 0.05087992176413536,
304
+ 0.04930369183421135,
305
+ 0.05453367158770561,
306
+ 0.04361484572291374,
307
+ 0.05198704078793526,
308
+ 0.042146146297454834,
309
+ 0.0432199127972126,
310
+ 0.042553484439849854,
311
+ 0.054908428341150284
312
  ],
313
  "zipf_loss": [
314
  6.587018966674805,
 
329
  1.031829595565796,
330
  0.9763095378875732,
331
  0.9072092771530151,
332
+ 0.8499235510826111,
333
+ 0.7916512489318848,
334
+ 0.7177382707595825,
335
+ 0.7136257290840149,
336
+ 0.6450810432434082,
337
+ 0.655110776424408,
338
+ 0.6283543109893799,
339
+ 0.5929351449012756,
340
+ 0.5331985950469971,
341
+ 0.5018686056137085,
342
+ 0.43681272864341736,
343
+ 0.48374468088150024,
344
+ 0.39324942231178284,
345
+ 0.37160131335258484,
346
+ 0.3891863524913788,
347
+ 0.3966630697250366,
348
+ 0.3016990125179291,
349
+ 0.2945023775100708,
350
+ 0.2740568518638611,
351
+ 0.29823243618011475,
352
+ 0.27079012989997864,
353
+ 0.2567179501056671,
354
+ 0.278494656085968,
355
+ 0.24684756994247437,
356
+ 0.22113612294197083,
357
+ 0.21420112252235413,
358
+ 0.1596638262271881,
359
+ 0.2009882926940918,
360
+ 0.17137852311134338,
361
+ 0.1637832671403885,
362
+ 0.1192292720079422,
363
+ 0.13335762917995453,
364
+ 0.14518415927886963,
365
+ 0.06609760969877243,
366
+ 0.08893045783042908,
367
+ 0.07529586553573608,
368
+ 0.05961669981479645,
369
+ 0.06056078150868416,
370
+ 0.05783679336309433,
371
+ 0.07584869861602783,
372
+ 0.0495077446103096,
373
+ 0.04784762114286423
374
  ],
375
  "denoise_loss": [],
376
  "ortho_loss": [
 
392
  0.3624407649040222,
393
  0.36967605352401733,
394
  0.3742343485355377,
395
+ 0.3982963562011719,
396
+ 0.3951874375343323,
397
+ 0.3906378149986267,
398
+ 0.3858691453933716,
399
+ 0.390056312084198,
400
+ 0.3822699785232544,
401
+ 0.3838372230529785,
402
+ 0.3822362422943115,
403
+ 0.375870019197464,
404
+ 0.38709157705307007,
405
+ 0.3905230164527893,
406
+ 0.39558085799217224,
407
+ 0.3952634334564209,
408
+ 0.39085277915000916,
409
+ 0.39921504259109497,
410
+ 0.39666083455085754,
411
+ 0.3942829966545105,
412
+ 0.3991093337535858,
413
+ 0.39491263031959534,
414
+ 0.3742454946041107,
415
+ 0.3708445429801941,
416
+ 0.37462660670280457,
417
+ 0.3777347207069397,
418
+ 0.36790767312049866,
419
+ 0.36087462306022644,
420
+ 0.3528926968574524,
421
+ 0.3517568111419678,
422
+ 0.3517394959926605,
423
+ 0.3531302809715271,
424
+ 0.35252636671066284,
425
+ 0.35542812943458557,
426
+ 0.35618269443511963,
427
+ 0.3573307693004608,
428
+ 0.3604697287082672,
429
+ 0.36059707403182983,
430
+ 0.36255255341529846,
431
+ 0.362486869096756,
432
+ 0.3620503544807434,
433
+ 0.363882839679718,
434
+ 0.36408552527427673,
435
+ 0.3636517822742462,
436
+ 0.3643637001514435
437
  ],
438
  "lr": [
439
  3.9200000000000004e-05,
 
454
  8e-05,
455
  8e-05,
456
  8e-05,
457
+ 8e-05,
458
+ 8e-05,
459
+ 8e-05,
460
+ 8e-05,
461
+ 8e-05,
462
+ 8e-05,
463
+ 8e-05,
464
+ 8e-05,
465
+ 8e-05,
466
+ 8e-05,
467
+ 8e-05,
468
+ 8e-05,
469
+ 8e-05,
470
+ 8e-05,
471
+ 8e-05,
472
+ 8e-05,
473
+ 8e-05,
474
+ 8e-05,
475
+ 7.946710526315791e-05,
476
+ 7.650657894736843e-05,
477
+ 7.354605263157895e-05,
478
+ 7.017105263157896e-05,
479
+ 6.721052631578948e-05,
480
+ 6.425e-05,
481
+ 6.0875e-05,
482
+ 5.791447368421054e-05,
483
+ 5.495394736842105e-05,
484
+ 5.157894736842105e-05,
485
+ 4.861842105263157e-05,
486
+ 4.565789473684212e-05,
487
+ 4.2282894736842104e-05,
488
+ 3.9322368421052625e-05,
489
+ 3.636184210526315e-05,
490
+ 3.2986842105263165e-05,
491
+ 3.0026315789473686e-05,
492
+ 2.7065789473684206e-05,
493
+ 2.3690789473684223e-05,
494
+ 2.0730263157894743e-05,
495
+ 1.7769736842105264e-05,
496
+ 1.4394736842105275e-05,
497
+ 1.1434210526315796e-05,
498
+ 8.473684210526318e-06
499
  ],
500
  "emb_lr": [],
501
  "eval_step": [
 
508
  1092,
509
  1199,
510
  1356,
511
+ 1513,
512
+ 1670,
513
+ 1827,
514
+ 1984,
515
+ 2141,
516
+ 2298,
517
+ 2455,
518
+ 2612,
519
+ 2769,
520
+ 2926,
521
+ 3083
522
  ],
523
  "eval_accuracy": [
524
  0.01,
 
530
  0.0,
531
  0.0,
532
  0.0,
533
+ 0.0,
534
+ 0.0,
535
+ 0.0,
536
+ 0.0,
537
+ 0.0,
538
+ 0.0,
539
+ 0.0,
540
+ 0.0,
541
+ 0.0,
542
+ 0.0,
543
  0.0
544
  ]
545
  },
546
+ "final_accuracy": 0.9470833333333334,
547
  "sft_eval": {
548
  "config": {
549
  "ops": "add_sub",
 
554
  },
555
  "splits": {
556
  "add_S0": {
557
+ "full_accuracy": 0.94,
558
  "n_examples": 100,
559
  "per_subtask": {
560
  "SA": {
561
+ "accuracy": 0.9917355371900827,
562
  "count": 605
563
  },
564
  "SS": {
565
+ "accuracy": 0.9894736842105263,
566
  "count": 95
567
  }
568
  }
569
  },
570
  "add_S1": {
571
+ "full_accuracy": 0.99,
572
  "n_examples": 100,
573
  "per_subtask": {
574
  "SA": {
575
+ "accuracy": 0.9950980392156863,
576
  "count": 204
577
  },
578
  "SC": {
579
+ "accuracy": 1.0,
580
  "count": 169
581
  },
582
  "SS": {
 
584
  "count": 31
585
  },
586
  "UC": {
587
+ "accuracy": 1.0,
588
  "count": 296
589
  }
590
  }
591
  },
592
  "add_S2": {
593
+ "full_accuracy": 0.91,
594
  "n_examples": 100,
595
  "per_subtask": {
596
  "SA": {
597
+ "accuracy": 1.0,
598
  "count": 163
599
  },
600
  "SC": {
601
+ "accuracy": 0.9615384615384616,
602
  "count": 130
603
  },
604
  "SS": {
605
+ "accuracy": 0.9885057471264368,
606
  "count": 87
607
  },
608
  "UC": {
609
+ "accuracy": 0.9852216748768473,
610
  "count": 203
611
  },
612
  "US": {
613
+ "accuracy": 1.0,
614
  "count": 117
615
  }
616
  }
617
  },
618
  "add_S3": {
619
+ "full_accuracy": 0.67,
620
  "n_examples": 100,
621
  "per_subtask": {
622
  "SA": {
623
+ "accuracy": 0.9917355371900827,
624
  "count": 121
625
  },
626
  "SC": {
627
+ "accuracy": 1.0,
628
  "count": 121
629
  },
630
  "SS": {
 
632
  "count": 49
633
  },
634
  "UC": {
635
+ "accuracy": 0.8333333333333334,
636
  "count": 186
637
  },
638
  "US": {
639
+ "accuracy": 0.9910313901345291,
640
  "count": 223
641
  }
642
  }
643
  },
644
  "add_S4": {
645
+ "full_accuracy": 0.58,
646
  "n_examples": 100,
647
  "per_subtask": {
648
  "SA": {
649
+ "accuracy": 1.0,
650
  "count": 104
651
  },
652
  "SC": {
653
+ "accuracy": 1.0,
654
  "count": 106
655
  },
656
  "SS": {
 
658
  "count": 23
659
  },
660
  "UC": {
661
+ "accuracy": 0.7875,
662
  "count": 160
663
  },
664
  "US": {
665
+ "accuracy": 0.9218241042345277,
666
  "count": 307
667
  }
668
  }
669
  },
670
  "add_S5": {
671
+ "full_accuracy": 0.43,
672
  "n_examples": 100,
673
  "per_subtask": {
674
  "SA": {
 
676
  "count": 100
677
  },
678
  "SC": {
679
+ "accuracy": 1.0,
680
  "count": 100
681
  },
682
  "UC": {
683
+ "accuracy": 0.5,
684
  "count": 100
685
  },
686
  "US": {
687
+ "accuracy": 0.775,
688
  "count": 400
689
  }
690
  }
691
  },
692
  "add_S6": {
693
+ "full_accuracy": 0.85,
694
  "n_examples": 100,
695
  "per_subtask": {
696
  "SC": {
 
698
  "count": 100
699
  },
700
  "UC": {
701
+ "accuracy": 0.85,
702
  "count": 100
703
  },
704
  "US": {
705
+ "accuracy": 0.91,
706
  "count": 500
707
  }
708
  }
709
  },
710
  "add_random": {
711
+ "full_accuracy": 0.975,
712
  "n_examples": 200,
713
  "per_subtask": {
714
  "SA": {
715
+ "accuracy": 0.9977628635346756,
716
  "count": 447
717
  },
718
  "SC": {
719
+ "accuracy": 0.996875,
720
  "count": 320
721
  },
722
  "SS": {
723
+ "accuracy": 0.9821428571428571,
724
  "count": 56
725
  },
726
  "UC": {
727
+ "accuracy": 0.994328922495274,
728
  "count": 529
729
  },
730
  "US": {
731
+ "accuracy": 1.0,
732
  "count": 48
733
  }
734
  }
735
  },
736
  "add_C3": {
737
+ "full_accuracy": 0.84,
738
  "n_examples": 100,
739
  "per_subtask": {
740
  "SA": {
 
742
  "count": 300
743
  },
744
  "SC": {
745
+ "accuracy": 1.0,
746
  "count": 100
747
  },
748
  "UC": {
749
+ "accuracy": 0.9222797927461139,
750
  "count": 193
751
  },
752
  "US": {
753
+ "accuracy": 0.9906542056074766,
754
  "count": 107
755
  }
756
  }
757
  },
758
  "add_C4": {
759
+ "full_accuracy": 0.83,
760
  "n_examples": 100,
761
  "per_subtask": {
762
  "SA": {
763
+ "accuracy": 1.0,
764
  "count": 200
765
  },
766
  "SC": {
767
+ "accuracy": 1.0,
768
  "count": 100
769
  },
770
  "UC": {
771
+ "accuracy": 0.9375,
772
  "count": 256
773
  },
774
  "US": {
775
+ "accuracy": 0.9722222222222222,
776
  "count": 144
777
  }
778
  }
779
  },
780
  "add_C5": {
781
+ "full_accuracy": 0.76,
782
  "n_examples": 100,
783
  "per_subtask": {
784
  "SA": {
785
+ "accuracy": 1.0,
786
  "count": 100
787
  },
788
  "SC": {
 
790
  "count": 100
791
  },
792
  "UC": {
793
+ "accuracy": 0.9215686274509803,
794
  "count": 306
795
  },
796
  "US": {
797
+ "accuracy": 0.9329896907216495,
798
  "count": 194
799
  }
800
  }
801
  },
802
  "add_C6": {
803
+ "full_accuracy": 0.81,
804
  "n_examples": 100,
805
  "per_subtask": {
806
  "SC": {
 
808
  "count": 100
809
  },
810
  "UC": {
811
+ "accuracy": 0.9590163934426229,
812
  "count": 366
813
  },
814
  "US": {
815
+ "accuracy": 0.9786324786324786,
816
  "count": 234
817
  }
818
  }
819
  },
820
  "sub_M0": {
821
+ "full_accuracy": 0.98,
822
  "n_examples": 100,
823
  "per_subtask": {
824
  "MD": {
825
+ "accuracy": 0.9966722129783694,
826
  "count": 601
827
  },
828
  "ME": {
829
+ "accuracy": 1.0,
830
  "count": 99
831
  }
832
  }
833
  },
834
  "sub_M1": {
835
+ "full_accuracy": 0.98,
836
  "n_examples": 100,
837
  "per_subtask": {
838
  "MD": {
839
+ "accuracy": 0.996415770609319,
840
  "count": 279
841
  },
842
  "MB": {
843
+ "accuracy": 1.0,
844
  "count": 145
845
  },
846
  "ME": {
 
848
  "count": 24
849
  },
850
  "UB": {
851
+ "accuracy": 0.996031746031746,
852
  "count": 252
853
  }
854
  }
855
  },
856
  "sub_M2": {
857
+ "full_accuracy": 0.95,
858
  "n_examples": 100,
859
  "per_subtask": {
860
  "MD": {
861
+ "accuracy": 1.0,
862
  "count": 213
863
  },
864
  "MB": {
865
+ "accuracy": 1.0,
866
  "count": 113
867
  },
868
  "ME": {
869
+ "accuracy": 1.0,
870
  "count": 85
871
  },
872
  "UB": {
873
+ "accuracy": 0.9723756906077348,
874
  "count": 181
875
  },
876
  "UD": {
877
+ "accuracy": 1.0,
878
  "count": 108
879
  }
880
  }
881
  },
882
  "sub_M3": {
883
+ "full_accuracy": 0.19,
884
  "n_examples": 100,
885
  "per_subtask": {
886
  "MD": {
 
888
  "count": 179
889
  },
890
  "MB": {
891
+ "accuracy": 1.0,
892
  "count": 103
893
  },
894
  "ME": {
 
896
  "count": 56
897
  },
898
  "UB": {
899
+ "accuracy": 0.48322147651006714,
900
  "count": 149
901
  },
902
  "UD": {
903
+ "accuracy": 0.9530516431924883,
904
  "count": 213
905
  }
906
  }
907
  },
908
  "sub_M4": {
909
+ "full_accuracy": 0.01,
910
  "n_examples": 100,
911
  "per_subtask": {
912
  "MD": {
 
914
  "count": 200
915
  },
916
  "MB": {
917
+ "accuracy": 1.0,
918
  "count": 100
919
  },
920
  "UB": {
921
+ "accuracy": 0.13,
922
  "count": 100
923
  },
924
  "UD": {
925
+ "accuracy": 0.5666666666666667,
926
  "count": 300
927
  }
928
  }
 
936
  "count": 100
937
  },
938
  "MB": {
939
+ "accuracy": 1.0,
940
  "count": 100
941
  },
942
  "UB": {
943
+ "accuracy": 0.09,
944
  "count": 100
945
  },
946
  "UD": {
947
+ "accuracy": 0.405,
948
  "count": 400
949
  }
950
  }
951
  },
952
  "sub_random": {
953
+ "full_accuracy": 0.97,
954
  "n_examples": 200,
955
  "per_subtask": {
956
  "MD": {
957
+ "accuracy": 1.0,
958
  "count": 600
959
  },
960
  "MB": {
961
+ "accuracy": 1.0,
962
  "count": 267
963
  },
964
  "ME": {
 
966
  "count": 53
967
  },
968
  "UB": {
969
+ "accuracy": 0.9863325740318907,
970
  "count": 439
971
  },
972
  "UD": {
973
+ "accuracy": 1.0,
974
  "count": 41
975
  }
976
  }
977
  },
978
  "sub_B3": {
979
+ "full_accuracy": 0.74,
980
  "n_examples": 100,
981
  "per_subtask": {
982
  "MD": {
983
+ "accuracy": 0.9966666666666667,
984
  "count": 300
985
  },
986
  "MB": {
987
+ "accuracy": 1.0,
988
  "count": 100
989
  },
990
  "UB": {
991
+ "accuracy": 0.8730964467005076,
992
  "count": 197
993
  },
994
  "UD": {
995
+ "accuracy": 1.0,
996
  "count": 103
997
  }
998
  }
999
  },
1000
  "sub_B4": {
1001
+ "full_accuracy": 0.62,
1002
  "n_examples": 100,
1003
  "per_subtask": {
1004
  "MD": {
1005
+ "accuracy": 1.0,
1006
  "count": 200
1007
  },
1008
  "MB": {
1009
+ "accuracy": 1.0,
1010
  "count": 100
1011
  },
1012
  "UB": {
1013
+ "accuracy": 0.854251012145749,
1014
  "count": 247
1015
  },
1016
  "UD": {
1017
+ "accuracy": 0.8888888888888888,
1018
  "count": 153
1019
  }
1020
  }
1021
  },
1022
  "sub_B5": {
1023
+ "full_accuracy": 0.46,
1024
  "n_examples": 100,
1025
  "per_subtask": {
1026
  "MD": {
 
1028
  "count": 100
1029
  },
1030
  "MB": {
1031
+ "accuracy": 1.0,
1032
  "count": 100
1033
  },
1034
  "UB": {
1035
+ "accuracy": 0.8288590604026845,
1036
  "count": 298
1037
  },
1038
  "UD": {
1039
+ "accuracy": 0.8811881188118812,
1040
  "count": 202
1041
  }
1042
  }
1043
  }
1044
  },
1045
  "summary": {
1046
+ "overall_accuracy": 0.7258333333333333,
1047
  "total_examples": 2400,
1048
  "n_splits": 22
1049
  }
 
1172
  }
1173
  },
1174
  "add_S5": {
1175
+ "full_accuracy": 0.6,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "SA": {
 
1184
  "count": 100
1185
  },
1186
  "UC": {
1187
+ "accuracy": 0.6,
1188
  "count": 100
1189
  },
1190
  "US": {
 
1194
  }
1195
  },
1196
  "add_S6": {
1197
+ "full_accuracy": 1.0,
1198
  "n_examples": 100,
1199
  "per_subtask": {
1200
  "SC": {
 
1202
  "count": 100
1203
  },
1204
  "UC": {
1205
+ "accuracy": 1.0,
1206
  "count": 100
1207
  },
1208
  "US": {
1209
+ "accuracy": 1.0,
1210
  "count": 500
1211
  }
1212
  }
 
1282
  }
1283
  },
1284
  "add_C5": {
1285
+ "full_accuracy": 0.98,
1286
  "n_examples": 100,
1287
  "per_subtask": {
1288
  "SA": {
 
1298
  "count": 306
1299
  },
1300
  "US": {
1301
+ "accuracy": 1.0,
1302
  "count": 194
1303
  }
1304
  }
1305
  },
1306
  "add_C6": {
1307
+ "full_accuracy": 1.0,
1308
  "n_examples": 100,
1309
  "per_subtask": {
1310
  "SC": {
 
1312
  "count": 100
1313
  },
1314
  "UC": {
1315
+ "accuracy": 1.0,
1316
  "count": 366
1317
  },
1318
  "US": {
 
1322
  }
1323
  },
1324
  "sub_M0": {
1325
+ "full_accuracy": 1.0,
1326
  "n_examples": 100,
1327
  "per_subtask": {
1328
  "MD": {
1329
+ "accuracy": 1.0,
1330
  "count": 601
1331
  },
1332
  "ME": {
 
1358
  }
1359
  },
1360
  "sub_M2": {
1361
+ "full_accuracy": 1.0,
1362
  "n_examples": 100,
1363
  "per_subtask": {
1364
  "MD": {
1365
+ "accuracy": 1.0,
1366
  "count": 213
1367
  },
1368
  "MB": {
 
1374
  "count": 85
1375
  },
1376
  "UB": {
1377
+ "accuracy": 1.0,
1378
  "count": 181
1379
  },
1380
  "UD": {
 
1384
  }
1385
  },
1386
  "sub_M3": {
1387
+ "full_accuracy": 1.0,
1388
  "n_examples": 100,
1389
  "per_subtask": {
1390
  "MD": {
 
1400
  "count": 56
1401
  },
1402
  "UB": {
1403
+ "accuracy": 1.0,
1404
  "count": 149
1405
  },
1406
  "UD": {
1407
+ "accuracy": 1.0,
1408
  "count": 213
1409
  }
1410
  }
1411
  },
1412
  "sub_M4": {
1413
+ "full_accuracy": 0.98,
1414
  "n_examples": 100,
1415
  "per_subtask": {
1416
  "MD": {
 
1422
  "count": 100
1423
  },
1424
  "UB": {
1425
+ "accuracy": 1.0,
1426
  "count": 100
1427
  },
1428
  "UD": {
1429
+ "accuracy": 0.9933333333333333,
1430
  "count": 300
1431
  }
1432
  }
1433
  },
1434
  "sub_M5": {
1435
+ "full_accuracy": 0.22,
1436
  "n_examples": 100,
1437
  "per_subtask": {
1438
  "MD": {
 
1444
  "count": 100
1445
  },
1446
  "UB": {
1447
+ "accuracy": 0.22,
1448
  "count": 100
1449
  },
1450
  "UD": {
1451
+ "accuracy": 0.9975,
1452
  "count": 400
1453
  }
1454
  }
1455
  },
1456
  "sub_random": {
1457
+ "full_accuracy": 1.0,
1458
  "n_examples": 200,
1459
  "per_subtask": {
1460
  "MD": {
1461
+ "accuracy": 1.0,
1462
  "count": 600
1463
  },
1464
  "MB": {
 
1480
  }
1481
  },
1482
  "sub_B3": {
1483
+ "full_accuracy": 1.0,
1484
  "n_examples": 100,
1485
  "per_subtask": {
1486
  "MD": {
1487
+ "accuracy": 1.0,
1488
  "count": 300
1489
  },
1490
  "MB": {
 
1492
  "count": 100
1493
  },
1494
  "UB": {
1495
+ "accuracy": 1.0,
1496
  "count": 197
1497
  },
1498
  "UD": {
 
1502
  }
1503
  },
1504
  "sub_B4": {
1505
+ "full_accuracy": 1.0,
1506
  "n_examples": 100,
1507
  "per_subtask": {
1508
  "MD": {
1509
+ "accuracy": 1.0,
1510
  "count": 200
1511
  },
1512
  "MB": {
 
1514
  "count": 100
1515
  },
1516
  "UB": {
1517
+ "accuracy": 1.0,
1518
  "count": 247
1519
  },
1520
  "UD": {
1521
+ "accuracy": 1.0,
1522
  "count": 153
1523
  }
1524
  }
1525
  },
1526
  "sub_B5": {
1527
+ "full_accuracy": 0.96,
1528
  "n_examples": 100,
1529
  "per_subtask": {
1530
  "MD": {
 
1536
  "count": 100
1537
  },
1538
  "UB": {
1539
+ "accuracy": 0.9865771812080537,
1540
  "count": 298
1541
  },
1542
  "UD": {
1543
+ "accuracy": 1.0,
1544
  "count": 202
1545
  }
1546
  }
1547
  }
1548
  },
1549
  "summary": {
1550
+ "overall_accuracy": 0.9470833333333334,
1551
  "total_examples": 2400,
1552
  "n_splits": 22
1553
  }
1554
  },
1555
+ "sorl_overall_accuracy": 0.9470833333333334,
1556
+ "sft_overall_accuracy": 0.7258333333333333
1557
  }
add_sub_sorl_v1_abs10_K1_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dc104978754dca706830aa83c50e922f112f2961bfe046fb9acb75fc2f4cfb9
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f05b0b5afcd6ea5dc437d2363128885fcfd080a9b715032b42b76a2c61eb2d
3
  size 650303660
add_sub_sorl_v1_abs10_K1_10K/train_config.json CHANGED
@@ -30,7 +30,7 @@
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
- "num_epochs": 10,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 156,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_10K",
72
- "git_commit": "8d5ee5420119746ef4e2c87570eb250c9718f643",
73
- "timestamp": "2026-04-12T23:50:30.170809+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "yh5d4it1",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/yh5d4it1",
81
- "final_accuracy": 0.8704166666666666,
82
- "sft_accuracy": 0.44083333333333335,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
+ "num_epochs": 20,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 156,
 
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_10K",
72
+ "git_commit": "57deaa28d9c21e39ddac5ef448d6e1be992fba91",
73
+ "timestamp": "2026-04-13T10:55:46.960375+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "0r6snjo4",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/0r6snjo4",
81
+ "final_accuracy": 0.9470833333333334,
82
+ "sft_accuracy": 0.7258333333333333,
83
  "eval_method": "ArithmeticEvaluator"
84
  }