amirali1985 commited on
Commit
13a6d03
·
verified ·
1 Parent(s): 15c9e50

Upload add_sub_sorl_v1_abs30_10K

Browse files
add_sub_sorl_v1_abs30_10K/metrics.json CHANGED
@@ -33,229 +33,229 @@
33
  1563
34
  ],
35
  "loss": [
36
- 12.37744426727295,
37
- 7.511226177215576,
38
- 4.2184038162231445,
39
- 2.3564629554748535,
40
- 2.5140318870544434,
41
- 1.9689207077026367,
42
- 2.304285764694214,
43
- 2.3624253273010254,
44
- 1.6747889518737793,
45
- 1.6878020763397217,
46
- 1.355285882949829,
47
- 1.1105128526687622,
48
- 1.1249760389328003,
49
- 1.014577865600586,
50
- 0.7836594581604004,
51
- 0.9053444266319275,
52
- 0.783288836479187,
53
- 0.7557217478752136,
54
- 0.6148194074630737,
55
- 0.27221277356147766,
56
- 0.543100893497467,
57
- -0.13505232334136963,
58
- -0.5669998526573181,
59
- -0.4266250431537628,
60
- -1.53482186794281,
61
- -2.418619155883789,
62
- -2.2528090476989746,
63
- -3.1418380737304688,
64
- -3.4164059162139893,
65
- -4.483319282531738
66
  ],
67
  "base_loss": [
68
- 7.905151844024658,
69
- 5.81351375579834,
70
- 3.7999191284179688,
71
- 2.091179609298706,
72
- 1.9467359781265259,
73
- 1.8935550451278687,
74
- 1.8686275482177734,
75
- 1.8094427585601807,
76
- 1.896533727645874,
77
- 1.813745379447937,
78
- 1.8012498617172241,
79
- 1.861536979675293,
80
- 1.8485734462738037,
81
- 1.8326823711395264,
82
- 1.8518121242523193,
83
- 1.7422752380371094,
84
- 1.865788459777832,
85
- 1.840556025505066,
86
- 1.8511160612106323,
87
- 1.8595082759857178,
88
- 1.7606910467147827,
89
- 1.7480539083480835,
90
- 1.7231824398040771,
91
- 1.7938270568847656,
92
- 1.8038008213043213,
93
- 1.7478877305984497,
94
- 1.7873140573501587,
95
- 1.8223601579666138,
96
- 1.8396693468093872,
97
- 1.8911950588226318
98
  ],
99
  "info_loss": [
100
- -0.1752934455871582,
101
- -0.12497377395629883,
102
- -0.10779404640197754,
103
- -0.09296655654907227,
104
- -0.0572664737701416,
105
- -0.10382282733917236,
106
- -0.06734299659729004,
107
- -0.055501341819763184,
108
- -0.13261711597442627,
109
- -0.12388193607330322,
110
- -0.15424871444702148,
111
- -0.1853572130203247,
112
- -0.1832747459411621,
113
- -0.19157755374908447,
114
- -0.2160094976425171,
115
- -0.19262421131134033,
116
- -0.21582400798797607,
117
- -0.2133394479751587,
118
- -0.22457683086395264,
119
- -0.2426363229751587,
120
- -0.1686021089553833,
121
- -0.2219860553741455,
122
- -0.26084792613983154,
123
- -0.25028252601623535,
124
- -0.36128807067871094,
125
- -0.44187653064727783,
126
- -0.4287785291671753,
127
- -0.5215833187103271,
128
- -0.5496792793273926,
129
- -0.6622849702835083
130
  ],
131
  "abs_loss": [
132
- 3.363813877105713,
133
- 3.225320816040039,
134
- 2.8528130054473877,
135
- 2.6181869506835938,
136
- 2.622387409210205,
137
- 2.5087389945983887,
138
- 2.5558464527130127,
139
- 2.5691847801208496,
140
- 2.4980592727661133,
141
- 2.6305322647094727,
142
- 2.508139133453369,
143
- 2.5618228912353516,
144
- 2.598161220550537,
145
- 2.5265064239501953,
146
- 2.4846131801605225,
147
- 2.5860702991485596,
148
- 2.4830150604248047,
149
- 2.454457998275757,
150
- 2.4279370307922363,
151
- 2.3325886726379395,
152
- 1.648212194442749,
153
- 1.3609596490859985,
154
- 1.3366903066635132,
155
- 1.2300528287887573,
156
- 1.2981481552124023,
157
- 1.1689049005508423,
158
- 1.1186243295669556,
159
- 1.182748556137085,
160
- 1.1046546697616577,
161
- 1.1569410562515259
162
  ],
163
  "zipf_loss": [
164
- 5.888845443725586,
165
- 2.624917984008789,
166
- 1.2111440896987915,
167
- 0.9331302046775818,
168
- 0.8777217864990234,
169
- 0.8627199530601501,
170
- 0.8535035252571106,
171
- 0.851077675819397,
172
- 0.8546203970909119,
173
- 0.8498228788375854,
174
- 0.8457091450691223,
175
- 0.8463656902313232,
176
- 0.8493338823318481,
177
- 0.8450204133987427,
178
- 0.8434810042381287,
179
- 0.8307042717933655,
180
- 0.8274389505386353,
181
- 0.8031144142150879,
182
- 0.7666779160499573,
183
- 0.6058088541030884,
184
- 0.3036097288131714,
185
- 0.20065835118293762,
186
- 0.184627965092659,
187
- 0.1593678891658783,
188
- 0.14444319903850555,
189
- 0.13536792993545532,
190
- 0.13580000400543213,
191
- 0.13336023688316345,
192
- 0.13025209307670593,
193
- 0.13264060020446777
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
197
- 0.44732314348220825,
198
- 0.1550220400094986,
199
- 0.08860894292593002,
200
- 0.06823601573705673,
201
- 0.05723876133561134,
202
- 0.05476710945367813,
203
- 0.050798289477825165,
204
- 0.04427957907319069,
205
- 0.04448474571108818,
206
- 0.04900604858994484,
207
- 0.055344294756650925,
208
- 0.057683270424604416,
209
- 0.06224667280912399,
210
- 0.0660625472664833,
211
- 0.0682113990187645,
212
- 0.07531450688838959,
213
- 0.07825914025306702,
214
- 0.07614783942699432,
215
- 0.08581066131591797,
216
- 0.0885457992553711,
217
- 0.0975886732339859,
218
- 0.10165664553642273,
219
- 0.10155351459980011,
220
- 0.1034129410982132,
221
- 0.10390264540910721,
222
- 0.10435115545988083,
223
- 0.10648810118436813,
224
- 0.10846319049596786,
225
- 0.11003367602825165,
226
- 0.11091622710227966
227
  ],
228
  "lr": [
229
- 1.9600000000000002e-05,
230
- 3.96e-05,
231
- 4e-05,
232
- 4e-05,
233
- 4e-05,
234
- 4e-05,
235
- 4e-05,
236
- 4e-05,
237
- 4e-05,
238
- 4e-05,
239
- 4e-05,
240
- 4e-05,
241
- 4e-05,
242
- 4e-05,
243
- 4e-05,
244
- 4e-05,
245
- 4e-05,
246
- 4e-05,
247
- 3.944897959183673e-05,
248
- 3.638775510204082e-05,
249
- 3.3326530612244897e-05,
250
- 2.983673469387755e-05,
251
- 2.6775510204081637e-05,
252
- 2.3714285714285713e-05,
253
- 2.022448979591837e-05,
254
- 1.7163265306122454e-05,
255
- 1.4102040816326535e-05,
256
- 1.0612244897959182e-05,
257
- 7.551020408163262e-06,
258
- 4.48979591836735e-06
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
@@ -280,10 +280,10 @@
280
  0.0,
281
  0.02,
282
  0.01,
283
- 0.02
284
  ]
285
  },
286
- "final_accuracy": 0.012083333333333333,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
@@ -298,11 +298,11 @@
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
- "accuracy": 0.22148760330578512,
302
  "count": 605
303
  },
304
  "SS": {
305
- "accuracy": 0.6736842105263158,
306
  "count": 95
307
  }
308
  }
@@ -312,19 +312,19 @@
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
315
- "accuracy": 0.27450980392156865,
316
  "count": 204
317
  },
318
  "SC": {
319
- "accuracy": 0.1242603550295858,
320
  "count": 169
321
  },
322
  "SS": {
323
- "accuracy": 0.6129032258064516,
324
  "count": 31
325
  },
326
  "UC": {
327
- "accuracy": 0.2533783783783784,
328
  "count": 296
329
  }
330
  }
@@ -334,23 +334,23 @@
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
- "accuracy": 0.3803680981595092,
338
  "count": 163
339
  },
340
  "SC": {
341
- "accuracy": 0.046153846153846156,
342
  "count": 130
343
  },
344
  "SS": {
345
- "accuracy": 0.632183908045977,
346
  "count": 87
347
  },
348
  "UC": {
349
- "accuracy": 0.3448275862068966,
350
  "count": 203
351
  },
352
  "US": {
353
- "accuracy": 0.3247863247863248,
354
  "count": 117
355
  }
356
  }
@@ -360,23 +360,23 @@
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
- "accuracy": 0.4297520661157025,
364
  "count": 121
365
  },
366
  "SC": {
367
- "accuracy": 0.05785123966942149,
368
  "count": 121
369
  },
370
  "SS": {
371
- "accuracy": 0.5714285714285714,
372
  "count": 49
373
  },
374
  "UC": {
375
- "accuracy": 0.3870967741935484,
376
  "count": 186
377
  },
378
  "US": {
379
- "accuracy": 0.4349775784753363,
380
  "count": 223
381
  }
382
  }
@@ -386,63 +386,63 @@
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
- "accuracy": 0.46153846153846156,
390
  "count": 104
391
  },
392
  "SC": {
393
- "accuracy": 0.05660377358490566,
394
  "count": 106
395
  },
396
  "SS": {
397
- "accuracy": 0.6956521739130435,
398
  "count": 23
399
  },
400
  "UC": {
401
- "accuracy": 0.4625,
402
  "count": 160
403
  },
404
  "US": {
405
- "accuracy": 0.4169381107491857,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
- "full_accuracy": 0.0,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
415
- "accuracy": 0.43,
416
  "count": 100
417
  },
418
  "SC": {
419
- "accuracy": 0.02,
420
  "count": 100
421
  },
422
  "UC": {
423
- "accuracy": 0.33,
424
  "count": 100
425
  },
426
  "US": {
427
- "accuracy": 0.245,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
- "full_accuracy": 0.0,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
437
- "accuracy": 0.01,
438
  "count": 100
439
  },
440
  "UC": {
441
- "accuracy": 0.62,
442
  "count": 100
443
  },
444
  "US": {
445
- "accuracy": 0.472,
446
  "count": 500
447
  }
448
  }
@@ -452,23 +452,23 @@
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
- "accuracy": 0.2751677852348993,
456
  "count": 447
457
  },
458
  "SC": {
459
- "accuracy": 0.109375,
460
  "count": 320
461
  },
462
  "SS": {
463
- "accuracy": 0.5714285714285714,
464
  "count": 56
465
  },
466
  "UC": {
467
- "accuracy": 0.2684310018903592,
468
  "count": 529
469
  },
470
  "US": {
471
- "accuracy": 0.2708333333333333,
472
  "count": 48
473
  }
474
  }
@@ -478,19 +478,19 @@
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
481
- "accuracy": 0.25,
482
  "count": 300
483
  },
484
  "SC": {
485
- "accuracy": 0.08,
486
  "count": 100
487
  },
488
  "UC": {
489
- "accuracy": 0.17616580310880828,
490
  "count": 193
491
  },
492
  "US": {
493
- "accuracy": 0.19626168224299065,
494
  "count": 107
495
  }
496
  }
@@ -500,41 +500,41 @@
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.36,
504
  "count": 200
505
  },
506
  "SC": {
507
- "accuracy": 0.01,
508
  "count": 100
509
  },
510
  "UC": {
511
- "accuracy": 0.09765625,
512
  "count": 256
513
  },
514
  "US": {
515
- "accuracy": 0.2013888888888889,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
- "full_accuracy": 0.0,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
- "accuracy": 0.47,
526
  "count": 100
527
  },
528
  "SC": {
529
- "accuracy": 0.03,
530
  "count": 100
531
  },
532
  "UC": {
533
- "accuracy": 0.16993464052287582,
534
  "count": 306
535
  },
536
  "US": {
537
- "accuracy": 0.29896907216494845,
538
  "count": 194
539
  }
540
  }
@@ -544,15 +544,15 @@
544
  "n_examples": 100,
545
  "per_subtask": {
546
  "SC": {
547
- "accuracy": 0.03,
548
  "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.30327868852459017,
552
  "count": 366
553
  },
554
  "US": {
555
- "accuracy": 0.5256410256410257,
556
  "count": 234
557
  }
558
  }
@@ -562,11 +562,11 @@
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
- "accuracy": 0.22462562396006655,
566
  "count": 601
567
  },
568
  "ME": {
569
- "accuracy": 0.7171717171717171,
570
  "count": 99
571
  }
572
  }
@@ -576,19 +576,19 @@
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
- "accuracy": 0.4050179211469534,
580
  "count": 279
581
  },
582
  "MB": {
583
- "accuracy": 0.034482758620689655,
584
  "count": 145
585
  },
586
  "ME": {
587
- "accuracy": 0.75,
588
  "count": 24
589
  },
590
  "UB": {
591
- "accuracy": 0.0992063492063492,
592
  "count": 252
593
  }
594
  }
@@ -598,23 +598,23 @@
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
- "accuracy": 0.6103286384976526,
602
  "count": 213
603
  },
604
  "MB": {
605
- "accuracy": 0.008849557522123894,
606
  "count": 113
607
  },
608
  "ME": {
609
- "accuracy": 0.8352941176470589,
610
  "count": 85
611
  },
612
  "UB": {
613
- "accuracy": 0.15469613259668508,
614
  "count": 181
615
  },
616
  "UD": {
617
- "accuracy": 0.07407407407407407,
618
  "count": 108
619
  }
620
  }
@@ -624,23 +624,23 @@
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
627
- "accuracy": 0.7206703910614525,
628
  "count": 179
629
  },
630
  "MB": {
631
- "accuracy": 0.009708737864077669,
632
  "count": 103
633
  },
634
  "ME": {
635
- "accuracy": 0.8571428571428571,
636
  "count": 56
637
  },
638
  "UB": {
639
- "accuracy": 0.15436241610738255,
640
  "count": 149
641
  },
642
  "UD": {
643
- "accuracy": 0.028169014084507043,
644
  "count": 213
645
  }
646
  }
@@ -650,25 +650,25 @@
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
653
- "accuracy": 0.525,
654
  "count": 200
655
  },
656
  "MB": {
657
- "accuracy": 0.04,
658
  "count": 100
659
  },
660
  "UB": {
661
- "accuracy": 0.3,
662
  "count": 100
663
  },
664
  "UD": {
665
- "accuracy": 0.07,
666
  "count": 300
667
  }
668
  }
669
  },
670
  "sub_M5": {
671
- "full_accuracy": 0.0,
672
  "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
@@ -676,15 +676,15 @@
676
  "count": 100
677
  },
678
  "MB": {
679
- "accuracy": 0.07,
680
  "count": 100
681
  },
682
  "UB": {
683
- "accuracy": 0.32,
684
  "count": 100
685
  },
686
  "UD": {
687
- "accuracy": 0.0675,
688
  "count": 400
689
  }
690
  }
@@ -694,23 +694,23 @@
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
- "accuracy": 0.37833333333333335,
698
  "count": 600
699
  },
700
  "MB": {
701
- "accuracy": 0.018726591760299626,
702
  "count": 267
703
  },
704
  "ME": {
705
- "accuracy": 0.6981132075471698,
706
  "count": 53
707
  },
708
  "UB": {
709
- "accuracy": 0.11845102505694761,
710
  "count": 439
711
  },
712
  "UD": {
713
- "accuracy": 0.04878048780487805,
714
  "count": 41
715
  }
716
  }
@@ -720,19 +720,19 @@
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
- "accuracy": 0.36,
724
  "count": 300
725
  },
726
  "MB": {
727
- "accuracy": 0.08,
728
  "count": 100
729
  },
730
  "UB": {
731
- "accuracy": 0.14720812182741116,
732
  "count": 197
733
  },
734
  "UD": {
735
- "accuracy": 0.06796116504854369,
736
  "count": 103
737
  }
738
  }
@@ -742,7 +742,7 @@
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
745
- "accuracy": 0.51,
746
  "count": 200
747
  },
748
  "MB": {
@@ -750,11 +750,11 @@
750
  "count": 100
751
  },
752
  "UB": {
753
- "accuracy": 0.15384615384615385,
754
  "count": 247
755
  },
756
  "UD": {
757
- "accuracy": 0.06535947712418301,
758
  "count": 153
759
  }
760
  }
@@ -768,22 +768,22 @@
768
  "count": 100
769
  },
770
  "MB": {
771
- "accuracy": 0.05,
772
  "count": 100
773
  },
774
  "UB": {
775
- "accuracy": 0.10738255033557047,
776
  "count": 298
777
  },
778
  "UD": {
779
- "accuracy": 0.15346534653465346,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
- "overall_accuracy": 0.0,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
@@ -798,15 +798,15 @@
798
  },
799
  "splits": {
800
  "add_S0": {
801
- "full_accuracy": 0.02,
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
- "accuracy": 0.5553719008264463,
806
  "count": 605
807
  },
808
  "SS": {
809
- "accuracy": 0.9052631578947369,
810
  "count": 95
811
  }
812
  }
@@ -816,163 +816,163 @@
816
  "n_examples": 100,
817
  "per_subtask": {
818
  "SA": {
819
- "accuracy": 0.5294117647058824,
820
  "count": 204
821
  },
822
  "SC": {
823
- "accuracy": 0.33727810650887574,
824
  "count": 169
825
  },
826
  "SS": {
827
- "accuracy": 0.7419354838709677,
828
  "count": 31
829
  },
830
  "UC": {
831
- "accuracy": 0.44594594594594594,
832
  "count": 296
833
  }
834
  }
835
  },
836
  "add_S2": {
837
- "full_accuracy": 0.01,
838
  "n_examples": 100,
839
  "per_subtask": {
840
  "SA": {
841
- "accuracy": 0.6441717791411042,
842
  "count": 163
843
  },
844
  "SC": {
845
- "accuracy": 0.2923076923076923,
846
  "count": 130
847
  },
848
  "SS": {
849
- "accuracy": 0.7701149425287356,
850
  "count": 87
851
  },
852
  "UC": {
853
- "accuracy": 0.4729064039408867,
854
  "count": 203
855
  },
856
  "US": {
857
- "accuracy": 0.5128205128205128,
858
  "count": 117
859
  }
860
  }
861
  },
862
  "add_S3": {
863
- "full_accuracy": 0.03,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SA": {
867
- "accuracy": 0.6694214876033058,
868
  "count": 121
869
  },
870
  "SC": {
871
- "accuracy": 0.256198347107438,
872
  "count": 121
873
  },
874
  "SS": {
875
- "accuracy": 0.8367346938775511,
876
  "count": 49
877
  },
878
  "UC": {
879
- "accuracy": 0.5268817204301075,
880
  "count": 186
881
  },
882
  "US": {
883
- "accuracy": 0.45739910313901344,
884
  "count": 223
885
  }
886
  }
887
  },
888
  "add_S4": {
889
- "full_accuracy": 0.04,
890
  "n_examples": 100,
891
  "per_subtask": {
892
  "SA": {
893
- "accuracy": 0.7019230769230769,
894
  "count": 104
895
  },
896
  "SC": {
897
- "accuracy": 0.36792452830188677,
898
  "count": 106
899
  },
900
  "SS": {
901
- "accuracy": 0.9130434782608695,
902
  "count": 23
903
  },
904
  "UC": {
905
- "accuracy": 0.525,
906
  "count": 160
907
  },
908
  "US": {
909
- "accuracy": 0.40716612377850164,
910
  "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
- "full_accuracy": 0.05,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
919
- "accuracy": 0.67,
920
  "count": 100
921
  },
922
  "SC": {
923
- "accuracy": 0.37,
924
  "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.38,
928
  "count": 100
929
  },
930
  "US": {
931
- "accuracy": 0.27,
932
  "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
- "full_accuracy": 0.11,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
941
- "accuracy": 0.23,
942
  "count": 100
943
  },
944
  "UC": {
945
- "accuracy": 0.39,
946
  "count": 100
947
  },
948
  "US": {
949
- "accuracy": 0.32,
950
  "count": 500
951
  }
952
  }
953
  },
954
  "add_random": {
955
- "full_accuracy": 0.005,
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
- "accuracy": 0.5391498881431768,
960
  "count": 447
961
  },
962
  "SC": {
963
- "accuracy": 0.328125,
964
  "count": 320
965
  },
966
  "SS": {
967
- "accuracy": 0.75,
968
  "count": 56
969
  },
970
  "UC": {
971
- "accuracy": 0.4612476370510397,
972
  "count": 529
973
  },
974
  "US": {
975
- "accuracy": 0.3541666666666667,
976
  "count": 48
977
  }
978
  }
@@ -982,19 +982,19 @@
982
  "n_examples": 100,
983
  "per_subtask": {
984
  "SA": {
985
- "accuracy": 0.67,
986
  "count": 300
987
  },
988
  "SC": {
989
- "accuracy": 0.22,
990
  "count": 100
991
  },
992
  "UC": {
993
- "accuracy": 0.3626943005181347,
994
  "count": 193
995
  },
996
  "US": {
997
- "accuracy": 0.29906542056074764,
998
  "count": 107
999
  }
1000
  }
@@ -1004,19 +1004,19 @@
1004
  "n_examples": 100,
1005
  "per_subtask": {
1006
  "SA": {
1007
- "accuracy": 0.755,
1008
  "count": 200
1009
  },
1010
  "SC": {
1011
- "accuracy": 0.32,
1012
  "count": 100
1013
  },
1014
  "UC": {
1015
- "accuracy": 0.33984375,
1016
  "count": 256
1017
  },
1018
  "US": {
1019
- "accuracy": 0.3125,
1020
  "count": 144
1021
  }
1022
  }
@@ -1026,19 +1026,19 @@
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
1029
- "accuracy": 0.67,
1030
  "count": 100
1031
  },
1032
  "SC": {
1033
- "accuracy": 0.42,
1034
  "count": 100
1035
  },
1036
  "UC": {
1037
- "accuracy": 0.45098039215686275,
1038
  "count": 306
1039
  },
1040
  "US": {
1041
- "accuracy": 0.37628865979381443,
1042
  "count": 194
1043
  }
1044
  }
@@ -1048,15 +1048,15 @@
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
1051
- "accuracy": 0.31,
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
- "accuracy": 0.5081967213114754,
1056
  "count": 366
1057
  },
1058
  "US": {
1059
- "accuracy": 0.6495726495726496,
1060
  "count": 234
1061
  }
1062
  }
@@ -1066,11 +1066,11 @@
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.540765391014975,
1070
  "count": 601
1071
  },
1072
  "ME": {
1073
- "accuracy": 0.7575757575757576,
1074
  "count": 99
1075
  }
1076
  }
@@ -1080,19 +1080,19 @@
1080
  "n_examples": 100,
1081
  "per_subtask": {
1082
  "MD": {
1083
- "accuracy": 0.6523297491039427,
1084
  "count": 279
1085
  },
1086
  "MB": {
1087
- "accuracy": 0.10344827586206896,
1088
  "count": 145
1089
  },
1090
  "ME": {
1091
- "accuracy": 0.7916666666666666,
1092
  "count": 24
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.3333333333333333,
1096
  "count": 252
1097
  }
1098
  }
@@ -1102,23 +1102,23 @@
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.8169014084507042,
1106
  "count": 213
1107
  },
1108
  "MB": {
1109
- "accuracy": 0.12389380530973451,
1110
  "count": 113
1111
  },
1112
  "ME": {
1113
- "accuracy": 0.9529411764705882,
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
- "accuracy": 0.27071823204419887,
1118
  "count": 181
1119
  },
1120
  "UD": {
1121
- "accuracy": 0.1111111111111111,
1122
  "count": 108
1123
  }
1124
  }
@@ -1128,23 +1128,23 @@
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
1131
- "accuracy": 0.8659217877094972,
1132
  "count": 179
1133
  },
1134
  "MB": {
1135
- "accuracy": 0.10679611650485436,
1136
  "count": 103
1137
  },
1138
  "ME": {
1139
- "accuracy": 0.9642857142857143,
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
- "accuracy": 0.30201342281879195,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
- "accuracy": 0.09859154929577464,
1148
  "count": 213
1149
  }
1150
  }
@@ -1154,25 +1154,25 @@
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
1157
- "accuracy": 0.79,
1158
  "count": 200
1159
  },
1160
  "MB": {
1161
- "accuracy": 0.15,
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
- "accuracy": 0.4,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
- "accuracy": 0.11,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
- "full_accuracy": 0.02,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
@@ -1184,11 +1184,11 @@
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
- "accuracy": 0.56,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
- "accuracy": 0.095,
1192
  "count": 400
1193
  }
1194
  }
@@ -1198,23 +1198,23 @@
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.6883333333333334,
1202
  "count": 600
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.12359550561797752,
1206
  "count": 267
1207
  },
1208
  "ME": {
1209
- "accuracy": 0.7924528301886793,
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
- "accuracy": 0.3120728929384966,
1214
  "count": 439
1215
  },
1216
  "UD": {
1217
- "accuracy": 0.17073170731707318,
1218
  "count": 41
1219
  }
1220
  }
@@ -1224,19 +1224,19 @@
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
- "accuracy": 0.6766666666666666,
1228
  "count": 300
1229
  },
1230
  "MB": {
1231
- "accuracy": 0.12,
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
- "accuracy": 0.24873096446700507,
1236
  "count": 197
1237
  },
1238
  "UD": {
1239
- "accuracy": 0.11650485436893204,
1240
  "count": 103
1241
  }
1242
  }
@@ -1246,19 +1246,19 @@
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
- "accuracy": 0.775,
1250
  "count": 200
1251
  },
1252
  "MB": {
1253
- "accuracy": 0.18,
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
- "accuracy": 0.2874493927125506,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
- "accuracy": 0.1895424836601307,
1262
  "count": 153
1263
  }
1264
  }
@@ -1272,26 +1272,26 @@
1272
  "count": 100
1273
  },
1274
  "MB": {
1275
- "accuracy": 0.08,
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
- "accuracy": 0.30201342281879195,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
- "accuracy": 0.06930693069306931,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
- "overall_accuracy": 0.012083333333333333,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
- "sorl_overall_accuracy": 0.012083333333333333,
1296
- "sft_overall_accuracy": 0.0
1297
  }
 
33
  1563
34
  ],
35
  "loss": [
36
+ 9.84412956237793,
37
+ 5.1068925857543945,
38
+ 2.6231350898742676,
39
+ 2.307675838470459,
40
+ 2.025418281555176,
41
+ 1.0131561756134033,
42
+ 1.7522368431091309,
43
+ 1.42581307888031,
44
+ 1.1289782524108887,
45
+ 0.9756244421005249,
46
+ 0.5843766927719116,
47
+ 0.12120179086923599,
48
+ 0.038071952760219574,
49
+ -0.09827154129743576,
50
+ -0.11171114444732666,
51
+ -0.15556588768959045,
52
+ -0.08670347929000854,
53
+ 0.03128141164779663,
54
+ -0.3896399140357971,
55
+ -0.008183799684047699,
56
+ -0.1007656455039978,
57
+ 0.3051470220088959,
58
+ -0.1488836109638214,
59
+ 0.28637364506721497,
60
+ -0.23199722170829773,
61
+ 0.12318609654903412,
62
+ -0.2954173684120178,
63
+ -0.3596929609775543,
64
+ -0.22620351612567902,
65
+ -0.7009453177452087
66
  ],
67
  "base_loss": [
68
+ 6.9705986976623535,
69
+ 3.7352306842803955,
70
+ 1.9767621755599976,
71
+ 1.8775386810302734,
72
+ 1.9166576862335205,
73
+ 1.8984237909317017,
74
+ 1.8574403524398804,
75
+ 1.801119089126587,
76
+ 1.865805983543396,
77
+ 1.7841968536376953,
78
+ 1.8087198734283447,
79
+ 1.8459980487823486,
80
+ 1.8696975708007812,
81
+ 1.8336594104766846,
82
+ 1.8405402898788452,
83
+ 1.760048747062683,
84
+ 1.8620561361312866,
85
+ 1.8630913496017456,
86
+ 1.8629968166351318,
87
+ 1.8340078592300415,
88
+ 1.7706114053726196,
89
+ 1.712676763534546,
90
+ 1.6838618516921997,
91
+ 1.7604615688323975,
92
+ 1.7645069360733032,
93
+ 1.6746128797531128,
94
+ 1.7230627536773682,
95
+ 1.7214601039886475,
96
+ 1.7510899305343628,
97
+ 1.6944631338119507
98
  ],
99
  "info_loss": [
100
+ -0.23186302185058594,
101
+ -0.016382217407226562,
102
+ -0.04995226860046387,
103
+ -0.06860637664794922,
104
+ -0.09940493106842041,
105
+ -0.19964098930358887,
106
+ -0.12048065662384033,
107
+ -0.14689147472381592,
108
+ -0.18259012699127197,
109
+ -0.18501496315002441,
110
+ -0.16905105113983154,
111
+ -0.19646573066711426,
112
+ -0.20380914211273193,
113
+ -0.21209895610809326,
114
+ -0.21376097202301025,
115
+ -0.20788681507110596,
116
+ -0.21054041385650635,
117
+ -0.19847512245178223,
118
+ -0.2401341199874878,
119
+ -0.19826948642730713,
120
+ -0.20111238956451416,
121
+ -0.15373647212982178,
122
+ -0.1952725648880005,
123
+ -0.16008174419403076,
124
+ -0.21182727813720703,
125
+ -0.16642332077026367,
126
+ -0.21538293361663818,
127
+ -0.21970915794372559,
128
+ -0.2088148593902588,
129
+ -0.25221550464630127
130
  ],
131
  "abs_loss": [
132
+ 3.347839832305908,
133
+ 2.7904562950134277,
134
+ 2.566986083984375,
135
+ 2.5303356647491455,
136
+ 2.5691452026367188,
137
+ 2.56864595413208,
138
+ 2.542234420776367,
139
+ 2.518596649169922,
140
+ 2.5303497314453125,
141
+ 2.397298812866211,
142
+ 1.649463415145874,
143
+ 0.9939525127410889,
144
+ 0.9239900708198547,
145
+ 0.7970927953720093,
146
+ 0.9154700040817261,
147
+ 0.7975423336029053,
148
+ 0.9060695171356201,
149
+ 0.8606487512588501,
150
+ 0.8730352520942688,
151
+ 0.8025760650634766,
152
+ 0.7727009057998657,
153
+ 0.7014546394348145,
154
+ 0.6866945624351501,
155
+ 0.681542158126831,
156
+ 0.7381680607795715,
157
+ 0.6557687520980835,
158
+ 0.7275815606117249,
159
+ 0.6469742655754089,
160
+ 0.6168563365936279,
161
+ 0.6632050275802612
162
  ],
163
  "zipf_loss": [
164
+ 4.857377529144287,
165
+ 1.2564382553100586,
166
+ 0.8891969919204712,
167
+ 0.8631672859191895,
168
+ 0.8458954691886902,
169
+ 0.8542776107788086,
170
+ 0.8453795909881592,
171
+ 0.8417490720748901,
172
+ 0.8360385894775391,
173
+ 0.8018473386764526,
174
+ 0.30122095346450806,
175
+ 0.14046579599380493,
176
+ 0.11406679451465607,
177
+ 0.1093493178486824,
178
+ 0.0938112735748291,
179
+ 0.08349927514791489,
180
+ 0.06603756546974182,
181
+ 0.06687641143798828,
182
+ 0.06140096113085747,
183
+ 0.06024559587240219,
184
+ 0.0624767541885376,
185
+ 0.05968953296542168,
186
+ 0.0513107106089592,
187
+ 0.05857530236244202,
188
+ 0.047951824963092804,
189
+ 0.047229547053575516,
190
+ 0.06259104609489441,
191
+ 0.051241062581539154,
192
+ 0.049169525504112244,
193
+ 0.06042611971497536
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
197
+ 0.25015148520469666,
198
+ 0.06454251706600189,
199
+ 0.03540093079209328,
200
+ 0.02958545833826065,
201
+ 0.02504236064851284,
202
+ 0.02280796691775322,
203
+ 0.027570093050599098,
204
+ 0.031304821372032166,
205
+ 0.03224661201238632,
206
+ 0.04218565672636032,
207
+ 0.05549744889140129,
208
+ 0.06068537011742592,
209
+ 0.062182605266571045,
210
+ 0.06644897162914276,
211
+ 0.07051082700490952,
212
+ 0.06743474304676056,
213
+ 0.06548100709915161,
214
+ 0.06360627710819244,
215
+ 0.06534555554389954,
216
+ 0.06200597807765007,
217
+ 0.06266725808382034,
218
+ 0.06213154271245003,
219
+ 0.06070336326956749,
220
+ 0.0609060600399971,
221
+ 0.06040602922439575,
222
+ 0.059318866580724716,
223
+ 0.06125802546739578,
224
+ 0.06153956055641174,
225
+ 0.06167338043451309,
226
+ 0.06198073923587799
227
  ],
228
  "lr": [
229
+ 3.9200000000000004e-05,
230
+ 7.92e-05,
231
+ 8e-05,
232
+ 8e-05,
233
+ 8e-05,
234
+ 8e-05,
235
+ 8e-05,
236
+ 8e-05,
237
+ 8e-05,
238
+ 8e-05,
239
+ 8e-05,
240
+ 8e-05,
241
+ 8e-05,
242
+ 8e-05,
243
+ 8e-05,
244
+ 8e-05,
245
+ 8e-05,
246
+ 8e-05,
247
+ 7.889795918367346e-05,
248
+ 7.277551020408164e-05,
249
+ 6.665306122448979e-05,
250
+ 5.96734693877551e-05,
251
+ 5.3551020408163274e-05,
252
+ 4.7428571428571427e-05,
253
+ 4.044897959183674e-05,
254
+ 3.432653061224491e-05,
255
+ 2.820408163265307e-05,
256
+ 2.1224489795918364e-05,
257
+ 1.5102040816326524e-05,
258
+ 8.9795918367347e-06
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
 
280
  0.0,
281
  0.02,
282
  0.01,
283
+ 0.01
284
  ]
285
  },
286
+ "final_accuracy": 0.004583333333333333,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
 
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
+ "accuracy": 0.2528925619834711,
302
  "count": 605
303
  },
304
  "SS": {
305
+ "accuracy": 0.8315789473684211,
306
  "count": 95
307
  }
308
  }
 
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
315
+ "accuracy": 0.30392156862745096,
316
  "count": 204
317
  },
318
  "SC": {
319
+ "accuracy": 0.21893491124260356,
320
  "count": 169
321
  },
322
  "SS": {
323
+ "accuracy": 0.7096774193548387,
324
  "count": 31
325
  },
326
  "UC": {
327
+ "accuracy": 0.27702702702702703,
328
  "count": 296
329
  }
330
  }
 
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
+ "accuracy": 0.3312883435582822,
338
  "count": 163
339
  },
340
  "SC": {
341
+ "accuracy": 0.13846153846153847,
342
  "count": 130
343
  },
344
  "SS": {
345
+ "accuracy": 0.4482758620689655,
346
  "count": 87
347
  },
348
  "UC": {
349
+ "accuracy": 0.3842364532019704,
350
  "count": 203
351
  },
352
  "US": {
353
+ "accuracy": 0.5299145299145299,
354
  "count": 117
355
  }
356
  }
 
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
+ "accuracy": 0.36363636363636365,
364
  "count": 121
365
  },
366
  "SC": {
367
+ "accuracy": 0.06611570247933884,
368
  "count": 121
369
  },
370
  "SS": {
371
+ "accuracy": 0.42857142857142855,
372
  "count": 49
373
  },
374
  "UC": {
375
+ "accuracy": 0.4032258064516129,
376
  "count": 186
377
  },
378
  "US": {
379
+ "accuracy": 0.5919282511210763,
380
  "count": 223
381
  }
382
  }
 
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
+ "accuracy": 0.38461538461538464,
390
  "count": 104
391
  },
392
  "SC": {
393
+ "accuracy": 0.11320754716981132,
394
  "count": 106
395
  },
396
  "SS": {
397
+ "accuracy": 0.4782608695652174,
398
  "count": 23
399
  },
400
  "UC": {
401
+ "accuracy": 0.41875,
402
  "count": 160
403
  },
404
  "US": {
405
+ "accuracy": 0.49185667752442996,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
+ "full_accuracy": 0.03,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
415
+ "accuracy": 0.47,
416
  "count": 100
417
  },
418
  "SC": {
419
+ "accuracy": 0.09,
420
  "count": 100
421
  },
422
  "UC": {
423
+ "accuracy": 0.43,
424
  "count": 100
425
  },
426
  "US": {
427
+ "accuracy": 0.3375,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
+ "full_accuracy": 0.12,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
437
+ "accuracy": 0.12,
438
  "count": 100
439
  },
440
  "UC": {
441
+ "accuracy": 0.64,
442
  "count": 100
443
  },
444
  "US": {
445
+ "accuracy": 0.636,
446
  "count": 500
447
  }
448
  }
 
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
+ "accuracy": 0.3087248322147651,
456
  "count": 447
457
  },
458
  "SC": {
459
+ "accuracy": 0.165625,
460
  "count": 320
461
  },
462
  "SS": {
463
+ "accuracy": 0.5892857142857143,
464
  "count": 56
465
  },
466
  "UC": {
467
+ "accuracy": 0.29300567107750475,
468
  "count": 529
469
  },
470
  "US": {
471
+ "accuracy": 0.4791666666666667,
472
  "count": 48
473
  }
474
  }
 
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
481
+ "accuracy": 0.35,
482
  "count": 300
483
  },
484
  "SC": {
485
+ "accuracy": 0.1,
486
  "count": 100
487
  },
488
  "UC": {
489
+ "accuracy": 0.21243523316062177,
490
  "count": 193
491
  },
492
  "US": {
493
+ "accuracy": 0.35514018691588783,
494
  "count": 107
495
  }
496
  }
 
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.46,
504
  "count": 200
505
  },
506
  "SC": {
507
+ "accuracy": 0.07,
508
  "count": 100
509
  },
510
  "UC": {
511
+ "accuracy": 0.15234375,
512
  "count": 256
513
  },
514
  "US": {
515
+ "accuracy": 0.2916666666666667,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
+ "full_accuracy": 0.02,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
+ "accuracy": 0.57,
526
  "count": 100
527
  },
528
  "SC": {
529
+ "accuracy": 0.15,
530
  "count": 100
531
  },
532
  "UC": {
533
+ "accuracy": 0.2647058823529412,
534
  "count": 306
535
  },
536
  "US": {
537
+ "accuracy": 0.4793814432989691,
538
  "count": 194
539
  }
540
  }
 
544
  "n_examples": 100,
545
  "per_subtask": {
546
  "SC": {
547
+ "accuracy": 0.16,
548
  "count": 100
549
  },
550
  "UC": {
551
+ "accuracy": 0.28688524590163933,
552
  "count": 366
553
  },
554
  "US": {
555
+ "accuracy": 0.6794871794871795,
556
  "count": 234
557
  }
558
  }
 
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
+ "accuracy": 0.2579034941763727,
566
  "count": 601
567
  },
568
  "ME": {
569
+ "accuracy": 0.898989898989899,
570
  "count": 99
571
  }
572
  }
 
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
+ "accuracy": 0.44086021505376344,
580
  "count": 279
581
  },
582
  "MB": {
583
+ "accuracy": 0.013793103448275862,
584
  "count": 145
585
  },
586
  "ME": {
587
+ "accuracy": 0.875,
588
  "count": 24
589
  },
590
  "UB": {
591
+ "accuracy": 0.15873015873015872,
592
  "count": 252
593
  }
594
  }
 
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
+ "accuracy": 0.6525821596244131,
602
  "count": 213
603
  },
604
  "MB": {
605
+ "accuracy": 0.02654867256637168,
606
  "count": 113
607
  },
608
  "ME": {
609
+ "accuracy": 0.8941176470588236,
610
  "count": 85
611
  },
612
  "UB": {
613
+ "accuracy": 0.19889502762430938,
614
  "count": 181
615
  },
616
  "UD": {
617
+ "accuracy": 0.1111111111111111,
618
  "count": 108
619
  }
620
  }
 
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
627
+ "accuracy": 0.7821229050279329,
628
  "count": 179
629
  },
630
  "MB": {
631
+ "accuracy": 0.02912621359223301,
632
  "count": 103
633
  },
634
  "ME": {
635
+ "accuracy": 0.8392857142857143,
636
  "count": 56
637
  },
638
  "UB": {
639
+ "accuracy": 0.2550335570469799,
640
  "count": 149
641
  },
642
  "UD": {
643
+ "accuracy": 0.1267605633802817,
644
  "count": 213
645
  }
646
  }
 
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
653
+ "accuracy": 0.59,
654
  "count": 200
655
  },
656
  "MB": {
657
+ "accuracy": 0.03,
658
  "count": 100
659
  },
660
  "UB": {
661
+ "accuracy": 0.35,
662
  "count": 100
663
  },
664
  "UD": {
665
+ "accuracy": 0.14333333333333334,
666
  "count": 300
667
  }
668
  }
669
  },
670
  "sub_M5": {
671
+ "full_accuracy": 0.02,
672
  "n_examples": 100,
673
  "per_subtask": {
674
  "MD": {
 
676
  "count": 100
677
  },
678
  "MB": {
679
+ "accuracy": 0.09,
680
  "count": 100
681
  },
682
  "UB": {
683
+ "accuracy": 0.52,
684
  "count": 100
685
  },
686
  "UD": {
687
+ "accuracy": 0.17,
688
  "count": 400
689
  }
690
  }
 
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
+ "accuracy": 0.4116666666666667,
698
  "count": 600
699
  },
700
  "MB": {
701
+ "accuracy": 0.02247191011235955,
702
  "count": 267
703
  },
704
  "ME": {
705
+ "accuracy": 0.8679245283018868,
706
  "count": 53
707
  },
708
  "UB": {
709
+ "accuracy": 0.1867881548974943,
710
  "count": 439
711
  },
712
  "UD": {
713
+ "accuracy": 0.1951219512195122,
714
  "count": 41
715
  }
716
  }
 
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
+ "accuracy": 0.38333333333333336,
724
  "count": 300
725
  },
726
  "MB": {
727
+ "accuracy": 0.01,
728
  "count": 100
729
  },
730
  "UB": {
731
+ "accuracy": 0.2131979695431472,
732
  "count": 197
733
  },
734
  "UD": {
735
+ "accuracy": 0.038834951456310676,
736
  "count": 103
737
  }
738
  }
 
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
745
+ "accuracy": 0.555,
746
  "count": 200
747
  },
748
  "MB": {
 
750
  "count": 100
751
  },
752
  "UB": {
753
+ "accuracy": 0.21862348178137653,
754
  "count": 247
755
  },
756
  "UD": {
757
+ "accuracy": 0.0784313725490196,
758
  "count": 153
759
  }
760
  }
 
768
  "count": 100
769
  },
770
  "MB": {
771
+ "accuracy": 0.03,
772
  "count": 100
773
  },
774
  "UB": {
775
+ "accuracy": 0.174496644295302,
776
  "count": 298
777
  },
778
  "UD": {
779
+ "accuracy": 0.04455445544554455,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
+ "overall_accuracy": 0.007916666666666667,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
 
798
  },
799
  "splits": {
800
  "add_S0": {
801
+ "full_accuracy": 0.0,
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
+ "accuracy": 0.39669421487603307,
806
  "count": 605
807
  },
808
  "SS": {
809
+ "accuracy": 0.9263157894736842,
810
  "count": 95
811
  }
812
  }
 
816
  "n_examples": 100,
817
  "per_subtask": {
818
  "SA": {
819
+ "accuracy": 0.45098039215686275,
820
  "count": 204
821
  },
822
  "SC": {
823
+ "accuracy": 0.2603550295857988,
824
  "count": 169
825
  },
826
  "SS": {
827
+ "accuracy": 0.8709677419354839,
828
  "count": 31
829
  },
830
  "UC": {
831
+ "accuracy": 0.3141891891891892,
832
  "count": 296
833
  }
834
  }
835
  },
836
  "add_S2": {
837
+ "full_accuracy": 0.0,
838
  "n_examples": 100,
839
  "per_subtask": {
840
  "SA": {
841
+ "accuracy": 0.4785276073619632,
842
  "count": 163
843
  },
844
  "SC": {
845
+ "accuracy": 0.12307692307692308,
846
  "count": 130
847
  },
848
  "SS": {
849
+ "accuracy": 0.3563218390804598,
850
  "count": 87
851
  },
852
  "UC": {
853
+ "accuracy": 0.4187192118226601,
854
  "count": 203
855
  },
856
  "US": {
857
+ "accuracy": 0.5641025641025641,
858
  "count": 117
859
  }
860
  }
861
  },
862
  "add_S3": {
863
+ "full_accuracy": 0.0,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SA": {
867
+ "accuracy": 0.512396694214876,
868
  "count": 121
869
  },
870
  "SC": {
871
+ "accuracy": 0.09917355371900827,
872
  "count": 121
873
  },
874
  "SS": {
875
+ "accuracy": 0.5102040816326531,
876
  "count": 49
877
  },
878
  "UC": {
879
+ "accuracy": 0.4032258064516129,
880
  "count": 186
881
  },
882
  "US": {
883
+ "accuracy": 0.5560538116591929,
884
  "count": 223
885
  }
886
  }
887
  },
888
  "add_S4": {
889
+ "full_accuracy": 0.0,
890
  "n_examples": 100,
891
  "per_subtask": {
892
  "SA": {
893
+ "accuracy": 0.5865384615384616,
894
  "count": 104
895
  },
896
  "SC": {
897
+ "accuracy": 0.11320754716981132,
898
  "count": 106
899
  },
900
  "SS": {
901
+ "accuracy": 0.6521739130434783,
902
  "count": 23
903
  },
904
  "UC": {
905
+ "accuracy": 0.43125,
906
  "count": 160
907
  },
908
  "US": {
909
+ "accuracy": 0.48534201954397393,
910
  "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
+ "full_accuracy": 0.0,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
919
+ "accuracy": 0.63,
920
  "count": 100
921
  },
922
  "SC": {
923
+ "accuracy": 0.06,
924
  "count": 100
925
  },
926
  "UC": {
927
+ "accuracy": 0.34,
928
  "count": 100
929
  },
930
  "US": {
931
+ "accuracy": 0.2575,
932
  "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
+ "full_accuracy": 0.1,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
941
+ "accuracy": 0.1,
942
  "count": 100
943
  },
944
  "UC": {
945
+ "accuracy": 0.53,
946
  "count": 100
947
  },
948
  "US": {
949
+ "accuracy": 0.506,
950
  "count": 500
951
  }
952
  }
953
  },
954
  "add_random": {
955
+ "full_accuracy": 0.0,
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
+ "accuracy": 0.46308724832214765,
960
  "count": 447
961
  },
962
  "SC": {
963
+ "accuracy": 0.1875,
964
  "count": 320
965
  },
966
  "SS": {
967
+ "accuracy": 0.6785714285714286,
968
  "count": 56
969
  },
970
  "UC": {
971
+ "accuracy": 0.3856332703213611,
972
  "count": 529
973
  },
974
  "US": {
975
+ "accuracy": 0.4791666666666667,
976
  "count": 48
977
  }
978
  }
 
982
  "n_examples": 100,
983
  "per_subtask": {
984
  "SA": {
985
+ "accuracy": 0.47333333333333333,
986
  "count": 300
987
  },
988
  "SC": {
989
+ "accuracy": 0.04,
990
  "count": 100
991
  },
992
  "UC": {
993
+ "accuracy": 0.21761658031088082,
994
  "count": 193
995
  },
996
  "US": {
997
+ "accuracy": 0.2336448598130841,
998
  "count": 107
999
  }
1000
  }
 
1004
  "n_examples": 100,
1005
  "per_subtask": {
1006
  "SA": {
1007
+ "accuracy": 0.625,
1008
  "count": 200
1009
  },
1010
  "SC": {
1011
+ "accuracy": 0.04,
1012
  "count": 100
1013
  },
1014
  "UC": {
1015
+ "accuracy": 0.19140625,
1016
  "count": 256
1017
  },
1018
  "US": {
1019
+ "accuracy": 0.2986111111111111,
1020
  "count": 144
1021
  }
1022
  }
 
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
1029
+ "accuracy": 0.62,
1030
  "count": 100
1031
  },
1032
  "SC": {
1033
+ "accuracy": 0.08,
1034
  "count": 100
1035
  },
1036
  "UC": {
1037
+ "accuracy": 0.25163398692810457,
1038
  "count": 306
1039
  },
1040
  "US": {
1041
+ "accuracy": 0.38144329896907214,
1042
  "count": 194
1043
  }
1044
  }
 
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
1051
+ "accuracy": 0.14,
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
+ "accuracy": 0.32786885245901637,
1056
  "count": 366
1057
  },
1058
  "US": {
1059
+ "accuracy": 0.7307692307692307,
1060
  "count": 234
1061
  }
1062
  }
 
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
+ "accuracy": 0.36605657237936773,
1070
  "count": 601
1071
  },
1072
  "ME": {
1073
+ "accuracy": 0.8888888888888888,
1074
  "count": 99
1075
  }
1076
  }
 
1080
  "n_examples": 100,
1081
  "per_subtask": {
1082
  "MD": {
1083
+ "accuracy": 0.5913978494623656,
1084
  "count": 279
1085
  },
1086
  "MB": {
1087
+ "accuracy": 0.034482758620689655,
1088
  "count": 145
1089
  },
1090
  "ME": {
1091
+ "accuracy": 0.75,
1092
  "count": 24
1093
  },
1094
  "UB": {
1095
+ "accuracy": 0.2222222222222222,
1096
  "count": 252
1097
  }
1098
  }
 
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
+ "accuracy": 0.704225352112676,
1106
  "count": 213
1107
  },
1108
  "MB": {
1109
+ "accuracy": 0.04424778761061947,
1110
  "count": 113
1111
  },
1112
  "ME": {
1113
+ "accuracy": 0.9647058823529412,
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
+ "accuracy": 0.281767955801105,
1118
  "count": 181
1119
  },
1120
  "UD": {
1121
+ "accuracy": 0.12037037037037036,
1122
  "count": 108
1123
  }
1124
  }
 
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
1131
+ "accuracy": 0.8156424581005587,
1132
  "count": 179
1133
  },
1134
  "MB": {
1135
+ "accuracy": 0.0,
1136
  "count": 103
1137
  },
1138
  "ME": {
1139
+ "accuracy": 0.9285714285714286,
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
+ "accuracy": 0.348993288590604,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
+ "accuracy": 0.0892018779342723,
1148
  "count": 213
1149
  }
1150
  }
 
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
1157
+ "accuracy": 0.76,
1158
  "count": 200
1159
  },
1160
  "MB": {
1161
+ "accuracy": 0.01,
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
+ "accuracy": 0.44,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
+ "accuracy": 0.08333333333333333,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
+ "full_accuracy": 0.01,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
 
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
+ "accuracy": 0.7,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
+ "accuracy": 0.065,
1192
  "count": 400
1193
  }
1194
  }
 
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.555,
1202
  "count": 600
1203
  },
1204
  "MB": {
1205
+ "accuracy": 0.03745318352059925,
1206
  "count": 267
1207
  },
1208
  "ME": {
1209
+ "accuracy": 0.9245283018867925,
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
+ "accuracy": 0.275626423690205,
1214
  "count": 439
1215
  },
1216
  "UD": {
1217
+ "accuracy": 0.1951219512195122,
1218
  "count": 41
1219
  }
1220
  }
 
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
+ "accuracy": 0.55,
1228
  "count": 300
1229
  },
1230
  "MB": {
1231
+ "accuracy": 0.07,
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
+ "accuracy": 0.2233502538071066,
1236
  "count": 197
1237
  },
1238
  "UD": {
1239
+ "accuracy": 0.1941747572815534,
1240
  "count": 103
1241
  }
1242
  }
 
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
+ "accuracy": 0.68,
1250
  "count": 200
1251
  },
1252
  "MB": {
1253
+ "accuracy": 0.03,
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
+ "accuracy": 0.2550607287449393,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
+ "accuracy": 0.1568627450980392,
1262
  "count": 153
1263
  }
1264
  }
 
1272
  "count": 100
1273
  },
1274
  "MB": {
1275
+ "accuracy": 0.06,
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
+ "accuracy": 0.31208053691275167,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
+ "accuracy": 0.19801980198019803,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
+ "overall_accuracy": 0.004583333333333333,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
+ "sorl_overall_accuracy": 0.004583333333333333,
1296
+ "sft_overall_accuracy": 0.007916666666666667
1297
  }
add_sub_sorl_v1_abs30_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f6e5e1e62f7dc6b60e3669e47c1843dcad77c3bdb610639bf38311f70d6d063
3
  size 650385300
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:688cc5ac307149019958195a1925a9ceaf23aacc9763b9e0e85256c740254636
3
  size 650385300
add_sub_sorl_v1_abs30_10K/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
@@ -36,7 +36,7 @@
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
- "output_dir": "ckpt/sweep/as_sorl_abs30_K4_10K",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162519662,
71
  "run_name": "add_sub_sorl_v1_abs30_10K",
72
- "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
- "timestamp": "2026-04-12T09:21:17.547661+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "owo1a52f",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/owo1a52f",
81
- "final_accuracy": 0.012083333333333333,
82
- "sft_accuracy": 0.0,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
 
36
  "eval_every": 156,
37
  "save_every": 999999,
38
  "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_sorl_abs30_10K",
40
  "eval_K": 4,
41
  "alpha_traj": 0.0,
42
  "corrupt_method": "shuffle",
 
69
  "no_wandb": false,
70
  "n_params": 162519662,
71
  "run_name": "add_sub_sorl_v1_abs30_10K",
72
+ "git_commit": "dc8dd776fb0c30a4c9073052dcc5e943e0fd80c6",
73
+ "timestamp": "2026-04-13T07:34:34.157977+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "4pjkd149",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/4pjkd149",
81
+ "final_accuracy": 0.004583333333333333,
82
+ "sft_accuracy": 0.007916666666666667,
83
  "eval_method": "ArithmeticEvaluator"
84
  }