amirali1985 commited on
Commit
264fa26
·
verified ·
1 Parent(s): 989a190

Upload add_sub_sorl_v1_abs10_10K

Browse files
add_sub_sorl_v1_abs10_10K/metrics.json CHANGED
@@ -33,229 +33,229 @@
33
  1563
34
  ],
35
  "loss": [
36
- 13.724945068359375,
37
- 7.864343166351318,
38
- 5.013725280761719,
39
- 2.741554021835327,
40
- 2.641360282897949,
41
- 2.458710193634033,
42
- 2.5750374794006348,
43
- 2.7018885612487793,
44
- 2.5789809226989746,
45
- 2.3690404891967773,
46
- 1.7872730493545532,
47
- 1.8722014427185059,
48
- 1.4984050989151,
49
- 1.2242059707641602,
50
- 1.2751460075378418,
51
- 1.0107839107513428,
52
- 1.6164339780807495,
53
- 0.9860085248947144,
54
- 0.9987770915031433,
55
- 1.071738600730896,
56
- 0.49998074769973755,
57
- 0.004859551787376404,
58
- -0.1695592701435089,
59
- 0.0747871994972229,
60
- -0.27059921622276306,
61
- -0.13733237981796265,
62
- 0.056721221655607224,
63
- 0.051335956901311874,
64
- -0.45392414927482605,
65
- -0.3348205089569092
66
  ],
67
  "base_loss": [
68
- 7.9022040367126465,
69
- 5.812614917755127,
70
- 3.858829975128174,
71
- 2.114943742752075,
72
- 1.96530020236969,
73
- 1.9036047458648682,
74
- 1.893314242362976,
75
- 1.8308947086334229,
76
- 1.8680362701416016,
77
- 1.7989755868911743,
78
- 1.8180350065231323,
79
- 1.8677423000335693,
80
- 1.862571120262146,
81
- 1.8560221195220947,
82
- 1.838614583015442,
83
- 1.7637040615081787,
84
- 1.8746064901351929,
85
- 1.8687989711761475,
86
- 1.8472496271133423,
87
- 1.8745392560958862,
88
- 1.8059179782867432,
89
- 1.755523920059204,
90
- 1.7115023136138916,
91
- 1.7793734073638916,
92
- 1.759653091430664,
93
- 1.7252968549728394,
94
- 1.7201368808746338,
95
- 1.7096773386001587,
96
- 1.7525498867034912,
97
- 1.710257887840271
98
  ],
99
  "info_loss": [
100
- -0.2345585823059082,
101
- -0.18186569213867188,
102
- -0.11677074432373047,
103
- -0.13591337203979492,
104
- -0.12461328506469727,
105
- -0.13464367389678955,
106
- -0.12099599838256836,
107
- -0.10128498077392578,
108
- -0.11677730083465576,
109
- -0.13061606884002686,
110
- -0.1908276081085205,
111
- -0.18726766109466553,
112
- -0.2237793207168579,
113
- -0.25037193298339844,
114
- -0.24307358264923096,
115
- -0.26141083240509033,
116
- -0.2112140655517578,
117
- -0.2714040279388428,
118
- -0.2562897205352783,
119
- -0.21821486949920654,
120
- -0.1999720335006714,
121
- -0.19893121719360352,
122
- -0.20633161067962646,
123
- -0.1859527826309204,
124
- -0.21592140197753906,
125
- -0.19767189025878906,
126
- -0.17773842811584473,
127
- -0.17674016952514648,
128
- -0.2308502197265625,
129
- -0.21450459957122803
130
  ],
131
  "abs_loss": [
132
- 2.2656421661376953,
133
- 2.060791254043579,
134
- 1.8972232341766357,
135
- 1.8288743495941162,
136
- 1.8518760204315186,
137
- 1.8478550910949707,
138
- 1.8548732995986938,
139
- 1.8174316883087158,
140
- 1.8104970455169678,
141
- 1.8124628067016602,
142
- 1.8303263187408447,
143
- 1.860059380531311,
144
- 1.8033771514892578,
145
- 1.8484278917312622,
146
- 1.7977077960968018,
147
- 1.7986536026000977,
148
- 1.8080449104309082,
149
- 1.7756034135818481,
150
- 1.678572177886963,
151
- 1.2864962816238403,
152
- 0.798673152923584,
153
- 0.611975371837616,
154
- 0.5902649164199829,
155
- 0.584165632724762,
156
- 0.5656200647354126,
157
- 0.5475164651870728,
158
- 0.5673137903213501,
159
- 0.5301952362060547,
160
- 0.4829615652561188,
161
- 0.5067222118377686
162
  ],
163
  "zipf_loss": [
164
- 7.941762447357178,
165
- 3.664306163787842,
166
- 2.132880449295044,
167
- 1.8028565645217896,
168
- 1.7370052337646484,
169
- 1.7167567014694214,
170
- 1.7061958312988281,
171
- 1.7021006345748901,
172
- 1.6976680755615234,
173
- 1.694979190826416,
174
- 1.6944814920425415,
175
- 1.6911298036575317,
176
- 1.6932895183563232,
177
- 1.6870603561401367,
178
- 1.6874964237213135,
179
- 1.6813228130340576,
180
- 1.673163652420044,
181
- 1.6536895036697388,
182
- 1.546567440032959,
183
- 1.250698447227478,
184
- 0.6139158010482788,
185
- 0.1774502694606781,
186
- 0.1232280433177948,
187
- 0.09652505815029144,
188
- 0.07239970564842224,
189
- 0.059338025748729706,
190
- 0.05723724141716957,
191
- 0.05604078993201256,
192
- 0.053732018917798996,
193
- 0.04929535463452339
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
197
- 0.5284707546234131,
198
- 0.21797625720500946,
199
- 0.12806634604930878,
200
- 0.09031304717063904,
201
- 0.08532247692346573,
202
- 0.07537004351615906,
203
- 0.07645349949598312,
204
- 0.07695373892784119,
205
- 0.07142108678817749,
206
- 0.08406423032283783,
207
- 0.08843859285116196,
208
- 0.07815887778997421,
209
- 0.07982385903596878,
210
- 0.07472201436758041,
211
- 0.07963292300701141,
212
- 0.07144557684659958,
213
- 0.07161661982536316,
214
- 0.07070355117321014,
215
- 0.06902579218149185,
216
- 0.06851960718631744,
217
- 0.06775054335594177,
218
- 0.0655888020992279,
219
- 0.059131793677806854,
220
- 0.05900496616959572,
221
- 0.056693185120821,
222
- 0.05708722025156021,
223
- 0.058303531259298325,
224
- 0.059081800282001495,
225
- 0.059238508343696594,
226
- 0.059114448726177216
227
  ],
228
  "lr": [
229
- 1.9600000000000002e-05,
230
- 3.96e-05,
231
- 4e-05,
232
- 4e-05,
233
- 4e-05,
234
- 4e-05,
235
- 4e-05,
236
- 4e-05,
237
- 4e-05,
238
- 4e-05,
239
- 4e-05,
240
- 4e-05,
241
- 4e-05,
242
- 4e-05,
243
- 4e-05,
244
- 4e-05,
245
- 4e-05,
246
- 4e-05,
247
- 3.944897959183673e-05,
248
- 3.638775510204082e-05,
249
- 3.3326530612244897e-05,
250
- 2.983673469387755e-05,
251
- 2.6775510204081637e-05,
252
- 2.3714285714285713e-05,
253
- 2.022448979591837e-05,
254
- 1.7163265306122454e-05,
255
- 1.4102040816326535e-05,
256
- 1.0612244897959182e-05,
257
- 7.551020408163262e-06,
258
- 4.48979591836735e-06
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
@@ -280,10 +280,10 @@
280
  0.0,
281
  0.02,
282
  0.01,
283
- 0.01
284
  ]
285
  },
286
- "final_accuracy": 0.008333333333333333,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
@@ -298,11 +298,11 @@
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
- "accuracy": 0.24297520661157024,
302
  "count": 605
303
  },
304
  "SS": {
305
- "accuracy": 0.8736842105263158,
306
  "count": 95
307
  }
308
  }
@@ -312,19 +312,19 @@
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
315
- "accuracy": 0.28921568627450983,
316
  "count": 204
317
  },
318
  "SC": {
319
- "accuracy": 0.14792899408284024,
320
  "count": 169
321
  },
322
  "SS": {
323
- "accuracy": 0.8064516129032258,
324
  "count": 31
325
  },
326
  "UC": {
327
- "accuracy": 0.23648648648648649,
328
  "count": 296
329
  }
330
  }
@@ -334,23 +334,23 @@
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
- "accuracy": 0.3619631901840491,
338
  "count": 163
339
  },
340
  "SC": {
341
- "accuracy": 0.1,
342
  "count": 130
343
  },
344
  "SS": {
345
- "accuracy": 0.4827586206896552,
346
  "count": 87
347
  },
348
  "UC": {
349
- "accuracy": 0.33497536945812806,
350
  "count": 203
351
  },
352
  "US": {
353
- "accuracy": 0.5384615384615384,
354
  "count": 117
355
  }
356
  }
@@ -360,23 +360,23 @@
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
- "accuracy": 0.4132231404958678,
364
  "count": 121
365
  },
366
  "SC": {
367
- "accuracy": 0.06611570247933884,
368
  "count": 121
369
  },
370
  "SS": {
371
- "accuracy": 0.46938775510204084,
372
  "count": 49
373
  },
374
  "UC": {
375
- "accuracy": 0.34946236559139787,
376
  "count": 186
377
  },
378
  "US": {
379
- "accuracy": 0.5650224215246636,
380
  "count": 223
381
  }
382
  }
@@ -386,33 +386,33 @@
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
- "accuracy": 0.4519230769230769,
390
  "count": 104
391
  },
392
  "SC": {
393
- "accuracy": 0.08490566037735849,
394
  "count": 106
395
  },
396
  "SS": {
397
- "accuracy": 0.6086956521739131,
398
  "count": 23
399
  },
400
  "UC": {
401
- "accuracy": 0.40625,
402
  "count": 160
403
  },
404
  "US": {
405
- "accuracy": 0.4560260586319218,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
- "full_accuracy": 0.02,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
415
- "accuracy": 0.43,
416
  "count": 100
417
  },
418
  "SC": {
@@ -420,29 +420,29 @@
420
  "count": 100
421
  },
422
  "UC": {
423
- "accuracy": 0.51,
424
  "count": 100
425
  },
426
  "US": {
427
- "accuracy": 0.36,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
- "full_accuracy": 0.05,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
437
- "accuracy": 0.05,
438
  "count": 100
439
  },
440
  "UC": {
441
- "accuracy": 0.53,
442
  "count": 100
443
  },
444
  "US": {
445
- "accuracy": 0.502,
446
  "count": 500
447
  }
448
  }
@@ -452,23 +452,23 @@
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
- "accuracy": 0.27293064876957496,
456
  "count": 447
457
  },
458
  "SC": {
459
- "accuracy": 0.1125,
460
  "count": 320
461
  },
462
  "SS": {
463
- "accuracy": 0.6964285714285714,
464
  "count": 56
465
  },
466
  "UC": {
467
- "accuracy": 0.2665406427221172,
468
  "count": 529
469
  },
470
  "US": {
471
- "accuracy": 0.5,
472
  "count": 48
473
  }
474
  }
@@ -478,19 +478,19 @@
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
481
- "accuracy": 0.31666666666666665,
482
  "count": 300
483
  },
484
  "SC": {
485
- "accuracy": 0.07,
486
  "count": 100
487
  },
488
  "UC": {
489
- "accuracy": 0.19689119170984457,
490
  "count": 193
491
  },
492
  "US": {
493
- "accuracy": 0.2523364485981308,
494
  "count": 107
495
  }
496
  }
@@ -500,41 +500,41 @@
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
- "accuracy": 0.405,
504
  "count": 200
505
  },
506
  "SC": {
507
- "accuracy": 0.05,
508
  "count": 100
509
  },
510
  "UC": {
511
- "accuracy": 0.1484375,
512
  "count": 256
513
  },
514
  "US": {
515
- "accuracy": 0.2986111111111111,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
- "full_accuracy": 0.02,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
- "accuracy": 0.54,
526
  "count": 100
527
  },
528
  "SC": {
529
- "accuracy": 0.1,
530
  "count": 100
531
  },
532
  "UC": {
533
- "accuracy": 0.24836601307189543,
534
  "count": 306
535
  },
536
  "US": {
537
- "accuracy": 0.5,
538
  "count": 194
539
  }
540
  }
@@ -548,11 +548,11 @@
548
  "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.2459016393442623,
552
  "count": 366
553
  },
554
  "US": {
555
- "accuracy": 0.6709401709401709,
556
  "count": 234
557
  }
558
  }
@@ -562,11 +562,11 @@
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
- "accuracy": 0.23960066555740434,
566
  "count": 601
567
  },
568
  "ME": {
569
- "accuracy": 0.8282828282828283,
570
  "count": 99
571
  }
572
  }
@@ -576,19 +576,19 @@
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
- "accuracy": 0.4444444444444444,
580
  "count": 279
581
  },
582
  "MB": {
583
- "accuracy": 0.034482758620689655,
584
  "count": 145
585
  },
586
  "ME": {
587
- "accuracy": 0.875,
588
  "count": 24
589
  },
590
  "UB": {
591
- "accuracy": 0.1388888888888889,
592
  "count": 252
593
  }
594
  }
@@ -598,23 +598,23 @@
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
- "accuracy": 0.6291079812206573,
602
  "count": 213
603
  },
604
  "MB": {
605
- "accuracy": 0.035398230088495575,
606
  "count": 113
607
  },
608
  "ME": {
609
- "accuracy": 0.8705882352941177,
610
  "count": 85
611
  },
612
  "UB": {
613
- "accuracy": 0.19337016574585636,
614
  "count": 181
615
  },
616
  "UD": {
617
- "accuracy": 0.1388888888888889,
618
  "count": 108
619
  }
620
  }
@@ -624,23 +624,23 @@
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
627
- "accuracy": 0.7374301675977654,
628
  "count": 179
629
  },
630
  "MB": {
631
- "accuracy": 0.02912621359223301,
632
  "count": 103
633
  },
634
  "ME": {
635
- "accuracy": 0.8928571428571429,
636
  "count": 56
637
  },
638
  "UB": {
639
- "accuracy": 0.2214765100671141,
640
  "count": 149
641
  },
642
  "UD": {
643
- "accuracy": 0.13145539906103287,
644
  "count": 213
645
  }
646
  }
@@ -650,19 +650,19 @@
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
653
- "accuracy": 0.55,
654
  "count": 200
655
  },
656
  "MB": {
657
- "accuracy": 0.06,
658
  "count": 100
659
  },
660
  "UB": {
661
- "accuracy": 0.35,
662
  "count": 100
663
  },
664
  "UD": {
665
- "accuracy": 0.16666666666666666,
666
  "count": 300
667
  }
668
  }
@@ -676,15 +676,15 @@
676
  "count": 100
677
  },
678
  "MB": {
679
- "accuracy": 0.04,
680
  "count": 100
681
  },
682
  "UB": {
683
- "accuracy": 0.47,
684
  "count": 100
685
  },
686
  "UD": {
687
- "accuracy": 0.16,
688
  "count": 400
689
  }
690
  }
@@ -694,23 +694,23 @@
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
- "accuracy": 0.4066666666666667,
698
  "count": 600
699
  },
700
  "MB": {
701
- "accuracy": 0.011235955056179775,
702
  "count": 267
703
  },
704
  "ME": {
705
- "accuracy": 0.7547169811320755,
706
  "count": 53
707
  },
708
  "UB": {
709
- "accuracy": 0.1662870159453303,
710
  "count": 439
711
  },
712
  "UD": {
713
- "accuracy": 0.21951219512195122,
714
  "count": 41
715
  }
716
  }
@@ -720,19 +720,19 @@
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
- "accuracy": 0.38,
724
  "count": 300
725
  },
726
  "MB": {
727
- "accuracy": 0.09,
728
  "count": 100
729
  },
730
  "UB": {
731
- "accuracy": 0.16243654822335024,
732
  "count": 197
733
  },
734
  "UD": {
735
- "accuracy": 0.20388349514563106,
736
  "count": 103
737
  }
738
  }
@@ -742,19 +742,19 @@
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
745
- "accuracy": 0.55,
746
  "count": 200
747
  },
748
  "MB": {
749
- "accuracy": 0.07,
750
  "count": 100
751
  },
752
  "UB": {
753
- "accuracy": 0.17408906882591094,
754
  "count": 247
755
  },
756
  "UD": {
757
- "accuracy": 0.1568627450980392,
758
  "count": 153
759
  }
760
  }
@@ -768,22 +768,22 @@
768
  "count": 100
769
  },
770
  "MB": {
771
- "accuracy": 0.04,
772
  "count": 100
773
  },
774
  "UB": {
775
- "accuracy": 0.1644295302013423,
776
  "count": 298
777
  },
778
  "UD": {
779
- "accuracy": 0.1188118811881188,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
- "overall_accuracy": 0.004166666666666667,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
@@ -802,11 +802,11 @@
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
- "accuracy": 0.3256198347107438,
806
  "count": 605
807
  },
808
  "SS": {
809
- "accuracy": 0.968421052631579,
810
  "count": 95
811
  }
812
  }
@@ -816,19 +816,19 @@
816
  "n_examples": 100,
817
  "per_subtask": {
818
  "SA": {
819
- "accuracy": 0.3431372549019608,
820
  "count": 204
821
  },
822
  "SC": {
823
- "accuracy": 0.23076923076923078,
824
  "count": 169
825
  },
826
  "SS": {
827
- "accuracy": 0.8387096774193549,
828
  "count": 31
829
  },
830
  "UC": {
831
- "accuracy": 0.32094594594594594,
832
  "count": 296
833
  }
834
  }
@@ -838,11 +838,11 @@
838
  "n_examples": 100,
839
  "per_subtask": {
840
  "SA": {
841
- "accuracy": 0.4785276073619632,
842
  "count": 163
843
  },
844
  "SC": {
845
- "accuracy": 0.13076923076923078,
846
  "count": 130
847
  },
848
  "SS": {
@@ -850,7 +850,7 @@
850
  "count": 87
851
  },
852
  "UC": {
853
- "accuracy": 0.4039408866995074,
854
  "count": 203
855
  },
856
  "US": {
@@ -860,27 +860,27 @@
860
  }
861
  },
862
  "add_S3": {
863
- "full_accuracy": 0.0,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SA": {
867
- "accuracy": 0.5289256198347108,
868
  "count": 121
869
  },
870
  "SC": {
871
- "accuracy": 0.0743801652892562,
872
  "count": 121
873
  },
874
  "SS": {
875
- "accuracy": 0.4897959183673469,
876
  "count": 49
877
  },
878
  "UC": {
879
- "accuracy": 0.41935483870967744,
880
  "count": 186
881
  },
882
  "US": {
883
- "accuracy": 0.5964125560538116,
884
  "count": 223
885
  }
886
  }
@@ -890,29 +890,29 @@
890
  "n_examples": 100,
891
  "per_subtask": {
892
  "SA": {
893
- "accuracy": 0.5480769230769231,
894
  "count": 104
895
  },
896
  "SC": {
897
- "accuracy": 0.08490566037735849,
898
  "count": 106
899
  },
900
  "SS": {
901
- "accuracy": 0.6521739130434783,
902
  "count": 23
903
  },
904
  "UC": {
905
- "accuracy": 0.41875,
906
  "count": 160
907
  },
908
  "US": {
909
- "accuracy": 0.5374592833876222,
910
  "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
- "full_accuracy": 0.02,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
@@ -920,33 +920,33 @@
920
  "count": 100
921
  },
922
  "SC": {
923
- "accuracy": 0.05,
924
  "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.46,
928
  "count": 100
929
  },
930
  "US": {
931
- "accuracy": 0.3175,
932
  "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
- "full_accuracy": 0.12,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
941
- "accuracy": 0.12,
942
  "count": 100
943
  },
944
  "UC": {
945
- "accuracy": 0.48,
946
  "count": 100
947
  },
948
  "US": {
949
- "accuracy": 0.448,
950
  "count": 500
951
  }
952
  }
@@ -956,23 +956,23 @@
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
- "accuracy": 0.36017897091722595,
960
  "count": 447
961
  },
962
  "SC": {
963
- "accuracy": 0.18125,
964
  "count": 320
965
  },
966
  "SS": {
967
- "accuracy": 0.625,
968
  "count": 56
969
  },
970
  "UC": {
971
- "accuracy": 0.3534971644612476,
972
  "count": 529
973
  },
974
  "US": {
975
- "accuracy": 0.5,
976
  "count": 48
977
  }
978
  }
@@ -982,7 +982,7 @@
982
  "n_examples": 100,
983
  "per_subtask": {
984
  "SA": {
985
- "accuracy": 0.4,
986
  "count": 300
987
  },
988
  "SC": {
@@ -990,7 +990,7 @@
990
  "count": 100
991
  },
992
  "UC": {
993
- "accuracy": 0.22279792746113988,
994
  "count": 193
995
  },
996
  "US": {
@@ -1004,41 +1004,41 @@
1004
  "n_examples": 100,
1005
  "per_subtask": {
1006
  "SA": {
1007
- "accuracy": 0.53,
1008
  "count": 200
1009
  },
1010
  "SC": {
1011
- "accuracy": 0.04,
1012
  "count": 100
1013
  },
1014
  "UC": {
1015
- "accuracy": 0.19921875,
1016
  "count": 256
1017
  },
1018
  "US": {
1019
- "accuracy": 0.4513888888888889,
1020
  "count": 144
1021
  }
1022
  }
1023
  },
1024
  "add_C5": {
1025
- "full_accuracy": 0.01,
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
1029
- "accuracy": 0.57,
1030
  "count": 100
1031
  },
1032
  "SC": {
1033
- "accuracy": 0.09,
1034
  "count": 100
1035
  },
1036
  "UC": {
1037
- "accuracy": 0.30392156862745096,
1038
  "count": 306
1039
  },
1040
  "US": {
1041
- "accuracy": 0.4948453608247423,
1042
  "count": 194
1043
  }
1044
  }
@@ -1048,15 +1048,15 @@
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
1051
- "accuracy": 0.09,
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
- "accuracy": 0.31693989071038253,
1056
  "count": 366
1057
  },
1058
  "US": {
1059
- "accuracy": 0.7521367521367521,
1060
  "count": 234
1061
  }
1062
  }
@@ -1066,11 +1066,11 @@
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
- "accuracy": 0.3277870216306156,
1070
  "count": 601
1071
  },
1072
  "ME": {
1073
- "accuracy": 0.9090909090909091,
1074
  "count": 99
1075
  }
1076
  }
@@ -1080,19 +1080,19 @@
1080
  "n_examples": 100,
1081
  "per_subtask": {
1082
  "MD": {
1083
- "accuracy": 0.5483870967741935,
1084
  "count": 279
1085
  },
1086
  "MB": {
1087
- "accuracy": 0.05517241379310345,
1088
  "count": 145
1089
  },
1090
  "ME": {
1091
- "accuracy": 0.6666666666666666,
1092
  "count": 24
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.2222222222222222,
1096
  "count": 252
1097
  }
1098
  }
@@ -1102,23 +1102,23 @@
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.7183098591549296,
1106
  "count": 213
1107
  },
1108
  "MB": {
1109
- "accuracy": 0.017699115044247787,
1110
  "count": 113
1111
  },
1112
  "ME": {
1113
- "accuracy": 0.9058823529411765,
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
- "accuracy": 0.23204419889502761,
1118
  "count": 181
1119
  },
1120
  "UD": {
1121
- "accuracy": 0.2037037037037037,
1122
  "count": 108
1123
  }
1124
  }
@@ -1128,23 +1128,23 @@
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
1131
- "accuracy": 0.7932960893854749,
1132
  "count": 179
1133
  },
1134
  "MB": {
1135
- "accuracy": 0.02912621359223301,
1136
  "count": 103
1137
  },
1138
  "ME": {
1139
- "accuracy": 0.9285714285714286,
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
- "accuracy": 0.33557046979865773,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
- "accuracy": 0.2112676056338028,
1148
  "count": 213
1149
  }
1150
  }
@@ -1154,25 +1154,25 @@
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
1157
- "accuracy": 0.725,
1158
  "count": 200
1159
  },
1160
  "MB": {
1161
- "accuracy": 0.1,
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
- "accuracy": 0.42,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
- "accuracy": 0.31,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
- "full_accuracy": 0.03,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
@@ -1180,15 +1180,15 @@
1180
  "count": 100
1181
  },
1182
  "MB": {
1183
- "accuracy": 0.03,
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
- "accuracy": 0.71,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
- "accuracy": 0.285,
1192
  "count": 400
1193
  }
1194
  }
@@ -1198,23 +1198,23 @@
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
- "accuracy": 0.53,
1202
  "count": 600
1203
  },
1204
  "MB": {
1205
- "accuracy": 0.03745318352059925,
1206
  "count": 267
1207
  },
1208
  "ME": {
1209
- "accuracy": 0.7358490566037735,
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
- "accuracy": 0.23917995444191345,
1214
  "count": 439
1215
  },
1216
  "UD": {
1217
- "accuracy": 0.17073170731707318,
1218
  "count": 41
1219
  }
1220
  }
@@ -1224,19 +1224,19 @@
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
- "accuracy": 0.51,
1228
  "count": 300
1229
  },
1230
  "MB": {
1231
- "accuracy": 0.13,
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
- "accuracy": 0.18274111675126903,
1236
  "count": 197
1237
  },
1238
  "UD": {
1239
- "accuracy": 0.39805825242718446,
1240
  "count": 103
1241
  }
1242
  }
@@ -1246,25 +1246,25 @@
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
- "accuracy": 0.665,
1250
  "count": 200
1251
  },
1252
  "MB": {
1253
- "accuracy": 0.1,
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
- "accuracy": 0.21862348178137653,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
- "accuracy": 0.38562091503267976,
1262
  "count": 153
1263
  }
1264
  }
1265
  },
1266
  "sub_B5": {
1267
- "full_accuracy": 0.02,
1268
  "n_examples": 100,
1269
  "per_subtask": {
1270
  "MD": {
@@ -1272,26 +1272,26 @@
1272
  "count": 100
1273
  },
1274
  "MB": {
1275
- "accuracy": 0.07,
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
- "accuracy": 0.2953020134228188,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
- "accuracy": 0.33663366336633666,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
- "overall_accuracy": 0.008333333333333333,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
- "sorl_overall_accuracy": 0.008333333333333333,
1296
- "sft_overall_accuracy": 0.004166666666666667
1297
  }
 
33
  1563
34
  ],
35
  "loss": [
36
+ 11.961856842041016,
37
+ 5.24287223815918,
38
+ 3.2151432037353516,
39
+ 2.6320042610168457,
40
+ 2.6821465492248535,
41
+ 2.0658371448516846,
42
+ 1.942716360092163,
43
+ 1.8816739320755005,
44
+ 1.884223461151123,
45
+ 0.5458273887634277,
46
+ -0.02370402216911316,
47
+ -0.10736316442489624,
48
+ -0.0433916300535202,
49
+ -0.08627352118492126,
50
+ 0.19944323599338531,
51
+ -0.05154527723789215,
52
+ 0.18322159349918365,
53
+ 0.12385116517543793,
54
+ 0.17138411104679108,
55
+ -0.045592449605464935,
56
+ 0.21150481700897217,
57
+ 0.0073037222027778625,
58
+ -0.3383682370185852,
59
+ 0.12239806354045868,
60
+ -0.13985984027385712,
61
+ 0.046681616455316544,
62
+ 0.25705066323280334,
63
+ 0.1217120885848999,
64
+ 0.1391371488571167,
65
+ -0.13681045174598694
66
  ],
67
  "base_loss": [
68
+ 6.969983100891113,
69
+ 3.7504212856292725,
70
+ 1.9797016382217407,
71
+ 1.8771779537200928,
72
+ 1.8847401142120361,
73
+ 1.8795526027679443,
74
+ 1.8393981456756592,
75
+ 1.8044995069503784,
76
+ 1.8537063598632812,
77
+ 1.7935826778411865,
78
+ 1.8110519647598267,
79
+ 1.8323622941970825,
80
+ 1.8425616025924683,
81
+ 1.8131767511367798,
82
+ 1.8040305376052856,
83
+ 1.721121907234192,
84
+ 1.8190782070159912,
85
+ 1.812418818473816,
86
+ 1.7917753458023071,
87
+ 1.793439507484436,
88
+ 1.725421667098999,
89
+ 1.7205660343170166,
90
+ 1.6890180110931396,
91
+ 1.746482014656067,
92
+ 1.7506285905838013,
93
+ 1.6954907178878784,
94
+ 1.7080529928207397,
95
+ 1.6827865839004517,
96
+ 1.688535451889038,
97
+ 1.6724849939346313
98
  ],
99
  "info_loss": [
100
+ -0.18472051620483398,
101
+ -0.08357405662536621,
102
+ -0.06814658641815186,
103
+ -0.11294722557067871,
104
+ -0.10811793804168701,
105
+ -0.16878092288970947,
106
+ -0.17634761333465576,
107
+ -0.17674624919891357,
108
+ -0.16523611545562744,
109
+ -0.19186913967132568,
110
+ -0.2089073657989502,
111
+ -0.20613396167755127,
112
+ -0.1997373104095459,
113
+ -0.19908201694488525,
114
+ -0.16838586330413818,
115
+ -0.1851973533630371,
116
+ -0.1715489625930786,
117
+ -0.17607414722442627,
118
+ -0.16980135440826416,
119
+ -0.1923583745956421,
120
+ -0.15899121761322021,
121
+ -0.1772533655166626,
122
+ -0.20877563953399658,
123
+ -0.16812872886657715,
124
+ -0.19421494007110596,
125
+ -0.1697760820388794,
126
+ -0.1513657569885254,
127
+ -0.16188645362854004,
128
+ -0.15964651107788086,
129
+ -0.18581724166870117
130
  ],
131
  "abs_loss": [
132
+ 2.1909255981445312,
133
+ 1.8499255180358887,
134
+ 1.8133636713027954,
135
+ 1.8108993768692017,
136
+ 1.8413399457931519,
137
+ 1.8359529972076416,
138
+ 1.8084602355957031,
139
+ 1.7734503746032715,
140
+ 1.6312320232391357,
141
+ 1.0209534168243408,
142
+ 0.7137269377708435,
143
+ 0.5659414529800415,
144
+ 0.4661961793899536,
145
+ 0.4323907196521759,
146
+ 0.36104658246040344,
147
+ 0.4403992295265198,
148
+ 0.4302736222743988,
149
+ 0.42298078536987305,
150
+ 0.34669893980026245,
151
+ 0.45547735691070557,
152
+ 0.4821462035179138,
153
+ 0.40443599224090576,
154
+ 0.44047102332115173,
155
+ 0.37605512142181396,
156
+ 0.35405462980270386,
157
+ 0.33734890818595886,
158
+ 0.3338230848312378,
159
+ 0.30671998858451843,
160
+ 0.26178568601608276,
161
+ 0.33180034160614014
162
  ],
163
  "zipf_loss": [
164
+ 6.619986534118652,
165
+ 2.1431989669799805,
166
+ 1.7355711460113525,
167
+ 1.7032086849212646,
168
+ 1.6944518089294434,
169
+ 1.6904984712600708,
170
+ 1.685948371887207,
171
+ 1.6672918796539307,
172
+ 1.5197550058364868,
173
+ 0.568840742111206,
174
+ 0.18294498324394226,
175
+ 0.06502000987529755,
176
+ 0.06480024755001068,
177
+ 0.0481308251619339,
178
+ 0.04316667094826698,
179
+ 0.03526642918586731,
180
+ 0.03660564124584198,
181
+ 0.0298757404088974,
182
+ 0.042952414602041245,
183
+ 0.03900405019521713,
184
+ 0.027780719101428986,
185
+ 0.01882774382829666,
186
+ 0.016323039308190346,
187
+ 0.01959783211350441,
188
+ 0.016255516558885574,
189
+ 0.015216827392578125,
190
+ 0.029272940009832382,
191
+ 0.02711804211139679,
192
+ 0.020888233557343483,
193
+ 0.0156969353556633
194
  ],
195
  "denoise_loss": [],
196
  "ortho_loss": [
197
+ 0.3513818085193634,
198
+ 0.12336498498916626,
199
+ 0.07400115579366684,
200
+ 0.04318969324231148,
201
+ 0.03977782279253006,
202
+ 0.031715553253889084,
203
+ 0.03395743668079376,
204
+ 0.037701401859521866,
205
+ 0.03583124652504921,
206
+ 0.0376955084502697,
207
+ 0.03476428985595703,
208
+ 0.03290867805480957,
209
+ 0.03230033069849014,
210
+ 0.028476815670728683,
211
+ 0.029924070462584496,
212
+ 0.025573937222361565,
213
+ 0.023195529356598854,
214
+ 0.024144373834133148,
215
+ 0.026353251188993454,
216
+ 0.02235715091228485,
217
+ 0.025194751098752022,
218
+ 0.025079840794205666,
219
+ 0.028410209342837334,
220
+ 0.03212076798081398,
221
+ 0.032003507018089294,
222
+ 0.03370780497789383,
223
+ 0.03460211679339409,
224
+ 0.03515707328915596,
225
+ 0.03469262272119522,
226
+ 0.03475438058376312
227
  ],
228
  "lr": [
229
+ 3.9200000000000004e-05,
230
+ 7.92e-05,
231
+ 8e-05,
232
+ 8e-05,
233
+ 8e-05,
234
+ 8e-05,
235
+ 8e-05,
236
+ 8e-05,
237
+ 8e-05,
238
+ 8e-05,
239
+ 8e-05,
240
+ 8e-05,
241
+ 8e-05,
242
+ 8e-05,
243
+ 8e-05,
244
+ 8e-05,
245
+ 8e-05,
246
+ 8e-05,
247
+ 7.889795918367346e-05,
248
+ 7.277551020408164e-05,
249
+ 6.665306122448979e-05,
250
+ 5.96734693877551e-05,
251
+ 5.3551020408163274e-05,
252
+ 4.7428571428571427e-05,
253
+ 4.044897959183674e-05,
254
+ 3.432653061224491e-05,
255
+ 2.820408163265307e-05,
256
+ 2.1224489795918364e-05,
257
+ 1.5102040816326524e-05,
258
+ 8.9795918367347e-06
259
  ],
260
  "emb_lr": [],
261
  "eval_step": [
 
280
  0.0,
281
  0.02,
282
  0.01,
283
+ 0.0
284
  ]
285
  },
286
+ "final_accuracy": 0.007083333333333333,
287
  "sft_eval": {
288
  "config": {
289
  "ops": "add_sub",
 
298
  "n_examples": 100,
299
  "per_subtask": {
300
  "SA": {
301
+ "accuracy": 0.30247933884297523,
302
  "count": 605
303
  },
304
  "SS": {
305
+ "accuracy": 0.8,
306
  "count": 95
307
  }
308
  }
 
312
  "n_examples": 100,
313
  "per_subtask": {
314
  "SA": {
315
+ "accuracy": 0.29411764705882354,
316
  "count": 204
317
  },
318
  "SC": {
319
+ "accuracy": 0.20118343195266272,
320
  "count": 169
321
  },
322
  "SS": {
323
+ "accuracy": 0.5483870967741935,
324
  "count": 31
325
  },
326
  "UC": {
327
+ "accuracy": 0.2905405405405405,
328
  "count": 296
329
  }
330
  }
 
334
  "n_examples": 100,
335
  "per_subtask": {
336
  "SA": {
337
+ "accuracy": 0.4110429447852761,
338
  "count": 163
339
  },
340
  "SC": {
341
+ "accuracy": 0.14615384615384616,
342
  "count": 130
343
  },
344
  "SS": {
345
+ "accuracy": 0.41379310344827586,
346
  "count": 87
347
  },
348
  "UC": {
349
+ "accuracy": 0.4187192118226601,
350
  "count": 203
351
  },
352
  "US": {
353
+ "accuracy": 0.5726495726495726,
354
  "count": 117
355
  }
356
  }
 
360
  "n_examples": 100,
361
  "per_subtask": {
362
  "SA": {
363
+ "accuracy": 0.4214876033057851,
364
  "count": 121
365
  },
366
  "SC": {
367
+ "accuracy": 0.09090909090909091,
368
  "count": 121
369
  },
370
  "SS": {
371
+ "accuracy": 0.3673469387755102,
372
  "count": 49
373
  },
374
  "UC": {
375
+ "accuracy": 0.41935483870967744,
376
  "count": 186
377
  },
378
  "US": {
379
+ "accuracy": 0.6278026905829597,
380
  "count": 223
381
  }
382
  }
 
386
  "n_examples": 100,
387
  "per_subtask": {
388
  "SA": {
389
+ "accuracy": 0.41346153846153844,
390
  "count": 104
391
  },
392
  "SC": {
393
+ "accuracy": 0.1320754716981132,
394
  "count": 106
395
  },
396
  "SS": {
397
+ "accuracy": 0.4782608695652174,
398
  "count": 23
399
  },
400
  "UC": {
401
+ "accuracy": 0.5125,
402
  "count": 160
403
  },
404
  "US": {
405
+ "accuracy": 0.6319218241042345,
406
  "count": 307
407
  }
408
  }
409
  },
410
  "add_S5": {
411
+ "full_accuracy": 0.01,
412
  "n_examples": 100,
413
  "per_subtask": {
414
  "SA": {
415
+ "accuracy": 0.49,
416
  "count": 100
417
  },
418
  "SC": {
 
420
  "count": 100
421
  },
422
  "UC": {
423
+ "accuracy": 0.67,
424
  "count": 100
425
  },
426
  "US": {
427
+ "accuracy": 0.44,
428
  "count": 400
429
  }
430
  }
431
  },
432
  "add_S6": {
433
+ "full_accuracy": 0.14,
434
  "n_examples": 100,
435
  "per_subtask": {
436
  "SC": {
437
+ "accuracy": 0.14,
438
  "count": 100
439
  },
440
  "UC": {
441
+ "accuracy": 0.77,
442
  "count": 100
443
  },
444
  "US": {
445
+ "accuracy": 0.746,
446
  "count": 500
447
  }
448
  }
 
452
  "n_examples": 200,
453
  "per_subtask": {
454
  "SA": {
455
+ "accuracy": 0.3243847874720358,
456
  "count": 447
457
  },
458
  "SC": {
459
+ "accuracy": 0.14375,
460
  "count": 320
461
  },
462
  "SS": {
463
+ "accuracy": 0.6428571428571429,
464
  "count": 56
465
  },
466
  "UC": {
467
+ "accuracy": 0.32514177693761814,
468
  "count": 529
469
  },
470
  "US": {
471
+ "accuracy": 0.5208333333333334,
472
  "count": 48
473
  }
474
  }
 
478
  "n_examples": 100,
479
  "per_subtask": {
480
  "SA": {
481
+ "accuracy": 0.37333333333333335,
482
  "count": 300
483
  },
484
  "SC": {
485
+ "accuracy": 0.1,
486
  "count": 100
487
  },
488
  "UC": {
489
+ "accuracy": 0.24870466321243523,
490
  "count": 193
491
  },
492
  "US": {
493
+ "accuracy": 0.29906542056074764,
494
  "count": 107
495
  }
496
  }
 
500
  "n_examples": 100,
501
  "per_subtask": {
502
  "SA": {
503
+ "accuracy": 0.46,
504
  "count": 200
505
  },
506
  "SC": {
507
+ "accuracy": 0.02,
508
  "count": 100
509
  },
510
  "UC": {
511
+ "accuracy": 0.203125,
512
  "count": 256
513
  },
514
  "US": {
515
+ "accuracy": 0.2916666666666667,
516
  "count": 144
517
  }
518
  }
519
  },
520
  "add_C5": {
521
+ "full_accuracy": 0.01,
522
  "n_examples": 100,
523
  "per_subtask": {
524
  "SA": {
525
+ "accuracy": 0.53,
526
  "count": 100
527
  },
528
  "SC": {
529
+ "accuracy": 0.09,
530
  "count": 100
531
  },
532
  "UC": {
533
+ "accuracy": 0.2875816993464052,
534
  "count": 306
535
  },
536
  "US": {
537
+ "accuracy": 0.5670103092783505,
538
  "count": 194
539
  }
540
  }
 
548
  "count": 100
549
  },
550
  "UC": {
551
+ "accuracy": 0.29508196721311475,
552
  "count": 366
553
  },
554
  "US": {
555
+ "accuracy": 0.7948717948717948,
556
  "count": 234
557
  }
558
  }
 
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "MD": {
565
+ "accuracy": 0.22462562396006655,
566
  "count": 601
567
  },
568
  "ME": {
569
+ "accuracy": 0.98989898989899,
570
  "count": 99
571
  }
572
  }
 
576
  "n_examples": 100,
577
  "per_subtask": {
578
  "MD": {
579
+ "accuracy": 0.4050179211469534,
580
  "count": 279
581
  },
582
  "MB": {
583
+ "accuracy": 0.013793103448275862,
584
  "count": 145
585
  },
586
  "ME": {
587
+ "accuracy": 0.9583333333333334,
588
  "count": 24
589
  },
590
  "UB": {
591
+ "accuracy": 0.11507936507936507,
592
  "count": 252
593
  }
594
  }
 
598
  "n_examples": 100,
599
  "per_subtask": {
600
  "MD": {
601
+ "accuracy": 0.6244131455399061,
602
  "count": 213
603
  },
604
  "MB": {
605
+ "accuracy": 0.017699115044247787,
606
  "count": 113
607
  },
608
  "ME": {
609
+ "accuracy": 0.9529411764705882,
610
  "count": 85
611
  },
612
  "UB": {
613
+ "accuracy": 0.15469613259668508,
614
  "count": 181
615
  },
616
  "UD": {
617
+ "accuracy": 0.037037037037037035,
618
  "count": 108
619
  }
620
  }
 
624
  "n_examples": 100,
625
  "per_subtask": {
626
  "MD": {
627
+ "accuracy": 0.7597765363128491,
628
  "count": 179
629
  },
630
  "MB": {
631
+ "accuracy": 0.009708737864077669,
632
  "count": 103
633
  },
634
  "ME": {
635
+ "accuracy": 0.9464285714285714,
636
  "count": 56
637
  },
638
  "UB": {
639
+ "accuracy": 0.19463087248322147,
640
  "count": 149
641
  },
642
  "UD": {
643
+ "accuracy": 0.06572769953051644,
644
  "count": 213
645
  }
646
  }
 
650
  "n_examples": 100,
651
  "per_subtask": {
652
  "MD": {
653
+ "accuracy": 0.515,
654
  "count": 200
655
  },
656
  "MB": {
657
+ "accuracy": 0.03,
658
  "count": 100
659
  },
660
  "UB": {
661
+ "accuracy": 0.33,
662
  "count": 100
663
  },
664
  "UD": {
665
+ "accuracy": 0.1,
666
  "count": 300
667
  }
668
  }
 
676
  "count": 100
677
  },
678
  "MB": {
679
+ "accuracy": 0.02,
680
  "count": 100
681
  },
682
  "UB": {
683
+ "accuracy": 0.38,
684
  "count": 100
685
  },
686
  "UD": {
687
+ "accuracy": 0.05,
688
  "count": 400
689
  }
690
  }
 
694
  "n_examples": 200,
695
  "per_subtask": {
696
  "MD": {
697
+ "accuracy": 0.36666666666666664,
698
  "count": 600
699
  },
700
  "MB": {
701
+ "accuracy": 0.0149812734082397,
702
  "count": 267
703
  },
704
  "ME": {
705
+ "accuracy": 0.9811320754716981,
706
  "count": 53
707
  },
708
  "UB": {
709
+ "accuracy": 0.13439635535307518,
710
  "count": 439
711
  },
712
  "UD": {
713
+ "accuracy": 0.0975609756097561,
714
  "count": 41
715
  }
716
  }
 
720
  "n_examples": 100,
721
  "per_subtask": {
722
  "MD": {
723
+ "accuracy": 0.36,
724
  "count": 300
725
  },
726
  "MB": {
727
+ "accuracy": 0.05,
728
  "count": 100
729
  },
730
  "UB": {
731
+ "accuracy": 0.18781725888324874,
732
  "count": 197
733
  },
734
  "UD": {
735
+ "accuracy": 0.0970873786407767,
736
  "count": 103
737
  }
738
  }
 
742
  "n_examples": 100,
743
  "per_subtask": {
744
  "MD": {
745
+ "accuracy": 0.52,
746
  "count": 200
747
  },
748
  "MB": {
749
+ "accuracy": 0.02,
750
  "count": 100
751
  },
752
  "UB": {
753
+ "accuracy": 0.16194331983805668,
754
  "count": 247
755
  },
756
  "UD": {
757
+ "accuracy": 0.0784313725490196,
758
  "count": 153
759
  }
760
  }
 
768
  "count": 100
769
  },
770
  "MB": {
771
+ "accuracy": 0.02,
772
  "count": 100
773
  },
774
  "UB": {
775
+ "accuracy": 0.13087248322147652,
776
  "count": 298
777
  },
778
  "UD": {
779
+ "accuracy": 0.10396039603960396,
780
  "count": 202
781
  }
782
  }
783
  }
784
  },
785
  "summary": {
786
+ "overall_accuracy": 0.007083333333333333,
787
  "total_examples": 2400,
788
  "n_splits": 22
789
  }
 
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
+ "accuracy": 0.33884297520661155,
806
  "count": 605
807
  },
808
  "SS": {
809
+ "accuracy": 0.7578947368421053,
810
  "count": 95
811
  }
812
  }
 
816
  "n_examples": 100,
817
  "per_subtask": {
818
  "SA": {
819
+ "accuracy": 0.3333333333333333,
820
  "count": 204
821
  },
822
  "SC": {
823
+ "accuracy": 0.2781065088757396,
824
  "count": 169
825
  },
826
  "SS": {
827
+ "accuracy": 0.6129032258064516,
828
  "count": 31
829
  },
830
  "UC": {
831
+ "accuracy": 0.3310810810810811,
832
  "count": 296
833
  }
834
  }
 
838
  "n_examples": 100,
839
  "per_subtask": {
840
  "SA": {
841
+ "accuracy": 0.34355828220858897,
842
  "count": 163
843
  },
844
  "SC": {
845
+ "accuracy": 0.15384615384615385,
846
  "count": 130
847
  },
848
  "SS": {
 
850
  "count": 87
851
  },
852
  "UC": {
853
+ "accuracy": 0.4236453201970443,
854
  "count": 203
855
  },
856
  "US": {
 
860
  }
861
  },
862
  "add_S3": {
863
+ "full_accuracy": 0.01,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SA": {
867
+ "accuracy": 0.4628099173553719,
868
  "count": 121
869
  },
870
  "SC": {
871
+ "accuracy": 0.17355371900826447,
872
  "count": 121
873
  },
874
  "SS": {
875
+ "accuracy": 0.5306122448979592,
876
  "count": 49
877
  },
878
  "UC": {
879
+ "accuracy": 0.42473118279569894,
880
  "count": 186
881
  },
882
  "US": {
883
+ "accuracy": 0.726457399103139,
884
  "count": 223
885
  }
886
  }
 
890
  "n_examples": 100,
891
  "per_subtask": {
892
  "SA": {
893
+ "accuracy": 0.40384615384615385,
894
  "count": 104
895
  },
896
  "SC": {
897
+ "accuracy": 0.16037735849056603,
898
  "count": 106
899
  },
900
  "SS": {
901
+ "accuracy": 0.5652173913043478,
902
  "count": 23
903
  },
904
  "UC": {
905
+ "accuracy": 0.46875,
906
  "count": 160
907
  },
908
  "US": {
909
+ "accuracy": 0.6775244299674267,
910
  "count": 307
911
  }
912
  }
913
  },
914
  "add_S5": {
915
+ "full_accuracy": 0.0,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
 
920
  "count": 100
921
  },
922
  "SC": {
923
+ "accuracy": 0.1,
924
  "count": 100
925
  },
926
  "UC": {
927
+ "accuracy": 0.57,
928
  "count": 100
929
  },
930
  "US": {
931
+ "accuracy": 0.4925,
932
  "count": 400
933
  }
934
  }
935
  },
936
  "add_S6": {
937
+ "full_accuracy": 0.15,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SC": {
941
+ "accuracy": 0.15,
942
  "count": 100
943
  },
944
  "UC": {
945
+ "accuracy": 0.7,
946
  "count": 100
947
  },
948
  "US": {
949
+ "accuracy": 0.682,
950
  "count": 500
951
  }
952
  }
 
956
  "n_examples": 200,
957
  "per_subtask": {
958
  "SA": {
959
+ "accuracy": 0.38478747203579416,
960
  "count": 447
961
  },
962
  "SC": {
963
+ "accuracy": 0.20625,
964
  "count": 320
965
  },
966
  "SS": {
967
+ "accuracy": 0.5357142857142857,
968
  "count": 56
969
  },
970
  "UC": {
971
+ "accuracy": 0.3572778827977316,
972
  "count": 529
973
  },
974
  "US": {
975
+ "accuracy": 0.6458333333333334,
976
  "count": 48
977
  }
978
  }
 
982
  "n_examples": 100,
983
  "per_subtask": {
984
  "SA": {
985
+ "accuracy": 0.3933333333333333,
986
  "count": 300
987
  },
988
  "SC": {
 
990
  "count": 100
991
  },
992
  "UC": {
993
+ "accuracy": 0.23316062176165803,
994
  "count": 193
995
  },
996
  "US": {
 
1004
  "n_examples": 100,
1005
  "per_subtask": {
1006
  "SA": {
1007
+ "accuracy": 0.52,
1008
  "count": 200
1009
  },
1010
  "SC": {
1011
+ "accuracy": 0.05,
1012
  "count": 100
1013
  },
1014
  "UC": {
1015
+ "accuracy": 0.21484375,
1016
  "count": 256
1017
  },
1018
  "US": {
1019
+ "accuracy": 0.4097222222222222,
1020
  "count": 144
1021
  }
1022
  }
1023
  },
1024
  "add_C5": {
1025
+ "full_accuracy": 0.0,
1026
  "n_examples": 100,
1027
  "per_subtask": {
1028
  "SA": {
1029
+ "accuracy": 0.53,
1030
  "count": 100
1031
  },
1032
  "SC": {
1033
+ "accuracy": 0.06,
1034
  "count": 100
1035
  },
1036
  "UC": {
1037
+ "accuracy": 0.2777777777777778,
1038
  "count": 306
1039
  },
1040
  "US": {
1041
+ "accuracy": 0.5154639175257731,
1042
  "count": 194
1043
  }
1044
  }
 
1048
  "n_examples": 100,
1049
  "per_subtask": {
1050
  "SC": {
1051
+ "accuracy": 0.18,
1052
  "count": 100
1053
  },
1054
  "UC": {
1055
+ "accuracy": 0.33879781420765026,
1056
  "count": 366
1057
  },
1058
  "US": {
1059
+ "accuracy": 0.8632478632478633,
1060
  "count": 234
1061
  }
1062
  }
 
1066
  "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
+ "accuracy": 0.33111480865224624,
1070
  "count": 601
1071
  },
1072
  "ME": {
1073
+ "accuracy": 0.9696969696969697,
1074
  "count": 99
1075
  }
1076
  }
 
1080
  "n_examples": 100,
1081
  "per_subtask": {
1082
  "MD": {
1083
+ "accuracy": 0.5376344086021505,
1084
  "count": 279
1085
  },
1086
  "MB": {
1087
+ "accuracy": 0.020689655172413793,
1088
  "count": 145
1089
  },
1090
  "ME": {
1091
+ "accuracy": 0.9166666666666666,
1092
  "count": 24
1093
  },
1094
  "UB": {
1095
+ "accuracy": 0.21428571428571427,
1096
  "count": 252
1097
  }
1098
  }
 
1102
  "n_examples": 100,
1103
  "per_subtask": {
1104
  "MD": {
1105
+ "accuracy": 0.7136150234741784,
1106
  "count": 213
1107
  },
1108
  "MB": {
1109
+ "accuracy": 0.02654867256637168,
1110
  "count": 113
1111
  },
1112
  "ME": {
1113
+ "accuracy": 0.9411764705882353,
1114
  "count": 85
1115
  },
1116
  "UB": {
1117
+ "accuracy": 0.2265193370165746,
1118
  "count": 181
1119
  },
1120
  "UD": {
1121
+ "accuracy": 0.10185185185185185,
1122
  "count": 108
1123
  }
1124
  }
 
1128
  "n_examples": 100,
1129
  "per_subtask": {
1130
  "MD": {
1131
+ "accuracy": 0.8156424581005587,
1132
  "count": 179
1133
  },
1134
  "MB": {
1135
+ "accuracy": 0.0,
1136
  "count": 103
1137
  },
1138
  "ME": {
1139
+ "accuracy": 0.9821428571428571,
1140
  "count": 56
1141
  },
1142
  "UB": {
1143
+ "accuracy": 0.26174496644295303,
1144
  "count": 149
1145
  },
1146
  "UD": {
1147
+ "accuracy": 0.07981220657276995,
1148
  "count": 213
1149
  }
1150
  }
 
1154
  "n_examples": 100,
1155
  "per_subtask": {
1156
  "MD": {
1157
+ "accuracy": 0.755,
1158
  "count": 200
1159
  },
1160
  "MB": {
1161
+ "accuracy": 0.01,
1162
  "count": 100
1163
  },
1164
  "UB": {
1165
+ "accuracy": 0.38,
1166
  "count": 100
1167
  },
1168
  "UD": {
1169
+ "accuracy": 0.09666666666666666,
1170
  "count": 300
1171
  }
1172
  }
1173
  },
1174
  "sub_M5": {
1175
+ "full_accuracy": 0.01,
1176
  "n_examples": 100,
1177
  "per_subtask": {
1178
  "MD": {
 
1180
  "count": 100
1181
  },
1182
  "MB": {
1183
+ "accuracy": 0.02,
1184
  "count": 100
1185
  },
1186
  "UB": {
1187
+ "accuracy": 0.53,
1188
  "count": 100
1189
  },
1190
  "UD": {
1191
+ "accuracy": 0.11,
1192
  "count": 400
1193
  }
1194
  }
 
1198
  "n_examples": 200,
1199
  "per_subtask": {
1200
  "MD": {
1201
+ "accuracy": 0.5216666666666666,
1202
  "count": 600
1203
  },
1204
  "MB": {
1205
+ "accuracy": 0.0149812734082397,
1206
  "count": 267
1207
  },
1208
  "ME": {
1209
+ "accuracy": 0.9056603773584906,
1210
  "count": 53
1211
  },
1212
  "UB": {
1213
+ "accuracy": 0.2460136674259681,
1214
  "count": 439
1215
  },
1216
  "UD": {
1217
+ "accuracy": 0.07317073170731707,
1218
  "count": 41
1219
  }
1220
  }
 
1224
  "n_examples": 100,
1225
  "per_subtask": {
1226
  "MD": {
1227
+ "accuracy": 0.5266666666666666,
1228
  "count": 300
1229
  },
1230
  "MB": {
1231
+ "accuracy": 0.03,
1232
  "count": 100
1233
  },
1234
  "UB": {
1235
+ "accuracy": 0.2233502538071066,
1236
  "count": 197
1237
  },
1238
  "UD": {
1239
+ "accuracy": 0.10679611650485436,
1240
  "count": 103
1241
  }
1242
  }
 
1246
  "n_examples": 100,
1247
  "per_subtask": {
1248
  "MD": {
1249
+ "accuracy": 0.655,
1250
  "count": 200
1251
  },
1252
  "MB": {
1253
+ "accuracy": 0.03,
1254
  "count": 100
1255
  },
1256
  "UB": {
1257
+ "accuracy": 0.24696356275303644,
1258
  "count": 247
1259
  },
1260
  "UD": {
1261
+ "accuracy": 0.11764705882352941,
1262
  "count": 153
1263
  }
1264
  }
1265
  },
1266
  "sub_B5": {
1267
+ "full_accuracy": 0.0,
1268
  "n_examples": 100,
1269
  "per_subtask": {
1270
  "MD": {
 
1272
  "count": 100
1273
  },
1274
  "MB": {
1275
+ "accuracy": 0.01,
1276
  "count": 100
1277
  },
1278
  "UB": {
1279
+ "accuracy": 0.2651006711409396,
1280
  "count": 298
1281
  },
1282
  "UD": {
1283
+ "accuracy": 0.16336633663366337,
1284
  "count": 202
1285
  }
1286
  }
1287
  }
1288
  },
1289
  "summary": {
1290
+ "overall_accuracy": 0.007083333333333333,
1291
  "total_examples": 2400,
1292
  "n_splits": 22
1293
  }
1294
  },
1295
+ "sorl_overall_accuracy": 0.007083333333333333,
1296
+ "sft_overall_accuracy": 0.007083333333333333
1297
  }
add_sub_sorl_v1_abs10_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5539570ac41c2434a03e526e01a1a87e8aa66fcfadd6151a66949f140ded418d
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:324ce6c56d7206c9ce03c2a04c06b931963e334e14fddc37c56898ab88915205
3
  size 650303660
add_sub_sorl_v1_abs10_10K/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_10K",
72
- "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
- "timestamp": "2026-04-12T08:59:09.036996+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "f2ajfc34",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/f2ajfc34",
81
- "final_accuracy": 0.008333333333333333,
82
- "sft_accuracy": 0.004166666666666667,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
 
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_10K",
72
+ "git_commit": "8d5ee5420119746ef4e2c87570eb250c9718f643",
73
+ "timestamp": "2026-04-12T22:18:34.743639+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "hd7zspyv",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/hd7zspyv",
81
+ "final_accuracy": 0.007083333333333333,
82
+ "sft_accuracy": 0.007083333333333333,
83
  "eval_method": "ArithmeticEvaluator"
84
  }