amirali1985 commited on
Commit
af94939
·
verified ·
1 Parent(s): f7a9e93

Upload add_sub_baseline_10K

Browse files
add_sub_baseline_10K/metrics.json CHANGED
@@ -65,196 +65,196 @@
65
  3100
66
  ],
67
  "loss": [
68
- 10.090532302856445,
69
- 7.41793155670166,
70
- 6.670701503753662,
71
- 5.4355902671813965,
72
- 4.486183166503906,
73
- 3.106839418411255,
74
- 2.181303024291992,
75
- 1.9472637176513672,
76
- 1.6879997253417969,
77
- 1.7242902517318726,
78
- 1.3950544595718384,
79
- 0.9311097860336304,
80
- 0.593478798866272,
81
- 0.4981410801410675,
82
- 0.24719169735908508,
83
- 0.10993476957082748,
84
- 0.09566618502140045,
85
- 0.08340698480606079,
86
- 0.07292894273996353,
87
- 0.05232112854719162,
88
- 0.04302288219332695,
89
- 0.04004356637597084,
90
- 0.04495292156934738,
91
- 0.048470307141542435,
92
- 0.033653225749731064,
93
- 0.030521346256136894,
94
- 0.06012653931975365,
95
- 0.05565373972058296,
96
- 0.04759713634848595,
97
- 0.054727476090192795,
98
- 0.034593965858221054,
99
- 0.027987422421574593,
100
- 0.034813184291124344,
101
- 0.03215349465608597,
102
- 0.02478446625173092,
103
- 0.022883405908942223,
104
- 0.02932637929916382,
105
- 0.017739301547408104,
106
- 0.024523159489035606,
107
- 0.02478833682835102,
108
- 0.01739186979830265,
109
- 0.01981291174888611,
110
- 0.038495369255542755,
111
- 0.02020339109003544,
112
- 0.035459987819194794,
113
- 0.033973682671785355,
114
- 0.03820496052503586,
115
- 0.01859247498214245,
116
- 0.02102138102054596,
117
- 0.021527009084820747,
118
- 0.02982836402952671,
119
- 0.026523862034082413,
120
- 0.0283384770154953,
121
- 0.023189542815089226,
122
- 0.022154804319143295,
123
- 0.02203175611793995,
124
- 0.01161875855177641,
125
- 0.01782594434916973,
126
- 0.014171228744089603,
127
- 0.014905942603945732,
128
- 0.019558599218726158,
129
- 0.012723379768431187
130
  ],
131
  "base_loss": [
132
- 10.090532302856445,
133
- 7.41793155670166,
134
- 6.670701503753662,
135
- 5.4355902671813965,
136
- 4.486183166503906,
137
- 3.106839418411255,
138
- 2.181303024291992,
139
- 1.9472637176513672,
140
- 1.6879997253417969,
141
- 1.7242902517318726,
142
- 1.3950544595718384,
143
- 0.9311097860336304,
144
- 0.593478798866272,
145
- 0.4981410801410675,
146
- 0.24719169735908508,
147
- 0.10993476957082748,
148
- 0.09566618502140045,
149
- 0.08340698480606079,
150
- 0.07292894273996353,
151
- 0.05232112854719162,
152
- 0.04302288219332695,
153
- 0.04004356637597084,
154
- 0.04495292156934738,
155
- 0.048470307141542435,
156
- 0.033653225749731064,
157
- 0.030521346256136894,
158
- 0.06012653931975365,
159
- 0.05565373972058296,
160
- 0.04759713634848595,
161
- 0.054727476090192795,
162
- 0.034593965858221054,
163
- 0.027987422421574593,
164
- 0.034813184291124344,
165
- 0.03215349465608597,
166
- 0.02478446625173092,
167
- 0.022883405908942223,
168
- 0.02932637929916382,
169
- 0.017739301547408104,
170
- 0.024523159489035606,
171
- 0.02478833682835102,
172
- 0.01739186979830265,
173
- 0.01981291174888611,
174
- 0.038495369255542755,
175
- 0.02020339109003544,
176
- 0.035459987819194794,
177
- 0.033973682671785355,
178
- 0.03820496052503586,
179
- 0.01859247498214245,
180
- 0.02102138102054596,
181
- 0.021527009084820747,
182
- 0.02982836402952671,
183
- 0.026523862034082413,
184
- 0.0283384770154953,
185
- 0.023189542815089226,
186
- 0.022154804319143295,
187
- 0.02203175611793995,
188
- 0.01161875855177641,
189
- 0.01782594434916973,
190
- 0.014171228744089603,
191
- 0.014905942603945732,
192
- 0.019558599218726158,
193
- 0.012723379768431187
194
  ],
195
  "lr": [
196
- 6.242038216560511e-06,
197
- 1.2611464968152866e-05,
198
- 1.8980891719745225e-05,
199
- 2.5350318471337578e-05,
200
- 3.1719745222929934e-05,
201
- 3.808917197452229e-05,
202
- 4.445859872611465e-05,
203
- 5.082802547770701e-05,
204
- 5.7197452229299365e-05,
205
- 6.356687898089173e-05,
206
- 6.993630573248408e-05,
207
- 7.630573248407644e-05,
208
- 7.99862055592881e-05,
209
- 7.984241248831029e-05,
210
- 7.954287783192742e-05,
211
- 7.90887724530305e-05,
212
- 7.848187142213441e-05,
213
- 7.772454707873448e-05,
214
- 7.681975975797462e-05,
215
- 7.57710462188759e-05,
216
- 7.458250581935905e-05,
217
- 7.325878449210182e-05,
218
- 7.180505658386849e-05,
219
- 7.022700462930083e-05,
220
- 6.853079713823312e-05,
221
- 6.672306448335957e-05,
222
- 6.481087298250779e-05,
223
- 6.280169727682872e-05,
224
- 6.070339111287581e-05,
225
- 5.8524156642783655e-05,
226
- 5.627251236255051e-05,
227
- 5.3957259813751526e-05,
228
- 5.1587449178844164e-05,
229
- 4.917234390455111e-05,
230
- 4.672138449160635e-05,
231
- 4.424415159240753e-05,
232
- 4.175032856082417e-05,
233
- 3.924966360055181e-05,
234
- 3.675193165997228e-05,
235
- 3.42668962224704e-05,
236
- 3.180427114156694e-05,
237
- 2.9373682670051437e-05,
238
- 2.6984631831541183e-05,
239
- 2.4646457281553407e-05,
240
- 2.2368298803264487e-05,
241
- 2.0159061580649347e-05,
242
- 1.8027381388654794e-05,
243
- 1.5981590836476463e-05,
244
- 1.4029686795892575e-05,
245
- 1.2179299141974771e-05,
246
- 1.0437660928367057e-05,
247
- 8.811580113715755e-06,
248
- 7.307412949770034e-06,
249
- 5.9310391351775455e-06,
250
- 4.687838832097362e-06,
251
- 3.5826716354707645e-06,
252
- 2.6198575771580583e-06,
253
- 1.8031602391947344e-06,
254
- 1.1357720421765062e-06,
255
- 6.203017662798872e-07,
256
- 2.5876435369797334e-07,
257
- 5.257303235302935e-08
258
  ],
259
  "eval_step": [
260
  156,
@@ -301,530 +301,530 @@
301
  20
302
  ],
303
  "eval_accuracy": [
304
- 0.0,
305
- 0.0064285714285714285,
306
- 0.007142857142857143,
307
- 0.04,
308
- 0.6357142857142857,
309
- 0.7271428571428571,
310
- 0.7485714285714286,
311
- 0.7471428571428571,
312
- 0.75,
313
- 0.7878571428571428,
314
- 0.7842857142857143,
315
- 0.7714285714285715,
316
- 0.8028571428571428,
317
- 0.8085714285714286,
318
- 0.8228571428571428,
319
- 0.8342857142857143,
320
- 0.8307142857142857,
321
- 0.8635714285714285,
322
- 0.8814285714285715,
323
- 0.8628571428571429
324
  ]
325
  },
326
- "final_accuracy": 0.8628571428571429,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
- "n_per_split": 50
334
  },
335
  "splits": {
336
  "add_S0": {
337
- "full_accuracy": 1.0,
338
- "n_examples": 50,
339
  "per_subtask": {
340
  "SA": {
341
- "accuracy": 1.0,
342
- "count": 295
343
  },
344
  "SS": {
345
  "accuracy": 1.0,
346
- "count": 55
347
  }
348
  }
349
  },
350
  "add_S1": {
351
- "full_accuracy": 1.0,
352
- "n_examples": 50,
353
  "per_subtask": {
354
  "SA": {
355
- "accuracy": 1.0,
356
- "count": 126
357
  },
358
  "SC": {
359
  "accuracy": 1.0,
360
- "count": 79
361
  },
362
  "SS": {
363
  "accuracy": 1.0,
364
- "count": 21
365
  },
366
  "UC": {
367
- "accuracy": 1.0,
368
- "count": 124
369
  }
370
  }
371
  },
372
  "add_S2": {
373
- "full_accuracy": 0.96,
374
- "n_examples": 50,
375
  "per_subtask": {
376
  "SA": {
377
- "accuracy": 1.0,
378
- "count": 75
379
  },
380
  "SC": {
381
- "accuracy": 0.967741935483871,
382
- "count": 62
383
  },
384
  "SS": {
385
- "accuracy": 1.0,
386
- "count": 39
387
  },
388
  "UC": {
389
- "accuracy": 1.0,
390
- "count": 111
391
  },
392
  "US": {
393
  "accuracy": 1.0,
394
- "count": 63
395
  }
396
  }
397
  },
398
  "add_S3": {
399
- "full_accuracy": 0.88,
400
- "n_examples": 50,
401
  "per_subtask": {
402
  "SA": {
403
  "accuracy": 1.0,
404
- "count": 60
405
  },
406
  "SC": {
407
- "accuracy": 1.0,
408
- "count": 57
409
  },
410
  "SS": {
411
  "accuracy": 1.0,
412
- "count": 19
413
  },
414
  "UC": {
415
- "accuracy": 0.9423076923076923,
416
- "count": 104
417
  },
418
  "US": {
419
- "accuracy": 1.0,
420
- "count": 110
421
  }
422
  }
423
  },
424
  "add_S4": {
425
- "full_accuracy": 0.7,
426
- "n_examples": 50,
427
  "per_subtask": {
428
  "SA": {
429
  "accuracy": 1.0,
430
- "count": 48
431
  },
432
  "SC": {
433
  "accuracy": 1.0,
434
- "count": 52
435
  },
436
  "SS": {
437
  "accuracy": 1.0,
438
- "count": 7
439
  },
440
  "UC": {
441
- "accuracy": 0.8314606741573034,
442
- "count": 89
443
  },
444
  "US": {
445
- "accuracy": 0.9025974025974026,
446
- "count": 154
447
  }
448
  }
449
  },
450
  "add_S5": {
451
- "full_accuracy": 0.58,
452
- "n_examples": 50,
453
  "per_subtask": {
454
  "SA": {
455
  "accuracy": 1.0,
456
- "count": 50
457
  },
458
  "SC": {
459
  "accuracy": 1.0,
460
- "count": 50
461
  },
462
  "UC": {
463
- "accuracy": 0.6,
464
- "count": 50
465
  },
466
  "US": {
467
- "accuracy": 0.79,
468
- "count": 200
469
  }
470
  }
471
  },
472
  "add_S6": {
473
- "full_accuracy": 1.0,
474
- "n_examples": 50,
475
  "per_subtask": {
476
  "SC": {
477
  "accuracy": 1.0,
478
- "count": 50
479
  },
480
  "UC": {
481
- "accuracy": 1.0,
482
- "count": 50
483
  },
484
  "US": {
485
- "accuracy": 1.0,
486
- "count": 250
487
  }
488
  }
489
  },
490
  "add_random": {
491
- "full_accuracy": 0.99,
492
  "n_examples": 200,
493
  "per_subtask": {
494
  "SA": {
495
- "accuracy": 1.0,
496
- "count": 431
497
  },
498
  "SC": {
499
- "accuracy": 0.9968354430379747,
500
- "count": 316
501
  },
502
  "SS": {
503
  "accuracy": 1.0,
504
- "count": 39
505
  },
506
  "UC": {
507
- "accuracy": 0.9982142857142857,
508
- "count": 560
509
  },
510
  "US": {
511
- "accuracy": 1.0,
512
- "count": 54
513
  }
514
  }
515
  },
516
  "add_C3": {
517
- "full_accuracy": 0.88,
518
- "n_examples": 50,
519
  "per_subtask": {
520
  "SA": {
521
  "accuracy": 1.0,
522
- "count": 150
523
  },
524
  "SC": {
525
  "accuracy": 1.0,
526
- "count": 50
527
  },
528
  "UC": {
529
- "accuracy": 0.9423076923076923,
530
- "count": 104
531
  },
532
  "US": {
533
- "accuracy": 1.0,
534
- "count": 46
535
  }
536
  }
537
  },
538
  "add_C4": {
539
- "full_accuracy": 0.86,
540
- "n_examples": 50,
541
  "per_subtask": {
542
  "SA": {
543
  "accuracy": 1.0,
544
- "count": 100
545
  },
546
  "SC": {
547
  "accuracy": 1.0,
548
- "count": 50
549
  },
550
  "UC": {
551
- "accuracy": 0.943089430894309,
552
- "count": 123
553
  },
554
  "US": {
555
- "accuracy": 0.948051948051948,
556
- "count": 77
557
  }
558
  }
559
  },
560
  "add_C5": {
561
- "full_accuracy": 0.86,
562
- "n_examples": 50,
563
  "per_subtask": {
564
  "SA": {
565
  "accuracy": 1.0,
566
- "count": 50
567
  },
568
  "SC": {
569
  "accuracy": 1.0,
570
- "count": 50
571
  },
572
  "UC": {
573
- "accuracy": 0.9545454545454546,
574
- "count": 154
575
  },
576
  "US": {
577
- "accuracy": 0.9895833333333334,
578
- "count": 96
579
  }
580
  }
581
  },
582
  "add_C6": {
583
- "full_accuracy": 0.88,
584
- "n_examples": 50,
585
  "per_subtask": {
586
  "SC": {
587
  "accuracy": 1.0,
588
- "count": 50
589
  },
590
  "UC": {
591
- "accuracy": 0.967032967032967,
592
- "count": 182
593
  },
594
  "US": {
595
- "accuracy": 0.940677966101695,
596
- "count": 118
597
  }
598
  }
599
  },
600
  "sub_M0": {
601
- "full_accuracy": 1.0,
602
- "n_examples": 50,
603
  "per_subtask": {
604
  "MD": {
605
- "accuracy": 1.0,
606
- "count": 294
607
  },
608
  "ME": {
609
- "accuracy": 1.0,
610
- "count": 56
611
  }
612
  }
613
  },
614
  "sub_M1": {
615
- "full_accuracy": 1.0,
616
- "n_examples": 50,
617
  "per_subtask": {
618
  "MD": {
619
- "accuracy": 1.0,
620
- "count": 143
621
  },
622
  "MB": {
623
- "accuracy": 1.0,
624
- "count": 69
625
  },
626
  "ME": {
627
  "accuracy": 1.0,
628
- "count": 15
629
  },
630
  "UB": {
631
- "accuracy": 1.0,
632
- "count": 123
633
  }
634
  }
635
  },
636
  "sub_M2": {
637
- "full_accuracy": 1.0,
638
- "n_examples": 50,
639
  "per_subtask": {
640
  "MD": {
641
- "accuracy": 1.0,
642
- "count": 108
643
  },
644
  "MB": {
645
  "accuracy": 1.0,
646
- "count": 52
647
  },
648
  "ME": {
649
  "accuracy": 1.0,
650
- "count": 52
651
  },
652
  "UB": {
653
- "accuracy": 1.0,
654
- "count": 87
655
  },
656
  "UD": {
657
  "accuracy": 1.0,
658
- "count": 51
659
  }
660
  }
661
  },
662
  "sub_M3": {
663
- "full_accuracy": 0.46,
664
- "n_examples": 50,
665
  "per_subtask": {
666
  "MD": {
667
  "accuracy": 1.0,
668
- "count": 94
669
  },
670
  "MB": {
671
- "accuracy": 1.0,
672
- "count": 51
673
  },
674
  "ME": {
675
  "accuracy": 1.0,
676
- "count": 25
677
  },
678
  "UB": {
679
- "accuracy": 0.6538461538461539,
680
- "count": 78
681
  },
682
  "UD": {
683
- "accuracy": 1.0,
684
- "count": 102
685
  }
686
  }
687
  },
688
  "sub_M4": {
689
- "full_accuracy": 0.48,
690
- "n_examples": 50,
691
  "per_subtask": {
692
  "MD": {
693
  "accuracy": 1.0,
694
- "count": 100
695
  },
696
  "MB": {
697
  "accuracy": 1.0,
698
- "count": 50
699
  },
700
  "UB": {
701
- "accuracy": 0.48,
702
- "count": 50
703
  },
704
  "UD": {
705
- "accuracy": 0.8333333333333334,
706
- "count": 150
707
  }
708
  }
709
  },
710
  "sub_M5": {
711
- "full_accuracy": 0.36,
712
- "n_examples": 50,
713
  "per_subtask": {
714
  "MD": {
715
  "accuracy": 1.0,
716
- "count": 50
717
  },
718
  "MB": {
719
  "accuracy": 1.0,
720
- "count": 50
721
  },
722
  "UB": {
723
- "accuracy": 0.44,
724
- "count": 50
725
  },
726
  "UD": {
727
- "accuracy": 0.69,
728
- "count": 200
729
  }
730
  }
731
  },
732
  "sub_random": {
733
- "full_accuracy": 0.995,
734
  "n_examples": 200,
735
  "per_subtask": {
736
  "MD": {
737
- "accuracy": 1.0,
738
- "count": 588
739
  },
740
  "MB": {
741
- "accuracy": 1.0,
742
- "count": 268
743
  },
744
  "ME": {
745
  "accuracy": 1.0,
746
- "count": 60
747
  },
748
  "UB": {
749
- "accuracy": 0.9977628635346756,
750
- "count": 447
751
  },
752
  "UD": {
753
  "accuracy": 1.0,
754
- "count": 37
755
  }
756
  }
757
  },
758
  "sub_B3": {
759
- "full_accuracy": 0.92,
760
- "n_examples": 50,
761
  "per_subtask": {
762
  "MD": {
763
- "accuracy": 1.0,
764
- "count": 150
765
  },
766
  "MB": {
767
  "accuracy": 1.0,
768
- "count": 50
769
  },
770
  "UB": {
771
- "accuracy": 0.9626168224299065,
772
- "count": 107
773
  },
774
  "UD": {
775
  "accuracy": 1.0,
776
- "count": 43
777
  }
778
  }
779
  },
780
  "sub_B4": {
781
- "full_accuracy": 0.72,
782
- "n_examples": 50,
783
  "per_subtask": {
784
  "MD": {
785
- "accuracy": 1.0,
786
- "count": 100
787
  },
788
  "MB": {
789
  "accuracy": 1.0,
790
- "count": 50
791
  },
792
  "UB": {
793
- "accuracy": 0.8771929824561403,
794
- "count": 114
795
  },
796
  "UD": {
797
- "accuracy": 0.9302325581395349,
798
- "count": 86
799
  }
800
  }
801
  },
802
  "sub_B5": {
803
- "full_accuracy": 0.7,
804
- "n_examples": 50,
805
  "per_subtask": {
806
  "MD": {
807
  "accuracy": 1.0,
808
- "count": 50
809
  },
810
  "MB": {
811
  "accuracy": 1.0,
812
- "count": 50
813
  },
814
  "UB": {
815
- "accuracy": 0.9019607843137255,
816
- "count": 153
817
  },
818
  "UD": {
819
- "accuracy": 0.865979381443299,
820
- "count": 97
821
  }
822
  }
823
  }
824
  },
825
  "summary": {
826
- "overall_accuracy": 0.8628571428571429,
827
- "total_examples": 1400,
828
  "n_splits": 22
829
  }
830
  }
 
65
  3100
66
  ],
67
  "loss": [
68
+ 7.930843830108643,
69
+ 5.878052711486816,
70
+ 4.171169757843018,
71
+ 2.4461560249328613,
72
+ 1.9941270351409912,
73
+ 1.885154366493225,
74
+ 1.7704766988754272,
75
+ 1.7200186252593994,
76
+ 1.6930021047592163,
77
+ 1.659714698791504,
78
+ 1.6491907835006714,
79
+ 1.440868616104126,
80
+ 1.2277355194091797,
81
+ 0.9296635389328003,
82
+ 0.727922797203064,
83
+ 0.6531419157981873,
84
+ 0.6338068842887878,
85
+ 0.5315085053443909,
86
+ 0.471783846616745,
87
+ 0.44315391778945923,
88
+ 0.37063688039779663,
89
+ 0.31710734963417053,
90
+ 0.32954543828964233,
91
+ 0.2792213559150696,
92
+ 0.28969308733940125,
93
+ 0.27333250641822815,
94
+ 0.2600690424442291,
95
+ 0.22665680944919586,
96
+ 0.22485388815402985,
97
+ 0.18426764011383057,
98
+ 0.20698437094688416,
99
+ 0.20130981504917145,
100
+ 0.16024114191532135,
101
+ 0.20614077150821686,
102
+ 0.16621388494968414,
103
+ 0.1674460619688034,
104
+ 0.13519535958766937,
105
+ 0.17641237378120422,
106
+ 0.14274020493030548,
107
+ 0.15076635777950287,
108
+ 0.14852766692638397,
109
+ 0.12220268696546555,
110
+ 0.13161125779151917,
111
+ 0.12244851887226105,
112
+ 0.14852531254291534,
113
+ 0.15561267733573914,
114
+ 0.10794571787118912,
115
+ 0.14725656807422638,
116
+ 0.14284299314022064,
117
+ 0.10321654379367828,
118
+ 0.10347577184438705,
119
+ 0.1123846247792244,
120
+ 0.11255372315645218,
121
+ 0.11303780227899551,
122
+ 0.07373049110174179,
123
+ 0.1012827679514885,
124
+ 0.08089525252580643,
125
+ 0.10804768651723862,
126
+ 0.08694573491811752,
127
+ 0.11614620685577393,
128
+ 0.090724878013134,
129
+ 0.11524895578622818
130
  ],
131
  "base_loss": [
132
+ 7.930843830108643,
133
+ 5.878052711486816,
134
+ 4.171169757843018,
135
+ 2.4461560249328613,
136
+ 1.9941270351409912,
137
+ 1.885154366493225,
138
+ 1.7704766988754272,
139
+ 1.7200186252593994,
140
+ 1.6930021047592163,
141
+ 1.659714698791504,
142
+ 1.6491907835006714,
143
+ 1.440868616104126,
144
+ 1.2277355194091797,
145
+ 0.9296635389328003,
146
+ 0.727922797203064,
147
+ 0.6531419157981873,
148
+ 0.6338068842887878,
149
+ 0.5315085053443909,
150
+ 0.471783846616745,
151
+ 0.44315391778945923,
152
+ 0.37063688039779663,
153
+ 0.31710734963417053,
154
+ 0.32954543828964233,
155
+ 0.2792213559150696,
156
+ 0.28969308733940125,
157
+ 0.27333250641822815,
158
+ 0.2600690424442291,
159
+ 0.22665680944919586,
160
+ 0.22485388815402985,
161
+ 0.18426764011383057,
162
+ 0.20698437094688416,
163
+ 0.20130981504917145,
164
+ 0.16024114191532135,
165
+ 0.20614077150821686,
166
+ 0.16621388494968414,
167
+ 0.1674460619688034,
168
+ 0.13519535958766937,
169
+ 0.17641237378120422,
170
+ 0.14274020493030548,
171
+ 0.15076635777950287,
172
+ 0.14852766692638397,
173
+ 0.12220268696546555,
174
+ 0.13161125779151917,
175
+ 0.12244851887226105,
176
+ 0.14852531254291534,
177
+ 0.15561267733573914,
178
+ 0.10794571787118912,
179
+ 0.14725656807422638,
180
+ 0.14284299314022064,
181
+ 0.10321654379367828,
182
+ 0.10347577184438705,
183
+ 0.1123846247792244,
184
+ 0.11255372315645218,
185
+ 0.11303780227899551,
186
+ 0.07373049110174179,
187
+ 0.1012827679514885,
188
+ 0.08089525252580643,
189
+ 0.10804768651723862,
190
+ 0.08694573491811752,
191
+ 0.11614620685577393,
192
+ 0.090724878013134,
193
+ 0.11524895578622818
194
  ],
195
  "lr": [
196
+ 1.9600000000000002e-05,
197
+ 3.96e-05,
198
+ 3.9974363901222355e-05,
199
+ 3.9895421087752256e-05,
200
+ 3.976337160140893e-05,
201
+ 3.957856792072718e-05,
202
+ 3.9341503340549716e-05,
203
+ 3.9052810655279495e-05,
204
+ 3.8713260469767256e-05,
205
+ 3.832375914234272e-05,
206
+ 3.788534636548025e-05,
207
+ 3.739919239055685e-05,
208
+ 3.6866594904110466e-05,
209
+ 3.62889755639367e-05,
210
+ 3.566787620427007e-05,
211
+ 3.5004954720179526e-05,
212
+ 3.4301980642163606e-05,
213
+ 3.3560830412758255e-05,
214
+ 3.2783482377765215e-05,
215
+ 3.197201150547096e-05,
216
+ 3.112858384795204e-05,
217
+ 3.0255450759251313e-05,
218
+ 2.9354942885858323e-05,
219
+ 2.842946394553513e-05,
220
+ 2.7481484311093542e-05,
221
+ 2.6513534416250682e-05,
222
+ 2.5528198001164462e-05,
223
+ 2.4528105215678678e-05,
224
+ 2.3515925598687097e-05,
225
+ 2.249436095235672e-05,
226
+ 2.146613813023101e-05,
227
+ 2.043400175846362e-05,
228
+ 1.9400706909611906e-05,
229
+ 1.8369011748545936e-05,
230
+ 1.734167017010322e-05,
231
+ 1.6321424448141397e-05,
232
+ 1.5310997915610664e-05,
233
+ 1.4313087695185074e-05,
234
+ 1.3330357499856637e-05,
235
+ 1.2365430522709616e-05,
236
+ 1.1420882434854255e-05,
237
+ 1.049923451021051e-05,
238
+ 9.602946895493652e-06,
239
+ 8.734412043366293e-06,
240
+ 7.89594832628547e-06,
241
+ 7.089793848091389e-06,
242
+ 6.318100469856501e-06,
243
+ 5.582928065941624e-06,
244
+ 4.886239025591397e-06,
245
+ 4.229893014745887e-06,
246
+ 3.6156420120506306e-06,
247
+ 3.045125632315402e-06,
248
+ 2.5198667499047936e-06,
249
+ 2.0412674337430526e-06,
250
+ 1.610605204783806e-06,
251
+ 1.2290296259345835e-06,
252
+ 8.975592335386451e-07,
253
+ 6.170788186048593e-07,
254
+ 3.883370650428364e-07,
255
+ 2.11944551207528e-07,
256
+ 8.837212008778961e-08,
257
+ 1.79496224893283e-08
258
  ],
259
  "eval_step": [
260
  156,
 
301
  20
302
  ],
303
  "eval_accuracy": [
304
+ 0.006666666666666667,
305
+ 0.0022222222222222222,
306
+ 0.01,
307
+ 0.014444444444444444,
308
+ 0.051111111111111114,
309
+ 0.13444444444444445,
310
+ 0.26555555555555554,
311
+ 0.39,
312
+ 0.49666666666666665,
313
+ 0.56,
314
+ 0.6044444444444445,
315
+ 0.6322222222222222,
316
+ 0.6344444444444445,
317
+ 0.6344444444444445,
318
+ 0.6866666666666666,
319
+ 0.6888888888888889,
320
+ 0.7033333333333334,
321
+ 0.7033333333333334,
322
+ 0.7144444444444444,
323
+ 0.71
324
  ]
325
  },
326
+ "final_accuracy": 0.6270833333333333,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
330
  "K": null,
331
  "mode": "sft",
332
  "n_digits": 6,
333
+ "n_per_split": 100
334
  },
335
  "splits": {
336
  "add_S0": {
337
+ "full_accuracy": 0.87,
338
+ "n_examples": 100,
339
  "per_subtask": {
340
  "SA": {
341
+ "accuracy": 0.9785123966942149,
342
+ "count": 605
343
  },
344
  "SS": {
345
  "accuracy": 1.0,
346
+ "count": 95
347
  }
348
  }
349
  },
350
  "add_S1": {
351
+ "full_accuracy": 0.98,
352
+ "n_examples": 100,
353
  "per_subtask": {
354
  "SA": {
355
+ "accuracy": 0.9950980392156863,
356
+ "count": 204
357
  },
358
  "SC": {
359
  "accuracy": 1.0,
360
+ "count": 169
361
  },
362
  "SS": {
363
  "accuracy": 1.0,
364
+ "count": 31
365
  },
366
  "UC": {
367
+ "accuracy": 0.9966216216216216,
368
+ "count": 296
369
  }
370
  }
371
  },
372
  "add_S2": {
373
+ "full_accuracy": 0.7,
374
+ "n_examples": 100,
375
  "per_subtask": {
376
  "SA": {
377
+ "accuracy": 0.9877300613496932,
378
+ "count": 163
379
  },
380
  "SC": {
381
+ "accuracy": 0.9692307692307692,
382
+ "count": 130
383
  },
384
  "SS": {
385
+ "accuracy": 0.9655172413793104,
386
+ "count": 87
387
  },
388
  "UC": {
389
+ "accuracy": 0.8669950738916257,
390
+ "count": 203
391
  },
392
  "US": {
393
  "accuracy": 1.0,
394
+ "count": 117
395
  }
396
  }
397
  },
398
  "add_S3": {
399
+ "full_accuracy": 0.52,
400
+ "n_examples": 100,
401
  "per_subtask": {
402
  "SA": {
403
  "accuracy": 1.0,
404
+ "count": 121
405
  },
406
  "SC": {
407
+ "accuracy": 0.9917355371900827,
408
+ "count": 121
409
  },
410
  "SS": {
411
  "accuracy": 1.0,
412
+ "count": 49
413
  },
414
  "UC": {
415
+ "accuracy": 0.7795698924731183,
416
+ "count": 186
417
  },
418
  "US": {
419
+ "accuracy": 0.8923766816143498,
420
+ "count": 223
421
  }
422
  }
423
  },
424
  "add_S4": {
425
+ "full_accuracy": 0.58,
426
+ "n_examples": 100,
427
  "per_subtask": {
428
  "SA": {
429
  "accuracy": 1.0,
430
+ "count": 104
431
  },
432
  "SC": {
433
  "accuracy": 1.0,
434
+ "count": 106
435
  },
436
  "SS": {
437
  "accuracy": 1.0,
438
+ "count": 23
439
  },
440
  "UC": {
441
+ "accuracy": 0.8125,
442
+ "count": 160
443
  },
444
  "US": {
445
+ "accuracy": 0.8273615635179153,
446
+ "count": 307
447
  }
448
  }
449
  },
450
  "add_S5": {
451
+ "full_accuracy": 0.29,
452
+ "n_examples": 100,
453
  "per_subtask": {
454
  "SA": {
455
  "accuracy": 1.0,
456
+ "count": 100
457
  },
458
  "SC": {
459
  "accuracy": 1.0,
460
+ "count": 100
461
  },
462
  "UC": {
463
+ "accuracy": 0.41,
464
+ "count": 100
465
  },
466
  "US": {
467
+ "accuracy": 0.58,
468
+ "count": 400
469
  }
470
  }
471
  },
472
  "add_S6": {
473
+ "full_accuracy": 0.56,
474
+ "n_examples": 100,
475
  "per_subtask": {
476
  "SC": {
477
  "accuracy": 1.0,
478
+ "count": 100
479
  },
480
  "UC": {
481
+ "accuracy": 0.64,
482
+ "count": 100
483
  },
484
  "US": {
485
+ "accuracy": 0.688,
486
+ "count": 500
487
  }
488
  }
489
  },
490
  "add_random": {
491
+ "full_accuracy": 0.89,
492
  "n_examples": 200,
493
  "per_subtask": {
494
  "SA": {
495
+ "accuracy": 0.9910514541387024,
496
+ "count": 447
497
  },
498
  "SC": {
499
+ "accuracy": 0.99375,
500
+ "count": 320
501
  },
502
  "SS": {
503
  "accuracy": 1.0,
504
+ "count": 56
505
  },
506
  "UC": {
507
+ "accuracy": 0.9716446124763705,
508
+ "count": 529
509
  },
510
  "US": {
511
+ "accuracy": 0.9791666666666666,
512
+ "count": 48
513
  }
514
  }
515
  },
516
  "add_C3": {
517
+ "full_accuracy": 0.76,
518
+ "n_examples": 100,
519
  "per_subtask": {
520
  "SA": {
521
  "accuracy": 1.0,
522
+ "count": 300
523
  },
524
  "SC": {
525
  "accuracy": 1.0,
526
+ "count": 100
527
  },
528
  "UC": {
529
+ "accuracy": 0.8808290155440415,
530
+ "count": 193
531
  },
532
  "US": {
533
+ "accuracy": 0.9532710280373832,
534
+ "count": 107
535
  }
536
  }
537
  },
538
  "add_C4": {
539
+ "full_accuracy": 0.58,
540
+ "n_examples": 100,
541
  "per_subtask": {
542
  "SA": {
543
  "accuracy": 1.0,
544
+ "count": 200
545
  },
546
  "SC": {
547
  "accuracy": 1.0,
548
+ "count": 100
549
  },
550
  "UC": {
551
+ "accuracy": 0.84375,
552
+ "count": 256
553
  },
554
  "US": {
555
+ "accuracy": 0.9236111111111112,
556
+ "count": 144
557
  }
558
  }
559
  },
560
  "add_C5": {
561
+ "full_accuracy": 0.6,
562
+ "n_examples": 100,
563
  "per_subtask": {
564
  "SA": {
565
  "accuracy": 1.0,
566
+ "count": 100
567
  },
568
  "SC": {
569
  "accuracy": 1.0,
570
+ "count": 100
571
  },
572
  "UC": {
573
+ "accuracy": 0.8823529411764706,
574
+ "count": 306
575
  },
576
  "US": {
577
+ "accuracy": 0.865979381443299,
578
+ "count": 194
579
  }
580
  }
581
  },
582
  "add_C6": {
583
+ "full_accuracy": 0.52,
584
+ "n_examples": 100,
585
  "per_subtask": {
586
  "SC": {
587
  "accuracy": 1.0,
588
+ "count": 100
589
  },
590
  "UC": {
591
+ "accuracy": 0.8797814207650273,
592
+ "count": 366
593
  },
594
  "US": {
595
+ "accuracy": 0.9102564102564102,
596
+ "count": 234
597
  }
598
  }
599
  },
600
  "sub_M0": {
601
+ "full_accuracy": 0.96,
602
+ "n_examples": 100,
603
  "per_subtask": {
604
  "MD": {
605
+ "accuracy": 0.9933444259567388,
606
+ "count": 601
607
  },
608
  "ME": {
609
+ "accuracy": 0.98989898989899,
610
+ "count": 99
611
  }
612
  }
613
  },
614
  "sub_M1": {
615
+ "full_accuracy": 0.93,
616
+ "n_examples": 100,
617
  "per_subtask": {
618
  "MD": {
619
+ "accuracy": 0.989247311827957,
620
+ "count": 279
621
  },
622
  "MB": {
623
+ "accuracy": 0.9793103448275862,
624
+ "count": 145
625
  },
626
  "ME": {
627
  "accuracy": 1.0,
628
+ "count": 24
629
  },
630
  "UB": {
631
+ "accuracy": 0.996031746031746,
632
+ "count": 252
633
  }
634
  }
635
  },
636
  "sub_M2": {
637
+ "full_accuracy": 0.72,
638
+ "n_examples": 100,
639
  "per_subtask": {
640
  "MD": {
641
+ "accuracy": 0.9859154929577465,
642
+ "count": 213
643
  },
644
  "MB": {
645
  "accuracy": 1.0,
646
+ "count": 113
647
  },
648
  "ME": {
649
  "accuracy": 1.0,
650
+ "count": 85
651
  },
652
  "UB": {
653
+ "accuracy": 0.861878453038674,
654
+ "count": 181
655
  },
656
  "UD": {
657
  "accuracy": 1.0,
658
+ "count": 108
659
  }
660
  }
661
  },
662
  "sub_M3": {
663
+ "full_accuracy": 0.26,
664
+ "n_examples": 100,
665
  "per_subtask": {
666
  "MD": {
667
  "accuracy": 1.0,
668
+ "count": 179
669
  },
670
  "MB": {
671
+ "accuracy": 0.9805825242718447,
672
+ "count": 103
673
  },
674
  "ME": {
675
  "accuracy": 1.0,
676
+ "count": 56
677
  },
678
  "UB": {
679
+ "accuracy": 0.46308724832214765,
680
+ "count": 149
681
  },
682
  "UD": {
683
+ "accuracy": 0.9859154929577465,
684
+ "count": 213
685
  }
686
  }
687
  },
688
  "sub_M4": {
689
+ "full_accuracy": 0.0,
690
+ "n_examples": 100,
691
  "per_subtask": {
692
  "MD": {
693
  "accuracy": 1.0,
694
+ "count": 200
695
  },
696
  "MB": {
697
  "accuracy": 1.0,
698
+ "count": 100
699
  },
700
  "UB": {
701
+ "accuracy": 0.26,
702
+ "count": 100
703
  },
704
  "UD": {
705
+ "accuracy": 0.55,
706
+ "count": 300
707
  }
708
  }
709
  },
710
  "sub_M5": {
711
+ "full_accuracy": 0.0,
712
+ "n_examples": 100,
713
  "per_subtask": {
714
  "MD": {
715
  "accuracy": 1.0,
716
+ "count": 100
717
  },
718
  "MB": {
719
  "accuracy": 1.0,
720
+ "count": 100
721
  },
722
  "UB": {
723
+ "accuracy": 0.25,
724
+ "count": 100
725
  },
726
  "UD": {
727
+ "accuracy": 0.3975,
728
+ "count": 400
729
  }
730
  }
731
  },
732
  "sub_random": {
733
+ "full_accuracy": 0.905,
734
  "n_examples": 200,
735
  "per_subtask": {
736
  "MD": {
737
+ "accuracy": 0.995,
738
+ "count": 600
739
  },
740
  "MB": {
741
+ "accuracy": 0.9775280898876404,
742
+ "count": 267
743
  },
744
  "ME": {
745
  "accuracy": 1.0,
746
+ "count": 53
747
  },
748
  "UB": {
749
+ "accuracy": 0.9749430523917996,
750
+ "count": 439
751
  },
752
  "UD": {
753
  "accuracy": 1.0,
754
+ "count": 41
755
  }
756
  }
757
  },
758
  "sub_B3": {
759
+ "full_accuracy": 0.67,
760
+ "n_examples": 100,
761
  "per_subtask": {
762
  "MD": {
763
+ "accuracy": 0.98,
764
+ "count": 300
765
  },
766
  "MB": {
767
  "accuracy": 1.0,
768
+ "count": 100
769
  },
770
  "UB": {
771
+ "accuracy": 0.8527918781725888,
772
+ "count": 197
773
  },
774
  "UD": {
775
  "accuracy": 1.0,
776
+ "count": 103
777
  }
778
  }
779
  },
780
  "sub_B4": {
781
+ "full_accuracy": 0.58,
782
+ "n_examples": 100,
783
  "per_subtask": {
784
  "MD": {
785
+ "accuracy": 0.985,
786
+ "count": 200
787
  },
788
  "MB": {
789
  "accuracy": 1.0,
790
+ "count": 100
791
  },
792
  "UB": {
793
+ "accuracy": 0.854251012145749,
794
+ "count": 247
795
  },
796
  "UD": {
797
+ "accuracy": 0.8758169934640523,
798
+ "count": 153
799
  }
800
  }
801
  },
802
  "sub_B5": {
803
+ "full_accuracy": 0.42,
804
+ "n_examples": 100,
805
  "per_subtask": {
806
  "MD": {
807
  "accuracy": 1.0,
808
+ "count": 100
809
  },
810
  "MB": {
811
  "accuracy": 1.0,
812
+ "count": 100
813
  },
814
  "UB": {
815
+ "accuracy": 0.8154362416107382,
816
+ "count": 298
817
  },
818
  "UD": {
819
+ "accuracy": 0.8564356435643564,
820
+ "count": 202
821
  }
822
  }
823
  }
824
  },
825
  "summary": {
826
+ "overall_accuracy": 0.6270833333333333,
827
+ "total_examples": 2400,
828
  "n_splits": 22
829
  }
830
  }
add_sub_baseline_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc2ceca0901e7a6c1a4fdcedf14bf615dda905c45631f688fe16182a3dabbdcd
3
  size 650266922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83183edc6b1fa903ee491f2ac4fae76e8abb9d76035b9ce79aec6265c267c8ec
3
  size 650266922
add_sub_baseline_10K/train_config.json CHANGED
@@ -1,35 +1,84 @@
1
  {
2
- "mode": "baseline",
3
- "ops": "add_sub",
4
- "n_digits": 6,
5
- "n_layer": 2,
6
- "n_head": 3,
7
- "n_embd": 510,
8
- "abs_vocab": 0,
9
  "K": 4,
 
 
 
 
 
 
10
  "alpha_info_gain": 10.0,
11
  "alpha_abs": 0.1,
12
  "alpha_soft_zipf": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  "batch_size": 64,
 
14
  "num_epochs": 20,
15
- "dataset_size": 10000,
16
- "lr": 8e-05,
 
 
 
17
  "output_dir": "ckpt/sweep/add_sub_baseline_10K",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  "device": "cuda",
19
  "push_to_hub": true,
20
  "no_wandb": false,
21
  "n_params": 162490082,
22
  "run_name": "add_sub_baseline_10K",
23
- "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
- "timestamp": "2026-04-12T01:58:10.796905+00:00",
25
  "tokenizer": "Qwen/Qwen3-0.6B",
26
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
  "dataset_config": "add_sub_6digit",
28
  "model_repo": "thoughtworks/arithmetic-sorl",
29
  "trainer_version": "sft",
30
- "wandb_run_id": "04vy23fy",
31
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/04vy23fy",
32
- "final_accuracy": 0.8628571428571429,
33
- "sft_accuracy": 0.8628571428571429,
34
  "eval_method": "ArithmeticEvaluator"
35
  }
 
1
  {
2
+ "num_rollouts": 4,
 
 
 
 
 
 
3
  "K": 4,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
  "alpha_info_gain": 10.0,
11
  "alpha_abs": 0.1,
12
  "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 4e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 100,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
  "num_epochs": 20,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 156,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
  "output_dir": "ckpt/sweep/add_sub_baseline_10K",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 2,
61
+ "n_head": 3,
62
+ "n_embd": 510,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 0,
65
+ "dataset_size": 10000,
66
+ "mode": "baseline",
67
  "device": "cuda",
68
  "push_to_hub": true,
69
  "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
+ "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
+ "timestamp": "2026-04-12T08:59:11.045620+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
+ "wandb_run_id": "7dqmkyo7",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/7dqmkyo7",
81
+ "final_accuracy": 0.6270833333333333,
82
+ "sft_accuracy": 0.6270833333333333,
83
  "eval_method": "ArithmeticEvaluator"
84
  }