amirali1985 commited on
Commit
693f9d2
·
verified ·
1 Parent(s): e5cab5c

Upload add_sub_baseline_10K

Browse files
add_sub_baseline_10K/metrics.json CHANGED
@@ -65,196 +65,196 @@
65
  3100
66
  ],
67
  "loss": [
68
- 7.930843830108643,
69
- 5.878052711486816,
70
- 4.171169757843018,
71
- 2.4461560249328613,
72
- 1.9941270351409912,
73
- 1.885154366493225,
74
- 1.7704766988754272,
75
- 1.7200186252593994,
76
- 1.6930021047592163,
77
- 1.659714698791504,
78
- 1.6491907835006714,
79
- 1.440868616104126,
80
- 1.2277355194091797,
81
- 0.9296635389328003,
82
- 0.727922797203064,
83
- 0.6531419157981873,
84
- 0.6338068842887878,
85
- 0.5315085053443909,
86
- 0.471783846616745,
87
- 0.44315391778945923,
88
- 0.37063688039779663,
89
- 0.31710734963417053,
90
- 0.32954543828964233,
91
- 0.2792213559150696,
92
- 0.28969308733940125,
93
- 0.27333250641822815,
94
- 0.2600690424442291,
95
- 0.22665680944919586,
96
- 0.22485388815402985,
97
- 0.18426764011383057,
98
- 0.20698437094688416,
99
- 0.20130981504917145,
100
- 0.16024114191532135,
101
- 0.20614077150821686,
102
- 0.16621388494968414,
103
- 0.1674460619688034,
104
- 0.13519535958766937,
105
- 0.17641237378120422,
106
- 0.14274020493030548,
107
- 0.15076635777950287,
108
- 0.14852766692638397,
109
- 0.12220268696546555,
110
- 0.13161125779151917,
111
- 0.12244851887226105,
112
- 0.14852531254291534,
113
- 0.15561267733573914,
114
- 0.10794571787118912,
115
- 0.14725656807422638,
116
- 0.14284299314022064,
117
- 0.10321654379367828,
118
- 0.10347577184438705,
119
- 0.1123846247792244,
120
- 0.11255372315645218,
121
- 0.11303780227899551,
122
- 0.07373049110174179,
123
- 0.1012827679514885,
124
- 0.08089525252580643,
125
- 0.10804768651723862,
126
- 0.08694573491811752,
127
- 0.11614620685577393,
128
- 0.090724878013134,
129
- 0.11524895578622818
130
  ],
131
  "base_loss": [
132
- 7.930843830108643,
133
- 5.878052711486816,
134
- 4.171169757843018,
135
- 2.4461560249328613,
136
- 1.9941270351409912,
137
- 1.885154366493225,
138
- 1.7704766988754272,
139
- 1.7200186252593994,
140
- 1.6930021047592163,
141
- 1.659714698791504,
142
- 1.6491907835006714,
143
- 1.440868616104126,
144
- 1.2277355194091797,
145
- 0.9296635389328003,
146
- 0.727922797203064,
147
- 0.6531419157981873,
148
- 0.6338068842887878,
149
- 0.5315085053443909,
150
- 0.471783846616745,
151
- 0.44315391778945923,
152
- 0.37063688039779663,
153
- 0.31710734963417053,
154
- 0.32954543828964233,
155
- 0.2792213559150696,
156
- 0.28969308733940125,
157
- 0.27333250641822815,
158
- 0.2600690424442291,
159
- 0.22665680944919586,
160
- 0.22485388815402985,
161
- 0.18426764011383057,
162
- 0.20698437094688416,
163
- 0.20130981504917145,
164
- 0.16024114191532135,
165
- 0.20614077150821686,
166
- 0.16621388494968414,
167
- 0.1674460619688034,
168
- 0.13519535958766937,
169
- 0.17641237378120422,
170
- 0.14274020493030548,
171
- 0.15076635777950287,
172
- 0.14852766692638397,
173
- 0.12220268696546555,
174
- 0.13161125779151917,
175
- 0.12244851887226105,
176
- 0.14852531254291534,
177
- 0.15561267733573914,
178
- 0.10794571787118912,
179
- 0.14725656807422638,
180
- 0.14284299314022064,
181
- 0.10321654379367828,
182
- 0.10347577184438705,
183
- 0.1123846247792244,
184
- 0.11255372315645218,
185
- 0.11303780227899551,
186
- 0.07373049110174179,
187
- 0.1012827679514885,
188
- 0.08089525252580643,
189
- 0.10804768651723862,
190
- 0.08694573491811752,
191
- 0.11614620685577393,
192
- 0.090724878013134,
193
- 0.11524895578622818
194
  ],
195
  "lr": [
196
- 1.9600000000000002e-05,
197
- 3.96e-05,
198
- 3.9974363901222355e-05,
199
- 3.9895421087752256e-05,
200
- 3.976337160140893e-05,
201
- 3.957856792072718e-05,
202
- 3.9341503340549716e-05,
203
- 3.9052810655279495e-05,
204
- 3.8713260469767256e-05,
205
- 3.832375914234272e-05,
206
- 3.788534636548025e-05,
207
- 3.739919239055685e-05,
208
- 3.6866594904110466e-05,
209
- 3.62889755639367e-05,
210
- 3.566787620427007e-05,
211
- 3.5004954720179526e-05,
212
- 3.4301980642163606e-05,
213
- 3.3560830412758255e-05,
214
- 3.2783482377765215e-05,
215
- 3.197201150547096e-05,
216
- 3.112858384795204e-05,
217
- 3.0255450759251313e-05,
218
- 2.9354942885858323e-05,
219
- 2.842946394553513e-05,
220
- 2.7481484311093542e-05,
221
- 2.6513534416250682e-05,
222
- 2.5528198001164462e-05,
223
- 2.4528105215678678e-05,
224
- 2.3515925598687097e-05,
225
- 2.249436095235672e-05,
226
- 2.146613813023101e-05,
227
- 2.043400175846362e-05,
228
- 1.9400706909611906e-05,
229
- 1.8369011748545936e-05,
230
- 1.734167017010322e-05,
231
- 1.6321424448141397e-05,
232
- 1.5310997915610664e-05,
233
- 1.4313087695185074e-05,
234
- 1.3330357499856637e-05,
235
- 1.2365430522709616e-05,
236
- 1.1420882434854255e-05,
237
- 1.049923451021051e-05,
238
- 9.602946895493652e-06,
239
- 8.734412043366293e-06,
240
- 7.89594832628547e-06,
241
- 7.089793848091389e-06,
242
- 6.318100469856501e-06,
243
- 5.582928065941624e-06,
244
- 4.886239025591397e-06,
245
- 4.229893014745887e-06,
246
- 3.6156420120506306e-06,
247
- 3.045125632315402e-06,
248
- 2.5198667499047936e-06,
249
- 2.0412674337430526e-06,
250
- 1.610605204783806e-06,
251
- 1.2290296259345835e-06,
252
- 8.975592335386451e-07,
253
- 6.170788186048593e-07,
254
- 3.883370650428364e-07,
255
- 2.11944551207528e-07,
256
- 8.837212008778961e-08,
257
- 1.79496224893283e-08
258
  ],
259
  "eval_step": [
260
  156,
@@ -301,29 +301,29 @@
301
  20
302
  ],
303
  "eval_accuracy": [
304
- 0.006666666666666667,
305
- 0.0022222222222222222,
306
- 0.01,
307
- 0.014444444444444444,
308
- 0.051111111111111114,
309
- 0.13444444444444445,
310
- 0.26555555555555554,
311
- 0.39,
312
- 0.49666666666666665,
313
- 0.56,
314
- 0.6044444444444445,
315
- 0.6322222222222222,
316
- 0.6344444444444445,
317
- 0.6344444444444445,
318
- 0.6866666666666666,
319
- 0.6888888888888889,
320
- 0.7033333333333334,
321
- 0.7033333333333334,
322
- 0.7144444444444444,
323
- 0.71
324
  ]
325
  },
326
- "final_accuracy": 0.6270833333333333,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
@@ -334,11 +334,11 @@
334
  },
335
  "splits": {
336
  "add_S0": {
337
- "full_accuracy": 0.87,
338
  "n_examples": 100,
339
  "per_subtask": {
340
  "SA": {
341
- "accuracy": 0.9785123966942149,
342
  "count": 605
343
  },
344
  "SS": {
@@ -348,7 +348,7 @@
348
  }
349
  },
350
  "add_S1": {
351
- "full_accuracy": 0.98,
352
  "n_examples": 100,
353
  "per_subtask": {
354
  "SA": {
@@ -364,13 +364,13 @@
364
  "count": 31
365
  },
366
  "UC": {
367
- "accuracy": 0.9966216216216216,
368
  "count": 296
369
  }
370
  }
371
  },
372
  "add_S2": {
373
- "full_accuracy": 0.7,
374
  "n_examples": 100,
375
  "per_subtask": {
376
  "SA": {
@@ -378,15 +378,15 @@
378
  "count": 163
379
  },
380
  "SC": {
381
- "accuracy": 0.9692307692307692,
382
  "count": 130
383
  },
384
  "SS": {
385
- "accuracy": 0.9655172413793104,
386
  "count": 87
387
  },
388
  "UC": {
389
- "accuracy": 0.8669950738916257,
390
  "count": 203
391
  },
392
  "US": {
@@ -396,7 +396,7 @@
396
  }
397
  },
398
  "add_S3": {
399
- "full_accuracy": 0.52,
400
  "n_examples": 100,
401
  "per_subtask": {
402
  "SA": {
@@ -412,17 +412,17 @@
412
  "count": 49
413
  },
414
  "UC": {
415
- "accuracy": 0.7795698924731183,
416
  "count": 186
417
  },
418
  "US": {
419
- "accuracy": 0.8923766816143498,
420
  "count": 223
421
  }
422
  }
423
  },
424
  "add_S4": {
425
- "full_accuracy": 0.58,
426
  "n_examples": 100,
427
  "per_subtask": {
428
  "SA": {
@@ -438,17 +438,17 @@
438
  "count": 23
439
  },
440
  "UC": {
441
- "accuracy": 0.8125,
442
  "count": 160
443
  },
444
  "US": {
445
- "accuracy": 0.8273615635179153,
446
  "count": 307
447
  }
448
  }
449
  },
450
  "add_S5": {
451
- "full_accuracy": 0.29,
452
  "n_examples": 100,
453
  "per_subtask": {
454
  "SA": {
@@ -460,17 +460,17 @@
460
  "count": 100
461
  },
462
  "UC": {
463
- "accuracy": 0.41,
464
  "count": 100
465
  },
466
  "US": {
467
- "accuracy": 0.58,
468
  "count": 400
469
  }
470
  }
471
  },
472
  "add_S6": {
473
- "full_accuracy": 0.56,
474
  "n_examples": 100,
475
  "per_subtask": {
476
  "SC": {
@@ -478,25 +478,25 @@
478
  "count": 100
479
  },
480
  "UC": {
481
- "accuracy": 0.64,
482
  "count": 100
483
  },
484
  "US": {
485
- "accuracy": 0.688,
486
  "count": 500
487
  }
488
  }
489
  },
490
  "add_random": {
491
- "full_accuracy": 0.89,
492
  "n_examples": 200,
493
  "per_subtask": {
494
  "SA": {
495
- "accuracy": 0.9910514541387024,
496
  "count": 447
497
  },
498
  "SC": {
499
- "accuracy": 0.99375,
500
  "count": 320
501
  },
502
  "SS": {
@@ -504,17 +504,17 @@
504
  "count": 56
505
  },
506
  "UC": {
507
- "accuracy": 0.9716446124763705,
508
  "count": 529
509
  },
510
  "US": {
511
- "accuracy": 0.9791666666666666,
512
  "count": 48
513
  }
514
  }
515
  },
516
  "add_C3": {
517
- "full_accuracy": 0.76,
518
  "n_examples": 100,
519
  "per_subtask": {
520
  "SA": {
@@ -526,21 +526,21 @@
526
  "count": 100
527
  },
528
  "UC": {
529
- "accuracy": 0.8808290155440415,
530
  "count": 193
531
  },
532
  "US": {
533
- "accuracy": 0.9532710280373832,
534
  "count": 107
535
  }
536
  }
537
  },
538
  "add_C4": {
539
- "full_accuracy": 0.58,
540
  "n_examples": 100,
541
  "per_subtask": {
542
  "SA": {
543
- "accuracy": 1.0,
544
  "count": 200
545
  },
546
  "SC": {
@@ -548,17 +548,17 @@
548
  "count": 100
549
  },
550
  "UC": {
551
- "accuracy": 0.84375,
552
  "count": 256
553
  },
554
  "US": {
555
- "accuracy": 0.9236111111111112,
556
  "count": 144
557
  }
558
  }
559
  },
560
  "add_C5": {
561
- "full_accuracy": 0.6,
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "SA": {
@@ -570,17 +570,17 @@
570
  "count": 100
571
  },
572
  "UC": {
573
- "accuracy": 0.8823529411764706,
574
  "count": 306
575
  },
576
  "US": {
577
- "accuracy": 0.865979381443299,
578
  "count": 194
579
  }
580
  }
581
  },
582
  "add_C6": {
583
- "full_accuracy": 0.52,
584
  "n_examples": 100,
585
  "per_subtask": {
586
  "SC": {
@@ -588,11 +588,11 @@
588
  "count": 100
589
  },
590
  "UC": {
591
- "accuracy": 0.8797814207650273,
592
  "count": 366
593
  },
594
  "US": {
595
- "accuracy": 0.9102564102564102,
596
  "count": 234
597
  }
598
  }
@@ -606,21 +606,21 @@
606
  "count": 601
607
  },
608
  "ME": {
609
- "accuracy": 0.98989898989899,
610
  "count": 99
611
  }
612
  }
613
  },
614
  "sub_M1": {
615
- "full_accuracy": 0.93,
616
  "n_examples": 100,
617
  "per_subtask": {
618
  "MD": {
619
- "accuracy": 0.989247311827957,
620
  "count": 279
621
  },
622
  "MB": {
623
- "accuracy": 0.9793103448275862,
624
  "count": 145
625
  },
626
  "ME": {
@@ -628,21 +628,21 @@
628
  "count": 24
629
  },
630
  "UB": {
631
- "accuracy": 0.996031746031746,
632
  "count": 252
633
  }
634
  }
635
  },
636
  "sub_M2": {
637
- "full_accuracy": 0.72,
638
  "n_examples": 100,
639
  "per_subtask": {
640
  "MD": {
641
- "accuracy": 0.9859154929577465,
642
  "count": 213
643
  },
644
  "MB": {
645
- "accuracy": 1.0,
646
  "count": 113
647
  },
648
  "ME": {
@@ -650,7 +650,7 @@
650
  "count": 85
651
  },
652
  "UB": {
653
- "accuracy": 0.861878453038674,
654
  "count": 181
655
  },
656
  "UD": {
@@ -660,7 +660,7 @@
660
  }
661
  },
662
  "sub_M3": {
663
- "full_accuracy": 0.26,
664
  "n_examples": 100,
665
  "per_subtask": {
666
  "MD": {
@@ -668,7 +668,7 @@
668
  "count": 179
669
  },
670
  "MB": {
671
- "accuracy": 0.9805825242718447,
672
  "count": 103
673
  },
674
  "ME": {
@@ -676,17 +676,17 @@
676
  "count": 56
677
  },
678
  "UB": {
679
- "accuracy": 0.46308724832214765,
680
  "count": 149
681
  },
682
  "UD": {
683
- "accuracy": 0.9859154929577465,
684
  "count": 213
685
  }
686
  }
687
  },
688
  "sub_M4": {
689
- "full_accuracy": 0.0,
690
  "n_examples": 100,
691
  "per_subtask": {
692
  "MD": {
@@ -698,17 +698,17 @@
698
  "count": 100
699
  },
700
  "UB": {
701
- "accuracy": 0.26,
702
  "count": 100
703
  },
704
  "UD": {
705
- "accuracy": 0.55,
706
  "count": 300
707
  }
708
  }
709
  },
710
  "sub_M5": {
711
- "full_accuracy": 0.0,
712
  "n_examples": 100,
713
  "per_subtask": {
714
  "MD": {
@@ -720,25 +720,25 @@
720
  "count": 100
721
  },
722
  "UB": {
723
- "accuracy": 0.25,
724
  "count": 100
725
  },
726
  "UD": {
727
- "accuracy": 0.3975,
728
  "count": 400
729
  }
730
  }
731
  },
732
  "sub_random": {
733
- "full_accuracy": 0.905,
734
  "n_examples": 200,
735
  "per_subtask": {
736
  "MD": {
737
- "accuracy": 0.995,
738
  "count": 600
739
  },
740
  "MB": {
741
- "accuracy": 0.9775280898876404,
742
  "count": 267
743
  },
744
  "ME": {
@@ -746,7 +746,7 @@
746
  "count": 53
747
  },
748
  "UB": {
749
- "accuracy": 0.9749430523917996,
750
  "count": 439
751
  },
752
  "UD": {
@@ -756,11 +756,11 @@
756
  }
757
  },
758
  "sub_B3": {
759
- "full_accuracy": 0.67,
760
  "n_examples": 100,
761
  "per_subtask": {
762
  "MD": {
763
- "accuracy": 0.98,
764
  "count": 300
765
  },
766
  "MB": {
@@ -768,7 +768,7 @@
768
  "count": 100
769
  },
770
  "UB": {
771
- "accuracy": 0.8527918781725888,
772
  "count": 197
773
  },
774
  "UD": {
@@ -778,11 +778,11 @@
778
  }
779
  },
780
  "sub_B4": {
781
- "full_accuracy": 0.58,
782
  "n_examples": 100,
783
  "per_subtask": {
784
  "MD": {
785
- "accuracy": 0.985,
786
  "count": 200
787
  },
788
  "MB": {
@@ -790,17 +790,17 @@
790
  "count": 100
791
  },
792
  "UB": {
793
- "accuracy": 0.854251012145749,
794
  "count": 247
795
  },
796
  "UD": {
797
- "accuracy": 0.8758169934640523,
798
  "count": 153
799
  }
800
  }
801
  },
802
  "sub_B5": {
803
- "full_accuracy": 0.42,
804
  "n_examples": 100,
805
  "per_subtask": {
806
  "MD": {
@@ -812,18 +812,18 @@
812
  "count": 100
813
  },
814
  "UB": {
815
- "accuracy": 0.8154362416107382,
816
  "count": 298
817
  },
818
  "UD": {
819
- "accuracy": 0.8564356435643564,
820
  "count": 202
821
  }
822
  }
823
  }
824
  },
825
  "summary": {
826
- "overall_accuracy": 0.6270833333333333,
827
  "total_examples": 2400,
828
  "n_splits": 22
829
  }
 
65
  3100
66
  ],
67
  "loss": [
68
+ 7.068239688873291,
69
+ 4.189955234527588,
70
+ 2.0407068729400635,
71
+ 1.8267366886138916,
72
+ 1.7952507734298706,
73
+ 1.7645628452301025,
74
+ 1.595243215560913,
75
+ 1.4900548458099365,
76
+ 1.0821096897125244,
77
+ 0.7350229024887085,
78
+ 0.586064875125885,
79
+ 0.43922391533851624,
80
+ 0.40184348821640015,
81
+ 0.31044527888298035,
82
+ 0.26080775260925293,
83
+ 0.30810627341270447,
84
+ 0.2941185534000397,
85
+ 0.2382466048002243,
86
+ 0.19741371273994446,
87
+ 0.24084995687007904,
88
+ 0.16109716892242432,
89
+ 0.16342215240001678,
90
+ 0.15979401767253876,
91
+ 0.15030500292778015,
92
+ 0.1553926020860672,
93
+ 0.1616949737071991,
94
+ 0.15501153469085693,
95
+ 0.15730805695056915,
96
+ 0.1374218612909317,
97
+ 0.12141235917806625,
98
+ 0.15107305347919464,
99
+ 0.12318699806928635,
100
+ 0.08949179947376251,
101
+ 0.11493054777383804,
102
+ 0.09638842195272446,
103
+ 0.1008308082818985,
104
+ 0.11183486878871918,
105
+ 0.08597197383642197,
106
+ 0.10642959177494049,
107
+ 0.07882422208786011,
108
+ 0.09636445343494415,
109
+ 0.06600546836853027,
110
+ 0.09191834181547165,
111
+ 0.058375708758831024,
112
+ 0.08556245267391205,
113
+ 0.09554620832204819,
114
+ 0.05065993592143059,
115
+ 0.06083018705248833,
116
+ 0.06561274826526642,
117
+ 0.05150684341788292,
118
+ 0.04865796118974686,
119
+ 0.05840981379151344,
120
+ 0.06329210847616196,
121
+ 0.05198041722178459,
122
+ 0.041428614407777786,
123
+ 0.04235445708036423,
124
+ 0.0411263071000576,
125
+ 0.0459698922932148,
126
+ 0.04036566987633705,
127
+ 0.042875390499830246,
128
+ 0.05301470309495926,
129
+ 0.05982867628335953
130
  ],
131
  "base_loss": [
132
+ 7.068239688873291,
133
+ 4.189955234527588,
134
+ 2.0407068729400635,
135
+ 1.8267366886138916,
136
+ 1.7952507734298706,
137
+ 1.7645628452301025,
138
+ 1.595243215560913,
139
+ 1.4900548458099365,
140
+ 1.0821096897125244,
141
+ 0.7350229024887085,
142
+ 0.586064875125885,
143
+ 0.43922391533851624,
144
+ 0.40184348821640015,
145
+ 0.31044527888298035,
146
+ 0.26080775260925293,
147
+ 0.30810627341270447,
148
+ 0.2941185534000397,
149
+ 0.2382466048002243,
150
+ 0.19741371273994446,
151
+ 0.24084995687007904,
152
+ 0.16109716892242432,
153
+ 0.16342215240001678,
154
+ 0.15979401767253876,
155
+ 0.15030500292778015,
156
+ 0.1553926020860672,
157
+ 0.1616949737071991,
158
+ 0.15501153469085693,
159
+ 0.15730805695056915,
160
+ 0.1374218612909317,
161
+ 0.12141235917806625,
162
+ 0.15107305347919464,
163
+ 0.12318699806928635,
164
+ 0.08949179947376251,
165
+ 0.11493054777383804,
166
+ 0.09638842195272446,
167
+ 0.1008308082818985,
168
+ 0.11183486878871918,
169
+ 0.08597197383642197,
170
+ 0.10642959177494049,
171
+ 0.07882422208786011,
172
+ 0.09636445343494415,
173
+ 0.06600546836853027,
174
+ 0.09191834181547165,
175
+ 0.058375708758831024,
176
+ 0.08556245267391205,
177
+ 0.09554620832204819,
178
+ 0.05065993592143059,
179
+ 0.06083018705248833,
180
+ 0.06561274826526642,
181
+ 0.05150684341788292,
182
+ 0.04865796118974686,
183
+ 0.05840981379151344,
184
+ 0.06329210847616196,
185
+ 0.05198041722178459,
186
+ 0.041428614407777786,
187
+ 0.04235445708036423,
188
+ 0.0411263071000576,
189
+ 0.0459698922932148,
190
+ 0.04036566987633705,
191
+ 0.042875390499830246,
192
+ 0.05301470309495926,
193
+ 0.05982867628335953
194
  ],
195
  "lr": [
196
+ 3.9200000000000004e-05,
197
+ 7.92e-05,
198
+ 7.994872780244471e-05,
199
+ 7.979084217550451e-05,
200
+ 7.952674320281786e-05,
201
+ 7.915713584145437e-05,
202
+ 7.868300668109943e-05,
203
+ 7.810562131055899e-05,
204
+ 7.742652093953451e-05,
205
+ 7.664751828468545e-05,
206
+ 7.57706927309605e-05,
207
+ 7.47983847811137e-05,
208
+ 7.373318980822093e-05,
209
+ 7.25779511278734e-05,
210
+ 7.133575240854014e-05,
211
+ 7.000990944035905e-05,
212
+ 6.860396128432721e-05,
213
+ 6.712166082551651e-05,
214
+ 6.556696475553043e-05,
215
+ 6.394402301094192e-05,
216
+ 6.225716769590408e-05,
217
+ 6.0510901518502626e-05,
218
+ 5.8709885771716645e-05,
219
+ 5.685892789107026e-05,
220
+ 5.4962968622187084e-05,
221
+ 5.3027068832501364e-05,
222
+ 5.1056396002328924e-05,
223
+ 4.9056210431357356e-05,
224
+ 4.703185119737419e-05,
225
+ 4.498872190471344e-05,
226
+ 4.293227626046202e-05,
227
+ 4.086800351692724e-05,
228
+ 3.880141381922381e-05,
229
+ 3.673802349709187e-05,
230
+ 3.468334034020644e-05,
231
+ 3.2642848896282794e-05,
232
+ 3.062199583122133e-05,
233
+ 2.862617539037015e-05,
234
+ 2.6660714999713274e-05,
235
+ 2.4730861045419232e-05,
236
+ 2.284176486970851e-05,
237
+ 2.099846902042102e-05,
238
+ 1.9205893790987304e-05,
239
+ 1.7468824086732586e-05,
240
+ 1.579189665257094e-05,
241
+ 1.4179587696182778e-05,
242
+ 1.2636200939713001e-05,
243
+ 1.1165856131883247e-05,
244
+ 9.772478051182794e-06,
245
+ 8.459786029491775e-06,
246
+ 7.231284024101261e-06,
247
+ 6.090251264630804e-06,
248
+ 5.039733499809587e-06,
249
+ 4.082534867486105e-06,
250
+ 3.221210409567612e-06,
251
+ 2.458059251869167e-06,
252
+ 1.7951184670772902e-06,
253
+ 1.2341576372097185e-06,
254
+ 7.766741300856728e-07,
255
+ 4.23889102415056e-07,
256
+ 1.7674424017557922e-07,
257
+ 3.58992449786566e-08
258
  ],
259
  "eval_step": [
260
  156,
 
301
  20
302
  ],
303
  "eval_accuracy": [
304
+ 0.008888888888888889,
305
+ 0.005555555555555556,
306
+ 0.024444444444444446,
307
+ 0.21333333333333335,
308
+ 0.3522222222222222,
309
+ 0.4311111111111111,
310
+ 0.57,
311
+ 0.5322222222222223,
312
+ 0.6111111111111112,
313
+ 0.6211111111111111,
314
+ 0.6477777777777778,
315
+ 0.7,
316
+ 0.7322222222222222,
317
+ 0.7133333333333334,
318
+ 0.7388888888888889,
319
+ 0.7644444444444445,
320
+ 0.7766666666666666,
321
+ 0.7833333333333333,
322
+ 0.7866666666666666,
323
+ 0.7855555555555556
324
  ]
325
  },
326
+ "final_accuracy": 0.7241666666666666,
327
  "sft_eval": {
328
  "config": {
329
  "ops": "add_sub",
 
334
  },
335
  "splits": {
336
  "add_S0": {
337
+ "full_accuracy": 0.94,
338
  "n_examples": 100,
339
  "per_subtask": {
340
  "SA": {
341
+ "accuracy": 0.9900826446280991,
342
  "count": 605
343
  },
344
  "SS": {
 
348
  }
349
  },
350
  "add_S1": {
351
+ "full_accuracy": 0.99,
352
  "n_examples": 100,
353
  "per_subtask": {
354
  "SA": {
 
364
  "count": 31
365
  },
366
  "UC": {
367
+ "accuracy": 1.0,
368
  "count": 296
369
  }
370
  }
371
  },
372
  "add_S2": {
373
+ "full_accuracy": 0.87,
374
  "n_examples": 100,
375
  "per_subtask": {
376
  "SA": {
 
378
  "count": 163
379
  },
380
  "SC": {
381
+ "accuracy": 1.0,
382
  "count": 130
383
  },
384
  "SS": {
385
+ "accuracy": 0.9885057471264368,
386
  "count": 87
387
  },
388
  "UC": {
389
+ "accuracy": 0.9408866995073891,
390
  "count": 203
391
  },
392
  "US": {
 
396
  }
397
  },
398
  "add_S3": {
399
+ "full_accuracy": 0.69,
400
  "n_examples": 100,
401
  "per_subtask": {
402
  "SA": {
 
412
  "count": 49
413
  },
414
  "UC": {
415
+ "accuracy": 0.8387096774193549,
416
  "count": 186
417
  },
418
  "US": {
419
+ "accuracy": 1.0,
420
  "count": 223
421
  }
422
  }
423
  },
424
  "add_S4": {
425
+ "full_accuracy": 0.57,
426
  "n_examples": 100,
427
  "per_subtask": {
428
  "SA": {
 
438
  "count": 23
439
  },
440
  "UC": {
441
+ "accuracy": 0.74375,
442
  "count": 160
443
  },
444
  "US": {
445
+ "accuracy": 0.9511400651465798,
446
  "count": 307
447
  }
448
  }
449
  },
450
  "add_S5": {
451
+ "full_accuracy": 0.32,
452
  "n_examples": 100,
453
  "per_subtask": {
454
  "SA": {
 
460
  "count": 100
461
  },
462
  "UC": {
463
+ "accuracy": 0.39,
464
  "count": 100
465
  },
466
  "US": {
467
+ "accuracy": 0.7075,
468
  "count": 400
469
  }
470
  }
471
  },
472
  "add_S6": {
473
+ "full_accuracy": 0.46,
474
  "n_examples": 100,
475
  "per_subtask": {
476
  "SC": {
 
478
  "count": 100
479
  },
480
  "UC": {
481
+ "accuracy": 0.55,
482
  "count": 100
483
  },
484
  "US": {
485
+ "accuracy": 0.718,
486
  "count": 500
487
  }
488
  }
489
  },
490
  "add_random": {
491
+ "full_accuracy": 0.945,
492
  "n_examples": 200,
493
  "per_subtask": {
494
  "SA": {
495
+ "accuracy": 0.9932885906040269,
496
  "count": 447
497
  },
498
  "SC": {
499
+ "accuracy": 1.0,
500
  "count": 320
501
  },
502
  "SS": {
 
504
  "count": 56
505
  },
506
  "UC": {
507
+ "accuracy": 0.9848771266540642,
508
  "count": 529
509
  },
510
  "US": {
511
+ "accuracy": 1.0,
512
  "count": 48
513
  }
514
  }
515
  },
516
  "add_C3": {
517
+ "full_accuracy": 0.83,
518
  "n_examples": 100,
519
  "per_subtask": {
520
  "SA": {
 
526
  "count": 100
527
  },
528
  "UC": {
529
+ "accuracy": 0.9119170984455959,
530
  "count": 193
531
  },
532
  "US": {
533
+ "accuracy": 1.0,
534
  "count": 107
535
  }
536
  }
537
  },
538
  "add_C4": {
539
+ "full_accuracy": 0.88,
540
  "n_examples": 100,
541
  "per_subtask": {
542
  "SA": {
543
+ "accuracy": 0.995,
544
  "count": 200
545
  },
546
  "SC": {
 
548
  "count": 100
549
  },
550
  "UC": {
551
+ "accuracy": 0.95703125,
552
  "count": 256
553
  },
554
  "US": {
555
+ "accuracy": 1.0,
556
  "count": 144
557
  }
558
  }
559
  },
560
  "add_C5": {
561
+ "full_accuracy": 0.8,
562
  "n_examples": 100,
563
  "per_subtask": {
564
  "SA": {
 
570
  "count": 100
571
  },
572
  "UC": {
573
+ "accuracy": 0.934640522875817,
574
  "count": 306
575
  },
576
  "US": {
577
+ "accuracy": 0.9690721649484536,
578
  "count": 194
579
  }
580
  }
581
  },
582
  "add_C6": {
583
+ "full_accuracy": 0.78,
584
  "n_examples": 100,
585
  "per_subtask": {
586
  "SC": {
 
588
  "count": 100
589
  },
590
  "UC": {
591
+ "accuracy": 0.9398907103825137,
592
  "count": 366
593
  },
594
  "US": {
595
+ "accuracy": 0.9871794871794872,
596
  "count": 234
597
  }
598
  }
 
606
  "count": 601
607
  },
608
  "ME": {
609
+ "accuracy": 1.0,
610
  "count": 99
611
  }
612
  }
613
  },
614
  "sub_M1": {
615
+ "full_accuracy": 1.0,
616
  "n_examples": 100,
617
  "per_subtask": {
618
  "MD": {
619
+ "accuracy": 1.0,
620
  "count": 279
621
  },
622
  "MB": {
623
+ "accuracy": 1.0,
624
  "count": 145
625
  },
626
  "ME": {
 
628
  "count": 24
629
  },
630
  "UB": {
631
+ "accuracy": 1.0,
632
  "count": 252
633
  }
634
  }
635
  },
636
  "sub_M2": {
637
+ "full_accuracy": 0.9,
638
  "n_examples": 100,
639
  "per_subtask": {
640
  "MD": {
641
+ "accuracy": 1.0,
642
  "count": 213
643
  },
644
  "MB": {
645
+ "accuracy": 0.9911504424778761,
646
  "count": 113
647
  },
648
  "ME": {
 
650
  "count": 85
651
  },
652
  "UB": {
653
+ "accuracy": 0.9447513812154696,
654
  "count": 181
655
  },
656
  "UD": {
 
660
  }
661
  },
662
  "sub_M3": {
663
+ "full_accuracy": 0.47,
664
  "n_examples": 100,
665
  "per_subtask": {
666
  "MD": {
 
668
  "count": 179
669
  },
670
  "MB": {
671
+ "accuracy": 0.9902912621359223,
672
  "count": 103
673
  },
674
  "ME": {
 
676
  "count": 56
677
  },
678
  "UB": {
679
+ "accuracy": 0.6375838926174496,
680
  "count": 149
681
  },
682
  "UD": {
683
+ "accuracy": 1.0,
684
  "count": 213
685
  }
686
  }
687
  },
688
  "sub_M4": {
689
+ "full_accuracy": 0.08,
690
  "n_examples": 100,
691
  "per_subtask": {
692
  "MD": {
 
698
  "count": 100
699
  },
700
  "UB": {
701
+ "accuracy": 0.3,
702
  "count": 100
703
  },
704
  "UD": {
705
+ "accuracy": 0.7433333333333333,
706
  "count": 300
707
  }
708
  }
709
  },
710
  "sub_M5": {
711
+ "full_accuracy": 0.02,
712
  "n_examples": 100,
713
  "per_subtask": {
714
  "MD": {
 
720
  "count": 100
721
  },
722
  "UB": {
723
+ "accuracy": 0.38,
724
  "count": 100
725
  },
726
  "UD": {
727
+ "accuracy": 0.5575,
728
  "count": 400
729
  }
730
  }
731
  },
732
  "sub_random": {
733
+ "full_accuracy": 0.96,
734
  "n_examples": 200,
735
  "per_subtask": {
736
  "MD": {
737
+ "accuracy": 0.9983333333333333,
738
  "count": 600
739
  },
740
  "MB": {
741
+ "accuracy": 0.9925093632958801,
742
  "count": 267
743
  },
744
  "ME": {
 
746
  "count": 53
747
  },
748
  "UB": {
749
+ "accuracy": 0.9886104783599089,
750
  "count": 439
751
  },
752
  "UD": {
 
756
  }
757
  },
758
  "sub_B3": {
759
+ "full_accuracy": 0.75,
760
  "n_examples": 100,
761
  "per_subtask": {
762
  "MD": {
763
+ "accuracy": 1.0,
764
  "count": 300
765
  },
766
  "MB": {
 
768
  "count": 100
769
  },
770
  "UB": {
771
+ "accuracy": 0.8730964467005076,
772
  "count": 197
773
  },
774
  "UD": {
 
778
  }
779
  },
780
  "sub_B4": {
781
+ "full_accuracy": 0.68,
782
  "n_examples": 100,
783
  "per_subtask": {
784
  "MD": {
785
+ "accuracy": 1.0,
786
  "count": 200
787
  },
788
  "MB": {
 
790
  "count": 100
791
  },
792
  "UB": {
793
+ "accuracy": 0.8866396761133604,
794
  "count": 247
795
  },
796
  "UD": {
797
+ "accuracy": 0.934640522875817,
798
  "count": 153
799
  }
800
  }
801
  },
802
  "sub_B5": {
803
+ "full_accuracy": 0.59,
804
  "n_examples": 100,
805
  "per_subtask": {
806
  "MD": {
 
812
  "count": 100
813
  },
814
  "UB": {
815
+ "accuracy": 0.87248322147651,
816
  "count": 298
817
  },
818
  "UD": {
819
+ "accuracy": 0.9158415841584159,
820
  "count": 202
821
  }
822
  }
823
  }
824
  },
825
  "summary": {
826
+ "overall_accuracy": 0.7241666666666666,
827
  "total_examples": 2400,
828
  "n_splits": 22
829
  }
add_sub_baseline_10K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83183edc6b1fa903ee491f2ac4fae76e8abb9d76035b9ce79aec6265c267c8ec
3
  size 650266922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99090d5772338ac21e3d890eee50dfa2204eeafe4c53123e495571b22267bec9
3
  size 650266922
add_sub_baseline_10K/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
- "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
- "timestamp": "2026-04-12T08:59:11.045620+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "7dqmkyo7",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/7dqmkyo7",
81
- "final_accuracy": 0.6270833333333333,
82
- "sft_accuracy": 0.6270833333333333,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 100,
 
69
  "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_10K",
72
+ "git_commit": "8d5ee5420119746ef4e2c87570eb250c9718f643",
73
+ "timestamp": "2026-04-12T22:03:24.880179+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
+ "wandb_run_id": "j0499k85",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/j0499k85",
81
+ "final_accuracy": 0.7241666666666666,
82
+ "sft_accuracy": 0.7241666666666666,
83
  "eval_method": "ArithmeticEvaluator"
84
  }