amirali1985 commited on
Commit
8021cf7
·
verified ·
1 Parent(s): af94939

Upload add_sub_baseline_25K

Browse files
add_sub_baseline_25K/metrics.json CHANGED
@@ -159,478 +159,478 @@
159
  7800
160
  ],
161
  "loss": [
162
- 11.06562614440918,
163
- 9.070802688598633,
164
- 7.564438819885254,
165
- 7.00793981552124,
166
- 6.226614952087402,
167
- 5.854918479919434,
168
- 4.9685893058776855,
169
- 4.255834102630615,
170
- 3.3195507526397705,
171
- 2.7683334350585938,
172
- 2.22994065284729,
173
- 1.9851258993148804,
174
- 1.8262479305267334,
175
- 1.8066816329956055,
176
- 1.725418210029602,
177
- 1.7170584201812744,
178
- 1.760280728340149,
179
- 1.7368391752243042,
180
- 1.631116509437561,
181
- 1.628551721572876,
182
- 1.4732780456542969,
183
- 1.0680259466171265,
184
- 0.8227473497390747,
185
- 0.6230937242507935,
186
- 0.5450628995895386,
187
- 0.41445615887641907,
188
- 0.37112170457839966,
189
- 0.2718469500541687,
190
- 0.25747212767601013,
191
- 0.2155759632587433,
192
- 0.250948965549469,
193
- 0.16987545788288116,
194
- 0.17013859748840332,
195
- 0.13095267117023468,
196
- 0.1641116440296173,
197
- 0.15072695910930634,
198
- 0.14167438447475433,
199
- 0.11960385739803314,
200
- 0.08400659263134003,
201
- 0.1277170479297638,
202
- 0.10943807661533356,
203
- 0.10479181259870529,
204
- 0.11750847846269608,
205
- 0.09830878674983978,
206
- 0.09297787398099899,
207
- 0.08759058266878128,
208
- 0.09592296183109283,
209
- 0.056994277983903885,
210
- 0.08595649152994156,
211
- 0.09432648122310638,
212
- 0.06118571758270264,
213
- 0.05625522881746292,
214
- 0.0965944156050682,
215
- 0.08749499171972275,
216
- 0.08019021898508072,
217
- 0.05185790732502937,
218
- 0.07028406113386154,
219
- 0.053000323474407196,
220
- 0.06424856185913086,
221
- 0.07609923928976059,
222
- 0.08353769779205322,
223
- 0.0732322707772255,
224
- 0.08481856435537338,
225
- 0.05480723828077316,
226
- 0.04933084920048714,
227
- 0.03942330554127693,
228
- 0.03680780529975891,
229
- 0.0338483564555645,
230
- 0.04500473663210869,
231
- 0.05092630162835121,
232
- 0.06185285374522209,
233
- 0.05888064578175545,
234
- 0.06084391847252846,
235
- 0.047385044395923615,
236
- 0.05746296048164368,
237
- 0.038728680461645126,
238
- 0.041216280311346054,
239
- 0.04032302647829056,
240
- 0.03952217847108841,
241
- 0.04412711411714554,
242
- 0.037145402282476425,
243
- 0.03596339002251625,
244
- 0.03715207800269127,
245
- 0.02417995221912861,
246
- 0.05104166269302368,
247
- 0.028953881934285164,
248
- 0.0322723388671875,
249
- 0.05351385846734047,
250
- 0.04065471515059471,
251
- 0.035228464752435684,
252
- 0.0399099700152874,
253
- 0.04321296140551567,
254
- 0.023695966228842735,
255
- 0.03870895132422447,
256
- 0.023070303723216057,
257
- 0.02995055727660656,
258
- 0.034223996102809906,
259
- 0.03115018829703331,
260
- 0.047400325536727905,
261
- 0.033650610595941544,
262
- 0.020459134131669998,
263
- 0.031521618366241455,
264
- 0.010215552523732185,
265
- 0.009668267332017422,
266
- 0.009839163161814213,
267
- 0.01371306087821722,
268
- 0.0191287100315094,
269
- 0.02995881251990795,
270
- 0.010273347608745098,
271
- 0.013487428426742554,
272
- 0.006214354187250137,
273
- 0.028749624267220497,
274
- 0.004388689063489437,
275
- 0.01441959012299776,
276
- 0.010049402713775635,
277
- 0.006188374478369951,
278
- 0.006518733222037554,
279
- 0.012474223971366882,
280
- 0.0018296894850209355,
281
- 0.009708991274237633,
282
- 0.012771486304700375,
283
- 0.006753657478839159,
284
- 0.006519939284771681,
285
- 0.0015541197499260306,
286
- 0.0044125900603830814,
287
- 0.0019812153186649084,
288
- 0.009496470913290977,
289
- 0.014241503551602364,
290
- 0.004722653888165951,
291
- 0.003749982686713338,
292
- 0.009157851338386536,
293
- 0.002904724795371294,
294
- 0.002242226619273424,
295
- 0.012742781080305576,
296
- 0.002705535152927041,
297
- 0.0015757112996652722,
298
- 0.004974766168743372,
299
- 0.007047231774777174,
300
- 0.004940888378769159,
301
- 0.007126574404537678,
302
- 0.004077407065778971,
303
- 0.009843333624303341,
304
- 0.008745082654058933,
305
- 0.0023113691713660955,
306
- 0.005770131945610046,
307
- 0.006909515243023634,
308
- 0.005357807967811823,
309
- 0.022677551954984665,
310
- 0.0067167701199650764,
311
- 0.002013612538576126,
312
- 0.00514591159299016,
313
- 0.004963079001754522,
314
- 0.0025343645829707384,
315
- 0.012118958868086338,
316
- 0.012999330647289753,
317
- 0.0052215722389519215
318
  ],
319
  "base_loss": [
320
- 11.06562614440918,
321
- 9.070802688598633,
322
- 7.564438819885254,
323
- 7.00793981552124,
324
- 6.226614952087402,
325
- 5.854918479919434,
326
- 4.9685893058776855,
327
- 4.255834102630615,
328
- 3.3195507526397705,
329
- 2.7683334350585938,
330
- 2.22994065284729,
331
- 1.9851258993148804,
332
- 1.8262479305267334,
333
- 1.8066816329956055,
334
- 1.725418210029602,
335
- 1.7170584201812744,
336
- 1.760280728340149,
337
- 1.7368391752243042,
338
- 1.631116509437561,
339
- 1.628551721572876,
340
- 1.4732780456542969,
341
- 1.0680259466171265,
342
- 0.8227473497390747,
343
- 0.6230937242507935,
344
- 0.5450628995895386,
345
- 0.41445615887641907,
346
- 0.37112170457839966,
347
- 0.2718469500541687,
348
- 0.25747212767601013,
349
- 0.2155759632587433,
350
- 0.250948965549469,
351
- 0.16987545788288116,
352
- 0.17013859748840332,
353
- 0.13095267117023468,
354
- 0.1641116440296173,
355
- 0.15072695910930634,
356
- 0.14167438447475433,
357
- 0.11960385739803314,
358
- 0.08400659263134003,
359
- 0.1277170479297638,
360
- 0.10943807661533356,
361
- 0.10479181259870529,
362
- 0.11750847846269608,
363
- 0.09830878674983978,
364
- 0.09297787398099899,
365
- 0.08759058266878128,
366
- 0.09592296183109283,
367
- 0.056994277983903885,
368
- 0.08595649152994156,
369
- 0.09432648122310638,
370
- 0.06118571758270264,
371
- 0.05625522881746292,
372
- 0.0965944156050682,
373
- 0.08749499171972275,
374
- 0.08019021898508072,
375
- 0.05185790732502937,
376
- 0.07028406113386154,
377
- 0.053000323474407196,
378
- 0.06424856185913086,
379
- 0.07609923928976059,
380
- 0.08353769779205322,
381
- 0.0732322707772255,
382
- 0.08481856435537338,
383
- 0.05480723828077316,
384
- 0.04933084920048714,
385
- 0.03942330554127693,
386
- 0.03680780529975891,
387
- 0.0338483564555645,
388
- 0.04500473663210869,
389
- 0.05092630162835121,
390
- 0.06185285374522209,
391
- 0.05888064578175545,
392
- 0.06084391847252846,
393
- 0.047385044395923615,
394
- 0.05746296048164368,
395
- 0.038728680461645126,
396
- 0.041216280311346054,
397
- 0.04032302647829056,
398
- 0.03952217847108841,
399
- 0.04412711411714554,
400
- 0.037145402282476425,
401
- 0.03596339002251625,
402
- 0.03715207800269127,
403
- 0.02417995221912861,
404
- 0.05104166269302368,
405
- 0.028953881934285164,
406
- 0.0322723388671875,
407
- 0.05351385846734047,
408
- 0.04065471515059471,
409
- 0.035228464752435684,
410
- 0.0399099700152874,
411
- 0.04321296140551567,
412
- 0.023695966228842735,
413
- 0.03870895132422447,
414
- 0.023070303723216057,
415
- 0.02995055727660656,
416
- 0.034223996102809906,
417
- 0.03115018829703331,
418
- 0.047400325536727905,
419
- 0.033650610595941544,
420
- 0.020459134131669998,
421
- 0.031521618366241455,
422
- 0.010215552523732185,
423
- 0.009668267332017422,
424
- 0.009839163161814213,
425
- 0.01371306087821722,
426
- 0.0191287100315094,
427
- 0.02995881251990795,
428
- 0.010273347608745098,
429
- 0.013487428426742554,
430
- 0.006214354187250137,
431
- 0.028749624267220497,
432
- 0.004388689063489437,
433
- 0.01441959012299776,
434
- 0.010049402713775635,
435
- 0.006188374478369951,
436
- 0.006518733222037554,
437
- 0.012474223971366882,
438
- 0.0018296894850209355,
439
- 0.009708991274237633,
440
- 0.012771486304700375,
441
- 0.006753657478839159,
442
- 0.006519939284771681,
443
- 0.0015541197499260306,
444
- 0.0044125900603830814,
445
- 0.0019812153186649084,
446
- 0.009496470913290977,
447
- 0.014241503551602364,
448
- 0.004722653888165951,
449
- 0.003749982686713338,
450
- 0.009157851338386536,
451
- 0.002904724795371294,
452
- 0.002242226619273424,
453
- 0.012742781080305576,
454
- 0.002705535152927041,
455
- 0.0015757112996652722,
456
- 0.004974766168743372,
457
- 0.007047231774777174,
458
- 0.004940888378769159,
459
- 0.007126574404537678,
460
- 0.004077407065778971,
461
- 0.009843333624303341,
462
- 0.008745082654058933,
463
- 0.0023113691713660955,
464
- 0.005770131945610046,
465
- 0.006909515243023634,
466
- 0.005357807967811823,
467
- 0.022677551954984665,
468
- 0.0067167701199650764,
469
- 0.002013612538576126,
470
- 0.00514591159299016,
471
- 0.004963079001754522,
472
- 0.0025343645829707384,
473
- 0.012118958868086338,
474
- 0.012999330647289753,
475
- 0.0052215722389519215
476
  ],
477
  "lr": [
478
- 2.5063938618925837e-06,
479
- 5.063938618925831e-06,
480
- 7.62148337595908e-06,
481
- 1.0179028132992328e-05,
482
- 1.2736572890025576e-05,
483
- 1.5294117647058822e-05,
484
- 1.7851662404092073e-05,
485
- 2.040920716112532e-05,
486
- 2.296675191815857e-05,
487
- 2.5524296675191817e-05,
488
- 2.8081841432225065e-05,
489
- 3.0639386189258316e-05,
490
- 3.3196930946291564e-05,
491
- 3.575447570332481e-05,
492
- 3.831202046035806e-05,
493
- 4.086956521739131e-05,
494
- 4.3427109974424555e-05,
495
- 4.598465473145781e-05,
496
- 4.854219948849105e-05,
497
- 5.10997442455243e-05,
498
- 5.365728900255755e-05,
499
- 5.62148337595908e-05,
500
- 5.877237851662404e-05,
501
- 6.13299232736573e-05,
502
- 6.388746803069055e-05,
503
- 6.644501278772379e-05,
504
- 6.900255754475704e-05,
505
- 7.156010230179029e-05,
506
- 7.411764705882354e-05,
507
- 7.667519181585678e-05,
508
- 7.923273657289003e-05,
509
- 7.999382181128958e-05,
510
- 7.996356588945887e-05,
511
- 7.990811651495726e-05,
512
- 7.982750864365423e-05,
513
- 7.97217930916005e-05,
514
- 7.9591036502993e-05,
515
- 7.943532130816183e-05,
516
- 7.925474567160515e-05,
517
- 7.904942343010533e-05,
518
- 7.881948402096506e-05,
519
- 7.856507240040864e-05,
520
- 7.828634895220009e-05,
521
- 7.798348938653556e-05,
522
- 7.765668462927371e-05,
523
- 7.730614070157413e-05,
524
- 7.693207859001933e-05,
525
- 7.653473410730253e-05,
526
- 7.611435774356888e-05,
527
- 7.567121450850376e-05,
528
- 7.520558376426795e-05,
529
- 7.471775904938474e-05,
530
- 7.420804789369019e-05,
531
- 7.367677162446306e-05,
532
- 7.312426516385672e-05,
533
- 7.255087681776069e-05,
534
- 7.195696805622496e-05,
535
- 7.13429132855854e-05,
536
- 7.070909961243422e-05,
537
- 7.005592659958366e-05,
538
- 6.938380601417765e-05,
539
- 6.869316156810923e-05,
540
- 6.798442865090831e-05,
541
- 6.725805405526735e-05,
542
- 6.651449569537871e-05,
543
- 6.575422231826058e-05,
544
- 6.497771320825402e-05,
545
- 6.418545788487704e-05,
546
- 6.337795579422628e-05,
547
- 6.255571599412105e-05,
548
- 6.171925683318781e-05,
549
- 6.086910562408781e-05,
550
- 6.0005798311093635e-05,
551
- 5.912987913222422e-05,
552
- 5.824190027615158e-05,
553
- 5.734242153409514e-05,
554
- 5.643200994692358e-05,
555
- 5.55112394476862e-05,
556
- 5.458069049979956e-05,
557
- 5.364094973111714e-05,
558
- 5.269260956411309e-05,
559
- 5.1736267842412726e-05,
560
- 5.077252745390575e-05,
561
- 4.980199595067928e-05,
562
- 4.882528516601063e-05,
563
- 4.784301082866123e-05,
564
- 4.685579217471466e-05,
565
- 4.586425155720376e-05,
566
- 4.4869014053772686e-05,
567
- 4.387070707262142e-05,
568
- 4.286995995698098e-05,
569
- 4.186740358836888e-05,
570
- 4.0863669988874755e-05,
571
- 3.985939192272697e-05,
572
- 3.885520249739142e-05,
573
- 3.785173476445388e-05,
574
- 3.684962132053763e-05,
575
- 3.584949390850793e-05,
576
- 3.485198301921461e-05,
577
- 3.385771749402399e-05,
578
- 3.2867324128390756e-05,
579
- 3.188142727671938e-05,
580
- 3.090064845876465e-05,
581
- 2.9925605967818972e-05,
582
- 2.895691448093382e-05,
583
- 2.799518467142088e-05,
584
- 2.7041022823877087e-05,
585
- 2.6095030451976512e-05,
586
- 2.515780391926975e-05,
587
- 2.4229934063230064e-05,
588
- 2.3312005822783295e-05,
589
- 2.240459786955611e-05,
590
- 2.150828224307534e-05,
591
- 2.0623623990148315e-05,
592
- 1.9751180808651272e-05,
593
- 1.8891502695950898e-05,
594
- 1.8045131602180072e-05,
595
- 1.7212601088586823e-05,
596
- 1.63944359911718e-05,
597
- 1.559115208982597e-05,
598
- 1.48032557831777e-05,
599
- 1.4031243769353617e-05,
600
- 1.3275602732854923e-05,
601
- 1.2536809037746398e-05,
602
- 1.1815328427351398e-05,
603
- 1.1111615730642416e-05,
604
- 1.042611457551213e-05,
605
- 9.759257109105627e-06,
606
- 9.111463725390388e-06,
607
- 8.483142800135428e-06,
608
- 7.87469043346695e-06,
609
- 7.286490200162668e-06,
610
- 6.718912907842181e-06,
611
- 6.1723163632060055e-06,
612
- 5.647045146470409e-06,
613
- 5.143430394140439e-06,
614
- 4.661789590258008e-06,
615
- 4.202426366256558e-06,
616
- 3.7656303095486623e-06,
617
- 3.351676780967026e-06,
618
- 2.960826741174141e-06,
619
- 2.5933265861499514e-06,
620
- 2.2494079918611923e-06,
621
- 1.929287768210473e-06,
622
- 1.6331677223569853e-06,
623
- 1.3612345314951615e-06,
624
- 1.1136596251714304e-06,
625
- 8.905990772131879e-07,
626
- 6.921935073382368e-07,
627
- 5.185679925066245e-07,
628
- 3.698319880708301e-07,
629
- 2.4607925877392135e-07,
630
- 1.4738781963932191e-07,
631
- 7.381988678927255e-08,
632
- 2.5421838223160798e-08,
633
- 2.2241845803394615e-09
634
  ],
635
  "eval_step": [
636
  390,
@@ -677,530 +677,530 @@
677
  20
678
  ],
679
  "eval_accuracy": [
680
- 0.0064285714285714285,
681
- 0.007142857142857143,
682
- 0.085,
683
- 0.52,
684
- 0.5935714285714285,
685
- 0.6385714285714286,
686
- 0.6514285714285715,
687
- 0.7364285714285714,
688
- 0.7307142857142858,
689
- 0.7328571428571429,
690
- 0.755,
691
- 0.7964285714285714,
692
- 0.8135714285714286,
693
- 0.89,
694
- 0.8621428571428571,
695
- 0.8892857142857142,
696
- 0.9235714285714286,
697
- 0.9035714285714286,
698
- 0.9135714285714286,
699
- 0.9207142857142857
700
  ]
701
  },
702
- "final_accuracy": 0.9207142857142857,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
- "n_per_split": 50
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 1.0,
714
- "n_examples": 50,
715
  "per_subtask": {
716
  "SA": {
717
- "accuracy": 1.0,
718
- "count": 295
719
  },
720
  "SS": {
721
  "accuracy": 1.0,
722
- "count": 55
723
  }
724
  }
725
  },
726
  "add_S1": {
727
  "full_accuracy": 1.0,
728
- "n_examples": 50,
729
  "per_subtask": {
730
  "SA": {
731
  "accuracy": 1.0,
732
- "count": 126
733
  },
734
  "SC": {
735
  "accuracy": 1.0,
736
- "count": 79
737
  },
738
  "SS": {
739
  "accuracy": 1.0,
740
- "count": 21
741
  },
742
  "UC": {
743
  "accuracy": 1.0,
744
- "count": 124
745
  }
746
  }
747
  },
748
  "add_S2": {
749
- "full_accuracy": 1.0,
750
- "n_examples": 50,
751
  "per_subtask": {
752
  "SA": {
753
- "accuracy": 1.0,
754
- "count": 75
755
  },
756
  "SC": {
757
- "accuracy": 1.0,
758
- "count": 62
759
  },
760
  "SS": {
761
  "accuracy": 1.0,
762
- "count": 39
763
  },
764
  "UC": {
765
- "accuracy": 1.0,
766
- "count": 111
767
  },
768
  "US": {
769
  "accuracy": 1.0,
770
- "count": 63
771
  }
772
  }
773
  },
774
  "add_S3": {
775
- "full_accuracy": 1.0,
776
- "n_examples": 50,
777
  "per_subtask": {
778
  "SA": {
779
  "accuracy": 1.0,
780
- "count": 60
781
  },
782
  "SC": {
783
  "accuracy": 1.0,
784
- "count": 57
785
  },
786
  "SS": {
787
  "accuracy": 1.0,
788
- "count": 19
789
  },
790
  "UC": {
791
- "accuracy": 1.0,
792
- "count": 104
793
  },
794
  "US": {
795
- "accuracy": 1.0,
796
- "count": 110
797
  }
798
  }
799
  },
800
  "add_S4": {
801
- "full_accuracy": 0.92,
802
- "n_examples": 50,
803
  "per_subtask": {
804
  "SA": {
805
  "accuracy": 1.0,
806
- "count": 48
807
  },
808
  "SC": {
809
  "accuracy": 1.0,
810
- "count": 52
811
  },
812
  "SS": {
813
  "accuracy": 1.0,
814
- "count": 7
815
  },
816
  "UC": {
817
- "accuracy": 0.9550561797752809,
818
- "count": 89
819
  },
820
  "US": {
821
- "accuracy": 0.9935064935064936,
822
- "count": 154
823
  }
824
  }
825
  },
826
  "add_S5": {
827
- "full_accuracy": 0.54,
828
- "n_examples": 50,
829
  "per_subtask": {
830
  "SA": {
831
  "accuracy": 1.0,
832
- "count": 50
833
  },
834
  "SC": {
835
  "accuracy": 1.0,
836
- "count": 50
837
  },
838
  "UC": {
839
- "accuracy": 0.64,
840
- "count": 50
841
  },
842
  "US": {
843
- "accuracy": 0.91,
844
- "count": 200
845
  }
846
  }
847
  },
848
  "add_S6": {
849
- "full_accuracy": 0.92,
850
- "n_examples": 50,
851
  "per_subtask": {
852
  "SC": {
853
  "accuracy": 1.0,
854
- "count": 50
855
  },
856
  "UC": {
857
- "accuracy": 0.94,
858
- "count": 50
859
  },
860
  "US": {
861
- "accuracy": 0.98,
862
- "count": 250
863
  }
864
  }
865
  },
866
  "add_random": {
867
- "full_accuracy": 0.995,
868
  "n_examples": 200,
869
  "per_subtask": {
870
  "SA": {
871
- "accuracy": 1.0,
872
- "count": 431
873
  },
874
  "SC": {
875
- "accuracy": 1.0,
876
- "count": 316
877
  },
878
  "SS": {
879
  "accuracy": 1.0,
880
- "count": 39
881
  },
882
  "UC": {
883
- "accuracy": 0.9982142857142857,
884
- "count": 560
885
  },
886
  "US": {
887
  "accuracy": 1.0,
888
- "count": 54
889
  }
890
  }
891
  },
892
  "add_C3": {
893
- "full_accuracy": 0.98,
894
- "n_examples": 50,
895
  "per_subtask": {
896
  "SA": {
897
  "accuracy": 1.0,
898
- "count": 150
899
  },
900
  "SC": {
901
  "accuracy": 1.0,
902
- "count": 50
903
  },
904
  "UC": {
905
- "accuracy": 0.9903846153846154,
906
- "count": 104
907
  },
908
  "US": {
909
  "accuracy": 1.0,
910
- "count": 46
911
  }
912
  }
913
  },
914
  "add_C4": {
915
  "full_accuracy": 0.9,
916
- "n_examples": 50,
917
  "per_subtask": {
918
  "SA": {
919
  "accuracy": 1.0,
920
- "count": 100
921
  },
922
  "SC": {
923
  "accuracy": 1.0,
924
- "count": 50
925
  },
926
  "UC": {
927
- "accuracy": 0.967479674796748,
928
- "count": 123
929
  },
930
  "US": {
931
- "accuracy": 0.974025974025974,
932
- "count": 77
933
  }
934
  }
935
  },
936
  "add_C5": {
937
- "full_accuracy": 0.98,
938
- "n_examples": 50,
939
  "per_subtask": {
940
  "SA": {
941
  "accuracy": 1.0,
942
- "count": 50
943
  },
944
  "SC": {
945
  "accuracy": 1.0,
946
- "count": 50
947
  },
948
  "UC": {
949
- "accuracy": 0.9935064935064936,
950
- "count": 154
951
  },
952
  "US": {
953
  "accuracy": 1.0,
954
- "count": 96
955
  }
956
  }
957
  },
958
  "add_C6": {
959
- "full_accuracy": 0.96,
960
- "n_examples": 50,
961
  "per_subtask": {
962
  "SC": {
963
  "accuracy": 1.0,
964
- "count": 50
965
  },
966
  "UC": {
967
- "accuracy": 0.989010989010989,
968
- "count": 182
969
  },
970
  "US": {
971
- "accuracy": 0.9915254237288136,
972
- "count": 118
973
  }
974
  }
975
  },
976
  "sub_M0": {
977
- "full_accuracy": 1.0,
978
- "n_examples": 50,
979
  "per_subtask": {
980
  "MD": {
981
- "accuracy": 1.0,
982
- "count": 294
983
  },
984
  "ME": {
985
  "accuracy": 1.0,
986
- "count": 56
987
  }
988
  }
989
  },
990
  "sub_M1": {
991
- "full_accuracy": 1.0,
992
- "n_examples": 50,
993
  "per_subtask": {
994
  "MD": {
995
- "accuracy": 1.0,
996
- "count": 143
997
  },
998
  "MB": {
999
- "accuracy": 1.0,
1000
- "count": 69
1001
  },
1002
  "ME": {
1003
  "accuracy": 1.0,
1004
- "count": 15
1005
  },
1006
  "UB": {
1007
  "accuracy": 1.0,
1008
- "count": 123
1009
  }
1010
  }
1011
  },
1012
  "sub_M2": {
1013
- "full_accuracy": 1.0,
1014
- "n_examples": 50,
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 1.0,
1018
- "count": 108
1019
  },
1020
  "MB": {
1021
- "accuracy": 1.0,
1022
- "count": 52
1023
  },
1024
  "ME": {
1025
  "accuracy": 1.0,
1026
- "count": 52
1027
  },
1028
  "UB": {
1029
- "accuracy": 1.0,
1030
- "count": 87
1031
  },
1032
  "UD": {
1033
  "accuracy": 1.0,
1034
- "count": 51
1035
  }
1036
  }
1037
  },
1038
  "sub_M3": {
1039
- "full_accuracy": 1.0,
1040
- "n_examples": 50,
1041
  "per_subtask": {
1042
  "MD": {
1043
  "accuracy": 1.0,
1044
- "count": 94
1045
  },
1046
  "MB": {
1047
  "accuracy": 1.0,
1048
- "count": 51
1049
  },
1050
  "ME": {
1051
  "accuracy": 1.0,
1052
- "count": 25
1053
  },
1054
  "UB": {
1055
- "accuracy": 1.0,
1056
- "count": 78
1057
  },
1058
  "UD": {
1059
  "accuracy": 1.0,
1060
- "count": 102
1061
  }
1062
  }
1063
  },
1064
  "sub_M4": {
1065
- "full_accuracy": 0.64,
1066
- "n_examples": 50,
1067
  "per_subtask": {
1068
  "MD": {
1069
  "accuracy": 1.0,
1070
- "count": 100
1071
  },
1072
  "MB": {
1073
  "accuracy": 1.0,
1074
- "count": 50
1075
  },
1076
  "UB": {
1077
- "accuracy": 0.68,
1078
- "count": 50
1079
  },
1080
  "UD": {
1081
- "accuracy": 0.9866666666666667,
1082
- "count": 150
1083
  }
1084
  }
1085
  },
1086
  "sub_M5": {
1087
- "full_accuracy": 0.24,
1088
- "n_examples": 50,
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
- "count": 50
1093
  },
1094
  "MB": {
1095
  "accuracy": 1.0,
1096
- "count": 50
1097
  },
1098
  "UB": {
1099
- "accuracy": 0.52,
1100
- "count": 50
1101
  },
1102
  "UD": {
1103
- "accuracy": 0.885,
1104
- "count": 200
1105
  }
1106
  }
1107
  },
1108
  "sub_random": {
1109
- "full_accuracy": 1.0,
1110
  "n_examples": 200,
1111
  "per_subtask": {
1112
  "MD": {
1113
- "accuracy": 1.0,
1114
- "count": 588
1115
  },
1116
  "MB": {
1117
- "accuracy": 1.0,
1118
- "count": 268
1119
  },
1120
  "ME": {
1121
  "accuracy": 1.0,
1122
- "count": 60
1123
  },
1124
  "UB": {
1125
- "accuracy": 1.0,
1126
- "count": 447
1127
  },
1128
  "UD": {
1129
  "accuracy": 1.0,
1130
- "count": 37
1131
  }
1132
  }
1133
  },
1134
  "sub_B3": {
1135
- "full_accuracy": 1.0,
1136
- "n_examples": 50,
1137
  "per_subtask": {
1138
  "MD": {
1139
- "accuracy": 1.0,
1140
- "count": 150
1141
  },
1142
  "MB": {
1143
  "accuracy": 1.0,
1144
- "count": 50
1145
  },
1146
  "UB": {
1147
- "accuracy": 1.0,
1148
- "count": 107
1149
  },
1150
  "UD": {
1151
  "accuracy": 1.0,
1152
- "count": 43
1153
  }
1154
  }
1155
  },
1156
  "sub_B4": {
1157
- "full_accuracy": 0.9,
1158
- "n_examples": 50,
1159
  "per_subtask": {
1160
  "MD": {
1161
  "accuracy": 1.0,
1162
- "count": 100
1163
  },
1164
  "MB": {
1165
  "accuracy": 1.0,
1166
- "count": 50
1167
  },
1168
  "UB": {
1169
- "accuracy": 0.956140350877193,
1170
- "count": 114
1171
  },
1172
  "UD": {
1173
- "accuracy": 1.0,
1174
- "count": 86
1175
  }
1176
  }
1177
  },
1178
  "sub_B5": {
1179
- "full_accuracy": 0.82,
1180
- "n_examples": 50,
1181
  "per_subtask": {
1182
  "MD": {
1183
  "accuracy": 1.0,
1184
- "count": 50
1185
  },
1186
  "MB": {
1187
  "accuracy": 1.0,
1188
- "count": 50
1189
  },
1190
  "UB": {
1191
- "accuracy": 0.9477124183006536,
1192
- "count": 153
1193
  },
1194
  "UD": {
1195
- "accuracy": 0.979381443298969,
1196
- "count": 97
1197
  }
1198
  }
1199
  }
1200
  },
1201
  "summary": {
1202
- "overall_accuracy": 0.9207142857142857,
1203
- "total_examples": 1400,
1204
  "n_splits": 22
1205
  }
1206
  }
 
159
  7800
160
  ],
161
  "loss": [
162
+ 9.4371337890625,
163
+ 7.074652671813965,
164
+ 6.122837543487549,
165
+ 4.6825385093688965,
166
+ 3.0381083488464355,
167
+ 2.0917437076568604,
168
+ 1.882021188735962,
169
+ 1.9029165506362915,
170
+ 1.7700093984603882,
171
+ 1.6935831308364868,
172
+ 1.5817426443099976,
173
+ 1.4419691562652588,
174
+ 1.0461561679840088,
175
+ 0.7932643294334412,
176
+ 0.6623935699462891,
177
+ 0.5456612706184387,
178
+ 0.5513021349906921,
179
+ 0.4767663776874542,
180
+ 0.38498836755752563,
181
+ 0.4133641719818115,
182
+ 0.31893685460090637,
183
+ 0.3751819133758545,
184
+ 0.31682610511779785,
185
+ 0.3113398253917694,
186
+ 0.2859819829463959,
187
+ 0.30192962288856506,
188
+ 0.2594987452030182,
189
+ 0.19506025314331055,
190
+ 0.23191870748996735,
191
+ 0.2392965406179428,
192
+ 0.2253739833831787,
193
+ 0.17677058279514313,
194
+ 0.21856749057769775,
195
+ 0.1249561682343483,
196
+ 0.17712180316448212,
197
+ 0.1823340654373169,
198
+ 0.19931195676326752,
199
+ 0.17808902263641357,
200
+ 0.11950694024562836,
201
+ 0.17866145074367523,
202
+ 0.17124225199222565,
203
+ 0.112669937312603,
204
+ 0.19379594922065735,
205
+ 0.1307276040315628,
206
+ 0.17944271862506866,
207
+ 0.14239387214183807,
208
+ 0.15844760835170746,
209
+ 0.13379694521427155,
210
+ 0.09916692227125168,
211
+ 0.1193651482462883,
212
+ 0.12891770899295807,
213
+ 0.17134933173656464,
214
+ 0.11273209750652313,
215
+ 0.14026163518428802,
216
+ 0.1875033676624298,
217
+ 0.10574714094400406,
218
+ 0.1429772675037384,
219
+ 0.12849411368370056,
220
+ 0.14711573719978333,
221
+ 0.13475127518177032,
222
+ 0.09672373533248901,
223
+ 0.10411882400512695,
224
+ 0.11657597869634628,
225
+ 0.10133056342601776,
226
+ 0.11380661278963089,
227
+ 0.10492556542158127,
228
+ 0.07867716252803802,
229
+ 0.11208102852106094,
230
+ 0.12357942014932632,
231
+ 0.09030698239803314,
232
+ 0.11589111387729645,
233
+ 0.07594970613718033,
234
+ 0.08090318739414215,
235
+ 0.08280433714389801,
236
+ 0.06660598516464233,
237
+ 0.11626514047384262,
238
+ 0.08782225102186203,
239
+ 0.07667553424835205,
240
+ 0.0659191906452179,
241
+ 0.06984346359968185,
242
+ 0.07098687440156937,
243
+ 0.10126902908086777,
244
+ 0.09939603507518768,
245
+ 0.08991016447544098,
246
+ 0.07744383811950684,
247
+ 0.06926203519105911,
248
+ 0.06146344915032387,
249
+ 0.09036505222320557,
250
+ 0.08489241451025009,
251
+ 0.062018830329179764,
252
+ 0.05275828391313553,
253
+ 0.07086634635925293,
254
+ 0.06931327283382416,
255
+ 0.09127948433160782,
256
+ 0.07429345697164536,
257
+ 0.08549024164676666,
258
+ 0.06286706030368805,
259
+ 0.05378304794430733,
260
+ 0.044635117053985596,
261
+ 0.05906063690781593,
262
+ 0.053794749081134796,
263
+ 0.0547708198428154,
264
+ 0.02951294183731079,
265
+ 0.049380701035261154,
266
+ 0.06013629585504532,
267
+ 0.042153794318437576,
268
+ 0.059404365718364716,
269
+ 0.057506877928972244,
270
+ 0.0508505143225193,
271
+ 0.046686530113220215,
272
+ 0.07300496101379395,
273
+ 0.035853564739227295,
274
+ 0.04073333367705345,
275
+ 0.04065943509340286,
276
+ 0.045939598232507706,
277
+ 0.052995141595602036,
278
+ 0.049060072749853134,
279
+ 0.03872013837099075,
280
+ 0.043473295867443085,
281
+ 0.03402497619390488,
282
+ 0.05528164654970169,
283
+ 0.0507243312895298,
284
+ 0.05915408954024315,
285
+ 0.04674220830202103,
286
+ 0.051885757595300674,
287
+ 0.040509335696697235,
288
+ 0.03638586029410362,
289
+ 0.025626471266150475,
290
+ 0.02101641520857811,
291
+ 0.033242858946323395,
292
+ 0.027668897062540054,
293
+ 0.03064912185072899,
294
+ 0.035983264446258545,
295
+ 0.04090739041566849,
296
+ 0.02467496506869793,
297
+ 0.03506144881248474,
298
+ 0.023822205141186714,
299
+ 0.024419376626610756,
300
+ 0.029216334223747253,
301
+ 0.03664829209446907,
302
+ 0.033825285732746124,
303
+ 0.0254127848893404,
304
+ 0.029008716344833374,
305
+ 0.032890621572732925,
306
+ 0.036917805671691895,
307
+ 0.03223109617829323,
308
+ 0.02562890388071537,
309
+ 0.029643472284078598,
310
+ 0.03162688761949539,
311
+ 0.04186885803937912,
312
+ 0.028349589556455612,
313
+ 0.03455352038145065,
314
+ 0.027812888845801353,
315
+ 0.024674389511346817,
316
+ 0.03943091258406639,
317
+ 0.022551996633410454
318
  ],
319
  "base_loss": [
320
+ 9.4371337890625,
321
+ 7.074652671813965,
322
+ 6.122837543487549,
323
+ 4.6825385093688965,
324
+ 3.0381083488464355,
325
+ 2.0917437076568604,
326
+ 1.882021188735962,
327
+ 1.9029165506362915,
328
+ 1.7700093984603882,
329
+ 1.6935831308364868,
330
+ 1.5817426443099976,
331
+ 1.4419691562652588,
332
+ 1.0461561679840088,
333
+ 0.7932643294334412,
334
+ 0.6623935699462891,
335
+ 0.5456612706184387,
336
+ 0.5513021349906921,
337
+ 0.4767663776874542,
338
+ 0.38498836755752563,
339
+ 0.4133641719818115,
340
+ 0.31893685460090637,
341
+ 0.3751819133758545,
342
+ 0.31682610511779785,
343
+ 0.3113398253917694,
344
+ 0.2859819829463959,
345
+ 0.30192962288856506,
346
+ 0.2594987452030182,
347
+ 0.19506025314331055,
348
+ 0.23191870748996735,
349
+ 0.2392965406179428,
350
+ 0.2253739833831787,
351
+ 0.17677058279514313,
352
+ 0.21856749057769775,
353
+ 0.1249561682343483,
354
+ 0.17712180316448212,
355
+ 0.1823340654373169,
356
+ 0.19931195676326752,
357
+ 0.17808902263641357,
358
+ 0.11950694024562836,
359
+ 0.17866145074367523,
360
+ 0.17124225199222565,
361
+ 0.112669937312603,
362
+ 0.19379594922065735,
363
+ 0.1307276040315628,
364
+ 0.17944271862506866,
365
+ 0.14239387214183807,
366
+ 0.15844760835170746,
367
+ 0.13379694521427155,
368
+ 0.09916692227125168,
369
+ 0.1193651482462883,
370
+ 0.12891770899295807,
371
+ 0.17134933173656464,
372
+ 0.11273209750652313,
373
+ 0.14026163518428802,
374
+ 0.1875033676624298,
375
+ 0.10574714094400406,
376
+ 0.1429772675037384,
377
+ 0.12849411368370056,
378
+ 0.14711573719978333,
379
+ 0.13475127518177032,
380
+ 0.09672373533248901,
381
+ 0.10411882400512695,
382
+ 0.11657597869634628,
383
+ 0.10133056342601776,
384
+ 0.11380661278963089,
385
+ 0.10492556542158127,
386
+ 0.07867716252803802,
387
+ 0.11208102852106094,
388
+ 0.12357942014932632,
389
+ 0.09030698239803314,
390
+ 0.11589111387729645,
391
+ 0.07594970613718033,
392
+ 0.08090318739414215,
393
+ 0.08280433714389801,
394
+ 0.06660598516464233,
395
+ 0.11626514047384262,
396
+ 0.08782225102186203,
397
+ 0.07667553424835205,
398
+ 0.0659191906452179,
399
+ 0.06984346359968185,
400
+ 0.07098687440156937,
401
+ 0.10126902908086777,
402
+ 0.09939603507518768,
403
+ 0.08991016447544098,
404
+ 0.07744383811950684,
405
+ 0.06926203519105911,
406
+ 0.06146344915032387,
407
+ 0.09036505222320557,
408
+ 0.08489241451025009,
409
+ 0.062018830329179764,
410
+ 0.05275828391313553,
411
+ 0.07086634635925293,
412
+ 0.06931327283382416,
413
+ 0.09127948433160782,
414
+ 0.07429345697164536,
415
+ 0.08549024164676666,
416
+ 0.06286706030368805,
417
+ 0.05378304794430733,
418
+ 0.044635117053985596,
419
+ 0.05906063690781593,
420
+ 0.053794749081134796,
421
+ 0.0547708198428154,
422
+ 0.02951294183731079,
423
+ 0.049380701035261154,
424
+ 0.06013629585504532,
425
+ 0.042153794318437576,
426
+ 0.059404365718364716,
427
+ 0.057506877928972244,
428
+ 0.0508505143225193,
429
+ 0.046686530113220215,
430
+ 0.07300496101379395,
431
+ 0.035853564739227295,
432
+ 0.04073333367705345,
433
+ 0.04065943509340286,
434
+ 0.045939598232507706,
435
+ 0.052995141595602036,
436
+ 0.049060072749853134,
437
+ 0.03872013837099075,
438
+ 0.043473295867443085,
439
+ 0.03402497619390488,
440
+ 0.05528164654970169,
441
+ 0.0507243312895298,
442
+ 0.05915408954024315,
443
+ 0.04674220830202103,
444
+ 0.051885757595300674,
445
+ 0.040509335696697235,
446
+ 0.03638586029410362,
447
+ 0.025626471266150475,
448
+ 0.02101641520857811,
449
+ 0.033242858946323395,
450
+ 0.027668897062540054,
451
+ 0.03064912185072899,
452
+ 0.035983264446258545,
453
+ 0.04090739041566849,
454
+ 0.02467496506869793,
455
+ 0.03506144881248474,
456
+ 0.023822205141186714,
457
+ 0.024419376626610756,
458
+ 0.029216334223747253,
459
+ 0.03664829209446907,
460
+ 0.033825285732746124,
461
+ 0.0254127848893404,
462
+ 0.029008716344833374,
463
+ 0.032890621572732925,
464
+ 0.036917805671691895,
465
+ 0.03223109617829323,
466
+ 0.02562890388071537,
467
+ 0.029643472284078598,
468
+ 0.03162688761949539,
469
+ 0.04186885803937912,
470
+ 0.028349589556455612,
471
+ 0.03455352038145065,
472
+ 0.027812888845801353,
473
+ 0.024674389511346817,
474
+ 0.03943091258406639,
475
+ 0.022551996633410454
476
  ],
477
  "lr": [
478
+ 8.376068376068378e-06,
479
+ 1.6923076923076924e-05,
480
+ 2.5470085470085475e-05,
481
+ 3.401709401709402e-05,
482
+ 3.99996141174052e-05,
483
+ 3.9992754396617386e-05,
484
+ 3.997732289238075e-05,
485
+ 3.9953326220867826e-05,
486
+ 3.99207746705195e-05,
487
+ 3.987968219763389e-05,
488
+ 3.9830066420382645e-05,
489
+ 3.97719486112573e-05,
490
+ 3.9705353687948734e-05,
491
+ 3.9630310202663935e-05,
492
+ 3.9546850329884316e-05,
493
+ 3.945500985257116e-05,
494
+ 3.9354828146823805e-05,
495
+ 3.924634816499739e-05,
496
+ 3.9129616417287294e-05,
497
+ 3.900468295178809e-05,
498
+ 3.887160133303572e-05,
499
+ 3.8730428619042037e-05,
500
+ 3.858122533683144e-05,
501
+ 3.842405545649026e-05,
502
+ 3.825898636373997e-05,
503
+ 3.808608883104587e-05,
504
+ 3.790543698727386e-05,
505
+ 3.7717108285908e-05,
506
+ 3.752118347184284e-05,
507
+ 3.7317746546764446e-05,
508
+ 3.710688473313514e-05,
509
+ 3.688868843679738e-05,
510
+ 3.666325120821272e-05,
511
+ 3.643066970235256e-05,
512
+ 3.619104363725791e-05,
513
+ 3.5944475751285765e-05,
514
+ 3.569107175906064e-05,
515
+ 3.5430940306149985e-05,
516
+ 3.516419292248301e-05,
517
+ 3.489094397453285e-05,
518
+ 3.461131061628253e-05,
519
+ 3.4325412738995875e-05,
520
+ 3.403337291981479e-05,
521
+ 3.373531636920496e-05,
522
+ 3.343137087727258e-05,
523
+ 3.3121666758975014e-05,
524
+ 3.280633679824903e-05,
525
+ 3.248551619108036e-05,
526
+ 3.21593424875392e-05,
527
+ 3.1827955532806334e-05,
528
+ 3.149149740721537e-05,
529
+ 3.115011236533647e-05,
530
+ 3.080394677412806e-05,
531
+ 3.045314905018279e-05,
532
+ 3.0097869596094755e-05,
533
+ 2.9738260735975154e-05,
534
+ 2.9374476650144227e-05,
535
+ 2.9006673309027263e-05,
536
+ 2.8635008406283132e-05,
537
+ 2.8259641291194015e-05,
538
+ 2.7880732900345262e-05,
539
+ 2.7498445688624725e-05,
540
+ 2.711294355957111e-05,
541
+ 2.6724391795101242e-05,
542
+ 2.633295698464635e-05,
543
+ 2.5938806953727778e-05,
544
+ 2.5542110692002687e-05,
545
+ 2.5143038280810704e-05,
546
+ 2.4741760820252464e-05,
547
+ 2.433845035583137e-05,
548
+ 2.3933279804690064e-05,
549
+ 2.352642288147312e-05,
550
+ 2.3118054023847876e-05,
551
+ 2.2708348317715223e-05,
552
+ 2.2297481422142487e-05,
553
+ 2.188562949405059e-05,
554
+ 2.1472969112687716e-05,
555
+ 2.1059677203921957e-05,
556
+ 2.064593096438528e-05,
557
+ 2.0231907785501493e-05,
558
+ 1.9817785177430605e-05,
559
+ 1.9403740692962372e-05,
560
+ 1.898995185139145e-05,
561
+ 1.8576596062407023e-05,
562
+ 1.816385055002938e-05,
563
+ 1.7751892276626043e-05,
564
+ 1.7340897867040178e-05,
565
+ 1.6931043532863625e-05,
566
+ 1.6522504996887168e-05,
567
+ 1.611545741776037e-05,
568
+ 1.5710075314893218e-05,
569
+ 1.5306532493631954e-05,
570
+ 1.4905001970740918e-05,
571
+ 1.4505655900222602e-05,
572
+ 1.4108665499507574e-05,
573
+ 1.371420097604592e-05,
574
+ 1.332243145433177e-05,
575
+ 1.2933524903392054e-05,
576
+ 1.2547648064770774e-05,
577
+ 1.2164966381039404e-05,
578
+ 1.178564392486436e-05,
579
+ 1.140984332866172e-05,
580
+ 1.1037725714869483e-05,
581
+ 1.066945062686719e-05,
582
+ 1.0305175960572616e-05,
583
+ 9.945057896744699e-06,
584
+ 9.589250834021969e-06,
585
+ 9.237907322724944e-06,
586
+ 8.891177999451028e-06,
587
+ 8.549211522489898e-06,
588
+ 8.212154508087055e-06,
589
+ 7.88015146758299e-06,
590
+ 7.553344745454641e-06,
591
+ 7.231874458286057e-06,
592
+ 6.915878434694157e-06,
593
+ 6.605492156235467e-06,
594
+ 6.30084869931916e-06,
595
+ 6.002078678151244e-06,
596
+ 5.709310188734507e-06,
597
+ 5.422668753947975e-06,
598
+ 5.14227726972974e-06,
599
+ 4.868255952385965e-06,
600
+ 4.600722287048818e-06,
601
+ 4.339790977305362e-06,
602
+ 4.085573896019013e-06,
603
+ 3.838180037364703e-06,
604
+ 3.5977154700981752e-06,
605
+ 3.364283292079631e-06,
606
+ 3.137983586071065e-06,
607
+ 2.9189133768263488e-06,
608
+ 2.707166589492387e-06,
609
+ 2.5028340093392257e-06,
610
+ 2.3060032428363876e-06,
611
+ 2.1167586800920613e-06,
612
+ 1.9351814586713113e-06,
613
+ 1.7613494288088008e-06,
614
+ 1.5953371200309199e-06,
615
+ 1.437215709201667e-06,
616
+ 1.2870529900059636e-06,
617
+ 1.1449133438834802e-06,
618
+ 1.0108577124254482e-06,
619
+ 8.849435712462972e-07,
620
+ 7.67224905341275e-07,
621
+ 6.57752185940721e-07,
622
+ 5.565723488707586e-07,
623
+ 4.637287744298502e-07,
624
+ 3.79261268789719e-07,
625
+ 3.032060469286724e-07,
626
+ 2.3559571710463747e-07,
627
+ 1.7645926687452908e-07,
628
+ 1.2582205066603127e-07,
629
+ 8.370577890698173e-08,
630
+ 5.012850871717989e-08,
631
+ 2.5104636166479735e-08,
632
+ 8.64489010255598e-09,
633
+ 7.563275509769874e-10
634
  ],
635
  "eval_step": [
636
  390,
 
677
  20
678
  ],
679
  "eval_accuracy": [
680
+ 0.0044444444444444444,
681
+ 0.07111111111111111,
682
+ 0.3566666666666667,
683
+ 0.5133333333333333,
684
+ 0.6433333333333333,
685
+ 0.6266666666666667,
686
+ 0.6777777777777778,
687
+ 0.6422222222222222,
688
+ 0.6966666666666667,
689
+ 0.7366666666666667,
690
+ 0.75,
691
+ 0.7633333333333333,
692
+ 0.7722222222222223,
693
+ 0.7922222222222223,
694
+ 0.8388888888888889,
695
+ 0.8188888888888889,
696
+ 0.84,
697
+ 0.8522222222222222,
698
+ 0.8511111111111112,
699
+ 0.8488888888888889
700
  ]
701
  },
702
+ "final_accuracy": 0.8179166666666666,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
706
  "K": null,
707
  "mode": "sft",
708
  "n_digits": 6,
709
+ "n_per_split": 100
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.95,
714
+ "n_examples": 100,
715
  "per_subtask": {
716
  "SA": {
717
+ "accuracy": 0.9917355371900827,
718
+ "count": 605
719
  },
720
  "SS": {
721
  "accuracy": 1.0,
722
+ "count": 95
723
  }
724
  }
725
  },
726
  "add_S1": {
727
  "full_accuracy": 1.0,
728
+ "n_examples": 100,
729
  "per_subtask": {
730
  "SA": {
731
  "accuracy": 1.0,
732
+ "count": 204
733
  },
734
  "SC": {
735
  "accuracy": 1.0,
736
+ "count": 169
737
  },
738
  "SS": {
739
  "accuracy": 1.0,
740
+ "count": 31
741
  },
742
  "UC": {
743
  "accuracy": 1.0,
744
+ "count": 296
745
  }
746
  }
747
  },
748
  "add_S2": {
749
+ "full_accuracy": 0.91,
750
+ "n_examples": 100,
751
  "per_subtask": {
752
  "SA": {
753
+ "accuracy": 0.9938650306748467,
754
+ "count": 163
755
  },
756
  "SC": {
757
+ "accuracy": 0.9923076923076923,
758
+ "count": 130
759
  },
760
  "SS": {
761
  "accuracy": 1.0,
762
+ "count": 87
763
  },
764
  "UC": {
765
+ "accuracy": 0.9655172413793104,
766
+ "count": 203
767
  },
768
  "US": {
769
  "accuracy": 1.0,
770
+ "count": 117
771
  }
772
  }
773
  },
774
  "add_S3": {
775
+ "full_accuracy": 0.87,
776
+ "n_examples": 100,
777
  "per_subtask": {
778
  "SA": {
779
  "accuracy": 1.0,
780
+ "count": 121
781
  },
782
  "SC": {
783
  "accuracy": 1.0,
784
+ "count": 121
785
  },
786
  "SS": {
787
  "accuracy": 1.0,
788
+ "count": 49
789
  },
790
  "UC": {
791
+ "accuracy": 0.9354838709677419,
792
+ "count": 186
793
  },
794
  "US": {
795
+ "accuracy": 0.9820627802690582,
796
+ "count": 223
797
  }
798
  }
799
  },
800
  "add_S4": {
801
+ "full_accuracy": 0.71,
802
+ "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
805
  "accuracy": 1.0,
806
+ "count": 104
807
  },
808
  "SC": {
809
  "accuracy": 1.0,
810
+ "count": 106
811
  },
812
  "SS": {
813
  "accuracy": 1.0,
814
+ "count": 23
815
  },
816
  "UC": {
817
+ "accuracy": 0.85,
818
+ "count": 160
819
  },
820
  "US": {
821
+ "accuracy": 0.9641693811074918,
822
+ "count": 307
823
  }
824
  }
825
  },
826
  "add_S5": {
827
+ "full_accuracy": 0.59,
828
+ "n_examples": 100,
829
  "per_subtask": {
830
  "SA": {
831
  "accuracy": 1.0,
832
+ "count": 100
833
  },
834
  "SC": {
835
  "accuracy": 1.0,
836
+ "count": 100
837
  },
838
  "UC": {
839
+ "accuracy": 0.72,
840
+ "count": 100
841
  },
842
  "US": {
843
+ "accuracy": 0.8775,
844
+ "count": 400
845
  }
846
  }
847
  },
848
  "add_S6": {
849
+ "full_accuracy": 0.74,
850
+ "n_examples": 100,
851
  "per_subtask": {
852
  "SC": {
853
  "accuracy": 1.0,
854
+ "count": 100
855
  },
856
  "UC": {
857
+ "accuracy": 0.85,
858
+ "count": 100
859
  },
860
  "US": {
861
+ "accuracy": 0.882,
862
+ "count": 500
863
  }
864
  }
865
  },
866
  "add_random": {
867
+ "full_accuracy": 0.95,
868
  "n_examples": 200,
869
  "per_subtask": {
870
  "SA": {
871
+ "accuracy": 0.9865771812080537,
872
+ "count": 447
873
  },
874
  "SC": {
875
+ "accuracy": 0.996875,
876
+ "count": 320
877
  },
878
  "SS": {
879
  "accuracy": 1.0,
880
+ "count": 56
881
  },
882
  "UC": {
883
+ "accuracy": 0.9924385633270322,
884
+ "count": 529
885
  },
886
  "US": {
887
  "accuracy": 1.0,
888
+ "count": 48
889
  }
890
  }
891
  },
892
  "add_C3": {
893
+ "full_accuracy": 0.84,
894
+ "n_examples": 100,
895
  "per_subtask": {
896
  "SA": {
897
  "accuracy": 1.0,
898
+ "count": 300
899
  },
900
  "SC": {
901
  "accuracy": 1.0,
902
+ "count": 100
903
  },
904
  "UC": {
905
+ "accuracy": 0.917098445595855,
906
+ "count": 193
907
  },
908
  "US": {
909
  "accuracy": 1.0,
910
+ "count": 107
911
  }
912
  }
913
  },
914
  "add_C4": {
915
  "full_accuracy": 0.9,
916
+ "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
919
  "accuracy": 1.0,
920
+ "count": 200
921
  },
922
  "SC": {
923
  "accuracy": 1.0,
924
+ "count": 100
925
  },
926
  "UC": {
927
+ "accuracy": 0.9609375,
928
+ "count": 256
929
  },
930
  "US": {
931
+ "accuracy": 1.0,
932
+ "count": 144
933
  }
934
  }
935
  },
936
  "add_C5": {
937
+ "full_accuracy": 0.89,
938
+ "n_examples": 100,
939
  "per_subtask": {
940
  "SA": {
941
  "accuracy": 1.0,
942
+ "count": 100
943
  },
944
  "SC": {
945
  "accuracy": 1.0,
946
+ "count": 100
947
  },
948
  "UC": {
949
+ "accuracy": 0.9640522875816994,
950
+ "count": 306
951
  },
952
  "US": {
953
  "accuracy": 1.0,
954
+ "count": 194
955
  }
956
  }
957
  },
958
  "add_C6": {
959
+ "full_accuracy": 0.9,
960
+ "n_examples": 100,
961
  "per_subtask": {
962
  "SC": {
963
  "accuracy": 1.0,
964
+ "count": 100
965
  },
966
  "UC": {
967
+ "accuracy": 0.9754098360655737,
968
+ "count": 366
969
  },
970
  "US": {
971
+ "accuracy": 0.9957264957264957,
972
+ "count": 234
973
  }
974
  }
975
  },
976
  "sub_M0": {
977
+ "full_accuracy": 0.99,
978
+ "n_examples": 100,
979
  "per_subtask": {
980
  "MD": {
981
+ "accuracy": 0.9983361064891847,
982
+ "count": 601
983
  },
984
  "ME": {
985
  "accuracy": 1.0,
986
+ "count": 99
987
  }
988
  }
989
  },
990
  "sub_M1": {
991
+ "full_accuracy": 0.98,
992
+ "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
995
+ "accuracy": 0.996415770609319,
996
+ "count": 279
997
  },
998
  "MB": {
999
+ "accuracy": 0.993103448275862,
1000
+ "count": 145
1001
  },
1002
  "ME": {
1003
  "accuracy": 1.0,
1004
+ "count": 24
1005
  },
1006
  "UB": {
1007
  "accuracy": 1.0,
1008
+ "count": 252
1009
  }
1010
  }
1011
  },
1012
  "sub_M2": {
1013
+ "full_accuracy": 0.92,
1014
+ "n_examples": 100,
1015
  "per_subtask": {
1016
  "MD": {
1017
+ "accuracy": 0.9953051643192489,
1018
+ "count": 213
1019
  },
1020
  "MB": {
1021
+ "accuracy": 0.9911504424778761,
1022
+ "count": 113
1023
  },
1024
  "ME": {
1025
  "accuracy": 1.0,
1026
+ "count": 85
1027
  },
1028
  "UB": {
1029
+ "accuracy": 0.9668508287292817,
1030
+ "count": 181
1031
  },
1032
  "UD": {
1033
  "accuracy": 1.0,
1034
+ "count": 108
1035
  }
1036
  }
1037
  },
1038
  "sub_M3": {
1039
+ "full_accuracy": 0.8,
1040
+ "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
1043
  "accuracy": 1.0,
1044
+ "count": 179
1045
  },
1046
  "MB": {
1047
  "accuracy": 1.0,
1048
+ "count": 103
1049
  },
1050
  "ME": {
1051
  "accuracy": 1.0,
1052
+ "count": 56
1053
  },
1054
  "UB": {
1055
+ "accuracy": 0.8657718120805369,
1056
+ "count": 149
1057
  },
1058
  "UD": {
1059
  "accuracy": 1.0,
1060
+ "count": 213
1061
  }
1062
  }
1063
  },
1064
  "sub_M4": {
1065
+ "full_accuracy": 0.4,
1066
+ "n_examples": 100,
1067
  "per_subtask": {
1068
  "MD": {
1069
  "accuracy": 1.0,
1070
+ "count": 200
1071
  },
1072
  "MB": {
1073
  "accuracy": 1.0,
1074
+ "count": 100
1075
  },
1076
  "UB": {
1077
+ "accuracy": 0.44,
1078
+ "count": 100
1079
  },
1080
  "UD": {
1081
+ "accuracy": 0.97,
1082
+ "count": 300
1083
  }
1084
  }
1085
  },
1086
  "sub_M5": {
1087
+ "full_accuracy": 0.08,
1088
+ "n_examples": 100,
1089
  "per_subtask": {
1090
  "MD": {
1091
  "accuracy": 1.0,
1092
+ "count": 100
1093
  },
1094
  "MB": {
1095
  "accuracy": 1.0,
1096
+ "count": 100
1097
  },
1098
  "UB": {
1099
+ "accuracy": 0.25,
1100
+ "count": 100
1101
  },
1102
  "UD": {
1103
+ "accuracy": 0.6875,
1104
+ "count": 400
1105
  }
1106
  }
1107
  },
1108
  "sub_random": {
1109
+ "full_accuracy": 0.965,
1110
  "n_examples": 200,
1111
  "per_subtask": {
1112
  "MD": {
1113
+ "accuracy": 0.9983333333333333,
1114
+ "count": 600
1115
  },
1116
  "MB": {
1117
+ "accuracy": 0.9925093632958801,
1118
+ "count": 267
1119
  },
1120
  "ME": {
1121
  "accuracy": 1.0,
1122
+ "count": 53
1123
  },
1124
  "UB": {
1125
+ "accuracy": 0.9908883826879271,
1126
+ "count": 439
1127
  },
1128
  "UD": {
1129
  "accuracy": 1.0,
1130
+ "count": 41
1131
  }
1132
  }
1133
  },
1134
  "sub_B3": {
1135
+ "full_accuracy": 0.89,
1136
+ "n_examples": 100,
1137
  "per_subtask": {
1138
  "MD": {
1139
+ "accuracy": 0.9966666666666667,
1140
+ "count": 300
1141
  },
1142
  "MB": {
1143
  "accuracy": 1.0,
1144
+ "count": 100
1145
  },
1146
  "UB": {
1147
+ "accuracy": 0.949238578680203,
1148
+ "count": 197
1149
  },
1150
  "UD": {
1151
  "accuracy": 1.0,
1152
+ "count": 103
1153
  }
1154
  }
1155
  },
1156
  "sub_B4": {
1157
+ "full_accuracy": 0.74,
1158
+ "n_examples": 100,
1159
  "per_subtask": {
1160
  "MD": {
1161
  "accuracy": 1.0,
1162
+ "count": 200
1163
  },
1164
  "MB": {
1165
  "accuracy": 1.0,
1166
+ "count": 100
1167
  },
1168
  "UB": {
1169
+ "accuracy": 0.8947368421052632,
1170
+ "count": 247
1171
  },
1172
  "UD": {
1173
+ "accuracy": 0.9934640522875817,
1174
+ "count": 153
1175
  }
1176
  }
1177
  },
1178
  "sub_B5": {
1179
+ "full_accuracy": 0.7,
1180
+ "n_examples": 100,
1181
  "per_subtask": {
1182
  "MD": {
1183
  "accuracy": 1.0,
1184
+ "count": 100
1185
  },
1186
  "MB": {
1187
  "accuracy": 1.0,
1188
+ "count": 100
1189
  },
1190
  "UB": {
1191
+ "accuracy": 0.9060402684563759,
1192
+ "count": 298
1193
  },
1194
  "UD": {
1195
+ "accuracy": 0.9653465346534653,
1196
+ "count": 202
1197
  }
1198
  }
1199
  }
1200
  },
1201
  "summary": {
1202
+ "overall_accuracy": 0.8179166666666666,
1203
+ "total_examples": 2400,
1204
  "n_splits": 22
1205
  }
1206
  }
add_sub_baseline_25K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bf525d5b1ad9ce7ab59b46e66c0d4b353f9086c36262ae393e08d901ea70fb2
3
  size 650266922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795da9d34e3babd627a021a17d4593d73d1e588cccc1e221657909955751d349
3
  size 650266922
add_sub_baseline_25K/train_config.json CHANGED
@@ -1,35 +1,84 @@
1
  {
2
- "mode": "baseline",
3
- "ops": "add_sub",
4
- "n_digits": 6,
5
- "n_layer": 2,
6
- "n_head": 3,
7
- "n_embd": 510,
8
- "abs_vocab": 0,
9
  "K": 4,
 
 
 
 
 
 
10
  "alpha_info_gain": 10.0,
11
  "alpha_abs": 0.1,
12
  "alpha_soft_zipf": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  "batch_size": 64,
 
14
  "num_epochs": 20,
15
- "dataset_size": 25000,
16
- "lr": 8e-05,
 
 
 
17
  "output_dir": "ckpt/sweep/add_sub_baseline_25K",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  "device": "cuda",
19
  "push_to_hub": true,
20
  "no_wandb": false,
21
  "n_params": 162490082,
22
  "run_name": "add_sub_baseline_25K",
23
- "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
- "timestamp": "2026-04-12T01:58:10.719272+00:00",
25
  "tokenizer": "Qwen/Qwen3-0.6B",
26
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
  "dataset_config": "add_sub_6digit",
28
  "model_repo": "thoughtworks/arithmetic-sorl",
29
  "trainer_version": "sft",
30
- "wandb_run_id": "hccg8mv4",
31
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/hccg8mv4",
32
- "final_accuracy": 0.9207142857142857,
33
- "sft_accuracy": 0.9207142857142857,
34
  "eval_method": "ArithmeticEvaluator"
35
  }
 
1
  {
2
+ "num_rollouts": 4,
 
 
 
 
 
 
3
  "K": 4,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
  "alpha_info_gain": 10.0,
11
  "alpha_abs": 0.1,
12
  "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 4e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 234,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
  "num_epochs": 20,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 390,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
  "output_dir": "ckpt/sweep/add_sub_baseline_25K",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 2,
61
+ "n_head": 3,
62
+ "n_embd": 510,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 0,
65
+ "dataset_size": 25000,
66
+ "mode": "baseline",
67
  "device": "cuda",
68
  "push_to_hub": true,
69
  "no_wandb": false,
70
  "n_params": 162490082,
71
  "run_name": "add_sub_baseline_25K",
72
+ "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
+ "timestamp": "2026-04-12T08:59:11.035906+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
+ "wandb_run_id": "4f30dkzf",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/4f30dkzf",
81
+ "final_accuracy": 0.8179166666666666,
82
+ "sft_accuracy": 0.8179166666666666,
83
  "eval_method": "ArithmeticEvaluator"
84
  }