amirali1985 commited on
Commit
a6a1ec1
·
verified ·
1 Parent(s): c6eee08

Upload add_sub_baseline_25K_1L3H510d

Browse files
add_sub_baseline_25K_1L3H510d/metrics.json CHANGED
@@ -159,478 +159,478 @@
159
  7800
160
  ],
161
  "loss": [
162
- 10.19568157196045,
163
- 7.192863464355469,
164
- 6.217032432556152,
165
- 4.67427921295166,
166
- 2.9682857990264893,
167
- 2.0980846881866455,
168
- 1.8946998119354248,
169
- 1.9396218061447144,
170
- 1.8249216079711914,
171
- 1.7901777029037476,
172
- 1.7934296131134033,
173
- 1.7734571695327759,
174
- 1.7680686712265015,
175
- 1.6811468601226807,
176
- 1.6904586553573608,
177
- 1.6290004253387451,
178
- 1.5940027236938477,
179
- 1.5358794927597046,
180
- 1.278952717781067,
181
- 1.1427963972091675,
182
- 0.9103876352310181,
183
- 0.7655139565467834,
184
- 0.7879643440246582,
185
- 0.704889714717865,
186
- 0.7027565240859985,
187
- 0.7044745683670044,
188
- 0.6972912549972534,
189
- 0.6401001811027527,
190
- 0.6621316075325012,
191
- 0.6303651928901672,
192
- 0.5821676254272461,
193
- 0.6099020838737488,
194
- 0.5351438522338867,
195
- 0.5087913870811462,
196
- 0.541701078414917,
197
- 0.47329190373420715,
198
- 0.4839247167110443,
199
- 0.47691941261291504,
200
- 0.4968743920326233,
201
- 0.45558029413223267,
202
- 0.4464017450809479,
203
- 0.45957401394844055,
204
- 0.4282552897930145,
205
- 0.41863182187080383,
206
- 0.4324481785297394,
207
- 0.3976905047893524,
208
- 0.3744252026081085,
209
- 0.3938696086406708,
210
- 0.36305326223373413,
211
- 0.3572494089603424,
212
- 0.39805254340171814,
213
- 0.38645902276039124,
214
- 0.3354526162147522,
215
- 0.3297174274921417,
216
- 0.37510284781455994,
217
- 0.3538384437561035,
218
- 0.3419480621814728,
219
- 0.31991156935691833,
220
- 0.30549606680870056,
221
- 0.3192532956600189,
222
- 0.29912063479423523,
223
- 0.3299557864665985,
224
- 0.3237224221229553,
225
- 0.3043464720249176,
226
- 0.29594531655311584,
227
- 0.2924143970012665,
228
- 0.3392622172832489,
229
- 0.29438456892967224,
230
- 0.2825162708759308,
231
- 0.2688557803630829,
232
- 0.27867329120635986,
233
- 0.26582691073417664,
234
- 0.2573307156562805,
235
- 0.2639952301979065,
236
- 0.2442159354686737,
237
- 0.30415433645248413,
238
- 0.28714659810066223,
239
- 0.2530474364757538,
240
- 0.2529540956020355,
241
- 0.26841235160827637,
242
- 0.2817437946796417,
243
- 0.25053316354751587,
244
- 0.2483910471200943,
245
- 0.2930961549282074,
246
- 0.28800302743911743,
247
- 0.25981736183166504,
248
- 0.23965764045715332,
249
- 0.24082252383232117,
250
- 0.2665214538574219,
251
- 0.24315674602985382,
252
- 0.23547802865505219,
253
- 0.2577818036079407,
254
- 0.2551196217536926,
255
- 0.2866048812866211,
256
- 0.22905278205871582,
257
- 0.2948581576347351,
258
- 0.22631767392158508,
259
- 0.23360957205295563,
260
- 0.23361513018608093,
261
- 0.2707379162311554,
262
- 0.2082580029964447,
263
- 0.259878545999527,
264
- 0.23401972651481628,
265
- 0.22835519909858704,
266
- 0.24494406580924988,
267
- 0.2527218461036682,
268
- 0.23783628642559052,
269
- 0.2323145866394043,
270
- 0.2299260050058365,
271
- 0.22768929600715637,
272
- 0.22232380509376526,
273
- 0.23369023203849792,
274
- 0.2056972235441208,
275
- 0.19805261492729187,
276
- 0.2061125934123993,
277
- 0.22895146906375885,
278
- 0.2325371354818344,
279
- 0.20736224949359894,
280
- 0.2153971642255783,
281
- 0.22081783413887024,
282
- 0.21203044056892395,
283
- 0.21354515850543976,
284
- 0.24314843118190765,
285
- 0.21091873943805695,
286
- 0.22135458886623383,
287
- 0.22974269092082977,
288
- 0.20833232998847961,
289
- 0.20342501997947693,
290
- 0.21650569140911102,
291
- 0.2234790027141571,
292
- 0.21649529039859772,
293
- 0.20806384086608887,
294
- 0.201065331697464,
295
- 0.21765287220478058,
296
- 0.199764221906662,
297
- 0.21314558386802673,
298
- 0.18189337849617004,
299
- 0.1983877569437027,
300
- 0.20944280922412872,
301
- 0.20895661413669586,
302
- 0.20532403886318207,
303
- 0.20170608162879944,
304
- 0.2062072455883026,
305
- 0.21582531929016113,
306
- 0.20745429396629333,
307
- 0.22070619463920593,
308
- 0.20926928520202637,
309
- 0.18736205995082855,
310
- 0.19164054095745087,
311
- 0.2283153533935547,
312
- 0.19645021855831146,
313
- 0.2316746711730957,
314
- 0.21589398384094238,
315
- 0.19139589369297028,
316
- 0.218153715133667,
317
- 0.2088916301727295
318
  ],
319
  "base_loss": [
320
- 10.19568157196045,
321
- 7.192863464355469,
322
- 6.217032432556152,
323
- 4.67427921295166,
324
- 2.9682857990264893,
325
- 2.0980846881866455,
326
- 1.8946998119354248,
327
- 1.9396218061447144,
328
- 1.8249216079711914,
329
- 1.7901777029037476,
330
- 1.7934296131134033,
331
- 1.7734571695327759,
332
- 1.7680686712265015,
333
- 1.6811468601226807,
334
- 1.6904586553573608,
335
- 1.6290004253387451,
336
- 1.5940027236938477,
337
- 1.5358794927597046,
338
- 1.278952717781067,
339
- 1.1427963972091675,
340
- 0.9103876352310181,
341
- 0.7655139565467834,
342
- 0.7879643440246582,
343
- 0.704889714717865,
344
- 0.7027565240859985,
345
- 0.7044745683670044,
346
- 0.6972912549972534,
347
- 0.6401001811027527,
348
- 0.6621316075325012,
349
- 0.6303651928901672,
350
- 0.5821676254272461,
351
- 0.6099020838737488,
352
- 0.5351438522338867,
353
- 0.5087913870811462,
354
- 0.541701078414917,
355
- 0.47329190373420715,
356
- 0.4839247167110443,
357
- 0.47691941261291504,
358
- 0.4968743920326233,
359
- 0.45558029413223267,
360
- 0.4464017450809479,
361
- 0.45957401394844055,
362
- 0.4282552897930145,
363
- 0.41863182187080383,
364
- 0.4324481785297394,
365
- 0.3976905047893524,
366
- 0.3744252026081085,
367
- 0.3938696086406708,
368
- 0.36305326223373413,
369
- 0.3572494089603424,
370
- 0.39805254340171814,
371
- 0.38645902276039124,
372
- 0.3354526162147522,
373
- 0.3297174274921417,
374
- 0.37510284781455994,
375
- 0.3538384437561035,
376
- 0.3419480621814728,
377
- 0.31991156935691833,
378
- 0.30549606680870056,
379
- 0.3192532956600189,
380
- 0.29912063479423523,
381
- 0.3299557864665985,
382
- 0.3237224221229553,
383
- 0.3043464720249176,
384
- 0.29594531655311584,
385
- 0.2924143970012665,
386
- 0.3392622172832489,
387
- 0.29438456892967224,
388
- 0.2825162708759308,
389
- 0.2688557803630829,
390
- 0.27867329120635986,
391
- 0.26582691073417664,
392
- 0.2573307156562805,
393
- 0.2639952301979065,
394
- 0.2442159354686737,
395
- 0.30415433645248413,
396
- 0.28714659810066223,
397
- 0.2530474364757538,
398
- 0.2529540956020355,
399
- 0.26841235160827637,
400
- 0.2817437946796417,
401
- 0.25053316354751587,
402
- 0.2483910471200943,
403
- 0.2930961549282074,
404
- 0.28800302743911743,
405
- 0.25981736183166504,
406
- 0.23965764045715332,
407
- 0.24082252383232117,
408
- 0.2665214538574219,
409
- 0.24315674602985382,
410
- 0.23547802865505219,
411
- 0.2577818036079407,
412
- 0.2551196217536926,
413
- 0.2866048812866211,
414
- 0.22905278205871582,
415
- 0.2948581576347351,
416
- 0.22631767392158508,
417
- 0.23360957205295563,
418
- 0.23361513018608093,
419
- 0.2707379162311554,
420
- 0.2082580029964447,
421
- 0.259878545999527,
422
- 0.23401972651481628,
423
- 0.22835519909858704,
424
- 0.24494406580924988,
425
- 0.2527218461036682,
426
- 0.23783628642559052,
427
- 0.2323145866394043,
428
- 0.2299260050058365,
429
- 0.22768929600715637,
430
- 0.22232380509376526,
431
- 0.23369023203849792,
432
- 0.2056972235441208,
433
- 0.19805261492729187,
434
- 0.2061125934123993,
435
- 0.22895146906375885,
436
- 0.2325371354818344,
437
- 0.20736224949359894,
438
- 0.2153971642255783,
439
- 0.22081783413887024,
440
- 0.21203044056892395,
441
- 0.21354515850543976,
442
- 0.24314843118190765,
443
- 0.21091873943805695,
444
- 0.22135458886623383,
445
- 0.22974269092082977,
446
- 0.20833232998847961,
447
- 0.20342501997947693,
448
- 0.21650569140911102,
449
- 0.2234790027141571,
450
- 0.21649529039859772,
451
- 0.20806384086608887,
452
- 0.201065331697464,
453
- 0.21765287220478058,
454
- 0.199764221906662,
455
- 0.21314558386802673,
456
- 0.18189337849617004,
457
- 0.1983877569437027,
458
- 0.20944280922412872,
459
- 0.20895661413669586,
460
- 0.20532403886318207,
461
- 0.20170608162879944,
462
- 0.2062072455883026,
463
- 0.21582531929016113,
464
- 0.20745429396629333,
465
- 0.22070619463920593,
466
- 0.20926928520202637,
467
- 0.18736205995082855,
468
- 0.19164054095745087,
469
- 0.2283153533935547,
470
- 0.19645021855831146,
471
- 0.2316746711730957,
472
- 0.21589398384094238,
473
- 0.19139589369297028,
474
- 0.218153715133667,
475
- 0.2088916301727295
476
  ],
477
  "lr": [
478
- 8.376068376068378e-06,
479
- 1.6923076923076924e-05,
480
- 2.5470085470085475e-05,
481
- 3.401709401709402e-05,
482
- 3.99996141174052e-05,
483
- 3.9992754396617386e-05,
484
- 3.997732289238075e-05,
485
- 3.9953326220867826e-05,
486
- 3.99207746705195e-05,
487
- 3.987968219763389e-05,
488
- 3.9830066420382645e-05,
489
- 3.97719486112573e-05,
490
- 3.9705353687948734e-05,
491
- 3.9630310202663935e-05,
492
- 3.9546850329884316e-05,
493
- 3.945500985257116e-05,
494
- 3.9354828146823805e-05,
495
- 3.924634816499739e-05,
496
- 3.9129616417287294e-05,
497
- 3.900468295178809e-05,
498
- 3.887160133303572e-05,
499
- 3.8730428619042037e-05,
500
- 3.858122533683144e-05,
501
- 3.842405545649026e-05,
502
- 3.825898636373997e-05,
503
- 3.808608883104587e-05,
504
- 3.790543698727386e-05,
505
- 3.7717108285908e-05,
506
- 3.752118347184284e-05,
507
- 3.7317746546764446e-05,
508
- 3.710688473313514e-05,
509
- 3.688868843679738e-05,
510
- 3.666325120821272e-05,
511
- 3.643066970235256e-05,
512
- 3.619104363725791e-05,
513
- 3.5944475751285765e-05,
514
- 3.569107175906064e-05,
515
- 3.5430940306149985e-05,
516
- 3.516419292248301e-05,
517
- 3.489094397453285e-05,
518
- 3.461131061628253e-05,
519
- 3.4325412738995875e-05,
520
- 3.403337291981479e-05,
521
- 3.373531636920496e-05,
522
- 3.343137087727258e-05,
523
- 3.3121666758975014e-05,
524
- 3.280633679824903e-05,
525
- 3.248551619108036e-05,
526
- 3.21593424875392e-05,
527
- 3.1827955532806334e-05,
528
- 3.149149740721537e-05,
529
- 3.115011236533647e-05,
530
- 3.080394677412806e-05,
531
- 3.045314905018279e-05,
532
- 3.0097869596094755e-05,
533
- 2.9738260735975154e-05,
534
- 2.9374476650144227e-05,
535
- 2.9006673309027263e-05,
536
- 2.8635008406283132e-05,
537
- 2.8259641291194015e-05,
538
- 2.7880732900345262e-05,
539
- 2.7498445688624725e-05,
540
- 2.711294355957111e-05,
541
- 2.6724391795101242e-05,
542
- 2.633295698464635e-05,
543
- 2.5938806953727778e-05,
544
- 2.5542110692002687e-05,
545
- 2.5143038280810704e-05,
546
- 2.4741760820252464e-05,
547
- 2.433845035583137e-05,
548
- 2.3933279804690064e-05,
549
- 2.352642288147312e-05,
550
- 2.3118054023847876e-05,
551
- 2.2708348317715223e-05,
552
- 2.2297481422142487e-05,
553
- 2.188562949405059e-05,
554
- 2.1472969112687716e-05,
555
- 2.1059677203921957e-05,
556
- 2.064593096438528e-05,
557
- 2.0231907785501493e-05,
558
- 1.9817785177430605e-05,
559
- 1.9403740692962372e-05,
560
- 1.898995185139145e-05,
561
- 1.8576596062407023e-05,
562
- 1.816385055002938e-05,
563
- 1.7751892276626043e-05,
564
- 1.7340897867040178e-05,
565
- 1.6931043532863625e-05,
566
- 1.6522504996887168e-05,
567
- 1.611545741776037e-05,
568
- 1.5710075314893218e-05,
569
- 1.5306532493631954e-05,
570
- 1.4905001970740918e-05,
571
- 1.4505655900222602e-05,
572
- 1.4108665499507574e-05,
573
- 1.371420097604592e-05,
574
- 1.332243145433177e-05,
575
- 1.2933524903392054e-05,
576
- 1.2547648064770774e-05,
577
- 1.2164966381039404e-05,
578
- 1.178564392486436e-05,
579
- 1.140984332866172e-05,
580
- 1.1037725714869483e-05,
581
- 1.066945062686719e-05,
582
- 1.0305175960572616e-05,
583
- 9.945057896744699e-06,
584
- 9.589250834021969e-06,
585
- 9.237907322724944e-06,
586
- 8.891177999451028e-06,
587
- 8.549211522489898e-06,
588
- 8.212154508087055e-06,
589
- 7.88015146758299e-06,
590
- 7.553344745454641e-06,
591
- 7.231874458286057e-06,
592
- 6.915878434694157e-06,
593
- 6.605492156235467e-06,
594
- 6.30084869931916e-06,
595
- 6.002078678151244e-06,
596
- 5.709310188734507e-06,
597
- 5.422668753947975e-06,
598
- 5.14227726972974e-06,
599
- 4.868255952385965e-06,
600
- 4.600722287048818e-06,
601
- 4.339790977305362e-06,
602
- 4.085573896019013e-06,
603
- 3.838180037364703e-06,
604
- 3.5977154700981752e-06,
605
- 3.364283292079631e-06,
606
- 3.137983586071065e-06,
607
- 2.9189133768263488e-06,
608
- 2.707166589492387e-06,
609
- 2.5028340093392257e-06,
610
- 2.3060032428363876e-06,
611
- 2.1167586800920613e-06,
612
- 1.9351814586713113e-06,
613
- 1.7613494288088008e-06,
614
- 1.5953371200309199e-06,
615
- 1.437215709201667e-06,
616
- 1.2870529900059636e-06,
617
- 1.1449133438834802e-06,
618
- 1.0108577124254482e-06,
619
- 8.849435712462972e-07,
620
- 7.67224905341275e-07,
621
- 6.57752185940721e-07,
622
- 5.565723488707586e-07,
623
- 4.637287744298502e-07,
624
- 3.79261268789719e-07,
625
- 3.032060469286724e-07,
626
- 2.3559571710463747e-07,
627
- 1.7645926687452908e-07,
628
- 1.2582205066603127e-07,
629
- 8.370577890698173e-08,
630
- 5.012850871717989e-08,
631
- 2.5104636166479735e-08,
632
- 8.64489010255598e-09,
633
- 7.563275509769874e-10
634
  ],
635
  "eval_step": [
636
  390,
@@ -677,29 +677,29 @@
677
  20
678
  ],
679
  "eval_accuracy": [
680
- 0.0033333333333333335,
681
- 0.0033333333333333335,
682
- 0.028888888888888888,
683
- 0.07444444444444444,
684
- 0.14888888888888888,
685
- 0.24555555555555555,
686
- 0.28444444444444444,
687
- 0.32555555555555554,
688
- 0.4177777777777778,
689
- 0.4411111111111111,
690
- 0.42333333333333334,
691
- 0.4166666666666667,
692
- 0.4677777777777778,
693
- 0.46444444444444444,
694
- 0.5133333333333333,
695
- 0.5,
696
- 0.5111111111111111,
697
- 0.4922222222222222,
698
- 0.5,
699
- 0.5077777777777778
700
  ]
701
  },
702
- "final_accuracy": 0.395,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
@@ -710,51 +710,51 @@
710
  },
711
  "splits": {
712
  "add_S0": {
713
- "full_accuracy": 0.69,
714
  "n_examples": 100,
715
  "per_subtask": {
716
  "SA": {
717
- "accuracy": 0.943801652892562,
718
  "count": 605
719
  },
720
  "SS": {
721
- "accuracy": 0.9578947368421052,
722
  "count": 95
723
  }
724
  }
725
  },
726
  "add_S1": {
727
- "full_accuracy": 0.62,
728
  "n_examples": 100,
729
  "per_subtask": {
730
  "SA": {
731
- "accuracy": 0.9656862745098039,
732
  "count": 204
733
  },
734
  "SC": {
735
- "accuracy": 0.9704142011834319,
736
  "count": 169
737
  },
738
  "SS": {
739
- "accuracy": 0.9354838709677419,
740
  "count": 31
741
  },
742
  "UC": {
743
- "accuracy": 0.8986486486486487,
744
  "count": 296
745
  }
746
  }
747
  },
748
  "add_S2": {
749
- "full_accuracy": 0.45,
750
  "n_examples": 100,
751
  "per_subtask": {
752
  "SA": {
753
- "accuracy": 0.9570552147239264,
754
  "count": 163
755
  },
756
  "SC": {
757
- "accuracy": 0.9307692307692308,
758
  "count": 130
759
  },
760
  "SS": {
@@ -762,43 +762,43 @@
762
  "count": 87
763
  },
764
  "UC": {
765
- "accuracy": 0.7733990147783252,
766
  "count": 203
767
  },
768
  "US": {
769
- "accuracy": 0.9572649572649573,
770
  "count": 117
771
  }
772
  }
773
  },
774
  "add_S3": {
775
- "full_accuracy": 0.21,
776
  "n_examples": 100,
777
  "per_subtask": {
778
  "SA": {
779
- "accuracy": 0.9752066115702479,
780
  "count": 121
781
  },
782
  "SC": {
783
- "accuracy": 0.9504132231404959,
784
  "count": 121
785
  },
786
  "SS": {
787
- "accuracy": 0.9591836734693877,
788
  "count": 49
789
  },
790
  "UC": {
791
- "accuracy": 0.6236559139784946,
792
  "count": 186
793
  },
794
  "US": {
795
- "accuracy": 0.7085201793721974,
796
  "count": 223
797
  }
798
  }
799
  },
800
  "add_S4": {
801
- "full_accuracy": 0.32,
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
@@ -806,7 +806,7 @@
806
  "count": 104
807
  },
808
  "SC": {
809
- "accuracy": 0.9622641509433962,
810
  "count": 106
811
  },
812
  "SS": {
@@ -814,17 +814,17 @@
814
  "count": 23
815
  },
816
  "UC": {
817
- "accuracy": 0.725,
818
  "count": 160
819
  },
820
  "US": {
821
- "accuracy": 0.5602605863192183,
822
  "count": 307
823
  }
824
  }
825
  },
826
  "add_S5": {
827
- "full_accuracy": 0.3,
828
  "n_examples": 100,
829
  "per_subtask": {
830
  "SA": {
@@ -836,17 +836,17 @@
836
  "count": 100
837
  },
838
  "UC": {
839
- "accuracy": 0.46,
840
  "count": 100
841
  },
842
  "US": {
843
- "accuracy": 0.5475,
844
  "count": 400
845
  }
846
  }
847
  },
848
  "add_S6": {
849
- "full_accuracy": 0.4,
850
  "n_examples": 100,
851
  "per_subtask": {
852
  "SC": {
@@ -854,47 +854,47 @@
854
  "count": 100
855
  },
856
  "UC": {
857
- "accuracy": 0.44,
858
  "count": 100
859
  },
860
  "US": {
861
- "accuracy": 0.494,
862
  "count": 500
863
  }
864
  }
865
  },
866
  "add_random": {
867
- "full_accuracy": 0.675,
868
  "n_examples": 200,
869
  "per_subtask": {
870
  "SA": {
871
- "accuracy": 0.9664429530201343,
872
  "count": 447
873
  },
874
  "SC": {
875
- "accuracy": 0.95625,
876
  "count": 320
877
  },
878
  "SS": {
879
- "accuracy": 0.9821428571428571,
880
  "count": 56
881
  },
882
  "UC": {
883
- "accuracy": 0.9168241965973535,
884
  "count": 529
885
  },
886
  "US": {
887
- "accuracy": 0.8958333333333334,
888
  "count": 48
889
  }
890
  }
891
  },
892
  "add_C3": {
893
- "full_accuracy": 0.42,
894
  "n_examples": 100,
895
  "per_subtask": {
896
  "SA": {
897
- "accuracy": 0.9833333333333333,
898
  "count": 300
899
  },
900
  "SC": {
@@ -902,39 +902,39 @@
902
  "count": 100
903
  },
904
  "UC": {
905
- "accuracy": 0.7409326424870466,
906
  "count": 193
907
  },
908
  "US": {
909
- "accuracy": 0.8037383177570093,
910
  "count": 107
911
  }
912
  }
913
  },
914
  "add_C4": {
915
- "full_accuracy": 0.42,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
919
- "accuracy": 0.99,
920
  "count": 200
921
  },
922
  "SC": {
923
- "accuracy": 0.99,
924
  "count": 100
925
  },
926
  "UC": {
927
- "accuracy": 0.7734375,
928
  "count": 256
929
  },
930
  "US": {
931
- "accuracy": 0.7569444444444444,
932
  "count": 144
933
  }
934
  }
935
  },
936
  "add_C5": {
937
- "full_accuracy": 0.37,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SA": {
@@ -942,21 +942,21 @@
942
  "count": 100
943
  },
944
  "SC": {
945
- "accuracy": 0.97,
946
  "count": 100
947
  },
948
  "UC": {
949
- "accuracy": 0.7908496732026143,
950
  "count": 306
951
  },
952
  "US": {
953
- "accuracy": 0.7474226804123711,
954
  "count": 194
955
  }
956
  }
957
  },
958
  "add_C6": {
959
- "full_accuracy": 0.32,
960
  "n_examples": 100,
961
  "per_subtask": {
962
  "SC": {
@@ -964,39 +964,39 @@
964
  "count": 100
965
  },
966
  "UC": {
967
- "accuracy": 0.7923497267759563,
968
  "count": 366
969
  },
970
  "US": {
971
- "accuracy": 0.8034188034188035,
972
  "count": 234
973
  }
974
  }
975
  },
976
  "sub_M0": {
977
- "full_accuracy": 0.77,
978
  "n_examples": 100,
979
  "per_subtask": {
980
  "MD": {
981
- "accuracy": 0.9584026622296173,
982
  "count": 601
983
  },
984
  "ME": {
985
- "accuracy": 0.98989898989899,
986
  "count": 99
987
  }
988
  }
989
  },
990
  "sub_M1": {
991
- "full_accuracy": 0.62,
992
  "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
995
- "accuracy": 0.974910394265233,
996
  "count": 279
997
  },
998
  "MB": {
999
- "accuracy": 0.9586206896551724,
1000
  "count": 145
1001
  },
1002
  "ME": {
@@ -1004,17 +1004,17 @@
1004
  "count": 24
1005
  },
1006
  "UB": {
1007
- "accuracy": 0.8849206349206349,
1008
  "count": 252
1009
  }
1010
  }
1011
  },
1012
  "sub_M2": {
1013
- "full_accuracy": 0.28,
1014
  "n_examples": 100,
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 0.971830985915493,
1018
  "count": 213
1019
  },
1020
  "MB": {
@@ -1022,29 +1022,29 @@
1022
  "count": 113
1023
  },
1024
  "ME": {
1025
- "accuracy": 0.9529411764705882,
1026
  "count": 85
1027
  },
1028
  "UB": {
1029
- "accuracy": 0.6574585635359116,
1030
  "count": 181
1031
  },
1032
  "UD": {
1033
- "accuracy": 0.8240740740740741,
1034
  "count": 108
1035
  }
1036
  }
1037
  },
1038
  "sub_M3": {
1039
- "full_accuracy": 0.09,
1040
  "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
1043
- "accuracy": 0.994413407821229,
1044
  "count": 179
1045
  },
1046
  "MB": {
1047
- "accuracy": 0.9514563106796117,
1048
  "count": 103
1049
  },
1050
  "ME": {
@@ -1052,11 +1052,11 @@
1052
  "count": 56
1053
  },
1054
  "UB": {
1055
- "accuracy": 0.46308724832214765,
1056
  "count": 149
1057
  },
1058
  "UD": {
1059
- "accuracy": 0.7230046948356808,
1060
  "count": 213
1061
  }
1062
  }
@@ -1070,21 +1070,21 @@
1070
  "count": 200
1071
  },
1072
  "MB": {
1073
- "accuracy": 0.98,
1074
  "count": 100
1075
  },
1076
  "UB": {
1077
- "accuracy": 0.38,
1078
  "count": 100
1079
  },
1080
  "UD": {
1081
- "accuracy": 0.35333333333333333,
1082
  "count": 300
1083
  }
1084
  }
1085
  },
1086
  "sub_M5": {
1087
- "full_accuracy": 0.03,
1088
  "n_examples": 100,
1089
  "per_subtask": {
1090
  "MD": {
@@ -1100,21 +1100,21 @@
1100
  "count": 100
1101
  },
1102
  "UD": {
1103
- "accuracy": 0.285,
1104
  "count": 400
1105
  }
1106
  }
1107
  },
1108
  "sub_random": {
1109
- "full_accuracy": 0.605,
1110
  "n_examples": 200,
1111
  "per_subtask": {
1112
  "MD": {
1113
- "accuracy": 0.965,
1114
  "count": 600
1115
  },
1116
  "MB": {
1117
- "accuracy": 0.9438202247191011,
1118
  "count": 267
1119
  },
1120
  "ME": {
@@ -1122,39 +1122,39 @@
1122
  "count": 53
1123
  },
1124
  "UB": {
1125
- "accuracy": 0.8883826879271071,
1126
  "count": 439
1127
  },
1128
  "UD": {
1129
- "accuracy": 0.9024390243902439,
1130
  "count": 41
1131
  }
1132
  }
1133
  },
1134
  "sub_B3": {
1135
- "full_accuracy": 0.27,
1136
  "n_examples": 100,
1137
  "per_subtask": {
1138
  "MD": {
1139
- "accuracy": 0.9833333333333333,
1140
  "count": 300
1141
  },
1142
  "MB": {
1143
- "accuracy": 0.97,
1144
  "count": 100
1145
  },
1146
  "UB": {
1147
- "accuracy": 0.6040609137055838,
1148
  "count": 197
1149
  },
1150
  "UD": {
1151
- "accuracy": 0.6699029126213593,
1152
  "count": 103
1153
  }
1154
  }
1155
  },
1156
  "sub_B4": {
1157
- "full_accuracy": 0.12,
1158
  "n_examples": 100,
1159
  "per_subtask": {
1160
  "MD": {
@@ -1166,17 +1166,17 @@
1166
  "count": 100
1167
  },
1168
  "UB": {
1169
- "accuracy": 0.6072874493927125,
1170
  "count": 247
1171
  },
1172
  "UD": {
1173
- "accuracy": 0.5620915032679739,
1174
  "count": 153
1175
  }
1176
  }
1177
  },
1178
  "sub_B5": {
1179
- "full_accuracy": 0.17,
1180
  "n_examples": 100,
1181
  "per_subtask": {
1182
  "MD": {
@@ -1188,18 +1188,18 @@
1188
  "count": 100
1189
  },
1190
  "UB": {
1191
- "accuracy": 0.6879194630872483,
1192
  "count": 298
1193
  },
1194
  "UD": {
1195
- "accuracy": 0.5594059405940595,
1196
  "count": 202
1197
  }
1198
  }
1199
  }
1200
  },
1201
  "summary": {
1202
- "overall_accuracy": 0.395,
1203
  "total_examples": 2400,
1204
  "n_splits": 22
1205
  }
 
159
  7800
160
  ],
161
  "loss": [
162
+ 8.74937915802002,
163
+ 6.198915004730225,
164
+ 4.366815090179443,
165
+ 2.2759313583374023,
166
+ 1.9526265859603882,
167
+ 1.877150535583496,
168
+ 1.8139151334762573,
169
+ 1.822188377380371,
170
+ 1.71466064453125,
171
+ 1.590683937072754,
172
+ 1.4826921224594116,
173
+ 1.133190393447876,
174
+ 0.8561835885047913,
175
+ 0.7075616121292114,
176
+ 0.6399551033973694,
177
+ 0.6669952273368835,
178
+ 0.6160100698471069,
179
+ 0.5750561952590942,
180
+ 0.5242418646812439,
181
+ 0.5324519872665405,
182
+ 0.49814528226852417,
183
+ 0.4612176716327667,
184
+ 0.5026556849479675,
185
+ 0.47276973724365234,
186
+ 0.43902602791786194,
187
+ 0.446272611618042,
188
+ 0.38574811816215515,
189
+ 0.3711676001548767,
190
+ 0.3969157338142395,
191
+ 0.39037615060806274,
192
+ 0.3659220337867737,
193
+ 0.2901519238948822,
194
+ 0.372765451669693,
195
+ 0.33219239115715027,
196
+ 0.34733083844184875,
197
+ 0.31526169180870056,
198
+ 0.33172616362571716,
199
+ 0.3349834382534027,
200
+ 0.3454124927520752,
201
+ 0.32427385449409485,
202
+ 0.29763302206993103,
203
+ 0.2891975939273834,
204
+ 0.3104265332221985,
205
+ 0.2937973141670227,
206
+ 0.29948028922080994,
207
+ 0.26262804865837097,
208
+ 0.26943641901016235,
209
+ 0.2750188410282135,
210
+ 0.2822719216346741,
211
+ 0.2253115326166153,
212
+ 0.2866693139076233,
213
+ 0.29621443152427673,
214
+ 0.2518032193183899,
215
+ 0.25790685415267944,
216
+ 0.33121928572654724,
217
+ 0.2776740789413452,
218
+ 0.240325927734375,
219
+ 0.25502628087997437,
220
+ 0.2516288459300995,
221
+ 0.2600439190864563,
222
+ 0.22953486442565918,
223
+ 0.25047045946121216,
224
+ 0.287687748670578,
225
+ 0.24002419412136078,
226
+ 0.20791330933570862,
227
+ 0.24204131960868835,
228
+ 0.27317410707473755,
229
+ 0.24345578253269196,
230
+ 0.21374334394931793,
231
+ 0.21283268928527832,
232
+ 0.21549828350543976,
233
+ 0.19832827150821686,
234
+ 0.21149209141731262,
235
+ 0.1855495721101761,
236
+ 0.19525133073329926,
237
+ 0.23278847336769104,
238
+ 0.2606830298900604,
239
+ 0.20044934749603271,
240
+ 0.181396022439003,
241
+ 0.21328875422477722,
242
+ 0.212825208902359,
243
+ 0.21052296459674835,
244
+ 0.19027332961559296,
245
+ 0.2117026001214981,
246
+ 0.22618059813976288,
247
+ 0.22302715480327606,
248
+ 0.17805133759975433,
249
+ 0.18642711639404297,
250
+ 0.2146042138338089,
251
+ 0.19574260711669922,
252
+ 0.16053321957588196,
253
+ 0.21608540415763855,
254
+ 0.20082899928092957,
255
+ 0.23242345452308655,
256
+ 0.17946086823940277,
257
+ 0.2614651918411255,
258
+ 0.1651460975408554,
259
+ 0.1777617186307907,
260
+ 0.15014827251434326,
261
+ 0.20371000468730927,
262
+ 0.1573261022567749,
263
+ 0.19635023176670074,
264
+ 0.18772493302822113,
265
+ 0.1814812421798706,
266
+ 0.18410103023052216,
267
+ 0.20629878342151642,
268
+ 0.1857832968235016,
269
+ 0.17055557668209076,
270
+ 0.16345703601837158,
271
+ 0.16755636036396027,
272
+ 0.17432273924350739,
273
+ 0.15670685470104218,
274
+ 0.15046896040439606,
275
+ 0.14622943103313446,
276
+ 0.1660774052143097,
277
+ 0.16316993534564972,
278
+ 0.16458259522914886,
279
+ 0.16533777117729187,
280
+ 0.18796680867671967,
281
+ 0.16055859625339508,
282
+ 0.16241776943206787,
283
+ 0.15573906898498535,
284
+ 0.18253065645694733,
285
+ 0.15446387231349945,
286
+ 0.16669152677059174,
287
+ 0.16877718269824982,
288
+ 0.15042708814144135,
289
+ 0.14171156287193298,
290
+ 0.1469859778881073,
291
+ 0.16647039353847504,
292
+ 0.1766190528869629,
293
+ 0.15534673631191254,
294
+ 0.15092872083187103,
295
+ 0.16818329691886902,
296
+ 0.14259760081768036,
297
+ 0.16949117183685303,
298
+ 0.13747766613960266,
299
+ 0.16281220316886902,
300
+ 0.1641203612089157,
301
+ 0.15849816799163818,
302
+ 0.14388667047023773,
303
+ 0.14095112681388855,
304
+ 0.14720116555690765,
305
+ 0.15021368861198425,
306
+ 0.15137387812137604,
307
+ 0.15634380280971527,
308
+ 0.14748618006706238,
309
+ 0.1435571163892746,
310
+ 0.13644962012767792,
311
+ 0.16987796127796173,
312
+ 0.15995793044567108,
313
+ 0.17382332682609558,
314
+ 0.157388374209404,
315
+ 0.13605724275112152,
316
+ 0.174538716673851,
317
+ 0.15863211452960968
318
  ],
319
  "base_loss": [
320
+ 8.74937915802002,
321
+ 6.198915004730225,
322
+ 4.366815090179443,
323
+ 2.2759313583374023,
324
+ 1.9526265859603882,
325
+ 1.877150535583496,
326
+ 1.8139151334762573,
327
+ 1.822188377380371,
328
+ 1.71466064453125,
329
+ 1.590683937072754,
330
+ 1.4826921224594116,
331
+ 1.133190393447876,
332
+ 0.8561835885047913,
333
+ 0.7075616121292114,
334
+ 0.6399551033973694,
335
+ 0.6669952273368835,
336
+ 0.6160100698471069,
337
+ 0.5750561952590942,
338
+ 0.5242418646812439,
339
+ 0.5324519872665405,
340
+ 0.49814528226852417,
341
+ 0.4612176716327667,
342
+ 0.5026556849479675,
343
+ 0.47276973724365234,
344
+ 0.43902602791786194,
345
+ 0.446272611618042,
346
+ 0.38574811816215515,
347
+ 0.3711676001548767,
348
+ 0.3969157338142395,
349
+ 0.39037615060806274,
350
+ 0.3659220337867737,
351
+ 0.2901519238948822,
352
+ 0.372765451669693,
353
+ 0.33219239115715027,
354
+ 0.34733083844184875,
355
+ 0.31526169180870056,
356
+ 0.33172616362571716,
357
+ 0.3349834382534027,
358
+ 0.3454124927520752,
359
+ 0.32427385449409485,
360
+ 0.29763302206993103,
361
+ 0.2891975939273834,
362
+ 0.3104265332221985,
363
+ 0.2937973141670227,
364
+ 0.29948028922080994,
365
+ 0.26262804865837097,
366
+ 0.26943641901016235,
367
+ 0.2750188410282135,
368
+ 0.2822719216346741,
369
+ 0.2253115326166153,
370
+ 0.2866693139076233,
371
+ 0.29621443152427673,
372
+ 0.2518032193183899,
373
+ 0.25790685415267944,
374
+ 0.33121928572654724,
375
+ 0.2776740789413452,
376
+ 0.240325927734375,
377
+ 0.25502628087997437,
378
+ 0.2516288459300995,
379
+ 0.2600439190864563,
380
+ 0.22953486442565918,
381
+ 0.25047045946121216,
382
+ 0.287687748670578,
383
+ 0.24002419412136078,
384
+ 0.20791330933570862,
385
+ 0.24204131960868835,
386
+ 0.27317410707473755,
387
+ 0.24345578253269196,
388
+ 0.21374334394931793,
389
+ 0.21283268928527832,
390
+ 0.21549828350543976,
391
+ 0.19832827150821686,
392
+ 0.21149209141731262,
393
+ 0.1855495721101761,
394
+ 0.19525133073329926,
395
+ 0.23278847336769104,
396
+ 0.2606830298900604,
397
+ 0.20044934749603271,
398
+ 0.181396022439003,
399
+ 0.21328875422477722,
400
+ 0.212825208902359,
401
+ 0.21052296459674835,
402
+ 0.19027332961559296,
403
+ 0.2117026001214981,
404
+ 0.22618059813976288,
405
+ 0.22302715480327606,
406
+ 0.17805133759975433,
407
+ 0.18642711639404297,
408
+ 0.2146042138338089,
409
+ 0.19574260711669922,
410
+ 0.16053321957588196,
411
+ 0.21608540415763855,
412
+ 0.20082899928092957,
413
+ 0.23242345452308655,
414
+ 0.17946086823940277,
415
+ 0.2614651918411255,
416
+ 0.1651460975408554,
417
+ 0.1777617186307907,
418
+ 0.15014827251434326,
419
+ 0.20371000468730927,
420
+ 0.1573261022567749,
421
+ 0.19635023176670074,
422
+ 0.18772493302822113,
423
+ 0.1814812421798706,
424
+ 0.18410103023052216,
425
+ 0.20629878342151642,
426
+ 0.1857832968235016,
427
+ 0.17055557668209076,
428
+ 0.16345703601837158,
429
+ 0.16755636036396027,
430
+ 0.17432273924350739,
431
+ 0.15670685470104218,
432
+ 0.15046896040439606,
433
+ 0.14622943103313446,
434
+ 0.1660774052143097,
435
+ 0.16316993534564972,
436
+ 0.16458259522914886,
437
+ 0.16533777117729187,
438
+ 0.18796680867671967,
439
+ 0.16055859625339508,
440
+ 0.16241776943206787,
441
+ 0.15573906898498535,
442
+ 0.18253065645694733,
443
+ 0.15446387231349945,
444
+ 0.16669152677059174,
445
+ 0.16877718269824982,
446
+ 0.15042708814144135,
447
+ 0.14171156287193298,
448
+ 0.1469859778881073,
449
+ 0.16647039353847504,
450
+ 0.1766190528869629,
451
+ 0.15534673631191254,
452
+ 0.15092872083187103,
453
+ 0.16818329691886902,
454
+ 0.14259760081768036,
455
+ 0.16949117183685303,
456
+ 0.13747766613960266,
457
+ 0.16281220316886902,
458
+ 0.1641203612089157,
459
+ 0.15849816799163818,
460
+ 0.14388667047023773,
461
+ 0.14095112681388855,
462
+ 0.14720116555690765,
463
+ 0.15021368861198425,
464
+ 0.15137387812137604,
465
+ 0.15634380280971527,
466
+ 0.14748618006706238,
467
+ 0.1435571163892746,
468
+ 0.13644962012767792,
469
+ 0.16987796127796173,
470
+ 0.15995793044567108,
471
+ 0.17382332682609558,
472
+ 0.157388374209404,
473
+ 0.13605724275112152,
474
+ 0.174538716673851,
475
+ 0.15863211452960968
476
  ],
477
  "lr": [
478
+ 1.6752136752136756e-05,
479
+ 3.384615384615385e-05,
480
+ 5.094017094017095e-05,
481
+ 6.803418803418804e-05,
482
+ 7.99992282348104e-05,
483
+ 7.998550879323477e-05,
484
+ 7.99546457847615e-05,
485
+ 7.990665244173565e-05,
486
+ 7.9841549341039e-05,
487
+ 7.975936439526778e-05,
488
+ 7.966013284076529e-05,
489
+ 7.95438972225146e-05,
490
+ 7.941070737589747e-05,
491
+ 7.926062040532787e-05,
492
+ 7.909370065976863e-05,
493
+ 7.891001970514232e-05,
494
+ 7.870965629364761e-05,
495
+ 7.849269632999478e-05,
496
+ 7.825923283457459e-05,
497
+ 7.800936590357617e-05,
498
+ 7.774320266607144e-05,
499
+ 7.746085723808407e-05,
500
+ 7.716245067366288e-05,
501
+ 7.684811091298052e-05,
502
+ 7.651797272747994e-05,
503
+ 7.617217766209175e-05,
504
+ 7.581087397454772e-05,
505
+ 7.5434216571816e-05,
506
+ 7.504236694368569e-05,
507
+ 7.463549309352889e-05,
508
+ 7.421376946627028e-05,
509
+ 7.377737687359476e-05,
510
+ 7.332650241642543e-05,
511
+ 7.286133940470513e-05,
512
+ 7.238208727451582e-05,
513
+ 7.188895150257153e-05,
514
+ 7.138214351812129e-05,
515
+ 7.086188061229997e-05,
516
+ 7.032838584496603e-05,
517
+ 6.97818879490657e-05,
518
+ 6.922262123256506e-05,
519
+ 6.865082547799175e-05,
520
+ 6.806674583962958e-05,
521
+ 6.747063273840991e-05,
522
+ 6.686274175454515e-05,
523
+ 6.624333351795003e-05,
524
+ 6.561267359649806e-05,
525
+ 6.497103238216073e-05,
526
+ 6.43186849750784e-05,
527
+ 6.365591106561267e-05,
528
+ 6.298299481443074e-05,
529
+ 6.230022473067294e-05,
530
+ 6.160789354825612e-05,
531
+ 6.090629810036558e-05,
532
+ 6.019573919218951e-05,
533
+ 5.947652147195031e-05,
534
+ 5.8748953300288454e-05,
535
+ 5.8013346618054525e-05,
536
+ 5.7270016812566265e-05,
537
+ 5.651928258238803e-05,
538
+ 5.5761465800690523e-05,
539
+ 5.499689137724945e-05,
540
+ 5.422588711914222e-05,
541
+ 5.3448783590202484e-05,
542
+ 5.26659139692927e-05,
543
+ 5.1877613907455556e-05,
544
+ 5.1084221384005374e-05,
545
+ 5.028607656162141e-05,
546
+ 4.948352164050493e-05,
547
+ 4.867690071166274e-05,
548
+ 4.786655960938013e-05,
549
+ 4.705284576294624e-05,
550
+ 4.623610804769575e-05,
551
+ 4.541669663543045e-05,
552
+ 4.4594962844284974e-05,
553
+ 4.377125898810118e-05,
554
+ 4.294593822537543e-05,
555
+ 4.2119354407843914e-05,
556
+ 4.129186192877056e-05,
557
+ 4.046381557100299e-05,
558
+ 3.963557035486121e-05,
559
+ 3.8807481385924744e-05,
560
+ 3.79799037027829e-05,
561
+ 3.7153192124814045e-05,
562
+ 3.632770110005876e-05,
563
+ 3.5503784553252085e-05,
564
+ 3.4681795734080355e-05,
565
+ 3.386208706572725e-05,
566
+ 3.3045009993774335e-05,
567
+ 3.223091483552074e-05,
568
+ 3.1420150629786436e-05,
569
+ 3.061306498726391e-05,
570
+ 2.9810003941481836e-05,
571
+ 2.9011311800445205e-05,
572
+ 2.8217330999015148e-05,
573
+ 2.742840195209184e-05,
574
+ 2.664486290866354e-05,
575
+ 2.586704980678411e-05,
576
+ 2.509529612954155e-05,
577
+ 2.432993276207881e-05,
578
+ 2.357128784972872e-05,
579
+ 2.281968665732344e-05,
580
+ 2.2075451429738966e-05,
581
+ 2.133890125373438e-05,
582
+ 2.061035192114523e-05,
583
+ 1.9890115793489398e-05,
584
+ 1.9178501668043937e-05,
585
+ 1.847581464544989e-05,
586
+ 1.7782355998902056e-05,
587
+ 1.7098423044979796e-05,
588
+ 1.642430901617411e-05,
589
+ 1.576030293516598e-05,
590
+ 1.5106689490909282e-05,
591
+ 1.4463748916572114e-05,
592
+ 1.3831756869388313e-05,
593
+ 1.3210984312470934e-05,
594
+ 1.260169739863832e-05,
595
+ 1.2004157356302488e-05,
596
+ 1.1418620377469014e-05,
597
+ 1.084533750789595e-05,
598
+ 1.028455453945948e-05,
599
+ 9.73651190477193e-06,
600
+ 9.201444574097636e-06,
601
+ 8.679581954610725e-06,
602
+ 8.171147792038026e-06,
603
+ 7.676360074729406e-06,
604
+ 7.1954309401963505e-06,
605
+ 6.728566584159262e-06,
606
+ 6.27596717214213e-06,
607
+ 5.8378267536526975e-06,
608
+ 5.414333178984774e-06,
609
+ 5.005668018678451e-06,
610
+ 4.612006485672775e-06,
611
+ 4.233517360184123e-06,
612
+ 3.870362917342623e-06,
613
+ 3.5226988576176015e-06,
614
+ 3.1906742400618397e-06,
615
+ 2.874431418403334e-06,
616
+ 2.5741059800119272e-06,
617
+ 2.2898266877669604e-06,
618
+ 2.0217154248508964e-06,
619
+ 1.7698871424925945e-06,
620
+ 1.53444981068255e-06,
621
+ 1.315504371881442e-06,
622
+ 1.1131446977415172e-06,
623
+ 9.274575488597004e-07,
624
+ 7.58522537579438e-07,
625
+ 6.064120938573448e-07,
626
+ 4.7119143420927493e-07,
627
+ 3.5291853374905817e-07,
628
+ 2.5164410133206253e-07,
629
+ 1.6741155781396346e-07,
630
+ 1.0025701743435978e-07,
631
+ 5.020927233295947e-08,
632
+ 1.728978020511196e-08,
633
+ 1.5126551019539748e-09
634
  ],
635
  "eval_step": [
636
  390,
 
677
  20
678
  ],
679
  "eval_accuracy": [
680
+ 0.005555555555555556,
681
+ 0.05333333333333334,
682
+ 0.20555555555555555,
683
+ 0.29333333333333333,
684
+ 0.3055555555555556,
685
+ 0.31777777777777777,
686
+ 0.36777777777777776,
687
+ 0.3988888888888889,
688
+ 0.4388888888888889,
689
+ 0.5322222222222223,
690
+ 0.5355555555555556,
691
+ 0.5366666666666666,
692
+ 0.62,
693
+ 0.5488888888888889,
694
+ 0.5488888888888889,
695
+ 0.5866666666666667,
696
+ 0.5811111111111111,
697
+ 0.5811111111111111,
698
+ 0.5733333333333334,
699
+ 0.5777777777777777
700
  ]
701
  },
702
+ "final_accuracy": 0.49541666666666667,
703
  "sft_eval": {
704
  "config": {
705
  "ops": "add_sub",
 
710
  },
711
  "splits": {
712
  "add_S0": {
713
+ "full_accuracy": 0.8,
714
  "n_examples": 100,
715
  "per_subtask": {
716
  "SA": {
717
+ "accuracy": 0.9636363636363636,
718
  "count": 605
719
  },
720
  "SS": {
721
+ "accuracy": 1.0,
722
  "count": 95
723
  }
724
  }
725
  },
726
  "add_S1": {
727
+ "full_accuracy": 0.82,
728
  "n_examples": 100,
729
  "per_subtask": {
730
  "SA": {
731
+ "accuracy": 0.9754901960784313,
732
  "count": 204
733
  },
734
  "SC": {
735
+ "accuracy": 0.9881656804733728,
736
  "count": 169
737
  },
738
  "SS": {
739
+ "accuracy": 0.967741935483871,
740
  "count": 31
741
  },
742
  "UC": {
743
+ "accuracy": 0.9628378378378378,
744
  "count": 296
745
  }
746
  }
747
  },
748
  "add_S2": {
749
+ "full_accuracy": 0.55,
750
  "n_examples": 100,
751
  "per_subtask": {
752
  "SA": {
753
+ "accuracy": 0.9815950920245399,
754
  "count": 163
755
  },
756
  "SC": {
757
+ "accuracy": 0.9384615384615385,
758
  "count": 130
759
  },
760
  "SS": {
 
762
  "count": 87
763
  },
764
  "UC": {
765
+ "accuracy": 0.8472906403940886,
766
  "count": 203
767
  },
768
  "US": {
769
+ "accuracy": 0.9743589743589743,
770
  "count": 117
771
  }
772
  }
773
  },
774
  "add_S3": {
775
+ "full_accuracy": 0.27,
776
  "n_examples": 100,
777
  "per_subtask": {
778
  "SA": {
779
+ "accuracy": 0.9834710743801653,
780
  "count": 121
781
  },
782
  "SC": {
783
+ "accuracy": 0.9586776859504132,
784
  "count": 121
785
  },
786
  "SS": {
787
+ "accuracy": 0.9795918367346939,
788
  "count": 49
789
  },
790
  "UC": {
791
+ "accuracy": 0.6720430107526881,
792
  "count": 186
793
  },
794
  "US": {
795
+ "accuracy": 0.7668161434977578,
796
  "count": 223
797
  }
798
  }
799
  },
800
  "add_S4": {
801
+ "full_accuracy": 0.36,
802
  "n_examples": 100,
803
  "per_subtask": {
804
  "SA": {
 
806
  "count": 104
807
  },
808
  "SC": {
809
+ "accuracy": 0.9716981132075472,
810
  "count": 106
811
  },
812
  "SS": {
 
814
  "count": 23
815
  },
816
  "UC": {
817
+ "accuracy": 0.7125,
818
  "count": 160
819
  },
820
  "US": {
821
+ "accuracy": 0.6156351791530945,
822
  "count": 307
823
  }
824
  }
825
  },
826
  "add_S5": {
827
+ "full_accuracy": 0.22,
828
  "n_examples": 100,
829
  "per_subtask": {
830
  "SA": {
 
836
  "count": 100
837
  },
838
  "UC": {
839
+ "accuracy": 0.39,
840
  "count": 100
841
  },
842
  "US": {
843
+ "accuracy": 0.5075,
844
  "count": 400
845
  }
846
  }
847
  },
848
  "add_S6": {
849
+ "full_accuracy": 0.55,
850
  "n_examples": 100,
851
  "per_subtask": {
852
  "SC": {
 
854
  "count": 100
855
  },
856
  "UC": {
857
+ "accuracy": 0.61,
858
  "count": 100
859
  },
860
  "US": {
861
+ "accuracy": 0.628,
862
  "count": 500
863
  }
864
  }
865
  },
866
  "add_random": {
867
+ "full_accuracy": 0.75,
868
  "n_examples": 200,
869
  "per_subtask": {
870
  "SA": {
871
+ "accuracy": 0.9753914988814317,
872
  "count": 447
873
  },
874
  "SC": {
875
+ "accuracy": 0.975,
876
  "count": 320
877
  },
878
  "SS": {
879
+ "accuracy": 0.9642857142857143,
880
  "count": 56
881
  },
882
  "UC": {
883
+ "accuracy": 0.9395085066162571,
884
  "count": 529
885
  },
886
  "US": {
887
+ "accuracy": 0.9166666666666666,
888
  "count": 48
889
  }
890
  }
891
  },
892
  "add_C3": {
893
+ "full_accuracy": 0.54,
894
  "n_examples": 100,
895
  "per_subtask": {
896
  "SA": {
897
+ "accuracy": 0.9866666666666667,
898
  "count": 300
899
  },
900
  "SC": {
 
902
  "count": 100
903
  },
904
  "UC": {
905
+ "accuracy": 0.7979274611398963,
906
  "count": 193
907
  },
908
  "US": {
909
+ "accuracy": 0.7757009345794392,
910
  "count": 107
911
  }
912
  }
913
  },
914
  "add_C4": {
915
+ "full_accuracy": 0.48,
916
  "n_examples": 100,
917
  "per_subtask": {
918
  "SA": {
919
+ "accuracy": 0.995,
920
  "count": 200
921
  },
922
  "SC": {
923
+ "accuracy": 1.0,
924
  "count": 100
925
  },
926
  "UC": {
927
+ "accuracy": 0.78515625,
928
  "count": 256
929
  },
930
  "US": {
931
+ "accuracy": 0.75,
932
  "count": 144
933
  }
934
  }
935
  },
936
  "add_C5": {
937
+ "full_accuracy": 0.45,
938
  "n_examples": 100,
939
  "per_subtask": {
940
  "SA": {
 
942
  "count": 100
943
  },
944
  "SC": {
945
+ "accuracy": 1.0,
946
  "count": 100
947
  },
948
  "UC": {
949
+ "accuracy": 0.8137254901960784,
950
  "count": 306
951
  },
952
  "US": {
953
+ "accuracy": 0.8195876288659794,
954
  "count": 194
955
  }
956
  }
957
  },
958
  "add_C6": {
959
+ "full_accuracy": 0.5,
960
  "n_examples": 100,
961
  "per_subtask": {
962
  "SC": {
 
964
  "count": 100
965
  },
966
  "UC": {
967
+ "accuracy": 0.855191256830601,
968
  "count": 366
969
  },
970
  "US": {
971
+ "accuracy": 0.8717948717948718,
972
  "count": 234
973
  }
974
  }
975
  },
976
  "sub_M0": {
977
+ "full_accuracy": 0.88,
978
  "n_examples": 100,
979
  "per_subtask": {
980
  "MD": {
981
+ "accuracy": 0.9800332778702163,
982
  "count": 601
983
  },
984
  "ME": {
985
+ "accuracy": 1.0,
986
  "count": 99
987
  }
988
  }
989
  },
990
  "sub_M1": {
991
+ "full_accuracy": 0.8,
992
  "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
995
+ "accuracy": 0.982078853046595,
996
  "count": 279
997
  },
998
  "MB": {
999
+ "accuracy": 0.9793103448275862,
1000
  "count": 145
1001
  },
1002
  "ME": {
 
1004
  "count": 24
1005
  },
1006
  "UB": {
1007
+ "accuracy": 0.9444444444444444,
1008
  "count": 252
1009
  }
1010
  }
1011
  },
1012
  "sub_M2": {
1013
+ "full_accuracy": 0.55,
1014
  "n_examples": 100,
1015
  "per_subtask": {
1016
  "MD": {
1017
+ "accuracy": 0.9812206572769953,
1018
  "count": 213
1019
  },
1020
  "MB": {
 
1022
  "count": 113
1023
  },
1024
  "ME": {
1025
+ "accuracy": 0.9882352941176471,
1026
  "count": 85
1027
  },
1028
  "UB": {
1029
+ "accuracy": 0.7624309392265194,
1030
  "count": 181
1031
  },
1032
  "UD": {
1033
+ "accuracy": 0.9629629629629629,
1034
  "count": 108
1035
  }
1036
  }
1037
  },
1038
  "sub_M3": {
1039
+ "full_accuracy": 0.17,
1040
  "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
1043
+ "accuracy": 1.0,
1044
  "count": 179
1045
  },
1046
  "MB": {
1047
+ "accuracy": 0.9902912621359223,
1048
  "count": 103
1049
  },
1050
  "ME": {
 
1052
  "count": 56
1053
  },
1054
  "UB": {
1055
+ "accuracy": 0.5637583892617449,
1056
  "count": 149
1057
  },
1058
  "UD": {
1059
+ "accuracy": 0.7699530516431925,
1060
  "count": 213
1061
  }
1062
  }
 
1070
  "count": 200
1071
  },
1072
  "MB": {
1073
+ "accuracy": 1.0,
1074
  "count": 100
1075
  },
1076
  "UB": {
1077
+ "accuracy": 0.39,
1078
  "count": 100
1079
  },
1080
  "UD": {
1081
+ "accuracy": 0.38333333333333336,
1082
  "count": 300
1083
  }
1084
  }
1085
  },
1086
  "sub_M5": {
1087
+ "full_accuracy": 0.04,
1088
  "n_examples": 100,
1089
  "per_subtask": {
1090
  "MD": {
 
1100
  "count": 100
1101
  },
1102
  "UD": {
1103
+ "accuracy": 0.3475,
1104
  "count": 400
1105
  }
1106
  }
1107
  },
1108
  "sub_random": {
1109
+ "full_accuracy": 0.745,
1110
  "n_examples": 200,
1111
  "per_subtask": {
1112
  "MD": {
1113
+ "accuracy": 0.98,
1114
  "count": 600
1115
  },
1116
  "MB": {
1117
+ "accuracy": 0.9700374531835206,
1118
  "count": 267
1119
  },
1120
  "ME": {
 
1122
  "count": 53
1123
  },
1124
  "UB": {
1125
+ "accuracy": 0.9225512528473804,
1126
  "count": 439
1127
  },
1128
  "UD": {
1129
+ "accuracy": 0.975609756097561,
1130
  "count": 41
1131
  }
1132
  }
1133
  },
1134
  "sub_B3": {
1135
+ "full_accuracy": 0.35,
1136
  "n_examples": 100,
1137
  "per_subtask": {
1138
  "MD": {
1139
+ "accuracy": 0.9933333333333333,
1140
  "count": 300
1141
  },
1142
  "MB": {
1143
+ "accuracy": 0.98,
1144
  "count": 100
1145
  },
1146
  "UB": {
1147
+ "accuracy": 0.6852791878172588,
1148
  "count": 197
1149
  },
1150
  "UD": {
1151
+ "accuracy": 0.7184466019417476,
1152
  "count": 103
1153
  }
1154
  }
1155
  },
1156
  "sub_B4": {
1157
+ "full_accuracy": 0.27,
1158
  "n_examples": 100,
1159
  "per_subtask": {
1160
  "MD": {
 
1166
  "count": 100
1167
  },
1168
  "UB": {
1169
+ "accuracy": 0.6923076923076923,
1170
  "count": 247
1171
  },
1172
  "UD": {
1173
+ "accuracy": 0.6666666666666666,
1174
  "count": 153
1175
  }
1176
  }
1177
  },
1178
  "sub_B5": {
1179
+ "full_accuracy": 0.25,
1180
  "n_examples": 100,
1181
  "per_subtask": {
1182
  "MD": {
 
1188
  "count": 100
1189
  },
1190
  "UB": {
1191
+ "accuracy": 0.7348993288590604,
1192
  "count": 298
1193
  },
1194
  "UD": {
1195
+ "accuracy": 0.6683168316831684,
1196
  "count": 202
1197
  }
1198
  }
1199
  }
1200
  },
1201
  "summary": {
1202
+ "overall_accuracy": 0.49541666666666667,
1203
  "total_examples": 2400,
1204
  "n_splits": 22
1205
  }
add_sub_baseline_25K_1L3H510d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b98be00f1db73dc27014b4c1caa5d84a53eb6d87420aaf5f5b520c01b59bab30
3
  size 634642298
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b3d0a4f5926074c2ad67eaf23945b1b503b7915d0fbafb817b3c7222125cc7
3
  size 634642298
add_sub_baseline_25K_1L3H510d/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 234,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_25K_1L3H510d",
72
- "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
- "timestamp": "2026-04-12T18:11:00.255428+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
- "wandb_run_id": "jqxb8wih",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/jqxb8wih",
81
- "final_accuracy": 0.395,
82
- "sft_accuracy": 0.395,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 234,
 
69
  "no_wandb": false,
70
  "n_params": 158584246,
71
  "run_name": "add_sub_baseline_25K_1L3H510d",
72
+ "git_commit": "dc8dd776fb0c30a4c9073052dcc5e943e0fd80c6",
73
+ "timestamp": "2026-04-13T06:26:07.104526+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "sft",
79
+ "wandb_run_id": "wpi2qkei",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/wpi2qkei",
81
+ "final_accuracy": 0.49541666666666667,
82
+ "sft_accuracy": 0.49541666666666667,
83
  "eval_method": "ArithmeticEvaluator"
84
  }