amirali1985 commited on
Commit
15c9e50
·
verified ·
1 Parent(s): c23af73

Upload add_sub_sorl_v1_abs10_25K

Browse files
add_sub_sorl_v1_abs10_25K/metrics.json CHANGED
@@ -73,509 +73,509 @@
73
  3869
74
  ],
75
  "loss": [
76
- 14.256978034973145,
77
- 8.812192916870117,
78
- 5.189567565917969,
79
- 3.4612081050872803,
80
- 2.9352636337280273,
81
- 3.010927438735962,
82
- 2.87626314163208,
83
- 2.229984998703003,
84
- 1.9719955921173096,
85
- 1.5822030305862427,
86
- 1.6735219955444336,
87
- 1.3240095376968384,
88
- 1.6034573316574097,
89
- 1.584771752357483,
90
- 1.3375678062438965,
91
- 1.5081782341003418,
92
- 1.2838478088378906,
93
- 1.442265510559082,
94
- 1.8373520374298096,
95
- 1.5873284339904785,
96
- 1.0017588138580322,
97
- 0.5844299793243408,
98
- 0.4599916934967041,
99
- 0.4710693061351776,
100
- -0.016015514731407166,
101
- -0.03885940462350845,
102
- -1.8101730346679688,
103
- -6.263668537139893,
104
- -8.383745193481445,
105
- -8.508172035217285,
106
- -9.05787181854248,
107
- -9.528923988342285,
108
- -10.230069160461426,
109
- -10.578496932983398,
110
- -10.426430702209473,
111
- -10.69153118133545,
112
- -11.126657485961914,
113
- -10.420424461364746,
114
- -10.6273775100708,
115
- -8.421341896057129,
116
- -7.00389289855957,
117
- -6.054770469665527,
118
- -3.3519654273986816,
119
- -2.9612250328063965,
120
- -2.2057387828826904,
121
- -1.278799057006836,
122
- -1.6747382879257202,
123
- -0.8737431168556213,
124
- -0.8648754358291626,
125
- -0.9642059803009033,
126
- -1.3165982961654663,
127
- -0.8842989802360535,
128
- -0.5905858278274536,
129
- -0.7463504076004028,
130
- -1.1404083967208862,
131
- -0.4939427971839905,
132
- -0.732154905796051,
133
- -0.5326835513114929,
134
- -0.6656410694122314,
135
- -0.5514172315597534,
136
- -0.4456074833869934,
137
- -0.8364714980125427,
138
- -0.6598624587059021,
139
- -0.48375943303108215,
140
- -0.8276984691619873,
141
- -0.6905249953269958,
142
- -0.7523627281188965,
143
- -0.6108655333518982,
144
- -0.634128749370575,
145
- -0.6320534944534302
146
  ],
147
  "base_loss": [
148
- 8.179495811462402,
149
- 6.115806579589844,
150
- 4.016663551330566,
151
- 2.338233232498169,
152
- 2.049792528152466,
153
- 1.9500830173492432,
154
- 1.8704925775527954,
155
- 1.915109634399414,
156
- 1.9027211666107178,
157
- 1.822627067565918,
158
- 1.7563773393630981,
159
- 1.8559844493865967,
160
- 1.810850739479065,
161
- 1.8188568353652954,
162
- 1.8670803308486938,
163
- 1.7376196384429932,
164
- 1.7410252094268799,
165
- 1.6789716482162476,
166
- 1.7815697193145752,
167
- 1.7297338247299194,
168
- 1.76444411277771,
169
- 1.7341679334640503,
170
- 1.745019555091858,
171
- 1.7294776439666748,
172
- 1.7048413753509521,
173
- 1.7179557085037231,
174
- 1.7174227237701416,
175
- 1.8109327554702759,
176
- 1.6843860149383545,
177
- 1.691207766532898,
178
- 1.6398429870605469,
179
- 1.6830883026123047,
180
- 1.7041343450546265,
181
- 1.7096806764602661,
182
- 1.6330136060714722,
183
- 1.572530746459961,
184
- 1.6290007829666138,
185
- 1.4542722702026367,
186
- 1.4407005310058594,
187
- 1.222497820854187,
188
- 1.0295950174331665,
189
- 0.9284564256668091,
190
- 0.6118221282958984,
191
- 0.5249842405319214,
192
- 0.4513547718524933,
193
- 0.31663909554481506,
194
- 0.3410135805606842,
195
- 0.3127397894859314,
196
- 0.23937572538852692,
197
- 0.23965051770210266,
198
- 0.2704034447669983,
199
- 0.22560283541679382,
200
- 0.20714624226093292,
201
- 0.18898002803325653,
202
- 0.21477903425693512,
203
- 0.18460628390312195,
204
- 0.20580320060253143,
205
- 0.16052566468715668,
206
- 0.17262044548988342,
207
- 0.14774015545845032,
208
- 0.14181342720985413,
209
- 0.2022370994091034,
210
- 0.15932251513004303,
211
- 0.11304645985364914,
212
- 0.16339722275733948,
213
- 0.16573865711688995,
214
- 0.17047688364982605,
215
- 0.13443663716316223,
216
- 0.14277073740959167,
217
- 0.12313588708639145
218
  ],
219
  "info_loss": [
220
- -0.23002290725708008,
221
- -0.16527986526489258,
222
- -0.11838364601135254,
223
- -0.08787274360656738,
224
- -0.10383403301239014,
225
- -0.0843881368637085,
226
- -0.08847904205322266,
227
- -0.1563432216644287,
228
- -0.1807953119277954,
229
- -0.21196186542510986,
230
- -0.19581389427185059,
231
- -0.24078822135925293,
232
- -0.2084348201751709,
233
- -0.21077227592468262,
234
- -0.2398141622543335,
235
- -0.2098095417022705,
236
- -0.231908917427063,
237
- -0.20753967761993408,
238
- -0.17290937900543213,
239
- -0.17037701606750488,
240
- -0.12913298606872559,
241
- -0.12954926490783691,
242
- -0.1401742696762085,
243
- -0.13750088214874268,
244
- -0.18243885040283203,
245
- -0.18511807918548584,
246
- -0.36848652362823486,
247
- -0.8207617402076721,
248
- -1.0179661512374878,
249
- -1.0324950218200684,
250
- -1.079706072807312,
251
- -1.1315070390701294,
252
- -1.2022056579589844,
253
- -1.2376500368118286,
254
- -1.2149099111557007,
255
- -1.234117865562439,
256
- -1.2836066484451294,
257
- -1.1954413652420044,
258
- -1.2143906354904175,
259
- -0.9723633527755737,
260
- -0.8120318651199341,
261
- -0.7064701318740845,
262
- -0.40395891666412354,
263
- -0.3564797043800354,
264
- -0.2730256915092468,
265
- -0.1676408350467682,
266
- -0.2091994285583496,
267
- -0.12594006955623627,
268
- -0.1181057021021843,
269
- -0.12815025448799133,
270
- -0.16598695516586304,
271
- -0.11813508719205856,
272
- -0.08626975864171982,
273
- -0.09994277358055115,
274
- -0.14211693406105042,
275
- -0.07376568019390106,
276
- -0.10002028197050095,
277
- -0.07552778720855713,
278
- -0.08972552418708801,
279
- -0.0756625384092331,
280
- -0.06394617259502411,
281
- -0.10925082862377167,
282
- -0.08743764460086823,
283
- -0.06474234163761139,
284
- -0.10444813966751099,
285
- -0.09072288870811462,
286
- -0.09688038378953934,
287
- -0.07976917922496796,
288
- -0.08310963958501816,
289
- -0.08014579117298126
290
  ],
291
  "abs_loss": [
292
- 2.2766036987304688,
293
- 2.091233730316162,
294
- 1.9168267250061035,
295
- 1.879494547843933,
296
- 1.8019380569458008,
297
- 1.82309889793396,
298
- 1.8274445533752441,
299
- 1.8011195659637451,
300
- 1.8016701936721802,
301
- 1.8573944568634033,
302
- 1.8407838344573975,
303
- 1.8531955480575562,
304
- 1.851827621459961,
305
- 1.843540906906128,
306
- 1.8486919403076172,
307
- 1.8462592363357544,
308
- 1.8527169227600098,
309
- 1.8005094528198242,
310
- 1.8157070875167847,
311
- 1.8012362718582153,
312
- 1.0213181972503662,
313
- 0.5417872667312622,
314
- 0.5030105113983154,
315
- 0.5118759274482727,
316
- 0.4955863952636719,
317
- 0.44640424847602844,
318
- 0.6110158562660217,
319
- 0.558056116104126,
320
- 0.486600399017334,
321
- 0.562961757183075,
322
- 0.47528234124183655,
323
- 0.49910861253738403,
324
- 0.46877163648605347,
325
- 0.44574135541915894,
326
- 0.5675186514854431,
327
- 0.4594317078590393,
328
- 0.49148818850517273,
329
- 0.5518755912780762,
330
- 0.5050919055938721,
331
- 0.4664192795753479,
332
- 0.5244419574737549,
333
- 0.47135239839553833,
334
- 0.5251164436340332,
335
- 0.5231102705001831,
336
- 0.5319302678108215,
337
- 0.5147315263748169,
338
- 0.4716704487800598,
339
- 0.5043060779571533,
340
- 0.5314348936080933,
341
- 0.6154720783233643,
342
- 0.6017305850982666,
343
- 0.5053713321685791,
344
- 0.5168009996414185,
345
- 0.5287708044052124,
346
- 0.5177637338638306,
347
- 0.46657508611679077,
348
- 0.4951359033584595,
349
- 0.4861401319503784,
350
- 0.40949398279190063,
351
- 0.38447850942611694,
352
- 0.40168479084968567,
353
- 0.3637966215610504,
354
- 0.41874241828918457,
355
- 0.4040564298629761,
356
- 0.3729878067970276,
357
- 0.36216187477111816,
358
- 0.3261665105819702,
359
- 0.3961992561817169,
360
- 0.40525567531585693,
361
- 0.347781777381897
362
  ],
363
  "zipf_loss": [
364
- 8.15005111694336,
365
- 4.140061378479004,
366
- 2.16505765914917,
367
- 1.8137528896331787,
368
- 1.7436177730560303,
369
- 1.7224159240722656,
370
- 1.707816481590271,
371
- 1.6981955766677856,
372
- 1.697060465812683,
373
- 1.6934552192687988,
374
- 1.6912052631378174,
375
- 1.6905877590179443,
376
- 1.6917719841003418,
377
- 1.6892836093902588,
378
- 1.6837599277496338,
379
- 1.6840280294418335,
380
- 1.6766400337219238,
381
- 1.658639669418335,
382
- 1.6033053398132324,
383
- 1.3812410831451416,
384
- 0.426512748003006,
385
- 0.0915759801864624,
386
- 0.06641378998756409,
387
- 0.06541288644075394,
388
- 0.05397297441959381,
389
- 0.049725256860256195,
390
- 0.09616798162460327,
391
- 0.07721105217933655,
392
- 0.06286955624818802,
393
- 0.06927378475666046,
394
- 0.05181824788451195,
395
- 0.05314686521887779,
396
- 0.04097625985741615,
397
- 0.04374873265624046,
398
- 0.032902587205171585,
399
- 0.03117380663752556,
400
- 0.03125934302806854,
401
- 0.02452976070344448,
402
- 0.025319188833236694,
403
- 0.03315161168575287,
404
- 0.03438675403594971,
405
- 0.03433896601200104,
406
- 0.023289574310183525,
407
- 0.02627655677497387,
408
- 0.019970480352640152,
409
- 0.029496990144252777,
410
- 0.029075412079691887,
411
- 0.022487249225378036,
412
- 0.02366235852241516,
413
- 0.016098806634545326,
414
- 0.01269473321735859,
415
- 0.02091185189783573,
416
- 0.01328546553850174,
417
- 0.011220241896808147,
418
- 0.014205502346158028,
419
- 0.01245021354407072,
420
- 0.012731088325381279,
421
- 0.013454608619213104,
422
- 0.01804427243769169,
423
- 0.019020183011889458,
424
- 0.011872355826199055,
425
- 0.017420047894120216,
426
- 0.013317206874489784,
427
- 0.010211849585175514,
428
- 0.016086921095848083,
429
- 0.014749030582606792,
430
- 0.013347554951906204,
431
- 0.012769741006195545,
432
- 0.013671365566551685,
433
- 0.011490320786833763
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
- 0.5640959739685059,
438
- 0.2631226181983948,
439
- 0.14999699592590332,
440
- 0.12239736318588257,
441
- 0.09714409708976746,
442
- 0.07303234189748764,
443
- 0.06550216674804688,
444
- 0.06051327660679817,
445
- 0.07507379353046417,
446
- 0.084007628262043,
447
- 0.08215909451246262,
448
- 0.08330931514501572,
449
- 0.08743671327829361,
450
- 0.09553566575050354,
451
- 0.08920977264642715,
452
- 0.09277433902025223,
453
- 0.09217482805252075,
454
- 0.09592001140117645,
455
- 0.10014844685792923,
456
- 0.10119500756263733,
457
- 0.09707954525947571,
458
- 0.097630575299263,
459
- 0.09751289337873459,
460
- 0.09296217560768127,
461
- 0.08584143221378326,
462
- 0.08582761883735657,
463
- 0.08925554901361465,
464
- 0.09694031625986099,
465
- 0.10842876881361008,
466
- 0.11049525439739227,
467
- 0.11342362314462662,
468
- 0.11486317962408066,
469
- 0.11290319263935089,
470
- 0.1154816746711731,
471
- 0.11996712535619736,
472
- 0.13185496628284454,
473
- 0.12799306213855743,
474
- 0.13368353247642517,
475
- 0.13123612105846405,
476
- 0.13329772651195526,
477
- 0.1350865513086319,
478
- 0.13771839439868927,
479
- 0.1451946496963501,
480
- 0.15061640739440918,
481
- 0.1515263319015503,
482
- 0.1512250006198883,
483
- 0.14959852397441864,
484
- 0.15471874177455902,
485
- 0.1546550989151001,
486
- 0.15259236097335815,
487
- 0.1592143177986145,
488
- 0.16225320100784302,
489
- 0.16515889763832092,
490
- 0.16539309918880463,
491
- 0.16331565380096436,
492
- 0.16385121643543243,
493
- 0.17084619402885437,
494
- 0.17373836040496826,
495
- 0.17300517857074738,
496
- 0.17302757501602173,
497
- 0.17285387217998505,
498
- 0.17389743030071259,
499
- 0.17504020035266876,
500
- 0.17735148966312408,
501
- 0.18059377372264862,
502
- 0.18277068436145782,
503
- 0.1846488118171692,
504
- 0.18622788786888123,
505
- 0.18851174414157867,
506
- 0.19029369950294495
507
  ],
508
  "lr": [
509
- 1.6752136752136756e-05,
510
- 3.384615384615385e-05,
511
- 4e-05,
512
- 4e-05,
513
- 4e-05,
514
- 4e-05,
515
- 4e-05,
516
- 4e-05,
517
- 4e-05,
518
- 4e-05,
519
- 4e-05,
520
- 4e-05,
521
- 4e-05,
522
- 4e-05,
523
- 4e-05,
524
- 4e-05,
525
- 4e-05,
526
- 4e-05,
527
- 4e-05,
528
- 4e-05,
529
- 4e-05,
530
- 4e-05,
531
- 4e-05,
532
- 4e-05,
533
- 4e-05,
534
- 4e-05,
535
- 4e-05,
536
- 4e-05,
537
- 4e-05,
538
- 4e-05,
539
- 4e-05,
540
- 4e-05,
541
- 4e-05,
542
- 4e-05,
543
- 4e-05,
544
- 4e-05,
545
- 4e-05,
546
- 4e-05,
547
- 4e-05,
548
- 4e-05,
549
- 4e-05,
550
- 4e-05,
551
- 3.9947798576324814e-05,
552
- 3.8761402583706826e-05,
553
- 3.757500659108885e-05,
554
- 3.6388610598470864e-05,
555
- 3.5202214605852884e-05,
556
- 3.401581861323491e-05,
557
- 3.282942262061693e-05,
558
- 3.0670181914052204e-05,
559
- 2.948378592143422e-05,
560
- 2.8297389928816243e-05,
561
- 2.711099393619826e-05,
562
- 2.5924597943580284e-05,
563
- 2.4738201950962303e-05,
564
- 2.3551805958344316e-05,
565
- 2.1392565251779595e-05,
566
- 2.020616925916161e-05,
567
- 1.901977326654364e-05,
568
- 1.783337727392566e-05,
569
- 1.6646981281307675e-05,
570
- 1.546058528868969e-05,
571
- 1.427418929607171e-05,
572
- 1.2114948589506984e-05,
573
- 1.0928552596889013e-05,
574
- 9.742156604271029e-06,
575
- 8.555760611653046e-06,
576
- 7.369364619035064e-06,
577
- 6.182968626417082e-06,
578
- 4.996572633799099e-06
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
@@ -593,17 +593,17 @@
593
  "eval_accuracy": [
594
  0.01,
595
  0.0,
596
- 0.01,
597
- 0.12,
598
- 0.26,
599
- 0.6,
600
- 0.59,
601
- 0.76,
602
- 0.78,
603
- 0.83
604
  ]
605
  },
606
- "final_accuracy": 0.6191666666666666,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
@@ -614,11 +614,11 @@
614
  },
615
  "splits": {
616
  "add_S0": {
617
- "full_accuracy": 0.82,
618
  "n_examples": 100,
619
  "per_subtask": {
620
  "SA": {
621
- "accuracy": 0.9735537190082645,
622
  "count": 605
623
  },
624
  "SS": {
@@ -628,107 +628,107 @@
628
  }
629
  },
630
  "add_S1": {
631
- "full_accuracy": 0.83,
632
  "n_examples": 100,
633
  "per_subtask": {
634
  "SA": {
635
- "accuracy": 0.9754901960784313,
636
  "count": 204
637
  },
638
  "SC": {
639
- "accuracy": 0.9881656804733728,
640
  "count": 169
641
  },
642
  "SS": {
643
- "accuracy": 1.0,
644
  "count": 31
645
  },
646
  "UC": {
647
- "accuracy": 0.9628378378378378,
648
  "count": 296
649
  }
650
  }
651
  },
652
  "add_S2": {
653
- "full_accuracy": 0.59,
654
  "n_examples": 100,
655
  "per_subtask": {
656
  "SA": {
657
- "accuracy": 0.9815950920245399,
658
  "count": 163
659
  },
660
  "SC": {
661
- "accuracy": 0.9384615384615385,
662
  "count": 130
663
  },
664
  "SS": {
665
- "accuracy": 0.9425287356321839,
666
  "count": 87
667
  },
668
  "UC": {
669
- "accuracy": 0.8669950738916257,
670
  "count": 203
671
  },
672
  "US": {
673
- "accuracy": 0.9145299145299145,
674
  "count": 117
675
  }
676
  }
677
  },
678
  "add_S3": {
679
- "full_accuracy": 0.38,
680
  "n_examples": 100,
681
  "per_subtask": {
682
  "SA": {
683
- "accuracy": 0.9586776859504132,
684
  "count": 121
685
  },
686
  "SC": {
687
- "accuracy": 0.9752066115702479,
688
  "count": 121
689
  },
690
  "SS": {
691
- "accuracy": 0.9387755102040817,
692
  "count": 49
693
  },
694
  "UC": {
695
- "accuracy": 0.7204301075268817,
696
  "count": 186
697
  },
698
  "US": {
699
- "accuracy": 0.7982062780269058,
700
  "count": 223
701
  }
702
  }
703
  },
704
  "add_S4": {
705
- "full_accuracy": 0.31,
706
  "n_examples": 100,
707
  "per_subtask": {
708
  "SA": {
709
- "accuracy": 0.9807692307692307,
710
  "count": 104
711
  },
712
  "SC": {
713
- "accuracy": 0.9716981132075472,
714
  "count": 106
715
  },
716
  "SS": {
717
- "accuracy": 1.0,
718
  "count": 23
719
  },
720
  "UC": {
721
- "accuracy": 0.63125,
722
  "count": 160
723
  },
724
  "US": {
725
- "accuracy": 0.6742671009771987,
726
  "count": 307
727
  }
728
  }
729
  },
730
  "add_S5": {
731
- "full_accuracy": 0.28,
732
  "n_examples": 100,
733
  "per_subtask": {
734
  "SA": {
@@ -736,15 +736,15 @@
736
  "count": 100
737
  },
738
  "SC": {
739
- "accuracy": 1.0,
740
  "count": 100
741
  },
742
  "UC": {
743
- "accuracy": 0.46,
744
  "count": 100
745
  },
746
  "US": {
747
- "accuracy": 0.51,
748
  "count": 400
749
  }
750
  }
@@ -758,87 +758,87 @@
758
  "count": 100
759
  },
760
  "UC": {
761
- "accuracy": 0.44,
762
  "count": 100
763
  },
764
  "US": {
765
- "accuracy": 0.582,
766
  "count": 500
767
  }
768
  }
769
  },
770
  "add_random": {
771
- "full_accuracy": 0.785,
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
- "accuracy": 0.9753914988814317,
776
  "count": 447
777
  },
778
  "SC": {
779
- "accuracy": 0.98125,
780
  "count": 320
781
  },
782
  "SS": {
783
- "accuracy": 0.9821428571428571,
784
  "count": 56
785
  },
786
  "UC": {
787
- "accuracy": 0.9489603024574669,
788
  "count": 529
789
  },
790
  "US": {
791
- "accuracy": 0.7708333333333334,
792
  "count": 48
793
  }
794
  }
795
  },
796
  "add_C3": {
797
- "full_accuracy": 0.56,
798
  "n_examples": 100,
799
  "per_subtask": {
800
  "SA": {
801
- "accuracy": 0.9966666666666667,
802
  "count": 300
803
  },
804
  "SC": {
805
- "accuracy": 1.0,
806
  "count": 100
807
  },
808
  "UC": {
809
- "accuracy": 0.772020725388601,
810
  "count": 193
811
  },
812
  "US": {
813
- "accuracy": 0.8037383177570093,
814
  "count": 107
815
  }
816
  }
817
  },
818
  "add_C4": {
819
- "full_accuracy": 0.54,
820
  "n_examples": 100,
821
  "per_subtask": {
822
  "SA": {
823
- "accuracy": 1.0,
824
  "count": 200
825
  },
826
  "SC": {
827
- "accuracy": 1.0,
828
  "count": 100
829
  },
830
  "UC": {
831
- "accuracy": 0.8359375,
832
  "count": 256
833
  },
834
  "US": {
835
- "accuracy": 0.8611111111111112,
836
  "count": 144
837
  }
838
  }
839
  },
840
  "add_C5": {
841
- "full_accuracy": 0.4,
842
  "n_examples": 100,
843
  "per_subtask": {
844
  "SA": {
@@ -846,21 +846,21 @@
846
  "count": 100
847
  },
848
  "SC": {
849
- "accuracy": 1.0,
850
  "count": 100
851
  },
852
  "UC": {
853
- "accuracy": 0.803921568627451,
854
  "count": 306
855
  },
856
  "US": {
857
- "accuracy": 0.7731958762886598,
858
  "count": 194
859
  }
860
  }
861
  },
862
  "add_C6": {
863
- "full_accuracy": 0.52,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SC": {
@@ -868,21 +868,21 @@
868
  "count": 100
869
  },
870
  "UC": {
871
- "accuracy": 0.8688524590163934,
872
  "count": 366
873
  },
874
  "US": {
875
- "accuracy": 0.8675213675213675,
876
  "count": 234
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
- "full_accuracy": 0.9,
882
  "n_examples": 100,
883
  "per_subtask": {
884
  "MD": {
885
- "accuracy": 0.9850249584026622,
886
  "count": 601
887
  },
888
  "ME": {
@@ -892,15 +892,15 @@
892
  }
893
  },
894
  "sub_M1": {
895
- "full_accuracy": 0.9,
896
  "n_examples": 100,
897
  "per_subtask": {
898
  "MD": {
899
- "accuracy": 0.989247311827957,
900
  "count": 279
901
  },
902
  "MB": {
903
- "accuracy": 0.9793103448275862,
904
  "count": 145
905
  },
906
  "ME": {
@@ -908,39 +908,39 @@
908
  "count": 24
909
  },
910
  "UB": {
911
- "accuracy": 0.9880952380952381,
912
  "count": 252
913
  }
914
  }
915
  },
916
  "sub_M2": {
917
- "full_accuracy": 0.65,
918
  "n_examples": 100,
919
  "per_subtask": {
920
  "MD": {
921
- "accuracy": 0.9953051643192489,
922
  "count": 213
923
  },
924
  "MB": {
925
- "accuracy": 1.0,
926
  "count": 113
927
  },
928
  "ME": {
929
- "accuracy": 0.9764705882352941,
930
  "count": 85
931
  },
932
  "UB": {
933
- "accuracy": 0.8176795580110497,
934
  "count": 181
935
  },
936
  "UD": {
937
- "accuracy": 0.9537037037037037,
938
  "count": 108
939
  }
940
  }
941
  },
942
  "sub_M3": {
943
- "full_accuracy": 0.24,
944
  "n_examples": 100,
945
  "per_subtask": {
946
  "MD": {
@@ -948,25 +948,25 @@
948
  "count": 179
949
  },
950
  "MB": {
951
- "accuracy": 0.9902912621359223,
952
  "count": 103
953
  },
954
  "ME": {
955
- "accuracy": 1.0,
956
  "count": 56
957
  },
958
  "UB": {
959
- "accuracy": 0.5234899328859061,
960
  "count": 149
961
  },
962
  "UD": {
963
- "accuracy": 0.7934272300469484,
964
  "count": 213
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
- "full_accuracy": 0.01,
970
  "n_examples": 100,
971
  "per_subtask": {
972
  "MD": {
@@ -978,17 +978,17 @@
978
  "count": 100
979
  },
980
  "UB": {
981
- "accuracy": 0.39,
982
  "count": 100
983
  },
984
  "UD": {
985
- "accuracy": 0.43666666666666665,
986
  "count": 300
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
- "full_accuracy": 0.0,
992
  "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
@@ -1000,21 +1000,21 @@
1000
  "count": 100
1001
  },
1002
  "UB": {
1003
- "accuracy": 0.39,
1004
  "count": 100
1005
  },
1006
  "UD": {
1007
- "accuracy": 0.275,
1008
  "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
- "full_accuracy": 0.81,
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 0.9933333333333333,
1018
  "count": 600
1019
  },
1020
  "MB": {
@@ -1022,65 +1022,65 @@
1022
  "count": 267
1023
  },
1024
  "ME": {
1025
- "accuracy": 0.9622641509433962,
1026
  "count": 53
1027
  },
1028
  "UB": {
1029
- "accuracy": 0.9476082004555809,
1030
  "count": 439
1031
  },
1032
  "UD": {
1033
- "accuracy": 0.975609756097561,
1034
  "count": 41
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
- "full_accuracy": 0.62,
1040
  "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
1043
- "accuracy": 0.9966666666666667,
1044
  "count": 300
1045
  },
1046
  "MB": {
1047
- "accuracy": 1.0,
1048
  "count": 100
1049
  },
1050
  "UB": {
1051
- "accuracy": 0.8274111675126904,
1052
  "count": 197
1053
  },
1054
  "UD": {
1055
- "accuracy": 0.7669902912621359,
1056
  "count": 103
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
- "full_accuracy": 0.38,
1062
  "n_examples": 100,
1063
  "per_subtask": {
1064
  "MD": {
1065
- "accuracy": 1.0,
1066
  "count": 200
1067
  },
1068
  "MB": {
1069
- "accuracy": 1.0,
1070
  "count": 100
1071
  },
1072
  "UB": {
1073
- "accuracy": 0.7732793522267206,
1074
  "count": 247
1075
  },
1076
  "UD": {
1077
- "accuracy": 0.7450980392156863,
1078
  "count": 153
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
- "full_accuracy": 0.37,
1084
  "n_examples": 100,
1085
  "per_subtask": {
1086
  "MD": {
@@ -1092,18 +1092,18 @@
1092
  "count": 100
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.785234899328859,
1096
  "count": 298
1097
  },
1098
  "UD": {
1099
- "accuracy": 0.7524752475247525,
1100
  "count": 202
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
- "overall_accuracy": 0.5375,
1107
  "total_examples": 2400,
1108
  "n_splits": 22
1109
  }
@@ -1118,11 +1118,11 @@
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
- "full_accuracy": 0.97,
1122
  "n_examples": 100,
1123
  "per_subtask": {
1124
  "SA": {
1125
- "accuracy": 0.9950413223140496,
1126
  "count": 605
1127
  },
1128
  "SS": {
@@ -1132,15 +1132,15 @@
1132
  }
1133
  },
1134
  "add_S1": {
1135
- "full_accuracy": 0.9,
1136
  "n_examples": 100,
1137
  "per_subtask": {
1138
  "SA": {
1139
- "accuracy": 0.9852941176470589,
1140
  "count": 204
1141
  },
1142
  "SC": {
1143
- "accuracy": 0.9822485207100592,
1144
  "count": 169
1145
  },
1146
  "SS": {
@@ -1148,29 +1148,29 @@
1148
  "count": 31
1149
  },
1150
  "UC": {
1151
- "accuracy": 0.9864864864864865,
1152
  "count": 296
1153
  }
1154
  }
1155
  },
1156
  "add_S2": {
1157
- "full_accuracy": 0.76,
1158
  "n_examples": 100,
1159
  "per_subtask": {
1160
  "SA": {
1161
- "accuracy": 0.9815950920245399,
1162
  "count": 163
1163
  },
1164
  "SC": {
1165
- "accuracy": 0.9615384615384616,
1166
  "count": 130
1167
  },
1168
  "SS": {
1169
- "accuracy": 0.9425287356321839,
1170
  "count": 87
1171
  },
1172
  "UC": {
1173
- "accuracy": 0.9261083743842364,
1174
  "count": 203
1175
  },
1176
  "US": {
@@ -1180,33 +1180,33 @@
1180
  }
1181
  },
1182
  "add_S3": {
1183
- "full_accuracy": 0.57,
1184
  "n_examples": 100,
1185
  "per_subtask": {
1186
  "SA": {
1187
- "accuracy": 0.9917355371900827,
1188
  "count": 121
1189
  },
1190
  "SC": {
1191
- "accuracy": 0.9834710743801653,
1192
  "count": 121
1193
  },
1194
  "SS": {
1195
- "accuracy": 0.9591836734693877,
1196
  "count": 49
1197
  },
1198
  "UC": {
1199
- "accuracy": 0.7956989247311828,
1200
  "count": 186
1201
  },
1202
  "US": {
1203
- "accuracy": 0.9192825112107623,
1204
  "count": 223
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
- "full_accuracy": 0.38,
1210
  "n_examples": 100,
1211
  "per_subtask": {
1212
  "SA": {
@@ -1222,17 +1222,17 @@
1222
  "count": 23
1223
  },
1224
  "UC": {
1225
- "accuracy": 0.68125,
1226
  "count": 160
1227
  },
1228
  "US": {
1229
- "accuracy": 0.7654723127035831,
1230
  "count": 307
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
- "full_accuracy": 0.19,
1236
  "n_examples": 100,
1237
  "per_subtask": {
1238
  "SA": {
@@ -1244,17 +1244,17 @@
1244
  "count": 100
1245
  },
1246
  "UC": {
1247
- "accuracy": 0.32,
1248
  "count": 100
1249
  },
1250
  "US": {
1251
- "accuracy": 0.505,
1252
  "count": 400
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
- "full_accuracy": 0.38,
1258
  "n_examples": 100,
1259
  "per_subtask": {
1260
  "SC": {
@@ -1262,43 +1262,43 @@
1262
  "count": 100
1263
  },
1264
  "UC": {
1265
- "accuracy": 0.59,
1266
  "count": 100
1267
  },
1268
  "US": {
1269
- "accuracy": 0.634,
1270
  "count": 500
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
- "full_accuracy": 0.885,
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
- "accuracy": 0.9865771812080537,
1280
  "count": 447
1281
  },
1282
  "SC": {
1283
- "accuracy": 0.98125,
1284
  "count": 320
1285
  },
1286
  "SS": {
1287
- "accuracy": 0.9642857142857143,
1288
  "count": 56
1289
  },
1290
  "UC": {
1291
- "accuracy": 0.9792060491493384,
1292
  "count": 529
1293
  },
1294
  "US": {
1295
- "accuracy": 0.9375,
1296
  "count": 48
1297
  }
1298
  }
1299
  },
1300
  "add_C3": {
1301
- "full_accuracy": 0.71,
1302
  "n_examples": 100,
1303
  "per_subtask": {
1304
  "SA": {
@@ -1310,17 +1310,17 @@
1310
  "count": 100
1311
  },
1312
  "UC": {
1313
- "accuracy": 0.8497409326424871,
1314
  "count": 193
1315
  },
1316
  "US": {
1317
- "accuracy": 0.9345794392523364,
1318
  "count": 107
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
- "full_accuracy": 0.76,
1324
  "n_examples": 100,
1325
  "per_subtask": {
1326
  "SA": {
@@ -1332,17 +1332,17 @@
1332
  "count": 100
1333
  },
1334
  "UC": {
1335
- "accuracy": 0.91015625,
1336
  "count": 256
1337
  },
1338
  "US": {
1339
- "accuracy": 0.9236111111111112,
1340
  "count": 144
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
- "full_accuracy": 0.54,
1346
  "n_examples": 100,
1347
  "per_subtask": {
1348
  "SA": {
@@ -1354,17 +1354,17 @@
1354
  "count": 100
1355
  },
1356
  "UC": {
1357
- "accuracy": 0.8594771241830066,
1358
  "count": 306
1359
  },
1360
  "US": {
1361
- "accuracy": 0.8402061855670103,
1362
  "count": 194
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
- "full_accuracy": 0.68,
1368
  "n_examples": 100,
1369
  "per_subtask": {
1370
  "SC": {
@@ -1372,21 +1372,21 @@
1372
  "count": 100
1373
  },
1374
  "UC": {
1375
- "accuracy": 0.9098360655737705,
1376
  "count": 366
1377
  },
1378
  "US": {
1379
- "accuracy": 0.9487179487179487,
1380
  "count": 234
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
- "full_accuracy": 0.98,
1386
  "n_examples": 100,
1387
  "per_subtask": {
1388
  "MD": {
1389
- "accuracy": 0.9966722129783694,
1390
  "count": 601
1391
  },
1392
  "ME": {
@@ -1396,7 +1396,7 @@
1396
  }
1397
  },
1398
  "sub_M1": {
1399
- "full_accuracy": 0.97,
1400
  "n_examples": 100,
1401
  "per_subtask": {
1402
  "MD": {
@@ -1404,11 +1404,11 @@
1404
  "count": 279
1405
  },
1406
  "MB": {
1407
- "accuracy": 0.9862068965517241,
1408
  "count": 145
1409
  },
1410
  "ME": {
1411
- "accuracy": 0.9583333333333334,
1412
  "count": 24
1413
  },
1414
  "UB": {
@@ -1418,15 +1418,15 @@
1418
  }
1419
  },
1420
  "sub_M2": {
1421
- "full_accuracy": 0.63,
1422
  "n_examples": 100,
1423
  "per_subtask": {
1424
  "MD": {
1425
- "accuracy": 0.9906103286384976,
1426
  "count": 213
1427
  },
1428
  "MB": {
1429
- "accuracy": 0.9911504424778761,
1430
  "count": 113
1431
  },
1432
  "ME": {
@@ -1434,7 +1434,7 @@
1434
  "count": 85
1435
  },
1436
  "UB": {
1437
- "accuracy": 0.8121546961325967,
1438
  "count": 181
1439
  },
1440
  "UD": {
@@ -1444,15 +1444,15 @@
1444
  }
1445
  },
1446
  "sub_M3": {
1447
- "full_accuracy": 0.2,
1448
  "n_examples": 100,
1449
  "per_subtask": {
1450
  "MD": {
1451
- "accuracy": 0.994413407821229,
1452
  "count": 179
1453
  },
1454
  "MB": {
1455
- "accuracy": 0.9902912621359223,
1456
  "count": 103
1457
  },
1458
  "ME": {
@@ -1460,17 +1460,17 @@
1460
  "count": 56
1461
  },
1462
  "UB": {
1463
- "accuracy": 0.4966442953020134,
1464
  "count": 149
1465
  },
1466
  "UD": {
1467
- "accuracy": 0.8075117370892019,
1468
  "count": 213
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
- "full_accuracy": 0.06,
1474
  "n_examples": 100,
1475
  "per_subtask": {
1476
  "MD": {
@@ -1482,17 +1482,17 @@
1482
  "count": 100
1483
  },
1484
  "UB": {
1485
- "accuracy": 0.25,
1486
  "count": 100
1487
  },
1488
  "UD": {
1489
- "accuracy": 0.5,
1490
  "count": 300
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
- "full_accuracy": 0.08,
1496
  "n_examples": 100,
1497
  "per_subtask": {
1498
  "MD": {
@@ -1504,25 +1504,25 @@
1504
  "count": 100
1505
  },
1506
  "UB": {
1507
- "accuracy": 0.22,
1508
  "count": 100
1509
  },
1510
  "UD": {
1511
- "accuracy": 0.425,
1512
  "count": 400
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
- "full_accuracy": 0.87,
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
- "accuracy": 0.9933333333333333,
1522
  "count": 600
1523
  },
1524
  "MB": {
1525
- "accuracy": 0.9887640449438202,
1526
  "count": 267
1527
  },
1528
  "ME": {
@@ -1530,21 +1530,21 @@
1530
  "count": 53
1531
  },
1532
  "UB": {
1533
- "accuracy": 0.9567198177676538,
1534
  "count": 439
1535
  },
1536
  "UD": {
1537
- "accuracy": 0.975609756097561,
1538
  "count": 41
1539
  }
1540
  }
1541
  },
1542
  "sub_B3": {
1543
- "full_accuracy": 0.67,
1544
  "n_examples": 100,
1545
  "per_subtask": {
1546
  "MD": {
1547
- "accuracy": 0.9933333333333333,
1548
  "count": 300
1549
  },
1550
  "MB": {
@@ -1552,17 +1552,17 @@
1552
  "count": 100
1553
  },
1554
  "UB": {
1555
- "accuracy": 0.8629441624365483,
1556
  "count": 197
1557
  },
1558
  "UD": {
1559
- "accuracy": 0.8349514563106796,
1560
  "count": 103
1561
  }
1562
  }
1563
  },
1564
  "sub_B4": {
1565
- "full_accuracy": 0.55,
1566
  "n_examples": 100,
1567
  "per_subtask": {
1568
  "MD": {
@@ -1574,17 +1574,17 @@
1574
  "count": 100
1575
  },
1576
  "UB": {
1577
- "accuracy": 0.8259109311740891,
1578
  "count": 247
1579
  },
1580
  "UD": {
1581
- "accuracy": 0.8169934640522876,
1582
  "count": 153
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
- "full_accuracy": 0.38,
1588
  "n_examples": 100,
1589
  "per_subtask": {
1590
  "MD": {
@@ -1596,22 +1596,22 @@
1596
  "count": 100
1597
  },
1598
  "UB": {
1599
- "accuracy": 0.7986577181208053,
1600
  "count": 298
1601
  },
1602
  "UD": {
1603
- "accuracy": 0.7574257425742574,
1604
  "count": 202
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
- "overall_accuracy": 0.6191666666666666,
1611
  "total_examples": 2400,
1612
  "n_splits": 22
1613
  }
1614
  },
1615
- "sorl_overall_accuracy": 0.6191666666666666,
1616
- "sft_overall_accuracy": 0.5375
1617
  }
 
73
  3869
74
  ],
75
  "loss": [
76
+ 12.149293899536133,
77
+ 5.929956436157227,
78
+ 2.9691457748413086,
79
+ 2.728782892227173,
80
+ 2.947129726409912,
81
+ 2.653822422027588,
82
+ 2.7324559688568115,
83
+ 2.199665069580078,
84
+ 1.981349229812622,
85
+ 1.708694338798523,
86
+ 1.9832797050476074,
87
+ 0.28236889839172363,
88
+ -0.3522413372993469,
89
+ -0.5116673111915588,
90
+ -5.281974792480469,
91
+ -7.145444393157959,
92
+ -9.3859281539917,
93
+ -9.920683860778809,
94
+ -10.891196250915527,
95
+ -11.45218563079834,
96
+ -12.591782569885254,
97
+ -13.195165634155273,
98
+ -13.34947395324707,
99
+ -13.665068626403809,
100
+ -13.288025856018066,
101
+ -13.608355522155762,
102
+ -13.651628494262695,
103
+ -13.755077362060547,
104
+ -13.579610824584961,
105
+ -13.95695972442627,
106
+ -13.180118560791016,
107
+ -13.482614517211914,
108
+ -13.230802536010742,
109
+ -12.17918586730957,
110
+ -10.092674255371094,
111
+ -6.810609340667725,
112
+ -6.023334503173828,
113
+ -5.472952842712402,
114
+ -5.557395935058594,
115
+ -5.194924831390381,
116
+ -4.636050701141357,
117
+ -5.174846172332764,
118
+ -4.493751049041748,
119
+ -4.809574127197266,
120
+ -4.58535099029541,
121
+ -3.341444253921509,
122
+ -4.477597236633301,
123
+ -3.532081365585327,
124
+ -3.53200626373291,
125
+ -3.320857286453247,
126
+ -3.7618467807769775,
127
+ -3.268784761428833,
128
+ -3.0646679401397705,
129
+ -2.330127477645874,
130
+ -2.34325909614563,
131
+ -2.682443141937256,
132
+ -2.45845365524292,
133
+ -2.6193289756774902,
134
+ -2.703904867172241,
135
+ -2.787076711654663,
136
+ -2.433518171310425,
137
+ -2.264406442642212,
138
+ -1.8057308197021484,
139
+ -1.878203272819519,
140
+ -2.12003231048584,
141
+ -2.0383877754211426,
142
+ -2.1631381511688232,
143
+ -2.256495714187622,
144
+ -2.0132031440734863,
145
+ -1.6519348621368408
146
  ],
147
  "base_loss": [
148
+ 7.158026695251465,
149
+ 4.391642093658447,
150
+ 2.067096710205078,
151
+ 1.9902334213256836,
152
+ 1.9624253511428833,
153
+ 1.9029481410980225,
154
+ 1.8107260465621948,
155
+ 1.85612952709198,
156
+ 1.8203506469726562,
157
+ 1.7635061740875244,
158
+ 1.710713505744934,
159
+ 1.8186479806900024,
160
+ 1.7509092092514038,
161
+ 1.7742748260498047,
162
+ 1.9272178411483765,
163
+ 1.6962559223175049,
164
+ 1.742572546005249,
165
+ 1.651781678199768,
166
+ 1.743944764137268,
167
+ 1.6543207168579102,
168
+ 1.713712453842163,
169
+ 1.6910855770111084,
170
+ 1.6437350511550903,
171
+ 1.6731939315795898,
172
+ 1.651646614074707,
173
+ 1.6904518604278564,
174
+ 1.622711420059204,
175
+ 1.637228012084961,
176
+ 1.63518488407135,
177
+ 1.6473982334136963,
178
+ 1.5401723384857178,
179
+ 1.5839508771896362,
180
+ 1.5411393642425537,
181
+ 1.4250189065933228,
182
+ 1.1851530075073242,
183
+ 0.8436911702156067,
184
+ 0.7452899217605591,
185
+ 0.6555761098861694,
186
+ 0.6856979727745056,
187
+ 0.64599609375,
188
+ 0.5920876264572144,
189
+ 0.6134722828865051,
190
+ 0.5435965657234192,
191
+ 0.574462890625,
192
+ 0.5591358542442322,
193
+ 0.4351283609867096,
194
+ 0.5301463007926941,
195
+ 0.43902722001075745,
196
+ 0.4200439453125,
197
+ 0.4339711666107178,
198
+ 0.4334247410297394,
199
+ 0.3855840861797333,
200
+ 0.36069685220718384,
201
+ 0.29347044229507446,
202
+ 0.2842704951763153,
203
+ 0.3291773498058319,
204
+ 0.29186540842056274,
205
+ 0.3001766502857208,
206
+ 0.30834123492240906,
207
+ 0.3233821392059326,
208
+ 0.2841505706310272,
209
+ 0.26011982560157776,
210
+ 0.2230202853679657,
211
+ 0.22029760479927063,
212
+ 0.24059365689754486,
213
+ 0.23020339012145996,
214
+ 0.2445732206106186,
215
+ 0.2575667202472687,
216
+ 0.22733500599861145,
217
+ 0.18718865513801575
218
  ],
219
  "info_loss": [
220
+ -0.2241072654724121,
221
+ -0.10442543029785156,
222
+ -0.10411179065704346,
223
+ -0.1153557300567627,
224
+ -0.08893084526062012,
225
+ -0.11244285106658936,
226
+ -0.09568226337432861,
227
+ -0.15248167514801025,
228
+ -0.16945350170135498,
229
+ -0.18039953708648682,
230
+ -0.11919641494750977,
231
+ -0.1902679204940796,
232
+ -0.22634315490722656,
233
+ -0.24306809902191162,
234
+ -0.7322999238967896,
235
+ -0.894987165927887,
236
+ -1.1232069730758667,
237
+ -1.1656540632247925,
238
+ -1.2713799476623535,
239
+ -1.3181113004684448,
240
+ -1.4363821744918823,
241
+ -1.4957610368728638,
242
+ -1.5041522979736328,
243
+ -1.53890860080719,
244
+ -1.4987587928771973,
245
+ -1.5347204208374023,
246
+ -1.5322381258010864,
247
+ -1.543616533279419,
248
+ -1.5256987810134888,
249
+ -1.564131736755371,
250
+ -1.476341724395752,
251
+ -1.5095551013946533,
252
+ -1.480704426765442,
253
+ -1.3635960817337036,
254
+ -1.1337618827819824,
255
+ -0.7684282064437866,
256
+ -0.6805359125137329,
257
+ -0.6158878207206726,
258
+ -0.6266580820083618,
259
+ -0.5865643620491028,
260
+ -0.5252509117126465,
261
+ -0.582808792591095,
262
+ -0.5078607201576233,
263
+ -0.5414115190505981,
264
+ -0.5167695879936218,
265
+ -0.3809068500995636,
266
+ -0.5038758516311646,
267
+ -0.4019751250743866,
268
+ -0.3984307050704956,
269
+ -0.378909170627594,
270
+ -0.42118901014328003,
271
+ -0.3680881857872009,
272
+ -0.344409704208374,
273
+ -0.2639271020889282,
274
+ -0.26402419805526733,
275
+ -0.30391913652420044,
276
+ -0.2776675522327423,
277
+ -0.2931075692176819,
278
+ -0.30392688512802124,
279
+ -0.3120562434196472,
280
+ -0.2729867696762085,
281
+ -0.2548932731151581,
282
+ -0.20517690479755402,
283
+ -0.21110056340694427,
284
+ -0.2384495884180069,
285
+ -0.2285153567790985,
286
+ -0.2433067113161087,
287
+ -0.2530016601085663,
288
+ -0.22626806795597076,
289
+ -0.1859578937292099
290
  ],
291
  "abs_loss": [
292
+ 2.222752094268799,
293
+ 1.926999568939209,
294
+ 1.8093197345733643,
295
+ 1.8339921236038208,
296
+ 1.759389877319336,
297
+ 1.8244154453277588,
298
+ 1.8642914295196533,
299
+ 1.744714617729187,
300
+ 1.7237495183944702,
301
+ 1.6763195991516113,
302
+ 1.4341479539871216,
303
+ 0.8320420980453491,
304
+ 0.58698970079422,
305
+ 0.5428140163421631,
306
+ 0.43639320135116577,
307
+ 0.37460923194885254,
308
+ 0.30772864818573,
309
+ 0.2563968002796173,
310
+ 0.2319200038909912,
311
+ 0.29884329438209534,
312
+ 0.24885083734989166,
313
+ 0.2609776258468628,
314
+ 0.20813710987567902,
315
+ 0.26550132036209106,
316
+ 0.2435016632080078,
317
+ 0.19937652349472046,
318
+ 0.21581536531448364,
319
+ 0.20377680659294128,
320
+ 0.22351153194904327,
321
+ 0.20612581074237823,
322
+ 0.16499526798725128,
323
+ 0.1456555277109146,
324
+ 0.09782898426055908,
325
+ 0.13325135409832,
326
+ 0.1102524921298027,
327
+ 0.09799818694591522,
328
+ 0.09346786141395569,
329
+ 0.09431788325309753,
330
+ 0.08420988917350769,
331
+ 0.06995618343353271,
332
+ 0.052987173199653625,
333
+ 0.060495052486658096,
334
+ 0.040083788335323334,
335
+ 0.05029800534248352,
336
+ 0.041374556720256805,
337
+ 0.047441694885492325,
338
+ 0.026004668325185776,
339
+ 0.01636277139186859,
340
+ 0.01893680915236473,
341
+ 0.04104611277580261,
342
+ 0.03436857461929321,
343
+ 0.024059899151325226,
344
+ 0.014520613476634026,
345
+ 0.02061675861477852,
346
+ 0.021550625562667847,
347
+ 0.015878906473517418,
348
+ 0.039841003715991974,
349
+ 0.019661087542772293,
350
+ 0.02790391817688942,
351
+ 0.006339336279779673,
352
+ 0.00404171971604228,
353
+ 0.011382443830370903,
354
+ 0.010472345165908337,
355
+ 0.007120729424059391,
356
+ 0.012258393689990044,
357
+ 0.012964775785803795,
358
+ 0.011299973353743553,
359
+ 0.016975682228803635,
360
+ 0.011367934755980968,
361
+ 0.010527610778808594
362
  ],
363
  "zipf_loss": [
364
+ 7.010064601898193,
365
+ 2.38986873626709,
366
+ 1.7622350454330444,
367
+ 1.708707571029663,
368
+ 1.6980737447738647,
369
+ 1.6928613185882568,
370
+ 1.6921234130859375,
371
+ 1.6938809156417847,
372
+ 1.6831586360931396,
373
+ 1.5815515518188477,
374
+ 1.321115493774414,
375
+ 0.2831959128379822,
376
+ 0.10158200562000275,
377
+ 0.09045746177434921,
378
+ 0.07016711682081223,
379
+ 0.07071135938167572,
380
+ 0.07279551029205322,
381
+ 0.05843561142683029,
382
+ 0.05546531453728676,
383
+ 0.044722579419612885,
384
+ 0.03344176337122917,
385
+ 0.04526158794760704,
386
+ 0.02750018984079361,
387
+ 0.024273207411170006,
388
+ 0.023565033450722694,
389
+ 0.02845955640077591,
390
+ 0.026460008695721626,
391
+ 0.02348167449235916,
392
+ 0.019840076565742493,
393
+ 0.016346683725714684,
394
+ 0.026626750826835632,
395
+ 0.01441938430070877,
396
+ 0.025319378823041916,
397
+ 0.01843027025461197,
398
+ 0.04876657947897911,
399
+ 0.02018149197101593,
400
+ 0.02738766372203827,
401
+ 0.02091757208108902,
402
+ 0.015065893530845642,
403
+ 0.017726987600326538,
404
+ 0.019072210416197777,
405
+ 0.033719502389431,
406
+ 0.03725103288888931,
407
+ 0.025048401206731796,
408
+ 0.01907178945839405,
409
+ 0.027751600369811058,
410
+ 0.02841419354081154,
411
+ 0.04700608178973198,
412
+ 0.030363060534000397,
413
+ 0.0301585104316473,
414
+ 0.013181859627366066,
415
+ 0.02410723641514778,
416
+ 0.01728036440908909,
417
+ 0.01361159235239029,
418
+ 0.010557424277067184,
419
+ 0.025982879102230072,
420
+ 0.02237217128276825,
421
+ 0.009604002349078655,
422
+ 0.024232294410467148,
423
+ 0.009469791315495968,
424
+ 0.011794861406087875,
425
+ 0.023268207907676697,
426
+ 0.021970605477690697,
427
+ 0.011792691424489021,
428
+ 0.022644102573394775,
429
+ 0.015265952795743942,
430
+ 0.024225622415542603,
431
+ 0.014256678521633148,
432
+ 0.02100590616464615,
433
+ 0.01940278336405754
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
+ 0.3968747854232788,
438
+ 0.14926865696907043,
439
+ 0.07227928936481476,
440
+ 0.04244796931743622,
441
+ 0.04083942249417305,
442
+ 0.028592364862561226,
443
+ 0.02875097654759884,
444
+ 0.043541643768548965,
445
+ 0.058582015335559845,
446
+ 0.07232064008712769,
447
+ 0.08619970083236694,
448
+ 0.0898292139172554,
449
+ 0.08418357372283936,
450
+ 0.08772807568311691,
451
+ 0.09836088865995407,
452
+ 0.10131695121526718,
453
+ 0.10624177753925323,
454
+ 0.10543321073055267,
455
+ 0.10600952059030533,
456
+ 0.10784824192523956,
457
+ 0.10756736248731613,
458
+ 0.10823886841535568,
459
+ 0.10757088661193848,
460
+ 0.10496333986520767,
461
+ 0.10412697494029999,
462
+ 0.10040485858917236,
463
+ 0.10272025316953659,
464
+ 0.09985368698835373,
465
+ 0.09758050739765167,
466
+ 0.09805158525705338,
467
+ 0.09213665872812271,
468
+ 0.0930456593632698,
469
+ 0.09284541010856628,
470
+ 0.09186448901891708,
471
+ 0.09377385675907135,
472
+ 0.09639732539653778,
473
+ 0.11027049273252487,
474
+ 0.11258107423782349,
475
+ 0.10643363744020462,
476
+ 0.1086091622710228,
477
+ 0.12804602086544037,
478
+ 0.1236853078007698,
479
+ 0.12697289884090424,
480
+ 0.13189586997032166,
481
+ 0.1433846354484558,
482
+ 0.13937915861606598,
483
+ 0.1361546367406845,
484
+ 0.14090222120285034,
485
+ 0.1492113620042801,
486
+ 0.1503925323486328,
487
+ 0.1537848263978958,
488
+ 0.14883005619049072,
489
+ 0.16153961420059204,
490
+ 0.1610225886106491,
491
+ 0.16643743216991425,
492
+ 0.16853205859661102,
493
+ 0.17710106074810028,
494
+ 0.18421341478824615,
495
+ 0.1849854439496994,
496
+ 0.18908445537090302,
497
+ 0.19132596254348755,
498
+ 0.1937350183725357,
499
+ 0.20879752933979034,
500
+ 0.21170519292354584,
501
+ 0.21465632319450378,
502
+ 0.21528464555740356,
503
+ 0.2186288684606552,
504
+ 0.22191277146339417,
505
+ 0.22332896292209625,
506
+ 0.22412705421447754
507
  ],
508
  "lr": [
509
+ 3.350427350427351e-05,
510
+ 6.76923076923077e-05,
511
+ 8e-05,
512
+ 8e-05,
513
+ 8e-05,
514
+ 8e-05,
515
+ 8e-05,
516
+ 8e-05,
517
+ 8e-05,
518
+ 8e-05,
519
+ 8e-05,
520
+ 8e-05,
521
+ 8e-05,
522
+ 8e-05,
523
+ 8e-05,
524
+ 8e-05,
525
+ 8e-05,
526
+ 8e-05,
527
+ 8e-05,
528
+ 8e-05,
529
+ 8e-05,
530
+ 8e-05,
531
+ 8e-05,
532
+ 8e-05,
533
+ 8e-05,
534
+ 8e-05,
535
+ 8e-05,
536
+ 8e-05,
537
+ 8e-05,
538
+ 8e-05,
539
+ 8e-05,
540
+ 8e-05,
541
+ 8e-05,
542
+ 8e-05,
543
+ 8e-05,
544
+ 8e-05,
545
+ 8e-05,
546
+ 8e-05,
547
+ 8e-05,
548
+ 8e-05,
549
+ 8e-05,
550
+ 8e-05,
551
+ 7.989559715264963e-05,
552
+ 7.752280516741365e-05,
553
+ 7.51500131821777e-05,
554
+ 7.277722119694173e-05,
555
+ 7.040442921170577e-05,
556
+ 6.803163722646982e-05,
557
+ 6.565884524123386e-05,
558
+ 6.134036382810441e-05,
559
+ 5.896757184286844e-05,
560
+ 5.6594779857632485e-05,
561
+ 5.422198787239652e-05,
562
+ 5.184919588716057e-05,
563
+ 4.947640390192461e-05,
564
+ 4.710361191668863e-05,
565
+ 4.278513050355919e-05,
566
+ 4.041233851832322e-05,
567
+ 3.803954653308728e-05,
568
+ 3.566675454785132e-05,
569
+ 3.329396256261535e-05,
570
+ 3.092117057737938e-05,
571
+ 2.854837859214342e-05,
572
+ 2.4229897179013967e-05,
573
+ 2.1857105193778026e-05,
574
+ 1.9484313208542057e-05,
575
+ 1.7111521223306092e-05,
576
+ 1.4738729238070129e-05,
577
+ 1.2365937252834164e-05,
578
+ 9.993145267598198e-06
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
 
593
  "eval_accuracy": [
594
  0.01,
595
  0.0,
596
+ 0.67,
597
+ 0.85,
598
+ 0.88,
599
+ 0.91,
600
+ 0.93,
601
+ 0.95,
602
+ 1.0,
603
+ 0.99
604
  ]
605
  },
606
+ "final_accuracy": 0.9704166666666667,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
 
614
  },
615
  "splits": {
616
  "add_S0": {
617
+ "full_accuracy": 0.55,
618
  "n_examples": 100,
619
  "per_subtask": {
620
  "SA": {
621
+ "accuracy": 0.9190082644628099,
622
  "count": 605
623
  },
624
  "SS": {
 
628
  }
629
  },
630
  "add_S1": {
631
+ "full_accuracy": 0.62,
632
  "n_examples": 100,
633
  "per_subtask": {
634
  "SA": {
635
+ "accuracy": 0.9215686274509803,
636
  "count": 204
637
  },
638
  "SC": {
639
+ "accuracy": 0.9467455621301775,
640
  "count": 169
641
  },
642
  "SS": {
643
+ "accuracy": 0.9032258064516129,
644
  "count": 31
645
  },
646
  "UC": {
647
+ "accuracy": 0.9391891891891891,
648
  "count": 296
649
  }
650
  }
651
  },
652
  "add_S2": {
653
+ "full_accuracy": 0.54,
654
  "n_examples": 100,
655
  "per_subtask": {
656
  "SA": {
657
+ "accuracy": 0.9447852760736196,
658
  "count": 163
659
  },
660
  "SC": {
661
+ "accuracy": 0.8769230769230769,
662
  "count": 130
663
  },
664
  "SS": {
665
+ "accuracy": 0.9310344827586207,
666
  "count": 87
667
  },
668
  "UC": {
669
+ "accuracy": 0.8472906403940886,
670
  "count": 203
671
  },
672
  "US": {
673
+ "accuracy": 0.9572649572649573,
674
  "count": 117
675
  }
676
  }
677
  },
678
  "add_S3": {
679
+ "full_accuracy": 0.42,
680
  "n_examples": 100,
681
  "per_subtask": {
682
  "SA": {
683
+ "accuracy": 0.9504132231404959,
684
  "count": 121
685
  },
686
  "SC": {
687
+ "accuracy": 0.9338842975206612,
688
  "count": 121
689
  },
690
  "SS": {
691
+ "accuracy": 0.8775510204081632,
692
  "count": 49
693
  },
694
  "UC": {
695
+ "accuracy": 0.7956989247311828,
696
  "count": 186
697
  },
698
  "US": {
699
+ "accuracy": 0.8609865470852018,
700
  "count": 223
701
  }
702
  }
703
  },
704
  "add_S4": {
705
+ "full_accuracy": 0.32,
706
  "n_examples": 100,
707
  "per_subtask": {
708
  "SA": {
709
+ "accuracy": 0.9519230769230769,
710
  "count": 104
711
  },
712
  "SC": {
713
+ "accuracy": 0.8962264150943396,
714
  "count": 106
715
  },
716
  "SS": {
717
+ "accuracy": 0.8695652173913043,
718
  "count": 23
719
  },
720
  "UC": {
721
+ "accuracy": 0.70625,
722
  "count": 160
723
  },
724
  "US": {
725
+ "accuracy": 0.6710097719869706,
726
  "count": 307
727
  }
728
  }
729
  },
730
  "add_S5": {
731
+ "full_accuracy": 0.26,
732
  "n_examples": 100,
733
  "per_subtask": {
734
  "SA": {
 
736
  "count": 100
737
  },
738
  "SC": {
739
+ "accuracy": 0.93,
740
  "count": 100
741
  },
742
  "UC": {
743
+ "accuracy": 0.47,
744
  "count": 100
745
  },
746
  "US": {
747
+ "accuracy": 0.44,
748
  "count": 400
749
  }
750
  }
 
758
  "count": 100
759
  },
760
  "UC": {
761
+ "accuracy": 0.43,
762
  "count": 100
763
  },
764
  "US": {
765
+ "accuracy": 0.516,
766
  "count": 500
767
  }
768
  }
769
  },
770
  "add_random": {
771
+ "full_accuracy": 0.7,
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
+ "accuracy": 0.941834451901566,
776
  "count": 447
777
  },
778
  "SC": {
779
+ "accuracy": 0.94375,
780
  "count": 320
781
  },
782
  "SS": {
783
+ "accuracy": 0.9642857142857143,
784
  "count": 56
785
  },
786
  "UC": {
787
+ "accuracy": 0.9640831758034026,
788
  "count": 529
789
  },
790
  "US": {
791
+ "accuracy": 0.8125,
792
  "count": 48
793
  }
794
  }
795
  },
796
  "add_C3": {
797
+ "full_accuracy": 0.52,
798
  "n_examples": 100,
799
  "per_subtask": {
800
  "SA": {
801
+ "accuracy": 0.99,
802
  "count": 300
803
  },
804
  "SC": {
805
+ "accuracy": 0.99,
806
  "count": 100
807
  },
808
  "UC": {
809
+ "accuracy": 0.7927461139896373,
810
  "count": 193
811
  },
812
  "US": {
813
+ "accuracy": 0.7570093457943925,
814
  "count": 107
815
  }
816
  }
817
  },
818
  "add_C4": {
819
+ "full_accuracy": 0.53,
820
  "n_examples": 100,
821
  "per_subtask": {
822
  "SA": {
823
+ "accuracy": 0.99,
824
  "count": 200
825
  },
826
  "SC": {
827
+ "accuracy": 0.95,
828
  "count": 100
829
  },
830
  "UC": {
831
+ "accuracy": 0.8515625,
832
  "count": 256
833
  },
834
  "US": {
835
+ "accuracy": 0.7916666666666666,
836
  "count": 144
837
  }
838
  }
839
  },
840
  "add_C5": {
841
+ "full_accuracy": 0.47,
842
  "n_examples": 100,
843
  "per_subtask": {
844
  "SA": {
 
846
  "count": 100
847
  },
848
  "SC": {
849
+ "accuracy": 0.97,
850
  "count": 100
851
  },
852
  "UC": {
853
+ "accuracy": 0.8235294117647058,
854
  "count": 306
855
  },
856
  "US": {
857
+ "accuracy": 0.845360824742268,
858
  "count": 194
859
  }
860
  }
861
  },
862
  "add_C6": {
863
+ "full_accuracy": 0.41,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SC": {
 
868
  "count": 100
869
  },
870
  "UC": {
871
+ "accuracy": 0.825136612021858,
872
  "count": 366
873
  },
874
  "US": {
875
+ "accuracy": 0.8803418803418803,
876
  "count": 234
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
+ "full_accuracy": 0.8,
882
  "n_examples": 100,
883
  "per_subtask": {
884
  "MD": {
885
+ "accuracy": 0.9683860232945092,
886
  "count": 601
887
  },
888
  "ME": {
 
892
  }
893
  },
894
  "sub_M1": {
895
+ "full_accuracy": 0.67,
896
  "n_examples": 100,
897
  "per_subtask": {
898
  "MD": {
899
+ "accuracy": 0.982078853046595,
900
  "count": 279
901
  },
902
  "MB": {
903
+ "accuracy": 0.9379310344827586,
904
  "count": 145
905
  },
906
  "ME": {
 
908
  "count": 24
909
  },
910
  "UB": {
911
+ "accuracy": 0.9047619047619048,
912
  "count": 252
913
  }
914
  }
915
  },
916
  "sub_M2": {
917
+ "full_accuracy": 0.38,
918
  "n_examples": 100,
919
  "per_subtask": {
920
  "MD": {
921
+ "accuracy": 0.9906103286384976,
922
  "count": 213
923
  },
924
  "MB": {
925
+ "accuracy": 0.9380530973451328,
926
  "count": 113
927
  },
928
  "ME": {
929
+ "accuracy": 0.9882352941176471,
930
  "count": 85
931
  },
932
  "UB": {
933
+ "accuracy": 0.6961325966850829,
934
  "count": 181
935
  },
936
  "UD": {
937
+ "accuracy": 0.8981481481481481,
938
  "count": 108
939
  }
940
  }
941
  },
942
  "sub_M3": {
943
+ "full_accuracy": 0.12,
944
  "n_examples": 100,
945
  "per_subtask": {
946
  "MD": {
 
948
  "count": 179
949
  },
950
  "MB": {
951
+ "accuracy": 0.9223300970873787,
952
  "count": 103
953
  },
954
  "ME": {
955
+ "accuracy": 0.9821428571428571,
956
  "count": 56
957
  },
958
  "UB": {
959
+ "accuracy": 0.48322147651006714,
960
  "count": 149
961
  },
962
  "UD": {
963
+ "accuracy": 0.6291079812206573,
964
  "count": 213
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
+ "full_accuracy": 0.06,
970
  "n_examples": 100,
971
  "per_subtask": {
972
  "MD": {
 
978
  "count": 100
979
  },
980
  "UB": {
981
+ "accuracy": 0.3,
982
  "count": 100
983
  },
984
  "UD": {
985
+ "accuracy": 0.38666666666666666,
986
  "count": 300
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
+ "full_accuracy": 0.03,
992
  "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
 
1000
  "count": 100
1001
  },
1002
  "UB": {
1003
+ "accuracy": 0.19,
1004
  "count": 100
1005
  },
1006
  "UD": {
1007
+ "accuracy": 0.2775,
1008
  "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
+ "full_accuracy": 0.655,
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
+ "accuracy": 0.9833333333333333,
1018
  "count": 600
1019
  },
1020
  "MB": {
 
1022
  "count": 267
1023
  },
1024
  "ME": {
1025
+ "accuracy": 1.0,
1026
  "count": 53
1027
  },
1028
  "UB": {
1029
+ "accuracy": 0.876993166287016,
1030
  "count": 439
1031
  },
1032
  "UD": {
1033
+ "accuracy": 0.8780487804878049,
1034
  "count": 41
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
+ "full_accuracy": 0.35,
1040
  "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
1043
+ "accuracy": 0.99,
1044
  "count": 300
1045
  },
1046
  "MB": {
1047
+ "accuracy": 0.95,
1048
  "count": 100
1049
  },
1050
  "UB": {
1051
+ "accuracy": 0.6751269035532995,
1052
  "count": 197
1053
  },
1054
  "UD": {
1055
+ "accuracy": 0.6893203883495146,
1056
  "count": 103
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
+ "full_accuracy": 0.24,
1062
  "n_examples": 100,
1063
  "per_subtask": {
1064
  "MD": {
1065
+ "accuracy": 0.995,
1066
  "count": 200
1067
  },
1068
  "MB": {
1069
+ "accuracy": 0.94,
1070
  "count": 100
1071
  },
1072
  "UB": {
1073
+ "accuracy": 0.7206477732793523,
1074
  "count": 247
1075
  },
1076
  "UD": {
1077
+ "accuracy": 0.6209150326797386,
1078
  "count": 153
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
+ "full_accuracy": 0.21,
1084
  "n_examples": 100,
1085
  "per_subtask": {
1086
  "MD": {
 
1092
  "count": 100
1093
  },
1094
  "UB": {
1095
+ "accuracy": 0.7013422818791947,
1096
  "count": 298
1097
  },
1098
  "UD": {
1099
+ "accuracy": 0.6782178217821783,
1100
  "count": 202
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
+ "overall_accuracy": 0.4425,
1107
  "total_examples": 2400,
1108
  "n_splits": 22
1109
  }
 
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
+ "full_accuracy": 1.0,
1122
  "n_examples": 100,
1123
  "per_subtask": {
1124
  "SA": {
1125
+ "accuracy": 1.0,
1126
  "count": 605
1127
  },
1128
  "SS": {
 
1132
  }
1133
  },
1134
  "add_S1": {
1135
+ "full_accuracy": 1.0,
1136
  "n_examples": 100,
1137
  "per_subtask": {
1138
  "SA": {
1139
+ "accuracy": 1.0,
1140
  "count": 204
1141
  },
1142
  "SC": {
1143
+ "accuracy": 1.0,
1144
  "count": 169
1145
  },
1146
  "SS": {
 
1148
  "count": 31
1149
  },
1150
  "UC": {
1151
+ "accuracy": 1.0,
1152
  "count": 296
1153
  }
1154
  }
1155
  },
1156
  "add_S2": {
1157
+ "full_accuracy": 1.0,
1158
  "n_examples": 100,
1159
  "per_subtask": {
1160
  "SA": {
1161
+ "accuracy": 1.0,
1162
  "count": 163
1163
  },
1164
  "SC": {
1165
+ "accuracy": 1.0,
1166
  "count": 130
1167
  },
1168
  "SS": {
1169
+ "accuracy": 1.0,
1170
  "count": 87
1171
  },
1172
  "UC": {
1173
+ "accuracy": 1.0,
1174
  "count": 203
1175
  },
1176
  "US": {
 
1180
  }
1181
  },
1182
  "add_S3": {
1183
+ "full_accuracy": 1.0,
1184
  "n_examples": 100,
1185
  "per_subtask": {
1186
  "SA": {
1187
+ "accuracy": 1.0,
1188
  "count": 121
1189
  },
1190
  "SC": {
1191
+ "accuracy": 1.0,
1192
  "count": 121
1193
  },
1194
  "SS": {
1195
+ "accuracy": 1.0,
1196
  "count": 49
1197
  },
1198
  "UC": {
1199
+ "accuracy": 1.0,
1200
  "count": 186
1201
  },
1202
  "US": {
1203
+ "accuracy": 1.0,
1204
  "count": 223
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
+ "full_accuracy": 0.95,
1210
  "n_examples": 100,
1211
  "per_subtask": {
1212
  "SA": {
 
1222
  "count": 23
1223
  },
1224
  "UC": {
1225
+ "accuracy": 0.96875,
1226
  "count": 160
1227
  },
1228
  "US": {
1229
+ "accuracy": 1.0,
1230
  "count": 307
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
+ "full_accuracy": 0.79,
1236
  "n_examples": 100,
1237
  "per_subtask": {
1238
  "SA": {
 
1244
  "count": 100
1245
  },
1246
  "UC": {
1247
+ "accuracy": 0.8,
1248
  "count": 100
1249
  },
1250
  "US": {
1251
+ "accuracy": 0.99,
1252
  "count": 400
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
+ "full_accuracy": 0.97,
1258
  "n_examples": 100,
1259
  "per_subtask": {
1260
  "SC": {
 
1262
  "count": 100
1263
  },
1264
  "UC": {
1265
+ "accuracy": 0.97,
1266
  "count": 100
1267
  },
1268
  "US": {
1269
+ "accuracy": 0.992,
1270
  "count": 500
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
+ "full_accuracy": 1.0,
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
+ "accuracy": 1.0,
1280
  "count": 447
1281
  },
1282
  "SC": {
1283
+ "accuracy": 1.0,
1284
  "count": 320
1285
  },
1286
  "SS": {
1287
+ "accuracy": 1.0,
1288
  "count": 56
1289
  },
1290
  "UC": {
1291
+ "accuracy": 1.0,
1292
  "count": 529
1293
  },
1294
  "US": {
1295
+ "accuracy": 1.0,
1296
  "count": 48
1297
  }
1298
  }
1299
  },
1300
  "add_C3": {
1301
+ "full_accuracy": 0.99,
1302
  "n_examples": 100,
1303
  "per_subtask": {
1304
  "SA": {
 
1310
  "count": 100
1311
  },
1312
  "UC": {
1313
+ "accuracy": 0.9948186528497409,
1314
  "count": 193
1315
  },
1316
  "US": {
1317
+ "accuracy": 1.0,
1318
  "count": 107
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
+ "full_accuracy": 0.99,
1324
  "n_examples": 100,
1325
  "per_subtask": {
1326
  "SA": {
 
1332
  "count": 100
1333
  },
1334
  "UC": {
1335
+ "accuracy": 0.99609375,
1336
  "count": 256
1337
  },
1338
  "US": {
1339
+ "accuracy": 1.0,
1340
  "count": 144
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
+ "full_accuracy": 0.98,
1346
  "n_examples": 100,
1347
  "per_subtask": {
1348
  "SA": {
 
1354
  "count": 100
1355
  },
1356
  "UC": {
1357
+ "accuracy": 0.9934640522875817,
1358
  "count": 306
1359
  },
1360
  "US": {
1361
+ "accuracy": 1.0,
1362
  "count": 194
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
+ "full_accuracy": 1.0,
1368
  "n_examples": 100,
1369
  "per_subtask": {
1370
  "SC": {
 
1372
  "count": 100
1373
  },
1374
  "UC": {
1375
+ "accuracy": 1.0,
1376
  "count": 366
1377
  },
1378
  "US": {
1379
+ "accuracy": 1.0,
1380
  "count": 234
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
+ "full_accuracy": 1.0,
1386
  "n_examples": 100,
1387
  "per_subtask": {
1388
  "MD": {
1389
+ "accuracy": 1.0,
1390
  "count": 601
1391
  },
1392
  "ME": {
 
1396
  }
1397
  },
1398
  "sub_M1": {
1399
+ "full_accuracy": 1.0,
1400
  "n_examples": 100,
1401
  "per_subtask": {
1402
  "MD": {
 
1404
  "count": 279
1405
  },
1406
  "MB": {
1407
+ "accuracy": 1.0,
1408
  "count": 145
1409
  },
1410
  "ME": {
1411
+ "accuracy": 1.0,
1412
  "count": 24
1413
  },
1414
  "UB": {
 
1418
  }
1419
  },
1420
  "sub_M2": {
1421
+ "full_accuracy": 1.0,
1422
  "n_examples": 100,
1423
  "per_subtask": {
1424
  "MD": {
1425
+ "accuracy": 1.0,
1426
  "count": 213
1427
  },
1428
  "MB": {
1429
+ "accuracy": 1.0,
1430
  "count": 113
1431
  },
1432
  "ME": {
 
1434
  "count": 85
1435
  },
1436
  "UB": {
1437
+ "accuracy": 1.0,
1438
  "count": 181
1439
  },
1440
  "UD": {
 
1444
  }
1445
  },
1446
  "sub_M3": {
1447
+ "full_accuracy": 1.0,
1448
  "n_examples": 100,
1449
  "per_subtask": {
1450
  "MD": {
1451
+ "accuracy": 1.0,
1452
  "count": 179
1453
  },
1454
  "MB": {
1455
+ "accuracy": 1.0,
1456
  "count": 103
1457
  },
1458
  "ME": {
 
1460
  "count": 56
1461
  },
1462
  "UB": {
1463
+ "accuracy": 1.0,
1464
  "count": 149
1465
  },
1466
  "UD": {
1467
+ "accuracy": 1.0,
1468
  "count": 213
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
+ "full_accuracy": 0.85,
1474
  "n_examples": 100,
1475
  "per_subtask": {
1476
  "MD": {
 
1482
  "count": 100
1483
  },
1484
  "UB": {
1485
+ "accuracy": 0.85,
1486
  "count": 100
1487
  },
1488
  "UD": {
1489
+ "accuracy": 1.0,
1490
  "count": 300
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
+ "full_accuracy": 0.79,
1496
  "n_examples": 100,
1497
  "per_subtask": {
1498
  "MD": {
 
1504
  "count": 100
1505
  },
1506
  "UB": {
1507
+ "accuracy": 0.82,
1508
  "count": 100
1509
  },
1510
  "UD": {
1511
+ "accuracy": 0.99,
1512
  "count": 400
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
+ "full_accuracy": 1.0,
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
+ "accuracy": 1.0,
1522
  "count": 600
1523
  },
1524
  "MB": {
1525
+ "accuracy": 1.0,
1526
  "count": 267
1527
  },
1528
  "ME": {
 
1530
  "count": 53
1531
  },
1532
  "UB": {
1533
+ "accuracy": 1.0,
1534
  "count": 439
1535
  },
1536
  "UD": {
1537
+ "accuracy": 1.0,
1538
  "count": 41
1539
  }
1540
  }
1541
  },
1542
  "sub_B3": {
1543
+ "full_accuracy": 1.0,
1544
  "n_examples": 100,
1545
  "per_subtask": {
1546
  "MD": {
1547
+ "accuracy": 1.0,
1548
  "count": 300
1549
  },
1550
  "MB": {
 
1552
  "count": 100
1553
  },
1554
  "UB": {
1555
+ "accuracy": 1.0,
1556
  "count": 197
1557
  },
1558
  "UD": {
1559
+ "accuracy": 1.0,
1560
  "count": 103
1561
  }
1562
  }
1563
  },
1564
  "sub_B4": {
1565
+ "full_accuracy": 0.98,
1566
  "n_examples": 100,
1567
  "per_subtask": {
1568
  "MD": {
 
1574
  "count": 100
1575
  },
1576
  "UB": {
1577
+ "accuracy": 0.9919028340080972,
1578
  "count": 247
1579
  },
1580
  "UD": {
1581
+ "accuracy": 1.0,
1582
  "count": 153
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
+ "full_accuracy": 1.0,
1588
  "n_examples": 100,
1589
  "per_subtask": {
1590
  "MD": {
 
1596
  "count": 100
1597
  },
1598
  "UB": {
1599
+ "accuracy": 1.0,
1600
  "count": 298
1601
  },
1602
  "UD": {
1603
+ "accuracy": 1.0,
1604
  "count": 202
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
+ "overall_accuracy": 0.9704166666666667,
1611
  "total_examples": 2400,
1612
  "n_splits": 22
1613
  }
1614
  },
1615
+ "sorl_overall_accuracy": 0.9704166666666667,
1616
+ "sft_overall_accuracy": 0.4425
1617
  }
add_sub_sorl_v1_abs10_25K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c44e7d1362819ab8f5d7105db96ac5f8cd27c038610420dcca40684cc135fc24
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a787d3e4f23b07be0ae2cb91c4645498390ee02c864e39b0e6a18bf8ed9d1578
3
  size 650303660
add_sub_sorl_v1_abs10_25K/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 117,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_25K",
72
- "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
- "timestamp": "2026-04-12T08:59:10.961931+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "pmvjbi05",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/pmvjbi05",
81
- "final_accuracy": 0.6191666666666666,
82
- "sft_accuracy": 0.5375,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 117,
 
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_25K",
72
+ "git_commit": "dc8dd776fb0c30a4c9073052dcc5e943e0fd80c6",
73
+ "timestamp": "2026-04-13T07:21:12.270040+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "3umyj3g0",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/3umyj3g0",
81
+ "final_accuracy": 0.9704166666666667,
82
+ "sft_accuracy": 0.4425,
83
  "eval_method": "ArithmeticEvaluator"
84
  }