amirali1985 commited on
Commit
e5cab5c
·
verified ·
1 Parent(s): ba05dd7

Upload add_sub_sorl_v1_abs10_K1_25K

Browse files
add_sub_sorl_v1_abs10_K1_25K/metrics.json CHANGED
@@ -73,509 +73,509 @@
73
  3869
74
  ],
75
  "loss": [
76
- 2.775237560272217,
77
- 7.547610282897949,
78
- 4.404772758483887,
79
- 3.531778335571289,
80
- 3.0909929275512695,
81
- 3.0005178451538086,
82
- 2.930567741394043,
83
- 2.380660057067871,
84
- 1.8402857780456543,
85
- 1.5253196954727173,
86
- 0.6456812620162964,
87
- -0.922775387763977,
88
- -5.972268104553223,
89
- -7.329289436340332,
90
- -3.549975872039795,
91
- -2.491389274597168,
92
- -0.8877972364425659,
93
- -0.7063808441162109,
94
- -1.138548493385315,
95
- -0.44369208812713623,
96
- -0.18519365787506104,
97
- 0.001989603042602539,
98
- -0.2683281898498535,
99
- 0.038404107093811035,
100
- 0.15909254550933838,
101
- 0.3572588562965393,
102
- -0.12331002950668335,
103
- 0.23269200325012207,
104
- -0.06346249580383301,
105
- -0.600041389465332,
106
- -0.17923784255981445,
107
- -0.3249526619911194,
108
- 0.10653692483901978,
109
- -0.6031087636947632,
110
- -0.2606835961341858,
111
- -0.31056416034698486,
112
- -0.71147620677948,
113
- -0.4193435311317444,
114
- -0.8142799139022827,
115
- -0.7582582235336304,
116
- -0.38260582089424133,
117
- -0.4925423562526703,
118
- -0.6277319192886353,
119
- -0.5789560675621033,
120
- -0.5000548958778381,
121
- -0.24027329683303833,
122
- -0.5376996994018555,
123
- -0.7423339486122131,
124
- -0.6099227070808411,
125
- -0.734022855758667,
126
- -0.5901182293891907,
127
- -0.4833343029022217,
128
- -0.50217604637146,
129
- -0.4287641942501068,
130
- -0.42581605911254883,
131
- -0.32260385155677795,
132
- -0.4461747407913208,
133
- -0.4050253629684448,
134
- -0.6488711833953857,
135
- -0.3880271315574646,
136
- -0.5698164105415344,
137
- -0.6381738781929016,
138
- -0.5308898091316223,
139
- -0.3668437600135803,
140
- -0.5520040988922119,
141
- -0.463625431060791,
142
- -0.6266764402389526,
143
- -0.4801936149597168,
144
- -0.22993049025535583,
145
- -0.4441494047641754
146
  ],
147
  "base_loss": [
148
- 9.28781795501709,
149
- 6.408207416534424,
150
- 4.156429767608643,
151
- 2.373605966567993,
152
- 2.0184969902038574,
153
- 1.929623007774353,
154
- 1.8580621480941772,
155
- 1.8653138875961304,
156
- 1.8420647382736206,
157
- 1.7876325845718384,
158
- 1.7426172494888306,
159
- 1.8583365678787231,
160
- 1.8177855014801025,
161
- 1.7561644315719604,
162
- 1.1391898393630981,
163
- 0.8618606925010681,
164
- 0.6386224627494812,
165
- 0.6016677021980286,
166
- 0.5611599087715149,
167
- 0.43799519538879395,
168
- 0.39136165380477905,
169
- 0.35541364550590515,
170
- 0.3650291860103607,
171
- 0.28820252418518066,
172
- 0.28319448232650757,
173
- 0.27565693855285645,
174
- 0.26511654257774353,
175
- 0.17914986610412598,
176
- 0.23786573112010956,
177
- 0.3100714087486267,
178
- 0.20881570875644684,
179
- 0.2222459614276886,
180
- 0.1805049628019333,
181
- 0.21834571659564972,
182
- 0.17398054897785187,
183
- 0.17169272899627686,
184
- 0.214468851685524,
185
- 0.16018731892108917,
186
- 0.21521098911762238,
187
- 0.1976483017206192,
188
- 0.16451016068458557,
189
- 0.15567553043365479,
190
- 0.1631745547056198,
191
- 0.17332789301872253,
192
- 0.16741959750652313,
193
- 0.14030645787715912,
194
- 0.16195133328437805,
195
- 0.19442276656627655,
196
- 0.23462989926338196,
197
- 0.1750485599040985,
198
- 0.15816131234169006,
199
- 0.1345469057559967,
200
- 0.11074493825435638,
201
- 0.09530355036258698,
202
- 0.12718015909194946,
203
- 0.1237618550658226,
204
- 0.15354827046394348,
205
- 0.0893278494477272,
206
- 0.1284264326095581,
207
- 0.0889713391661644,
208
- 0.10795746743679047,
209
- 0.11995094269514084,
210
- 0.11075253039598465,
211
- 0.09607528150081635,
212
- 0.0994424819946289,
213
- 0.13344082236289978,
214
- 0.12076304107904434,
215
- 0.10045450925827026,
216
- 0.07217186689376831,
217
- 0.08645479381084442
218
  ],
219
  "info_loss": [
220
- -1.4931654930114746,
221
- -0.3287515640258789,
222
- -0.20309758186340332,
223
- -0.08040070533752441,
224
- -0.08293402194976807,
225
- -0.08145439624786377,
226
- -0.08084726333618164,
227
- -0.13623690605163574,
228
- -0.18764793872833252,
229
- -0.21354830265045166,
230
- -0.2971622943878174,
231
- -0.4655829668045044,
232
- -0.9671027064323425,
233
- -1.0975041389465332,
234
- -0.6573877930641174,
235
- -0.52280592918396,
236
- -0.33613646030426025,
237
- -0.30700165033340454,
238
- -0.33531850576400757,
239
- -0.24739503860473633,
240
- -0.20575667917728424,
241
- -0.1729479283094406,
242
- -0.19086585938930511,
243
- -0.1466362029314041,
244
- -0.12648341059684753,
245
- -0.09660688042640686,
246
- -0.13731198012828827,
247
- -0.090257927775383,
248
- -0.11520253866910934,
249
- -0.16884556412696838,
250
- -0.1119970828294754,
251
- -0.12500318884849548,
252
- -0.07407157123088837,
253
- -0.1408563256263733,
254
- -0.10861283540725708,
255
- -0.10317053645849228,
256
- -0.1429494023323059,
257
- -0.10317841172218323,
258
- -0.14690537750720978,
259
- -0.13502144813537598,
260
- -0.0957949236035347,
261
- -0.10858335345983505,
262
- -0.11433392018079758,
263
- -0.11800597608089447,
264
- -0.1043647900223732,
265
- -0.07300866395235062,
266
- -0.10729879140853882,
267
- -0.12777504324913025,
268
- -0.1146274283528328,
269
- -0.11757340282201767,
270
- -0.10103149712085724,
271
- -0.0873810350894928,
272
- -0.08418776094913483,
273
- -0.07348176836967468,
274
- -0.08262161910533905,
275
- -0.06471286714076996,
276
- -0.08114015311002731,
277
- -0.07110169529914856,
278
- -0.09566681832075119,
279
- -0.07004982978105545,
280
- -0.08596527576446533,
281
- -0.09278135001659393,
282
- -0.08236707746982574,
283
- -0.06275559961795807,
284
- -0.08083701133728027,
285
- -0.07647520303726196,
286
- -0.08978942036628723,
287
- -0.07253386080265045,
288
- -0.04718723148107529,
289
- -0.06880763173103333
290
  ],
291
  "abs_loss": [
292
- 2.287022590637207,
293
- 2.0916876792907715,
294
- 1.904157280921936,
295
- 1.865134596824646,
296
- 1.8645482063293457,
297
- 1.8384100198745728,
298
- 1.8436139822006226,
299
- 1.8555344343185425,
300
- 1.8272294998168945,
301
- 1.8342450857162476,
302
- 1.8453730344772339,
303
- 1.838832974433899,
304
- 1.8428555727005005,
305
- 1.8253898620605469,
306
- 1.7948323488235474,
307
- 1.7442383766174316,
308
- 1.575785517692566,
309
- 1.3669052124023438,
310
- 1.1249511241912842,
311
- 0.8671944737434387,
312
- 0.7276226878166199,
313
- 0.617413341999054,
314
- 0.5250348448753357,
315
- 0.44361987709999084,
316
- 0.42031577229499817,
317
- 0.41864728927612305,
318
- 0.43554285168647766,
319
- 0.3743334114551544,
320
- 0.34732961654663086,
321
- 0.33214130997657776,
322
- 0.3264453709125519,
323
- 0.31868138909339905,
324
- 0.3359517753124237,
325
- 0.2563074827194214,
326
- 0.2655089795589447,
327
- 0.24500541388988495,
328
- 0.28663334250450134,
329
- 0.2430928349494934,
330
- 0.24621863663196564,
331
- 0.22454464435577393,
332
- 0.22838687896728516,
333
- 0.23042821884155273,
334
- 0.2049625664949417,
335
- 0.21573372185230255,
336
- 0.21731531620025635,
337
- 0.21410906314849854,
338
- 0.1734648495912552,
339
- 0.17448820173740387,
340
- 0.18207435309886932,
341
- 0.17394252121448517,
342
- 0.17870847880840302,
343
- 0.1585991531610489,
344
- 0.17257511615753174,
345
- 0.18372474610805511,
346
- 0.1582813858985901,
347
- 0.1431683897972107,
348
- 0.14724594354629517,
349
- 0.15524940192699432,
350
- 0.12837021052837372,
351
- 0.15147118270397186,
352
- 0.13836684823036194,
353
- 0.14143264293670654,
354
- 0.1055116131901741,
355
- 0.12111660093069077,
356
- 0.1088944599032402,
357
- 0.12397897243499756,
358
- 0.12244991213083267,
359
- 0.11925437301397324,
360
- 0.11537864059209824,
361
- 0.12550200521945953
362
  ],
363
  "zipf_loss": [
364
- 8.190372467041016,
365
- 4.21774959564209,
366
- 2.0889029502868652,
367
- 1.7756659984588623,
368
- 1.715381383895874,
369
- 1.7015979290008545,
370
- 1.6966168880462646,
371
- 1.6921616792678833,
372
- 1.6919775009155273,
373
- 1.6897456645965576,
374
- 1.6901496648788452,
375
- 1.690834403038025,
376
- 1.696687936782837,
377
- 1.7070481777191162,
378
- 1.7052288055419922,
379
- 1.7003854513168335,
380
- 1.6773663759231567,
381
- 1.6252772808074951,
382
- 1.540981650352478,
383
- 1.5055437088012695,
384
- 1.4082492589950562,
385
- 1.3143138885498047,
386
- 1.2227977514266968,
387
- 1.1722016334533691,
388
- 1.0987006425857544,
389
- 1.0058059692382812,
390
- 0.9411389231681824,
391
- 0.9186881184577942,
392
- 0.8159641623497009,
393
- 0.7451286315917969,
394
- 0.6992727518081665,
395
- 0.6709651947021484,
396
- 0.6331524848937988,
397
- 0.5614780187606812,
398
- 0.6249133348464966,
399
- 0.5249479413032532,
400
- 0.4748857021331787,
401
- 0.4279439449310303,
402
- 0.4149409234523773,
403
- 0.37185347080230713,
404
- 0.3879944980144501,
405
- 0.4145728647708893,
406
- 0.33193638920783997,
407
- 0.40620243549346924,
408
- 0.3544418215751648,
409
- 0.3280959725379944,
410
- 0.3559904098510742,
411
- 0.32354503870010376,
412
- 0.2835143208503723,
413
- 0.24926838278770447,
414
- 0.2441646009683609,
415
- 0.2400692105293274,
416
- 0.21169903874397278,
417
- 0.1923774778842926,
418
- 0.25739187002182007,
419
- 0.1864461600780487,
420
- 0.1969538927078247,
421
- 0.20113880932331085,
422
- 0.16653358936309814,
423
- 0.208352729678154,
424
- 0.16804218292236328,
425
- 0.1555454134941101,
426
- 0.17147725820541382,
427
- 0.15252524614334106,
428
- 0.1460340917110443,
429
- 0.15528786182403564,
430
- 0.13820970058441162,
431
- 0.1327650099992752,
432
- 0.15823212265968323,
433
- 0.1449219286441803
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
- 0.6088240742683411,
438
- 0.33010056614875793,
439
- 0.22861000895500183,
440
- 0.21283285319805145,
441
- 0.17951442301273346,
442
- 0.16018734872341156,
443
- 0.11169280111789703,
444
- 0.12691719830036163,
445
- 0.1456661969423294,
446
- 0.15746210515499115,
447
- 0.1443333923816681,
448
- 0.16156138479709625,
449
- 0.20633621513843536,
450
- 0.22460925579071045,
451
- 0.26161104440689087,
452
- 0.27444061636924744,
453
- 0.28116294741630554,
454
- 0.28717944025993347,
455
- 0.3055819571018219,
456
- 0.325307697057724,
457
- 0.3221662938594818,
458
- 0.3159293830394745,
459
- 0.3048442602157593,
460
- 0.30672410130500793,
461
- 0.301876962184906,
462
- 0.29914870858192444,
463
- 0.31866511702537537,
464
- 0.33082228899002075,
465
- 0.33669188618659973,
466
- 0.32973605394363403,
467
- 0.32963281869888306,
468
- 0.33795875310897827,
469
- 0.3385995030403137,
470
- 0.3364645540714264,
471
- 0.33232346177101135,
472
- 0.33818763494491577,
473
- 0.3330787122249603,
474
- 0.3314555287361145,
475
- 0.33205369114875793,
476
- 0.3319832682609558,
477
- 0.32697418332099915,
478
- 0.3278193771839142,
479
- 0.3068898022174835,
480
- 0.309592068195343,
481
- 0.3145006597042084,
482
- 0.3080871105194092,
483
- 0.3092295825481415,
484
- 0.3054082989692688,
485
- 0.30440065264701843,
486
- 0.299023300409317,
487
- 0.3067370355129242,
488
- 0.31260034441947937,
489
- 0.3118235766887665,
490
- 0.3144630491733551,
491
- 0.31215304136276245,
492
- 0.31217923760414124,
493
- 0.31190818548202515,
494
- 0.3100627064704895,
495
- 0.3103569447994232,
496
- 0.3121229112148285,
497
- 0.3065374791622162,
498
- 0.3073209226131439,
499
- 0.3084685206413269,
500
- 0.30692750215530396,
501
- 0.3080828785896301,
502
- 0.30695533752441406,
503
- 0.3065243661403656,
504
- 0.30688145756721497,
505
- 0.3082342743873596,
506
- 0.30828753113746643
507
  ],
508
  "lr": [
509
- 1.6752136752136756e-05,
510
- 3.384615384615385e-05,
511
- 4e-05,
512
- 4e-05,
513
- 4e-05,
514
- 4e-05,
515
- 4e-05,
516
- 4e-05,
517
- 4e-05,
518
- 4e-05,
519
- 4e-05,
520
- 4e-05,
521
- 4e-05,
522
- 4e-05,
523
- 4e-05,
524
- 4e-05,
525
- 4e-05,
526
- 4e-05,
527
- 4e-05,
528
- 4e-05,
529
- 4e-05,
530
- 4e-05,
531
- 4e-05,
532
- 4e-05,
533
- 4e-05,
534
- 4e-05,
535
- 4e-05,
536
- 4e-05,
537
- 4e-05,
538
- 4e-05,
539
- 4e-05,
540
- 4e-05,
541
- 4e-05,
542
- 4e-05,
543
- 4e-05,
544
- 4e-05,
545
- 4e-05,
546
- 4e-05,
547
- 4e-05,
548
- 4e-05,
549
- 4e-05,
550
- 4e-05,
551
- 3.9947798576324814e-05,
552
- 3.8761402583706826e-05,
553
- 3.757500659108885e-05,
554
- 3.6388610598470864e-05,
555
- 3.5202214605852884e-05,
556
- 3.401581861323491e-05,
557
- 3.282942262061693e-05,
558
- 3.0670181914052204e-05,
559
- 2.948378592143422e-05,
560
- 2.8297389928816243e-05,
561
- 2.711099393619826e-05,
562
- 2.5924597943580284e-05,
563
- 2.4738201950962303e-05,
564
- 2.3551805958344316e-05,
565
- 2.1392565251779595e-05,
566
- 2.020616925916161e-05,
567
- 1.901977326654364e-05,
568
- 1.783337727392566e-05,
569
- 1.6646981281307675e-05,
570
- 1.546058528868969e-05,
571
- 1.427418929607171e-05,
572
- 1.2114948589506984e-05,
573
- 1.0928552596889013e-05,
574
- 9.742156604271029e-06,
575
- 8.555760611653046e-06,
576
- 7.369364619035064e-06,
577
- 6.182968626417082e-06,
578
- 4.996572633799099e-06
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
@@ -603,7 +603,7 @@
603
  0.0
604
  ]
605
  },
606
- "final_accuracy": 0.7783333333333333,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
@@ -618,25 +618,25 @@
618
  "n_examples": 100,
619
  "per_subtask": {
620
  "SA": {
621
- "accuracy": 0.996694214876033,
622
  "count": 605
623
  },
624
  "SS": {
625
- "accuracy": 1.0,
626
  "count": 95
627
  }
628
  }
629
  },
630
  "add_S1": {
631
- "full_accuracy": 0.86,
632
  "n_examples": 100,
633
  "per_subtask": {
634
  "SA": {
635
- "accuracy": 0.9754901960784313,
636
  "count": 204
637
  },
638
  "SC": {
639
- "accuracy": 0.9763313609467456,
640
  "count": 169
641
  },
642
  "SS": {
@@ -644,13 +644,13 @@
644
  "count": 31
645
  },
646
  "UC": {
647
- "accuracy": 0.9831081081081081,
648
  "count": 296
649
  }
650
  }
651
  },
652
  "add_S2": {
653
- "full_accuracy": 0.84,
654
  "n_examples": 100,
655
  "per_subtask": {
656
  "SA": {
@@ -662,11 +662,11 @@
662
  "count": 130
663
  },
664
  "SS": {
665
- "accuracy": 1.0,
666
  "count": 87
667
  },
668
  "UC": {
669
- "accuracy": 0.9408866995073891,
670
  "count": 203
671
  },
672
  "US": {
@@ -676,7 +676,7 @@
676
  }
677
  },
678
  "add_S3": {
679
- "full_accuracy": 0.33,
680
  "n_examples": 100,
681
  "per_subtask": {
682
  "SA": {
@@ -684,33 +684,33 @@
684
  "count": 121
685
  },
686
  "SC": {
687
- "accuracy": 0.9834710743801653,
688
  "count": 121
689
  },
690
  "SS": {
691
- "accuracy": 0.9591836734693877,
692
  "count": 49
693
  },
694
  "UC": {
695
- "accuracy": 0.6505376344086021,
696
  "count": 186
697
  },
698
  "US": {
699
- "accuracy": 0.9147982062780269,
700
  "count": 223
701
  }
702
  }
703
  },
704
  "add_S4": {
705
- "full_accuracy": 0.29,
706
  "n_examples": 100,
707
  "per_subtask": {
708
  "SA": {
709
- "accuracy": 0.9903846153846154,
710
  "count": 104
711
  },
712
  "SC": {
713
- "accuracy": 0.9716981132075472,
714
  "count": 106
715
  },
716
  "SS": {
@@ -718,17 +718,17 @@
718
  "count": 23
719
  },
720
  "UC": {
721
- "accuracy": 0.61875,
722
  "count": 160
723
  },
724
  "US": {
725
- "accuracy": 0.7100977198697068,
726
  "count": 307
727
  }
728
  }
729
  },
730
  "add_S5": {
731
- "full_accuracy": 0.14,
732
  "n_examples": 100,
733
  "per_subtask": {
734
  "SA": {
@@ -740,17 +740,17 @@
740
  "count": 100
741
  },
742
  "UC": {
743
- "accuracy": 0.18,
744
  "count": 100
745
  },
746
  "US": {
747
- "accuracy": 0.4975,
748
  "count": 400
749
  }
750
  }
751
  },
752
  "add_S6": {
753
- "full_accuracy": 0.28,
754
  "n_examples": 100,
755
  "per_subtask": {
756
  "SC": {
@@ -758,25 +758,25 @@
758
  "count": 100
759
  },
760
  "UC": {
761
- "accuracy": 0.42,
762
  "count": 100
763
  },
764
  "US": {
765
- "accuracy": 0.526,
766
  "count": 500
767
  }
768
  }
769
  },
770
  "add_random": {
771
- "full_accuracy": 0.88,
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
- "accuracy": 0.9888143176733781,
776
  "count": 447
777
  },
778
  "SC": {
779
- "accuracy": 0.9875,
780
  "count": 320
781
  },
782
  "SS": {
@@ -784,7 +784,7 @@
784
  "count": 56
785
  },
786
  "UC": {
787
- "accuracy": 0.9716446124763705,
788
  "count": 529
789
  },
790
  "US": {
@@ -794,7 +794,7 @@
794
  }
795
  },
796
  "add_C3": {
797
- "full_accuracy": 0.62,
798
  "n_examples": 100,
799
  "per_subtask": {
800
  "SA": {
@@ -802,21 +802,21 @@
802
  "count": 300
803
  },
804
  "SC": {
805
- "accuracy": 0.99,
806
  "count": 100
807
  },
808
  "UC": {
809
- "accuracy": 0.8082901554404145,
810
  "count": 193
811
  },
812
  "US": {
813
- "accuracy": 0.9158878504672897,
814
  "count": 107
815
  }
816
  }
817
  },
818
  "add_C4": {
819
- "full_accuracy": 0.65,
820
  "n_examples": 100,
821
  "per_subtask": {
822
  "SA": {
@@ -828,39 +828,39 @@
828
  "count": 100
829
  },
830
  "UC": {
831
- "accuracy": 0.875,
832
  "count": 256
833
  },
834
  "US": {
835
- "accuracy": 0.8819444444444444,
836
  "count": 144
837
  }
838
  }
839
  },
840
  "add_C5": {
841
- "full_accuracy": 0.52,
842
  "n_examples": 100,
843
  "per_subtask": {
844
  "SA": {
845
- "accuracy": 0.95,
846
  "count": 100
847
  },
848
  "SC": {
849
- "accuracy": 0.99,
850
  "count": 100
851
  },
852
  "UC": {
853
- "accuracy": 0.8496732026143791,
854
  "count": 306
855
  },
856
  "US": {
857
- "accuracy": 0.8350515463917526,
858
  "count": 194
859
  }
860
  }
861
  },
862
  "add_C6": {
863
- "full_accuracy": 0.56,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SC": {
@@ -868,21 +868,21 @@
868
  "count": 100
869
  },
870
  "UC": {
871
- "accuracy": 0.8852459016393442,
872
  "count": 366
873
  },
874
  "US": {
875
- "accuracy": 0.905982905982906,
876
  "count": 234
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
- "full_accuracy": 0.91,
882
  "n_examples": 100,
883
  "per_subtask": {
884
  "MD": {
885
- "accuracy": 0.9850249584026622,
886
  "count": 601
887
  },
888
  "ME": {
@@ -914,15 +914,15 @@
914
  }
915
  },
916
  "sub_M2": {
917
- "full_accuracy": 0.85,
918
  "n_examples": 100,
919
  "per_subtask": {
920
  "MD": {
921
- "accuracy": 0.9906103286384976,
922
  "count": 213
923
  },
924
  "MB": {
925
- "accuracy": 0.9823008849557522,
926
  "count": 113
927
  },
928
  "ME": {
@@ -930,7 +930,7 @@
930
  "count": 85
931
  },
932
  "UB": {
933
- "accuracy": 0.9281767955801105,
934
  "count": 181
935
  },
936
  "UD": {
@@ -940,7 +940,7 @@
940
  }
941
  },
942
  "sub_M3": {
943
- "full_accuracy": 0.3,
944
  "n_examples": 100,
945
  "per_subtask": {
946
  "MD": {
@@ -948,7 +948,7 @@
948
  "count": 179
949
  },
950
  "MB": {
951
- "accuracy": 0.9805825242718447,
952
  "count": 103
953
  },
954
  "ME": {
@@ -956,17 +956,17 @@
956
  "count": 56
957
  },
958
  "UB": {
959
- "accuracy": 0.6040268456375839,
960
  "count": 149
961
  },
962
  "UD": {
963
- "accuracy": 0.9061032863849765,
964
  "count": 213
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
- "full_accuracy": 0.03,
970
  "n_examples": 100,
971
  "per_subtask": {
972
  "MD": {
@@ -978,17 +978,17 @@
978
  "count": 100
979
  },
980
  "UB": {
981
- "accuracy": 0.43,
982
  "count": 100
983
  },
984
  "UD": {
985
- "accuracy": 0.49333333333333335,
986
  "count": 300
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
- "full_accuracy": 0.06,
992
  "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
@@ -1000,43 +1000,43 @@
1000
  "count": 100
1001
  },
1002
  "UB": {
1003
- "accuracy": 0.41,
1004
  "count": 100
1005
  },
1006
  "UD": {
1007
- "accuracy": 0.4075,
1008
  "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
- "full_accuracy": 0.875,
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
- "accuracy": 0.99,
1018
  "count": 600
1019
  },
1020
  "MB": {
1021
- "accuracy": 0.9887640449438202,
1022
  "count": 267
1023
  },
1024
  "ME": {
1025
- "accuracy": 1.0,
1026
  "count": 53
1027
  },
1028
  "UB": {
1029
- "accuracy": 0.9635535307517085,
1030
  "count": 439
1031
  },
1032
  "UD": {
1033
- "accuracy": 0.975609756097561,
1034
  "count": 41
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
- "full_accuracy": 0.75,
1040
  "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
@@ -1052,13 +1052,13 @@
1052
  "count": 197
1053
  },
1054
  "UD": {
1055
- "accuracy": 0.9514563106796117,
1056
  "count": 103
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
- "full_accuracy": 0.55,
1062
  "n_examples": 100,
1063
  "per_subtask": {
1064
  "MD": {
@@ -1070,17 +1070,17 @@
1070
  "count": 100
1071
  },
1072
  "UB": {
1073
- "accuracy": 0.8421052631578947,
1074
  "count": 247
1075
  },
1076
  "UD": {
1077
- "accuracy": 0.869281045751634,
1078
  "count": 153
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
- "full_accuracy": 0.48,
1084
  "n_examples": 100,
1085
  "per_subtask": {
1086
  "MD": {
@@ -1092,18 +1092,18 @@
1092
  "count": 100
1093
  },
1094
  "UB": {
1095
- "accuracy": 0.8624161073825504,
1096
  "count": 298
1097
  },
1098
  "UD": {
1099
- "accuracy": 0.8118811881188119,
1100
  "count": 202
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
- "overall_accuracy": 0.6016666666666667,
1107
  "total_examples": 2400,
1108
  "n_splits": 22
1109
  }
@@ -1118,11 +1118,11 @@
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
- "full_accuracy": 1.0,
1122
  "n_examples": 100,
1123
  "per_subtask": {
1124
  "SA": {
1125
- "accuracy": 1.0,
1126
  "count": 605
1127
  },
1128
  "SS": {
@@ -1154,23 +1154,23 @@
1154
  }
1155
  },
1156
  "add_S2": {
1157
- "full_accuracy": 0.96,
1158
  "n_examples": 100,
1159
  "per_subtask": {
1160
  "SA": {
1161
- "accuracy": 0.9938650306748467,
1162
  "count": 163
1163
  },
1164
  "SC": {
1165
- "accuracy": 0.9923076923076923,
1166
  "count": 130
1167
  },
1168
  "SS": {
1169
- "accuracy": 0.9770114942528736,
1170
  "count": 87
1171
  },
1172
  "UC": {
1173
- "accuracy": 0.9901477832512315,
1174
  "count": 203
1175
  },
1176
  "US": {
@@ -1180,7 +1180,7 @@
1180
  }
1181
  },
1182
  "add_S3": {
1183
- "full_accuracy": 0.67,
1184
  "n_examples": 100,
1185
  "per_subtask": {
1186
  "SA": {
@@ -1188,7 +1188,7 @@
1188
  "count": 121
1189
  },
1190
  "SC": {
1191
- "accuracy": 0.9917355371900827,
1192
  "count": 121
1193
  },
1194
  "SS": {
@@ -1196,17 +1196,17 @@
1196
  "count": 49
1197
  },
1198
  "UC": {
1199
- "accuracy": 0.8279569892473119,
1200
  "count": 186
1201
  },
1202
  "US": {
1203
- "accuracy": 0.9955156950672646,
1204
  "count": 223
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
- "full_accuracy": 0.45,
1210
  "n_examples": 100,
1211
  "per_subtask": {
1212
  "SA": {
@@ -1222,17 +1222,17 @@
1222
  "count": 23
1223
  },
1224
  "UC": {
1225
- "accuracy": 0.6875,
1226
  "count": 160
1227
  },
1228
  "US": {
1229
- "accuracy": 0.8273615635179153,
1230
  "count": 307
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
- "full_accuracy": 0.21,
1236
  "n_examples": 100,
1237
  "per_subtask": {
1238
  "SA": {
@@ -1244,17 +1244,17 @@
1244
  "count": 100
1245
  },
1246
  "UC": {
1247
- "accuracy": 0.33,
1248
  "count": 100
1249
  },
1250
  "US": {
1251
- "accuracy": 0.6175,
1252
  "count": 400
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
- "full_accuracy": 0.19,
1258
  "n_examples": 100,
1259
  "per_subtask": {
1260
  "SC": {
@@ -1262,33 +1262,33 @@
1262
  "count": 100
1263
  },
1264
  "UC": {
1265
- "accuracy": 0.34,
1266
  "count": 100
1267
  },
1268
  "US": {
1269
- "accuracy": 0.54,
1270
  "count": 500
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
- "full_accuracy": 0.97,
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
- "accuracy": 0.9888143176733781,
1280
  "count": 447
1281
  },
1282
  "SC": {
1283
- "accuracy": 1.0,
1284
  "count": 320
1285
  },
1286
  "SS": {
1287
- "accuracy": 0.9642857142857143,
1288
  "count": 56
1289
  },
1290
  "UC": {
1291
- "accuracy": 0.998109640831758,
1292
  "count": 529
1293
  },
1294
  "US": {
@@ -1298,7 +1298,7 @@
1298
  }
1299
  },
1300
  "add_C3": {
1301
- "full_accuracy": 0.82,
1302
  "n_examples": 100,
1303
  "per_subtask": {
1304
  "SA": {
@@ -1310,17 +1310,17 @@
1310
  "count": 100
1311
  },
1312
  "UC": {
1313
- "accuracy": 0.9067357512953368,
1314
  "count": 193
1315
  },
1316
  "US": {
1317
- "accuracy": 0.9906542056074766,
1318
  "count": 107
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
- "full_accuracy": 0.79,
1324
  "n_examples": 100,
1325
  "per_subtask": {
1326
  "SA": {
@@ -1332,17 +1332,17 @@
1332
  "count": 100
1333
  },
1334
  "UC": {
1335
- "accuracy": 0.92578125,
1336
  "count": 256
1337
  },
1338
  "US": {
1339
- "accuracy": 0.9583333333333334,
1340
  "count": 144
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
- "full_accuracy": 0.78,
1346
  "n_examples": 100,
1347
  "per_subtask": {
1348
  "SA": {
@@ -1354,17 +1354,17 @@
1354
  "count": 100
1355
  },
1356
  "UC": {
1357
- "accuracy": 0.9379084967320261,
1358
  "count": 306
1359
  },
1360
  "US": {
1361
- "accuracy": 0.865979381443299,
1362
  "count": 194
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
- "full_accuracy": 0.85,
1368
  "n_examples": 100,
1369
  "per_subtask": {
1370
  "SC": {
@@ -1372,21 +1372,21 @@
1372
  "count": 100
1373
  },
1374
  "UC": {
1375
- "accuracy": 0.9617486338797814,
1376
  "count": 366
1377
  },
1378
  "US": {
1379
- "accuracy": 0.9658119658119658,
1380
  "count": 234
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
- "full_accuracy": 0.96,
1386
  "n_examples": 100,
1387
  "per_subtask": {
1388
  "MD": {
1389
- "accuracy": 0.9933444259567388,
1390
  "count": 601
1391
  },
1392
  "ME": {
@@ -1418,11 +1418,11 @@
1418
  }
1419
  },
1420
  "sub_M2": {
1421
- "full_accuracy": 0.97,
1422
  "n_examples": 100,
1423
  "per_subtask": {
1424
  "MD": {
1425
- "accuracy": 0.9953051643192489,
1426
  "count": 213
1427
  },
1428
  "MB": {
@@ -1434,7 +1434,7 @@
1434
  "count": 85
1435
  },
1436
  "UB": {
1437
- "accuracy": 0.988950276243094,
1438
  "count": 181
1439
  },
1440
  "UD": {
@@ -1444,7 +1444,7 @@
1444
  }
1445
  },
1446
  "sub_M3": {
1447
- "full_accuracy": 0.73,
1448
  "n_examples": 100,
1449
  "per_subtask": {
1450
  "MD": {
@@ -1460,17 +1460,17 @@
1460
  "count": 56
1461
  },
1462
  "UB": {
1463
- "accuracy": 0.8322147651006712,
1464
  "count": 149
1465
  },
1466
  "UD": {
1467
- "accuracy": 0.9906103286384976,
1468
  "count": 213
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
- "full_accuracy": 0.48,
1474
  "n_examples": 100,
1475
  "per_subtask": {
1476
  "MD": {
@@ -1482,17 +1482,17 @@
1482
  "count": 100
1483
  },
1484
  "UB": {
1485
- "accuracy": 0.62,
1486
  "count": 100
1487
  },
1488
  "UD": {
1489
- "accuracy": 0.8666666666666667,
1490
  "count": 300
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
- "full_accuracy": 0.35,
1496
  "n_examples": 100,
1497
  "per_subtask": {
1498
  "MD": {
@@ -1504,25 +1504,25 @@
1504
  "count": 100
1505
  },
1506
  "UB": {
1507
- "accuracy": 0.68,
1508
  "count": 100
1509
  },
1510
  "UD": {
1511
- "accuracy": 0.755,
1512
  "count": 400
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
- "full_accuracy": 0.985,
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
- "accuracy": 0.9966666666666667,
1522
  "count": 600
1523
  },
1524
  "MB": {
1525
- "accuracy": 0.9962546816479401,
1526
  "count": 267
1527
  },
1528
  "ME": {
@@ -1540,11 +1540,11 @@
1540
  }
1541
  },
1542
  "sub_B3": {
1543
- "full_accuracy": 0.9,
1544
  "n_examples": 100,
1545
  "per_subtask": {
1546
  "MD": {
1547
- "accuracy": 0.9966666666666667,
1548
  "count": 300
1549
  },
1550
  "MB": {
@@ -1552,7 +1552,7 @@
1552
  "count": 100
1553
  },
1554
  "UB": {
1555
- "accuracy": 0.9543147208121827,
1556
  "count": 197
1557
  },
1558
  "UD": {
@@ -1562,7 +1562,7 @@
1562
  }
1563
  },
1564
  "sub_B4": {
1565
- "full_accuracy": 0.87,
1566
  "n_examples": 100,
1567
  "per_subtask": {
1568
  "MD": {
@@ -1574,17 +1574,17 @@
1574
  "count": 100
1575
  },
1576
  "UB": {
1577
- "accuracy": 0.9554655870445344,
1578
  "count": 247
1579
  },
1580
  "UD": {
1581
- "accuracy": 0.954248366013072,
1582
  "count": 153
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
- "full_accuracy": 0.79,
1588
  "n_examples": 100,
1589
  "per_subtask": {
1590
  "MD": {
@@ -1596,22 +1596,22 @@
1596
  "count": 100
1597
  },
1598
  "UB": {
1599
- "accuracy": 0.9328859060402684,
1600
  "count": 298
1601
  },
1602
  "UD": {
1603
- "accuracy": 0.9554455445544554,
1604
  "count": 202
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
- "overall_accuracy": 0.7783333333333333,
1611
  "total_examples": 2400,
1612
  "n_splits": 22
1613
  }
1614
  },
1615
- "sorl_overall_accuracy": 0.7783333333333333,
1616
- "sft_overall_accuracy": 0.6016666666666667
1617
  }
 
73
  3869
74
  ],
75
  "loss": [
76
+ 9.190461158752441,
77
+ 5.166501045227051,
78
+ 3.2305479049682617,
79
+ 3.0906858444213867,
80
+ 2.994469165802002,
81
+ 3.048895835876465,
82
+ 3.2591943740844727,
83
+ 1.6000936031341553,
84
+ -0.03324759006500244,
85
+ -3.219552516937256,
86
+ -4.197554588317871,
87
+ -2.6743619441986084,
88
+ -2.2282357215881348,
89
+ -1.7437527179718018,
90
+ -1.24261474609375,
91
+ -1.1773107051849365,
92
+ -0.9963365793228149,
93
+ -0.18717944622039795,
94
+ -0.994360089302063,
95
+ -0.17535710334777832,
96
+ -0.5866737961769104,
97
+ -0.32630622386932373,
98
+ -0.1583651304244995,
99
+ -0.5506547689437866,
100
+ -0.6750422716140747,
101
+ -0.5237396955490112,
102
+ -0.5621165037155151,
103
+ -0.07628041505813599,
104
+ -0.1960809826850891,
105
+ -0.5238226652145386,
106
+ -0.45479297637939453,
107
+ -0.5785545706748962,
108
+ -0.572192907333374,
109
+ -0.7545036673545837,
110
+ -0.39050161838531494,
111
+ -0.3878746032714844,
112
+ -0.246707946062088,
113
+ -0.3712904155254364,
114
+ -0.7410520911216736,
115
+ -0.7477802634239197,
116
+ -0.9374204277992249,
117
+ -0.6002537608146667,
118
+ -0.5804862380027771,
119
+ -0.7307106256484985,
120
+ -1.1449896097183228,
121
+ -0.7023072242736816,
122
+ -0.2751615643501282,
123
+ -0.8681702613830566,
124
+ -0.6093825101852417,
125
+ -0.3929210901260376,
126
+ -0.7827325463294983,
127
+ -0.7987732887268066,
128
+ -0.6832472085952759,
129
+ -0.5688872337341309,
130
+ -0.6018052101135254,
131
+ -0.8418792486190796,
132
+ -0.7785307765007019,
133
+ -0.337721049785614,
134
+ -0.5132121443748474,
135
+ -0.4073343873023987,
136
+ -0.3655291199684143,
137
+ -0.6873089075088501,
138
+ -0.4807700514793396,
139
+ -0.39133989810943604,
140
+ -0.4098546802997589,
141
+ -0.797407865524292,
142
+ -0.44859200716018677,
143
+ -0.4712650775909424,
144
+ -0.3992188572883606,
145
+ -0.4064672291278839
146
  ],
147
  "base_loss": [
148
+ 7.707293510437012,
149
+ 4.35805606842041,
150
+ 2.015561819076538,
151
+ 1.9516109228134155,
152
+ 1.93378746509552,
153
+ 1.8740915060043335,
154
+ 1.8301212787628174,
155
+ 1.8807543516159058,
156
+ 1.8543447256088257,
157
+ 1.7816431522369385,
158
+ 1.449736475944519,
159
+ 1.0750778913497925,
160
+ 0.8969014286994934,
161
+ 0.7201328873634338,
162
+ 0.5857338309288025,
163
+ 0.515350341796875,
164
+ 0.4651491045951843,
165
+ 0.402709424495697,
166
+ 0.44452181458473206,
167
+ 0.35987958312034607,
168
+ 0.3687458038330078,
169
+ 0.309616357088089,
170
+ 0.28466305136680603,
171
+ 0.2800248861312866,
172
+ 0.37805360555648804,
173
+ 0.349456250667572,
174
+ 0.2721202075481415,
175
+ 0.1941368132829666,
176
+ 0.2818446159362793,
177
+ 0.24511845409870148,
178
+ 0.22365573048591614,
179
+ 0.2581332325935364,
180
+ 0.22468924522399902,
181
+ 0.2866680920124054,
182
+ 0.19988836348056793,
183
+ 0.21447350084781647,
184
+ 0.19538240134716034,
185
+ 0.19758661091327667,
186
+ 0.17485487461090088,
187
+ 0.1870802789926529,
188
+ 0.1876666396856308,
189
+ 0.1586993783712387,
190
+ 0.15935318171977997,
191
+ 0.1772172749042511,
192
+ 0.20511488616466522,
193
+ 0.14057905972003937,
194
+ 0.14634256064891815,
195
+ 0.17220208048820496,
196
+ 0.15756499767303467,
197
+ 0.12289166450500488,
198
+ 0.13189402222633362,
199
+ 0.12689675390720367,
200
+ 0.1268821805715561,
201
+ 0.0976928099989891,
202
+ 0.09254758805036545,
203
+ 0.1167963370680809,
204
+ 0.12230370193719864,
205
+ 0.061435870826244354,
206
+ 0.10646288096904755,
207
+ 0.06817925721406937,
208
+ 0.07916043698787689,
209
+ 0.09582412242889404,
210
+ 0.07675866037607193,
211
+ 0.06337268650531769,
212
+ 0.06639906018972397,
213
+ 0.10464208573102951,
214
+ 0.06369388103485107,
215
+ 0.07750404626131058,
216
+ 0.05772814899682999,
217
+ 0.0618344210088253
218
  ],
219
  "info_loss": [
220
+ -0.5703163146972656,
221
+ -0.166015625,
222
+ -0.06974220275878906,
223
+ -0.0745776891708374,
224
+ -0.08152985572814941,
225
+ -0.06991314888000488,
226
+ -0.04411435127258301,
227
+ -0.21549618244171143,
228
+ -0.37584030628204346,
229
+ -0.6874351501464844,
230
+ -0.7523373961448669,
231
+ -0.557295024394989,
232
+ -0.4853973090648651,
233
+ -0.40835925936698914,
234
+ -0.3295533061027527,
235
+ -0.3078162670135498,
236
+ -0.2742995619773865,
237
+ -0.18321914970874786,
238
+ -0.2575337886810303,
239
+ -0.15976710617542267,
240
+ -0.1911141574382782,
241
+ -0.15414360165596008,
242
+ -0.13090449571609497,
243
+ -0.15989911556243896,
244
+ -0.18713147938251495,
245
+ -0.1621924787759781,
246
+ -0.15000823140144348,
247
+ -0.09506124258041382,
248
+ -0.10266301035881042,
249
+ -0.13723227381706238,
250
+ -0.12602582573890686,
251
+ -0.13937442004680634,
252
+ -0.1260860562324524,
253
+ -0.14851446449756622,
254
+ -0.10906409472227097,
255
+ -0.09848323464393616,
256
+ -0.07746206969022751,
257
+ -0.0828794315457344,
258
+ -0.11464187502861023,
259
+ -0.11303246766328812,
260
+ -0.1308683156967163,
261
+ -0.09133747220039368,
262
+ -0.09089986979961395,
263
+ -0.1075134426355362,
264
+ -0.1480427086353302,
265
+ -0.09612017124891281,
266
+ -0.054484449326992035,
267
+ -0.1174512505531311,
268
+ -0.08855501562356949,
269
+ -0.06146596744656563,
270
+ -0.10301853716373444,
271
+ -0.10187290608882904,
272
+ -0.09169016033411026,
273
+ -0.07557268440723419,
274
+ -0.0797661542892456,
275
+ -0.10500068217515945,
276
+ -0.0984283983707428,
277
+ -0.047317616641521454,
278
+ -0.06959307193756104,
279
+ -0.05479596555233002,
280
+ -0.05312419682741165,
281
+ -0.08555228263139725,
282
+ -0.06329430639743805,
283
+ -0.052405696362257004,
284
+ -0.05460964888334274,
285
+ -0.09755084663629532,
286
+ -0.05779761075973511,
287
+ -0.06151659041643143,
288
+ -0.05198967456817627,
289
+ -0.05291672796010971
290
  ],
291
  "abs_loss": [
292
+ 2.2391064167022705,
293
+ 1.903133749961853,
294
+ 1.8471856117248535,
295
+ 1.870505690574646,
296
+ 1.8405386209487915,
297
+ 1.8321367502212524,
298
+ 1.8374236822128296,
299
+ 1.8635932207107544,
300
+ 1.8500641584396362,
301
+ 1.8130455017089844,
302
+ 1.789480209350586,
303
+ 1.4839378595352173,
304
+ 1.1312556266784668,
305
+ 0.8757233619689941,
306
+ 0.6512569785118103,
307
+ 0.6017574667930603,
308
+ 0.5022904872894287,
309
+ 0.46593570709228516,
310
+ 0.40201544761657715,
311
+ 0.35612478852272034,
312
+ 0.29723620414733887,
313
+ 0.2846054136753082,
314
+ 0.2813468277454376,
315
+ 0.2518315017223358,
316
+ 0.23474085330963135,
317
+ 0.1778661459684372,
318
+ 0.22422951459884644,
319
+ 0.22165711224079132,
320
+ 0.1990116387605667,
321
+ 0.21514342725276947,
322
+ 0.20219220221042633,
323
+ 0.18571658432483673,
324
+ 0.1271841675043106,
325
+ 0.12184059619903564,
326
+ 0.13935674726963043,
327
+ 0.1843566745519638,
328
+ 0.11972486972808838,
329
+ 0.11449414491653442,
330
+ 0.1205715611577034,
331
+ 0.07506386190652847,
332
+ 0.07510096579790115,
333
+ 0.07919437438249588,
334
+ 0.09589538723230362,
335
+ 0.06470391154289246,
336
+ 0.06804872304201126,
337
+ 0.06406687945127487,
338
+ 0.07336432486772537,
339
+ 0.04453839734196663,
340
+ 0.06374714523553848,
341
+ 0.0641837790608406,
342
+ 0.051210492849349976,
343
+ 0.06126769259572029,
344
+ 0.05609944835305214,
345
+ 0.045193735510110855,
346
+ 0.06044153496623039,
347
+ 0.03156470134854317,
348
+ 0.04949544370174408,
349
+ 0.04925629124045372,
350
+ 0.037908535450696945,
351
+ 0.052750349044799805,
352
+ 0.05024566873908043,
353
+ 0.0473237968981266,
354
+ 0.056845005601644516,
355
+ 0.0363769493997097,
356
+ 0.03889637067914009,
357
+ 0.04386458173394203,
358
+ 0.030996553599834442,
359
+ 0.02613372914493084,
360
+ 0.025389796122908592,
361
+ 0.03995569422841072
362
  ],
363
  "zipf_loss": [
364
+ 6.962420463562012,
365
+ 2.278287887573242,
366
+ 1.727689504623413,
367
+ 1.6978013515472412,
368
+ 1.6919262409210205,
369
+ 1.690722107887268,
370
+ 1.686474084854126,
371
+ 1.6879417896270752,
372
+ 1.6858043670654297,
373
+ 1.6918517351150513,
374
+ 1.6971348524093628,
375
+ 1.675116777420044,
376
+ 1.6157103776931763,
377
+ 1.5321345329284668,
378
+ 1.4020588397979736,
379
+ 1.3253259658813477,
380
+ 1.2312809228897095,
381
+ 1.195708990097046,
382
+ 1.0962542295455933,
383
+ 1.0268218517303467,
384
+ 0.9259983897209167,
385
+ 0.877052903175354,
386
+ 0.8378820419311523,
387
+ 0.7431282997131348,
388
+ 0.7947447299957275,
389
+ 0.7309422492980957,
390
+ 0.6434226036071777,
391
+ 0.6580294966697693,
392
+ 0.5288034081459045,
393
+ 0.5818673372268677,
394
+ 0.561590313911438,
395
+ 0.5384848713874817,
396
+ 0.45125994086265564,
397
+ 0.4317888617515564,
398
+ 0.4863152503967285,
399
+ 0.3640486001968384,
400
+ 0.3205578625202179,
401
+ 0.24846789240837097,
402
+ 0.21845467388629913,
403
+ 0.18795782327651978,
404
+ 0.17608600854873657,
405
+ 0.14650212228298187,
406
+ 0.15956974029541016,
407
+ 0.16073615849018097,
408
+ 0.12351780384778976,
409
+ 0.11190879344940186,
410
+ 0.11600394546985626,
411
+ 0.12968632578849792,
412
+ 0.11222794651985168,
413
+ 0.0924285426735878,
414
+ 0.11043774336576462,
415
+ 0.08693233132362366,
416
+ 0.10116228461265564,
417
+ 0.0846274122595787,
418
+ 0.0972646176815033,
419
+ 0.08817484229803085,
420
+ 0.07849989831447601,
421
+ 0.06909362971782684,
422
+ 0.07246481627225876,
423
+ 0.06717102229595184,
424
+ 0.08152782917022705,
425
+ 0.06765741109848022,
426
+ 0.06972979009151459,
427
+ 0.06570670753717422,
428
+ 0.0659530982375145,
429
+ 0.06907197833061218,
430
+ 0.06259053200483322,
431
+ 0.06378341466188431,
432
+ 0.0604107640683651,
433
+ 0.05687008425593376
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
+ 0.4214286804199219,
438
+ 0.18882839381694794,
439
+ 0.1212303414940834,
440
+ 0.11552999913692474,
441
+ 0.08706260472536087,
442
+ 0.09062568098306656,
443
+ 0.08478324860334396,
444
+ 0.08018892258405685,
445
+ 0.09239780157804489,
446
+ 0.1168321818113327,
447
+ 0.13350531458854675,
448
+ 0.15595880150794983,
449
+ 0.1914532333612442,
450
+ 0.1955980360507965,
451
+ 0.20734401047229767,
452
+ 0.2079848051071167,
453
+ 0.22458618879318237,
454
+ 0.217812642455101,
455
+ 0.22359661757946014,
456
+ 0.22261333465576172,
457
+ 0.21620140969753265,
458
+ 0.2151515632867813,
459
+ 0.2155912071466446,
460
+ 0.21206055581569672,
461
+ 0.21063639223575592,
462
+ 0.21173329651355743,
463
+ 0.2016972005367279,
464
+ 0.1977507472038269,
465
+ 0.19506840407848358,
466
+ 0.19696591794490814,
467
+ 0.17341327667236328,
468
+ 0.16453030705451965,
469
+ 0.18576037883758545,
470
+ 0.1828179508447647,
471
+ 0.1950673907995224,
472
+ 0.1837579756975174,
473
+ 0.1741635948419571,
474
+ 0.17811855673789978,
475
+ 0.16840724647045135,
476
+ 0.16616372764110565,
477
+ 0.17120879888534546,
478
+ 0.16129401326179504,
479
+ 0.16783320903778076,
480
+ 0.16347329318523407,
481
+ 0.16215452551841736,
482
+ 0.16703131794929504,
483
+ 0.16951780021190643,
484
+ 0.16996146738529205,
485
+ 0.16912420094013214,
486
+ 0.1748487651348114,
487
+ 0.1752723902463913,
488
+ 0.18083271384239197,
489
+ 0.17920422554016113,
490
+ 0.18311761319637299,
491
+ 0.18380235135555267,
492
+ 0.18301932513713837,
493
+ 0.1814393401145935,
494
+ 0.18424195051193237,
495
+ 0.18770591914653778,
496
+ 0.18873360753059387,
497
+ 0.18713155388832092,
498
+ 0.18882162868976593,
499
+ 0.18982088565826416,
500
+ 0.18827897310256958,
501
+ 0.18959622085094452,
502
+ 0.18957337737083435,
503
+ 0.1906057596206665,
504
+ 0.1932428628206253,
505
+ 0.19275683164596558,
506
+ 0.1928425282239914
507
  ],
508
  "lr": [
509
+ 3.350427350427351e-05,
510
+ 6.76923076923077e-05,
511
+ 8e-05,
512
+ 8e-05,
513
+ 8e-05,
514
+ 8e-05,
515
+ 8e-05,
516
+ 8e-05,
517
+ 8e-05,
518
+ 8e-05,
519
+ 8e-05,
520
+ 8e-05,
521
+ 8e-05,
522
+ 8e-05,
523
+ 8e-05,
524
+ 8e-05,
525
+ 8e-05,
526
+ 8e-05,
527
+ 8e-05,
528
+ 8e-05,
529
+ 8e-05,
530
+ 8e-05,
531
+ 8e-05,
532
+ 8e-05,
533
+ 8e-05,
534
+ 8e-05,
535
+ 8e-05,
536
+ 8e-05,
537
+ 8e-05,
538
+ 8e-05,
539
+ 8e-05,
540
+ 8e-05,
541
+ 8e-05,
542
+ 8e-05,
543
+ 8e-05,
544
+ 8e-05,
545
+ 8e-05,
546
+ 8e-05,
547
+ 8e-05,
548
+ 8e-05,
549
+ 8e-05,
550
+ 8e-05,
551
+ 7.989559715264963e-05,
552
+ 7.752280516741365e-05,
553
+ 7.51500131821777e-05,
554
+ 7.277722119694173e-05,
555
+ 7.040442921170577e-05,
556
+ 6.803163722646982e-05,
557
+ 6.565884524123386e-05,
558
+ 6.134036382810441e-05,
559
+ 5.896757184286844e-05,
560
+ 5.6594779857632485e-05,
561
+ 5.422198787239652e-05,
562
+ 5.184919588716057e-05,
563
+ 4.947640390192461e-05,
564
+ 4.710361191668863e-05,
565
+ 4.278513050355919e-05,
566
+ 4.041233851832322e-05,
567
+ 3.803954653308728e-05,
568
+ 3.566675454785132e-05,
569
+ 3.329396256261535e-05,
570
+ 3.092117057737938e-05,
571
+ 2.854837859214342e-05,
572
+ 2.4229897179013967e-05,
573
+ 2.1857105193778026e-05,
574
+ 1.9484313208542057e-05,
575
+ 1.7111521223306092e-05,
576
+ 1.4738729238070129e-05,
577
+ 1.2365937252834164e-05,
578
+ 9.993145267598198e-06
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
 
603
  0.0
604
  ]
605
  },
606
+ "final_accuracy": 0.8683333333333333,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
 
618
  "n_examples": 100,
619
  "per_subtask": {
620
  "SA": {
621
+ "accuracy": 1.0,
622
  "count": 605
623
  },
624
  "SS": {
625
+ "accuracy": 0.9789473684210527,
626
  "count": 95
627
  }
628
  }
629
  },
630
  "add_S1": {
631
+ "full_accuracy": 0.99,
632
  "n_examples": 100,
633
  "per_subtask": {
634
  "SA": {
635
+ "accuracy": 0.9950980392156863,
636
  "count": 204
637
  },
638
  "SC": {
639
+ "accuracy": 1.0,
640
  "count": 169
641
  },
642
  "SS": {
 
644
  "count": 31
645
  },
646
  "UC": {
647
+ "accuracy": 1.0,
648
  "count": 296
649
  }
650
  }
651
  },
652
  "add_S2": {
653
+ "full_accuracy": 0.85,
654
  "n_examples": 100,
655
  "per_subtask": {
656
  "SA": {
 
662
  "count": 130
663
  },
664
  "SS": {
665
+ "accuracy": 0.9655172413793104,
666
  "count": 87
667
  },
668
  "UC": {
669
+ "accuracy": 0.9655172413793104,
670
  "count": 203
671
  },
672
  "US": {
 
676
  }
677
  },
678
  "add_S3": {
679
+ "full_accuracy": 0.5,
680
  "n_examples": 100,
681
  "per_subtask": {
682
  "SA": {
 
684
  "count": 121
685
  },
686
  "SC": {
687
+ "accuracy": 0.9752066115702479,
688
  "count": 121
689
  },
690
  "SS": {
691
+ "accuracy": 1.0,
692
  "count": 49
693
  },
694
  "UC": {
695
+ "accuracy": 0.7580645161290323,
696
  "count": 186
697
  },
698
  "US": {
699
+ "accuracy": 0.9730941704035875,
700
  "count": 223
701
  }
702
  }
703
  },
704
  "add_S4": {
705
+ "full_accuracy": 0.37,
706
  "n_examples": 100,
707
  "per_subtask": {
708
  "SA": {
709
+ "accuracy": 1.0,
710
  "count": 104
711
  },
712
  "SC": {
713
+ "accuracy": 0.9811320754716981,
714
  "count": 106
715
  },
716
  "SS": {
 
718
  "count": 23
719
  },
720
  "UC": {
721
+ "accuracy": 0.66875,
722
  "count": 160
723
  },
724
  "US": {
725
+ "accuracy": 0.8045602605863192,
726
  "count": 307
727
  }
728
  }
729
  },
730
  "add_S5": {
731
+ "full_accuracy": 0.18,
732
  "n_examples": 100,
733
  "per_subtask": {
734
  "SA": {
 
740
  "count": 100
741
  },
742
  "UC": {
743
+ "accuracy": 0.3,
744
  "count": 100
745
  },
746
  "US": {
747
+ "accuracy": 0.605,
748
  "count": 400
749
  }
750
  }
751
  },
752
  "add_S6": {
753
+ "full_accuracy": 0.38,
754
  "n_examples": 100,
755
  "per_subtask": {
756
  "SC": {
 
758
  "count": 100
759
  },
760
  "UC": {
761
+ "accuracy": 0.53,
762
  "count": 100
763
  },
764
  "US": {
765
+ "accuracy": 0.644,
766
  "count": 500
767
  }
768
  }
769
  },
770
  "add_random": {
771
+ "full_accuracy": 0.925,
772
  "n_examples": 200,
773
  "per_subtask": {
774
  "SA": {
775
+ "accuracy": 0.9910514541387024,
776
  "count": 447
777
  },
778
  "SC": {
779
+ "accuracy": 0.984375,
780
  "count": 320
781
  },
782
  "SS": {
 
784
  "count": 56
785
  },
786
  "UC": {
787
+ "accuracy": 0.9886578449905482,
788
  "count": 529
789
  },
790
  "US": {
 
794
  }
795
  },
796
  "add_C3": {
797
+ "full_accuracy": 0.81,
798
  "n_examples": 100,
799
  "per_subtask": {
800
  "SA": {
 
802
  "count": 300
803
  },
804
  "SC": {
805
+ "accuracy": 1.0,
806
  "count": 100
807
  },
808
  "UC": {
809
+ "accuracy": 0.9067357512953368,
810
  "count": 193
811
  },
812
  "US": {
813
+ "accuracy": 0.9626168224299065,
814
  "count": 107
815
  }
816
  }
817
  },
818
  "add_C4": {
819
+ "full_accuracy": 0.75,
820
  "n_examples": 100,
821
  "per_subtask": {
822
  "SA": {
 
828
  "count": 100
829
  },
830
  "UC": {
831
+ "accuracy": 0.90625,
832
  "count": 256
833
  },
834
  "US": {
835
+ "accuracy": 0.9305555555555556,
836
  "count": 144
837
  }
838
  }
839
  },
840
  "add_C5": {
841
+ "full_accuracy": 0.68,
842
  "n_examples": 100,
843
  "per_subtask": {
844
  "SA": {
845
+ "accuracy": 1.0,
846
  "count": 100
847
  },
848
  "SC": {
849
+ "accuracy": 1.0,
850
  "count": 100
851
  },
852
  "UC": {
853
+ "accuracy": 0.9084967320261438,
854
  "count": 306
855
  },
856
  "US": {
857
+ "accuracy": 0.8556701030927835,
858
  "count": 194
859
  }
860
  }
861
  },
862
  "add_C6": {
863
+ "full_accuracy": 0.76,
864
  "n_examples": 100,
865
  "per_subtask": {
866
  "SC": {
 
868
  "count": 100
869
  },
870
  "UC": {
871
+ "accuracy": 0.9344262295081968,
872
  "count": 366
873
  },
874
  "US": {
875
+ "accuracy": 0.9700854700854701,
876
  "count": 234
877
  }
878
  }
879
  },
880
  "sub_M0": {
881
+ "full_accuracy": 1.0,
882
  "n_examples": 100,
883
  "per_subtask": {
884
  "MD": {
885
+ "accuracy": 1.0,
886
  "count": 601
887
  },
888
  "ME": {
 
914
  }
915
  },
916
  "sub_M2": {
917
+ "full_accuracy": 0.9,
918
  "n_examples": 100,
919
  "per_subtask": {
920
  "MD": {
921
+ "accuracy": 1.0,
922
  "count": 213
923
  },
924
  "MB": {
925
+ "accuracy": 0.9911504424778761,
926
  "count": 113
927
  },
928
  "ME": {
 
930
  "count": 85
931
  },
932
  "UB": {
933
+ "accuracy": 0.9502762430939227,
934
  "count": 181
935
  },
936
  "UD": {
 
940
  }
941
  },
942
  "sub_M3": {
943
+ "full_accuracy": 0.19,
944
  "n_examples": 100,
945
  "per_subtask": {
946
  "MD": {
 
948
  "count": 179
949
  },
950
  "MB": {
951
+ "accuracy": 1.0,
952
  "count": 103
953
  },
954
  "ME": {
 
956
  "count": 56
957
  },
958
  "UB": {
959
+ "accuracy": 0.4697986577181208,
960
  "count": 149
961
  },
962
  "UD": {
963
+ "accuracy": 0.9765258215962441,
964
  "count": 213
965
  }
966
  }
967
  },
968
  "sub_M4": {
969
+ "full_accuracy": 0.01,
970
  "n_examples": 100,
971
  "per_subtask": {
972
  "MD": {
 
978
  "count": 100
979
  },
980
  "UB": {
981
+ "accuracy": 0.1,
982
  "count": 100
983
  },
984
  "UD": {
985
+ "accuracy": 0.57,
986
  "count": 300
987
  }
988
  }
989
  },
990
  "sub_M5": {
991
+ "full_accuracy": 0.0,
992
  "n_examples": 100,
993
  "per_subtask": {
994
  "MD": {
 
1000
  "count": 100
1001
  },
1002
  "UB": {
1003
+ "accuracy": 0.02,
1004
  "count": 100
1005
  },
1006
  "UD": {
1007
+ "accuracy": 0.435,
1008
  "count": 400
1009
  }
1010
  }
1011
  },
1012
  "sub_random": {
1013
+ "full_accuracy": 0.96,
1014
  "n_examples": 200,
1015
  "per_subtask": {
1016
  "MD": {
1017
+ "accuracy": 0.9966666666666667,
1018
  "count": 600
1019
  },
1020
  "MB": {
1021
+ "accuracy": 1.0,
1022
  "count": 267
1023
  },
1024
  "ME": {
1025
+ "accuracy": 0.9811320754716981,
1026
  "count": 53
1027
  },
1028
  "UB": {
1029
+ "accuracy": 0.9886104783599089,
1030
  "count": 439
1031
  },
1032
  "UD": {
1033
+ "accuracy": 1.0,
1034
  "count": 41
1035
  }
1036
  }
1037
  },
1038
  "sub_B3": {
1039
+ "full_accuracy": 0.77,
1040
  "n_examples": 100,
1041
  "per_subtask": {
1042
  "MD": {
 
1052
  "count": 197
1053
  },
1054
  "UD": {
1055
+ "accuracy": 0.9805825242718447,
1056
  "count": 103
1057
  }
1058
  }
1059
  },
1060
  "sub_B4": {
1061
+ "full_accuracy": 0.6,
1062
  "n_examples": 100,
1063
  "per_subtask": {
1064
  "MD": {
 
1070
  "count": 100
1071
  },
1072
  "UB": {
1073
+ "accuracy": 0.8380566801619433,
1074
  "count": 247
1075
  },
1076
  "UD": {
1077
+ "accuracy": 0.8758169934640523,
1078
  "count": 153
1079
  }
1080
  }
1081
  },
1082
  "sub_B5": {
1083
+ "full_accuracy": 0.5,
1084
  "n_examples": 100,
1085
  "per_subtask": {
1086
  "MD": {
 
1092
  "count": 100
1093
  },
1094
  "UB": {
1095
+ "accuracy": 0.8355704697986577,
1096
  "count": 298
1097
  },
1098
  "UD": {
1099
+ "accuracy": 0.8663366336633663,
1100
  "count": 202
1101
  }
1102
  }
1103
  }
1104
  },
1105
  "summary": {
1106
+ "overall_accuracy": 0.66375,
1107
  "total_examples": 2400,
1108
  "n_splits": 22
1109
  }
 
1118
  },
1119
  "splits": {
1120
  "add_S0": {
1121
+ "full_accuracy": 0.99,
1122
  "n_examples": 100,
1123
  "per_subtask": {
1124
  "SA": {
1125
+ "accuracy": 0.9983471074380166,
1126
  "count": 605
1127
  },
1128
  "SS": {
 
1154
  }
1155
  },
1156
  "add_S2": {
1157
+ "full_accuracy": 0.99,
1158
  "n_examples": 100,
1159
  "per_subtask": {
1160
  "SA": {
1161
+ "accuracy": 1.0,
1162
  "count": 163
1163
  },
1164
  "SC": {
1165
+ "accuracy": 1.0,
1166
  "count": 130
1167
  },
1168
  "SS": {
1169
+ "accuracy": 1.0,
1170
  "count": 87
1171
  },
1172
  "UC": {
1173
+ "accuracy": 0.9950738916256158,
1174
  "count": 203
1175
  },
1176
  "US": {
 
1180
  }
1181
  },
1182
  "add_S3": {
1183
+ "full_accuracy": 0.98,
1184
  "n_examples": 100,
1185
  "per_subtask": {
1186
  "SA": {
 
1188
  "count": 121
1189
  },
1190
  "SC": {
1191
+ "accuracy": 1.0,
1192
  "count": 121
1193
  },
1194
  "SS": {
 
1196
  "count": 49
1197
  },
1198
  "UC": {
1199
+ "accuracy": 0.989247311827957,
1200
  "count": 186
1201
  },
1202
  "US": {
1203
+ "accuracy": 1.0,
1204
  "count": 223
1205
  }
1206
  }
1207
  },
1208
  "add_S4": {
1209
+ "full_accuracy": 0.86,
1210
  "n_examples": 100,
1211
  "per_subtask": {
1212
  "SA": {
 
1222
  "count": 23
1223
  },
1224
  "UC": {
1225
+ "accuracy": 0.9125,
1226
  "count": 160
1227
  },
1228
  "US": {
1229
+ "accuracy": 0.9869706840390879,
1230
  "count": 307
1231
  }
1232
  }
1233
  },
1234
  "add_S5": {
1235
+ "full_accuracy": 0.35,
1236
  "n_examples": 100,
1237
  "per_subtask": {
1238
  "SA": {
 
1244
  "count": 100
1245
  },
1246
  "UC": {
1247
+ "accuracy": 0.37,
1248
  "count": 100
1249
  },
1250
  "US": {
1251
+ "accuracy": 0.8925,
1252
  "count": 400
1253
  }
1254
  }
1255
  },
1256
  "add_S6": {
1257
+ "full_accuracy": 0.54,
1258
  "n_examples": 100,
1259
  "per_subtask": {
1260
  "SC": {
 
1262
  "count": 100
1263
  },
1264
  "UC": {
1265
+ "accuracy": 0.55,
1266
  "count": 100
1267
  },
1268
  "US": {
1269
+ "accuracy": 0.882,
1270
  "count": 500
1271
  }
1272
  }
1273
  },
1274
  "add_random": {
1275
+ "full_accuracy": 0.98,
1276
  "n_examples": 200,
1277
  "per_subtask": {
1278
  "SA": {
1279
+ "accuracy": 0.9932885906040269,
1280
  "count": 447
1281
  },
1282
  "SC": {
1283
+ "accuracy": 0.996875,
1284
  "count": 320
1285
  },
1286
  "SS": {
1287
+ "accuracy": 1.0,
1288
  "count": 56
1289
  },
1290
  "UC": {
1291
+ "accuracy": 1.0,
1292
  "count": 529
1293
  },
1294
  "US": {
 
1298
  }
1299
  },
1300
  "add_C3": {
1301
+ "full_accuracy": 0.95,
1302
  "n_examples": 100,
1303
  "per_subtask": {
1304
  "SA": {
 
1310
  "count": 100
1311
  },
1312
  "UC": {
1313
+ "accuracy": 0.9740932642487047,
1314
  "count": 193
1315
  },
1316
  "US": {
1317
+ "accuracy": 1.0,
1318
  "count": 107
1319
  }
1320
  }
1321
  },
1322
  "add_C4": {
1323
+ "full_accuracy": 0.96,
1324
  "n_examples": 100,
1325
  "per_subtask": {
1326
  "SA": {
 
1332
  "count": 100
1333
  },
1334
  "UC": {
1335
+ "accuracy": 0.98828125,
1336
  "count": 256
1337
  },
1338
  "US": {
1339
+ "accuracy": 0.9861111111111112,
1340
  "count": 144
1341
  }
1342
  }
1343
  },
1344
  "add_C5": {
1345
+ "full_accuracy": 0.89,
1346
  "n_examples": 100,
1347
  "per_subtask": {
1348
  "SA": {
 
1354
  "count": 100
1355
  },
1356
  "UC": {
1357
+ "accuracy": 0.9705882352941176,
1358
  "count": 306
1359
  },
1360
  "US": {
1361
+ "accuracy": 0.9742268041237113,
1362
  "count": 194
1363
  }
1364
  }
1365
  },
1366
  "add_C6": {
1367
+ "full_accuracy": 0.96,
1368
  "n_examples": 100,
1369
  "per_subtask": {
1370
  "SC": {
 
1372
  "count": 100
1373
  },
1374
  "UC": {
1375
+ "accuracy": 0.9890710382513661,
1376
  "count": 366
1377
  },
1378
  "US": {
1379
+ "accuracy": 0.9957264957264957,
1380
  "count": 234
1381
  }
1382
  }
1383
  },
1384
  "sub_M0": {
1385
+ "full_accuracy": 1.0,
1386
  "n_examples": 100,
1387
  "per_subtask": {
1388
  "MD": {
1389
+ "accuracy": 1.0,
1390
  "count": 601
1391
  },
1392
  "ME": {
 
1418
  }
1419
  },
1420
  "sub_M2": {
1421
+ "full_accuracy": 1.0,
1422
  "n_examples": 100,
1423
  "per_subtask": {
1424
  "MD": {
1425
+ "accuracy": 1.0,
1426
  "count": 213
1427
  },
1428
  "MB": {
 
1434
  "count": 85
1435
  },
1436
  "UB": {
1437
+ "accuracy": 1.0,
1438
  "count": 181
1439
  },
1440
  "UD": {
 
1444
  }
1445
  },
1446
  "sub_M3": {
1447
+ "full_accuracy": 0.95,
1448
  "n_examples": 100,
1449
  "per_subtask": {
1450
  "MD": {
 
1460
  "count": 56
1461
  },
1462
  "UB": {
1463
+ "accuracy": 0.9664429530201343,
1464
  "count": 149
1465
  },
1466
  "UD": {
1467
+ "accuracy": 0.9953051643192489,
1468
  "count": 213
1469
  }
1470
  }
1471
  },
1472
  "sub_M4": {
1473
+ "full_accuracy": 0.55,
1474
  "n_examples": 100,
1475
  "per_subtask": {
1476
  "MD": {
 
1482
  "count": 100
1483
  },
1484
  "UB": {
1485
+ "accuracy": 0.63,
1486
  "count": 100
1487
  },
1488
  "UD": {
1489
+ "accuracy": 0.95,
1490
  "count": 300
1491
  }
1492
  }
1493
  },
1494
  "sub_M5": {
1495
+ "full_accuracy": 0.11,
1496
  "n_examples": 100,
1497
  "per_subtask": {
1498
  "MD": {
 
1504
  "count": 100
1505
  },
1506
  "UB": {
1507
+ "accuracy": 0.11,
1508
  "count": 100
1509
  },
1510
  "UD": {
1511
+ "accuracy": 0.895,
1512
  "count": 400
1513
  }
1514
  }
1515
  },
1516
  "sub_random": {
1517
+ "full_accuracy": 1.0,
1518
  "n_examples": 200,
1519
  "per_subtask": {
1520
  "MD": {
1521
+ "accuracy": 1.0,
1522
  "count": 600
1523
  },
1524
  "MB": {
1525
+ "accuracy": 1.0,
1526
  "count": 267
1527
  },
1528
  "ME": {
 
1540
  }
1541
  },
1542
  "sub_B3": {
1543
+ "full_accuracy": 0.95,
1544
  "n_examples": 100,
1545
  "per_subtask": {
1546
  "MD": {
1547
+ "accuracy": 1.0,
1548
  "count": 300
1549
  },
1550
  "MB": {
 
1552
  "count": 100
1553
  },
1554
  "UB": {
1555
+ "accuracy": 0.9746192893401016,
1556
  "count": 197
1557
  },
1558
  "UD": {
 
1562
  }
1563
  },
1564
  "sub_B4": {
1565
+ "full_accuracy": 0.95,
1566
  "n_examples": 100,
1567
  "per_subtask": {
1568
  "MD": {
 
1574
  "count": 100
1575
  },
1576
  "UB": {
1577
+ "accuracy": 0.9838056680161943,
1578
  "count": 247
1579
  },
1580
  "UD": {
1581
+ "accuracy": 0.9803921568627451,
1582
  "count": 153
1583
  }
1584
  }
1585
  },
1586
  "sub_B5": {
1587
+ "full_accuracy": 0.9,
1588
  "n_examples": 100,
1589
  "per_subtask": {
1590
  "MD": {
 
1596
  "count": 100
1597
  },
1598
  "UB": {
1599
+ "accuracy": 0.9664429530201343,
1600
  "count": 298
1601
  },
1602
  "UD": {
1603
+ "accuracy": 0.9801980198019802,
1604
  "count": 202
1605
  }
1606
  }
1607
  }
1608
  },
1609
  "summary": {
1610
+ "overall_accuracy": 0.8683333333333333,
1611
  "total_examples": 2400,
1612
  "n_splits": 22
1613
  }
1614
  },
1615
+ "sorl_overall_accuracy": 0.8683333333333333,
1616
+ "sft_overall_accuracy": 0.66375
1617
  }
add_sub_sorl_v1_abs10_K1_25K/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3abff1d3d65f28a467e0107b3021da4659af9733b2521b6687ffa6fbb31e60f
3
  size 650303660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf047dee834e21f59a68ae2338d8d11dc22cd4fd412eacace785b4753ce6d636
3
  size 650303660
add_sub_sorl_v1_abs10_K1_25K/train_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
- "lr": 4e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 117,
@@ -69,16 +69,16 @@
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K",
72
- "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
- "timestamp": "2026-04-12T08:59:10.014670+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "je0pfgat",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/je0pfgat",
81
- "final_accuracy": 0.7783333333333333,
82
- "sft_accuracy": 0.6016666666666667,
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
17
  "target_vocab_util": 0.8,
18
  "min_abs_ppl": 0.0,
19
  "zipf_alpha": 1.0,
20
+ "lr": 8e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
  "warmup_steps": 117,
 
69
  "no_wandb": false,
70
  "n_params": 162499262,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K",
72
+ "git_commit": "8d5ee5420119746ef4e2c87570eb250c9718f643",
73
+ "timestamp": "2026-04-12T21:07:34.657454+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
+ "wandb_run_id": "ihgpsgwk",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/ihgpsgwk",
81
+ "final_accuracy": 0.8683333333333333,
82
+ "sft_accuracy": 0.66375,
83
  "eval_method": "ArithmeticEvaluator"
84
  }