amirali1985 commited on
Commit
aec10d2
·
verified ·
1 Parent(s): e718e4b

Upload add_sub_sorl_v1_abs10_K1_25K_2L1H128d

Browse files
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/metrics.json CHANGED
@@ -70,444 +70,936 @@
70
  3719,
71
  3769,
72
  3819,
73
- 3869
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  ],
75
  "loss": [
76
- 13.454885482788086,
77
- 11.478792190551758,
78
- 13.050385475158691,
79
- 14.577110290527344,
80
- 14.17066764831543,
81
- 13.036783218383789,
82
- 11.886457443237305,
83
- 10.250679016113281,
84
- 9.566696166992188,
85
- 8.962414741516113,
86
- 8.374322891235352,
87
- 7.972871780395508,
88
- 7.557723045349121,
89
- 7.117671012878418,
90
- 6.462696075439453,
91
- 6.050843238830566,
92
- 5.741342067718506,
93
- 5.391325950622559,
94
- 5.264486789703369,
95
- 4.956804275512695,
96
- 4.797525405883789,
97
- 4.500096321105957,
98
- 4.433718681335449,
99
- 4.317060470581055,
100
- 4.279264450073242,
101
- 4.234098434448242,
102
- 4.042954921722412,
103
- 4.035151481628418,
104
- 4.045926094055176,
105
- 3.9358861446380615,
106
- 3.753159761428833,
107
- 3.7763824462890625,
108
- 3.8419253826141357,
109
- 3.6180100440979004,
110
- 3.3646740913391113,
111
- 3.16855525970459,
112
- 3.360339641571045,
113
- 2.8317813873291016,
114
- 2.9290781021118164,
115
- 2.785353660583496,
116
- 2.54134464263916,
117
- 2.228949785232544,
118
- 2.0700716972351074,
119
- 1.8415426015853882,
120
- 1.5988508462905884,
121
- 1.569923758506775,
122
- 1.2837797403335571,
123
- 1.1172637939453125,
124
- 0.7710850834846497,
125
- 0.6829363107681274,
126
- 0.49595534801483154,
127
- 0.17024922370910645,
128
- 0.21459567546844482,
129
- -0.10922026634216309,
130
- -0.38342714309692383,
131
- -0.10504281520843506,
132
- -0.5501455068588257,
133
- -0.7160437107086182,
134
- -0.707771897315979,
135
- -0.9133639335632324,
136
- -1.1110361814498901,
137
- -0.8491053581237793,
138
- -0.9960892200469971,
139
- -1.0281953811645508,
140
- -1.2926632165908813,
141
- -1.2633980512619019,
142
- -1.2680187225341797,
143
- -1.4050925970077515,
144
- -1.2741034030914307,
145
- -1.4242706298828125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  ],
147
  "base_loss": [
148
- 11.880859375,
149
- 11.633416175842285,
150
- 11.221420288085938,
151
- 10.539042472839355,
152
- 10.006148338317871,
153
- 9.441154479980469,
154
- 8.854585647583008,
155
- 7.859694480895996,
156
- 7.329966068267822,
157
- 6.851498603820801,
158
- 6.352517127990723,
159
- 6.018871784210205,
160
- 5.628158092498779,
161
- 5.2450385093688965,
162
- 4.57428503036499,
163
- 4.198834419250488,
164
- 3.868682861328125,
165
- 3.5653653144836426,
166
- 3.412782907485962,
167
- 3.165587902069092,
168
- 2.9751205444335938,
169
- 2.7248756885528564,
170
- 2.6281933784484863,
171
- 2.598451852798462,
172
- 2.5072898864746094,
173
- 2.5161983966827393,
174
- 2.393761157989502,
175
- 2.3599026203155518,
176
- 2.3197877407073975,
177
- 2.3430163860321045,
178
- 2.2583587169647217,
179
- 2.2844526767730713,
180
- 2.254804849624634,
181
- 2.290438413619995,
182
- 2.2097489833831787,
183
- 2.197057008743286,
184
- 2.2262990474700928,
185
- 2.1319544315338135,
186
- 2.107748031616211,
187
- 2.173170328140259,
188
- 2.1438701152801514,
189
- 2.181342363357544,
190
- 2.0934159755706787,
191
- 2.126227617263794,
192
- 2.09466814994812,
193
- 2.0434868335723877,
194
- 2.1309261322021484,
195
- 2.075498342514038,
196
- 2.1343188285827637,
197
- 2.0033481121063232,
198
- 2.0886590480804443,
199
- 2.092684507369995,
200
- 2.020324945449829,
201
- 1.984065055847168,
202
- 2.0311124324798584,
203
- 2.026329755783081,
204
- 1.9996259212493896,
205
- 2.0779316425323486,
206
- 1.9736812114715576,
207
- 2.030266284942627,
208
- 2.1000616550445557,
209
- 2.038893461227417,
210
- 1.9640614986419678,
211
- 2.036902904510498,
212
- 2.0785582065582275,
213
- 2.0229272842407227,
214
- 2.0268771648406982,
215
- 2.0155155658721924,
216
- 1.9266163110733032,
217
- 2.076498508453369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  ],
219
  "info_loss": [
220
- -0.7863264083862305,
221
- -0.9126081466674805,
222
- -0.61785888671875,
223
- -0.2595643997192383,
224
- -0.1060333251953125,
225
- -0.05567169189453125,
226
- -0.03524017333984375,
227
- -0.020551681518554688,
228
- -0.01478719711303711,
229
- -0.013951778411865234,
230
- -0.01464700698852539,
231
- -0.015613079071044922,
232
- -0.013948440551757812,
233
- -0.016277790069580078,
234
- -0.010252952575683594,
235
- -0.012321949005126953,
236
- -0.008607149124145508,
237
- -0.012037992477416992,
238
- -0.008737802505493164,
239
- -0.013904571533203125,
240
- -0.010097980499267578,
241
- -0.013910055160522461,
242
- -0.010562896728515625,
243
- -0.018952369689941406,
244
- -0.013309478759765625,
245
- -0.018421173095703125,
246
- -0.025274276733398438,
247
- -0.02254462242126465,
248
- -0.01696610450744629,
249
- -0.03021550178527832,
250
- -0.03984832763671875,
251
- -0.0401759147644043,
252
- -0.030680418014526367,
253
- -0.05649089813232422,
254
- -0.07359719276428223,
255
- -0.09176802635192871,
256
- -0.07550787925720215,
257
- -0.11891603469848633,
258
- -0.10670852661132812,
259
- -0.1276233196258545,
260
- -0.14898061752319336,
261
- -0.1840728521347046,
262
- -0.19084644317626953,
263
- -0.2173391580581665,
264
- -0.2383211851119995,
265
- -0.23623204231262207,
266
- -0.27347636222839355,
267
- -0.28460466861724854,
268
- -0.32501447200775146,
269
- -0.32063066959381104,
270
- -0.34785377979278564,
271
- -0.38096821308135986,
272
- -0.3693283796310425,
273
- -0.39820408821105957,
274
- -0.4302417039871216,
275
- -0.40181398391723633,
276
- -0.4437370300292969,
277
- -0.46821415424346924,
278
- -0.4569128751754761,
279
- -0.4830566644668579,
280
- -0.5098972320556641,
281
- -0.4775048494338989,
282
- -0.4846118688583374,
283
- -0.4952908754348755,
284
- -0.5258979797363281,
285
- -0.5176137685775757,
286
- -0.518396258354187,
287
- -0.5309580564498901,
288
- -0.5087277889251709,
289
- -0.5388277769088745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  ],
291
  "abs_loss": [
292
- 2.3025481700897217,
293
- 2.2900214195251465,
294
- 2.254666566848755,
295
- 2.190378189086914,
296
- 2.0967636108398438,
297
- 2.037475347518921,
298
- 1.9807038307189941,
299
- 1.8861985206604004,
300
- 1.8569782972335815,
301
- 1.824637532234192,
302
- 1.8228012323379517,
303
- 1.818422794342041,
304
- 1.8206357955932617,
305
- 1.822617530822754,
306
- 1.804533839225769,
307
- 1.8139501810073853,
308
- 1.793150544166565,
309
- 1.7828887701034546,
310
- 1.8064988851547241,
311
- 1.8026375770568848,
312
- 1.797040343284607,
313
- 1.8011388778686523,
314
- 1.8015633821487427,
315
- 1.8015815019607544,
316
- 1.799573302268982,
317
- 1.7933950424194336,
318
- 1.805014967918396,
319
- 1.8085817098617554,
320
- 1.7863329648971558,
321
- 1.7919312715530396,
322
- 1.783199429512024,
323
- 1.7947336435317993,
324
- 1.8047763109207153,
325
- 1.8011966943740845,
326
- 1.788076400756836,
327
- 1.7798686027526855,
328
- 1.7843881845474243,
329
- 1.7868894338607788,
330
- 1.7851332426071167,
331
- 1.7916141748428345,
332
- 1.7841373682022095,
333
- 1.7945789098739624,
334
- 1.7695688009262085,
335
- 1.8075765371322632,
336
- 1.8012295961380005,
337
- 1.8081436157226562,
338
- 1.7965914011001587,
339
- 1.7999669313430786,
340
- 1.7932742834091187,
341
- 1.7773023843765259,
342
- 1.7790831327438354,
343
- 1.7893775701522827,
344
- 1.7931427955627441,
345
- 1.8046531677246094,
346
- 1.7975425720214844,
347
- 1.7841027975082397,
348
- 1.7952299118041992,
349
- 1.8000589609146118,
350
- 1.7956489324569702,
351
- 1.7838338613510132,
352
- 1.7932010889053345,
353
- 1.785744547843933,
354
- 1.773536205291748,
355
- 1.7905513048171997,
356
- 1.792868971824646,
357
- 1.8099616765975952,
358
- 1.801303505897522,
359
- 1.800384521484375,
360
- 1.7764925956726074,
361
- 1.7878471612930298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  ],
363
  "zipf_loss": [
364
- 9.207036018371582,
365
- 8.74245548248291,
366
- 7.782087326049805,
367
- 6.414673805236816,
368
- 5.0151753425598145,
369
- 3.9485976696014404,
370
- 3.1862025260925293,
371
- 2.4078822135925293,
372
- 2.198904514312744,
373
- 2.067970037460327,
374
- 1.9859960079193115,
375
- 1.928288459777832,
376
- 1.8869855403900146,
377
- 1.853148341178894,
378
- 1.8104872703552246,
379
- 1.7938334941864014,
380
- 1.7794156074523926,
381
- 1.7680517435073853,
382
- 1.7584320306777954,
383
- 1.7499982118606567,
384
- 1.743680715560913,
385
- 1.7342076301574707,
386
- 1.730997920036316,
387
- 1.7279741764068604,
388
- 1.7251121997833252,
389
- 1.7227725982666016,
390
- 1.7214350700378418,
391
- 1.7198371887207031,
392
- 1.7171660661697388,
393
- 1.7158317565917969,
394
- 1.7149643898010254,
395
- 1.7142155170440674,
396
- 1.713447093963623,
397
- 1.7123608589172363,
398
- 1.7120893001556396,
399
- 1.7111916542053223,
400
- 1.7106807231903076,
401
- 1.7102984189987183,
402
- 1.7099019289016724,
403
- 1.7092549800872803,
404
- 1.7088669538497925,
405
- 1.7088780403137207,
406
- 1.7081632614135742,
407
- 1.707948923110962,
408
- 1.7072715759277344,
409
- 1.7079429626464844,
410
- 1.7079581022262573,
411
- 1.707815408706665,
412
- 1.7075835466384888,
413
- 1.7081646919250488,
414
- 1.707925796508789,
415
- 1.7083090543746948,
416
- 1.708240270614624,
417
- 1.7082902193069458,
418
- 1.708122968673706,
419
- 1.7083569765090942,
420
- 1.7080758810043335,
421
- 1.708160161972046,
422
- 1.7081109285354614,
423
- 1.7085528373718262,
424
- 1.7085543870925903,
425
- 1.70847487449646,
426
- 1.7086145877838135,
427
- 1.7087550163269043,
428
- 1.708471417427063,
429
- 1.708816409111023,
430
- 1.7089366912841797,
431
- 1.7089341878890991,
432
- 1.708909034729004,
433
- 1.7087242603302002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
- 0.6665944457054138,
438
- 0.6689225435256958,
439
- 0.6665652394294739,
440
- 0.5638207197189331,
441
- 0.5186919569969177,
442
- 0.4937678277492523,
443
- 0.440208375453949,
444
- 0.44368138909339905,
445
- 0.4490883946418762,
446
- 0.46086809039115906,
447
- 0.47744476795196533,
448
- 0.4832924008369446,
449
- 0.5013949871063232,
450
- 0.49298223853111267,
451
- 0.48781320452690125,
452
- 0.4863438904285431,
453
- 0.4936782121658325,
454
- 0.49258145689964294,
455
- 0.49533185362815857,
456
- 0.507675290107727,
457
- 0.5198795795440674,
458
- 0.49237918853759766,
459
- 0.5015400648117065,
460
- 0.4975549578666687,
461
- 0.4957054555416107,
462
- 0.49398577213287354,
463
- 0.507213294506073,
464
- 0.5123801827430725,
465
- 0.5207350254058838,
466
- 0.5060194134712219,
467
- 0.5115739107131958,
468
- 0.5285096764564514,
469
- 0.5283535122871399,
470
- 0.524570643901825,
471
- 0.5327265858650208,
472
- 0.549440324306488,
473
- 0.5494626760482788,
474
- 0.558219313621521,
475
- 0.5633650422096252,
476
- 0.5702787637710571,
477
- 0.5571073293685913,
478
- 0.5566506385803223,
479
- 0.5707675814628601,
480
- 0.5843377113342285,
481
- 0.584276556968689,
482
- 0.5893025994300842,
483
- 0.6109833121299744,
484
- 0.6101059317588806,
485
- 0.6098511815071106,
486
- 0.6212247610092163,
487
- 0.6309865713119507,
488
- 0.6344567537307739,
489
- 0.6354795098304749,
490
- 0.647710919380188,
491
- 0.6540753841400146,
492
- 0.6593545079231262,
493
- 0.6664237976074219,
494
- 0.6748871803283691,
495
- 0.6771064400672913,
496
- 0.6822501420974731,
497
- 0.6884104013442993,
498
- 0.6926484704017639,
499
- 0.6983345150947571,
500
- 0.7035016417503357,
501
- 0.7052139639854431,
502
- 0.7094832062721252,
503
- 0.7137283682823181,
504
- 0.7142598628997803,
505
- 0.7140603065490723,
506
- 0.716733455657959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  ],
508
  "lr": [
509
- 8.376068376068378e-06,
510
- 1.6923076923076924e-05,
 
 
511
  2e-05,
512
  2e-05,
513
  2e-05,
@@ -548,34 +1040,102 @@
548
  2e-05,
549
  2e-05,
550
  2e-05,
551
- 1.9973899288162407e-05,
552
- 1.9380701291853413e-05,
553
- 1.8787503295544426e-05,
554
- 1.8194305299235432e-05,
555
- 1.7601107302926442e-05,
556
- 1.7007909306617455e-05,
557
- 1.6414711310308464e-05,
558
- 1.5335090957026102e-05,
559
- 1.474189296071711e-05,
560
- 1.4148694964408121e-05,
561
- 1.355549696809913e-05,
562
- 1.2962298971790142e-05,
563
- 1.2369100975481152e-05,
564
- 1.1775902979172158e-05,
565
- 1.0696282625889797e-05,
566
- 1.0103084629580805e-05,
567
- 9.50988663327182e-06,
568
- 8.91668863696283e-06,
569
- 8.323490640653837e-06,
570
- 7.730292644344845e-06,
571
- 7.137094648035855e-06,
572
- 6.057474294753492e-06,
573
- 5.4642762984445065e-06,
574
- 4.871078302135514e-06,
575
- 4.277880305826523e-06,
576
- 3.684682309517532e-06,
577
- 3.091484313208541e-06,
578
- 2.4982863168995496e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
@@ -588,588 +1148,608 @@
588
  2696,
589
  3087,
590
  3478,
591
- 3869
 
 
 
 
 
 
 
 
 
 
592
  ],
593
  "eval_accuracy": [
 
 
 
 
 
 
594
  0.01,
595
  0.0,
596
- 0.0,
597
  0.01,
598
- 0.0,
599
- 0.0,
600
  0.01,
601
- 0.0,
602
- 0.0,
 
 
 
 
 
 
 
603
  0.01
604
  ]
605
  },
606
- "final_accuracy": 0.020416666666666666,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
610
  "K": null,
611
  "mode": "sft",
612
  "n_digits": 6,
613
- "n_per_split": 50
614
  },
615
  "splits": {
616
  "add_S0": {
617
  "full_accuracy": 0.0,
618
- "digit_accuracy": 0.3457142857142857,
619
- "n_examples": 50,
620
  "per_subtask": {
621
  "SA": {
622
- "accuracy": 0.22372881355932203,
623
- "count": 295
624
  },
625
  "SS": {
626
- "accuracy": 1.0,
627
- "count": 55
628
  }
629
  }
630
  },
631
  "add_S1": {
632
  "full_accuracy": 0.0,
633
- "digit_accuracy": 0.19714285714285715,
634
- "n_examples": 50,
635
  "per_subtask": {
636
  "SA": {
637
- "accuracy": 0.25396825396825395,
638
- "count": 126
639
  },
640
  "SC": {
641
- "accuracy": 0.05063291139240506,
642
- "count": 79
643
  },
644
  "SS": {
645
- "accuracy": 0.8571428571428571,
646
- "count": 21
647
  },
648
  "UC": {
649
- "accuracy": 0.12096774193548387,
650
- "count": 124
651
  }
652
  }
653
  },
654
  "add_S2": {
655
  "full_accuracy": 0.0,
656
- "digit_accuracy": 0.37142857142857144,
657
- "n_examples": 50,
658
  "per_subtask": {
659
  "SA": {
660
- "accuracy": 0.48,
661
- "count": 75
662
  },
663
  "SC": {
664
- "accuracy": 0.08064516129032258,
665
- "count": 62
666
  },
667
  "SS": {
668
- "accuracy": 0.8717948717948718,
669
- "count": 39
670
  },
671
  "UC": {
672
- "accuracy": 0.23423423423423423,
673
- "count": 111
674
  },
675
  "US": {
676
- "accuracy": 0.4603174603174603,
677
- "count": 63
678
  }
679
  }
680
  },
681
  "add_S3": {
682
  "full_accuracy": 0.0,
683
- "digit_accuracy": 0.3057142857142857,
684
- "n_examples": 50,
685
  "per_subtask": {
686
  "SA": {
687
- "accuracy": 0.5833333333333334,
688
- "count": 60
689
  },
690
  "SC": {
691
- "accuracy": 0.05263157894736842,
692
- "count": 57
693
  },
694
  "SS": {
695
- "accuracy": 0.8947368421052632,
696
- "count": 19
697
  },
698
  "UC": {
699
- "accuracy": 0.16346153846153846,
700
- "count": 104
701
  },
702
  "US": {
703
- "accuracy": 0.3181818181818182,
704
- "count": 110
705
  }
706
  }
707
  },
708
  "add_S4": {
709
  "full_accuracy": 0.0,
710
- "digit_accuracy": 0.29714285714285715,
711
- "n_examples": 50,
712
  "per_subtask": {
713
  "SA": {
714
- "accuracy": 0.5208333333333334,
715
- "count": 48
716
  },
717
  "SC": {
718
- "accuracy": 0.057692307692307696,
719
- "count": 52
720
  },
721
  "SS": {
722
- "accuracy": 0.7142857142857143,
723
- "count": 7
724
  },
725
  "UC": {
726
- "accuracy": 0.21348314606741572,
727
- "count": 89
728
  },
729
  "US": {
730
- "accuracy": 0.33766233766233766,
731
- "count": 154
732
  }
733
  }
734
  },
735
  "add_S5": {
736
  "full_accuracy": 0.0,
737
- "digit_accuracy": 0.09142857142857143,
738
- "n_examples": 50,
739
  "per_subtask": {
740
  "SA": {
741
- "accuracy": 0.5,
742
- "count": 50
743
  },
744
  "SC": {
745
- "accuracy": 0.0,
746
- "count": 50
747
  },
748
  "UC": {
749
- "accuracy": 0.06,
750
- "count": 50
751
  },
752
  "US": {
753
- "accuracy": 0.02,
754
- "count": 200
755
  }
756
  }
757
  },
758
  "add_S6": {
759
- "full_accuracy": 0.0,
760
- "digit_accuracy": 0.12,
761
- "n_examples": 50,
762
  "per_subtask": {
763
  "SC": {
764
- "accuracy": 0.0,
765
- "count": 50
766
  },
767
  "UC": {
768
- "accuracy": 0.14,
769
- "count": 50
770
  },
771
  "US": {
772
- "accuracy": 0.14,
773
- "count": 250
774
  }
775
  }
776
  },
777
  "add_random": {
778
  "full_accuracy": 0.0,
779
- "digit_accuracy": 0.19857142857142857,
780
  "n_examples": 200,
781
  "per_subtask": {
782
  "SA": {
783
- "accuracy": 0.2482598607888631,
784
- "count": 431
785
  },
786
  "SC": {
787
- "accuracy": 0.05379746835443038,
788
- "count": 316
789
  },
790
  "SS": {
791
- "accuracy": 0.8461538461538461,
792
- "count": 39
793
  },
794
  "UC": {
795
- "accuracy": 0.17142857142857143,
796
- "count": 560
797
  },
798
  "US": {
799
- "accuracy": 0.46296296296296297,
800
- "count": 54
801
  }
802
  }
803
  },
804
  "add_C1": {
805
  "full_accuracy": 0.0,
806
- "digit_accuracy": 0.12285714285714286,
807
- "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
- "accuracy": 0.172,
811
- "count": 250
812
  },
813
  "SC": {
814
  "accuracy": 0.0,
815
- "count": 50
816
  },
817
  "UC": {
818
- "accuracy": 0.0,
819
- "count": 50
820
  }
821
  }
822
  },
823
  "add_C2": {
824
  "full_accuracy": 0.0,
825
- "digit_accuracy": 0.12285714285714286,
826
- "n_examples": 50,
827
  "per_subtask": {
828
  "SA": {
829
- "accuracy": 0.19,
830
- "count": 200
831
  },
832
  "SC": {
833
- "accuracy": 0.02,
834
- "count": 50
835
  },
836
  "UC": {
837
- "accuracy": 0.04819277108433735,
838
- "count": 83
839
  },
840
  "US": {
841
- "accuracy": 0.0,
842
- "count": 17
843
  }
844
  }
845
  },
846
  "add_C3": {
847
  "full_accuracy": 0.0,
848
- "digit_accuracy": 0.13428571428571429,
849
- "n_examples": 50,
850
  "per_subtask": {
851
  "SA": {
852
- "accuracy": 0.24666666666666667,
853
- "count": 150
854
  },
855
  "SC": {
856
- "accuracy": 0.0,
857
- "count": 50
858
- },
859
- "UC": {
860
  "accuracy": 0.06,
861
  "count": 100
862
  },
 
 
 
 
863
  "US": {
864
- "accuracy": 0.08,
865
- "count": 50
866
  }
867
  }
868
  },
869
  "add_C4": {
870
  "full_accuracy": 0.0,
871
- "digit_accuracy": 0.12857142857142856,
872
- "n_examples": 50,
873
  "per_subtask": {
874
  "SA": {
875
- "accuracy": 0.35,
876
- "count": 100
877
  },
878
  "SC": {
879
- "accuracy": 0.0,
880
- "count": 50
881
  },
882
  "UC": {
883
- "accuracy": 0.03787878787878788,
884
- "count": 132
885
  },
886
  "US": {
887
- "accuracy": 0.07352941176470588,
888
- "count": 68
889
  }
890
  }
891
  },
892
  "add_C5": {
893
  "full_accuracy": 0.0,
894
- "digit_accuracy": 0.21428571428571427,
895
- "n_examples": 50,
896
  "per_subtask": {
897
  "SA": {
898
- "accuracy": 0.36,
899
- "count": 50
900
  },
901
  "SC": {
902
- "accuracy": 0.04,
903
- "count": 50
904
  },
905
  "UC": {
906
- "accuracy": 0.13013698630136986,
907
- "count": 146
908
  },
909
  "US": {
910
- "accuracy": 0.34615384615384615,
911
- "count": 104
912
  }
913
  }
914
  },
915
  "add_C6": {
916
- "full_accuracy": 0.0,
917
- "digit_accuracy": 0.3,
918
- "n_examples": 50,
919
  "per_subtask": {
920
  "SC": {
921
- "accuracy": 0.0,
922
- "count": 50
923
  },
924
  "UC": {
925
- "accuracy": 0.20105820105820105,
926
- "count": 189
927
  },
928
  "US": {
929
- "accuracy": 0.6036036036036037,
930
- "count": 111
931
  }
932
  }
933
  },
934
  "sub_M0": {
935
  "full_accuracy": 0.0,
936
- "digit_accuracy": 0.3057142857142857,
937
- "n_examples": 50,
938
  "per_subtask": {
939
  "MD": {
940
- "accuracy": 0.19801980198019803,
941
- "count": 303
942
  },
943
  "ME": {
944
- "accuracy": 1.0,
945
- "count": 47
946
  }
947
  }
948
  },
949
  "sub_M1": {
950
  "full_accuracy": 0.0,
951
- "digit_accuracy": 0.26285714285714284,
952
- "n_examples": 50,
953
  "per_subtask": {
954
  "MD": {
955
- "accuracy": 0.3900709219858156,
956
- "count": 141
957
  },
958
  "MB": {
959
  "accuracy": 0.0,
960
- "count": 72
961
  },
962
  "ME": {
963
- "accuracy": 1.0,
964
- "count": 18
965
  },
966
  "UB": {
967
- "accuracy": 0.15966386554621848,
968
- "count": 119
969
  }
970
  }
971
  },
972
  "sub_M2": {
973
  "full_accuracy": 0.0,
974
- "digit_accuracy": 0.38285714285714284,
975
- "n_examples": 50,
976
  "per_subtask": {
977
  "MD": {
978
- "accuracy": 0.6428571428571429,
979
- "count": 112
980
  },
981
  "MB": {
982
- "accuracy": 0.0,
983
- "count": 53
984
  },
985
  "ME": {
986
- "accuracy": 1.0,
987
- "count": 47
988
  },
989
  "UB": {
990
- "accuracy": 0.17647058823529413,
991
- "count": 85
992
  },
993
  "UD": {
994
- "accuracy": 0.0,
995
- "count": 53
996
  }
997
  }
998
  },
999
  "sub_M3": {
1000
  "full_accuracy": 0.0,
1001
- "digit_accuracy": 0.28,
1002
- "n_examples": 50,
1003
  "per_subtask": {
1004
  "MD": {
1005
- "accuracy": 0.6494845360824743,
1006
- "count": 97
1007
  },
1008
  "MB": {
1009
- "accuracy": 0.0,
1010
- "count": 51
1011
  },
1012
  "ME": {
1013
- "accuracy": 1.0,
1014
- "count": 27
1015
  },
1016
  "UB": {
1017
- "accuracy": 0.10810810810810811,
1018
- "count": 74
1019
  },
1020
  "UD": {
1021
- "accuracy": 0.0,
1022
- "count": 101
1023
  }
1024
  }
1025
  },
1026
  "sub_M4": {
1027
  "full_accuracy": 0.0,
1028
- "digit_accuracy": 0.21142857142857144,
1029
- "n_examples": 50,
1030
  "per_subtask": {
1031
  "MD": {
1032
- "accuracy": 0.5,
1033
- "count": 100
1034
  },
1035
  "MB": {
1036
  "accuracy": 0.0,
1037
- "count": 50
1038
  },
1039
  "UB": {
1040
- "accuracy": 0.48,
1041
- "count": 50
1042
  },
1043
  "UD": {
1044
- "accuracy": 0.0,
1045
- "count": 150
1046
  }
1047
  }
1048
  },
1049
  "sub_M5": {
1050
  "full_accuracy": 0.0,
1051
- "digit_accuracy": 0.18285714285714286,
1052
- "n_examples": 50,
1053
  "per_subtask": {
1054
  "MD": {
1055
  "accuracy": 1.0,
1056
- "count": 50
1057
  },
1058
  "MB": {
1059
- "accuracy": 0.0,
1060
- "count": 50
1061
  },
1062
  "UB": {
1063
- "accuracy": 0.28,
1064
- "count": 50
1065
  },
1066
  "UD": {
1067
- "accuracy": 0.0,
1068
- "count": 200
1069
  }
1070
  }
1071
  },
1072
  "sub_random": {
1073
  "full_accuracy": 0.0,
1074
- "digit_accuracy": 0.2307142857142857,
1075
  "n_examples": 200,
1076
  "per_subtask": {
1077
  "MD": {
1078
- "accuracy": 0.37719298245614036,
1079
- "count": 570
1080
  },
1081
  "MB": {
1082
- "accuracy": 0.0,
1083
- "count": 277
1084
  },
1085
  "ME": {
1086
- "accuracy": 1.0,
1087
  "count": 53
1088
  },
1089
  "UB": {
1090
- "accuracy": 0.11677282377919321,
1091
- "count": 471
1092
  },
1093
  "UD": {
1094
- "accuracy": 0.0,
1095
- "count": 29
1096
  }
1097
  }
1098
  },
1099
  "sub_B3": {
1100
  "full_accuracy": 0.0,
1101
- "digit_accuracy": 0.18857142857142858,
1102
- "n_examples": 50,
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.3333333333333333,
1106
- "count": 150
1107
  },
1108
  "MB": {
1109
- "accuracy": 0.0,
1110
- "count": 50
1111
  },
1112
  "UB": {
1113
- "accuracy": 0.15841584158415842,
1114
- "count": 101
1115
  },
1116
  "UD": {
1117
- "accuracy": 0.0,
1118
- "count": 49
1119
  }
1120
  }
1121
  },
1122
  "sub_B4": {
1123
  "full_accuracy": 0.0,
1124
- "digit_accuracy": 0.17714285714285713,
1125
- "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
- "accuracy": 0.5,
1129
- "count": 100
1130
  },
1131
  "MB": {
1132
  "accuracy": 0.0,
1133
- "count": 50
1134
  },
1135
  "UB": {
1136
- "accuracy": 0.09917355371900827,
1137
- "count": 121
1138
  },
1139
  "UD": {
1140
- "accuracy": 0.0,
1141
- "count": 79
1142
  }
1143
  }
1144
  },
1145
  "sub_B5": {
1146
  "full_accuracy": 0.0,
1147
- "digit_accuracy": 0.18,
1148
- "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
- "count": 50
1153
  },
1154
  "MB": {
1155
- "accuracy": 0.0,
1156
- "count": 50
1157
  },
1158
  "UB": {
1159
- "accuracy": 0.08552631578947369,
1160
- "count": 152
1161
  },
1162
  "UD": {
1163
- "accuracy": 0.0,
1164
- "count": 98
1165
  }
1166
  }
1167
  }
1168
  },
1169
  "summary": {
1170
- "overall_accuracy": 0.0,
1171
- "digit_accuracy": 0.22114285714285714,
1172
- "total_examples": 1500,
1173
  "n_splits": 24
1174
  }
1175
  },
@@ -1179,569 +1759,569 @@
1179
  "K": 1,
1180
  "mode": "sorl",
1181
  "n_digits": 6,
1182
- "n_per_split": 50
1183
  },
1184
  "splits": {
1185
  "add_S0": {
1186
- "full_accuracy": 0.08,
1187
- "digit_accuracy": 0.7114285714285714,
1188
- "n_examples": 50,
1189
  "per_subtask": {
1190
  "SA": {
1191
- "accuracy": 0.688135593220339,
1192
- "count": 295
1193
  },
1194
  "SS": {
1195
- "accuracy": 0.8363636363636363,
1196
- "count": 55
1197
  }
1198
  }
1199
  },
1200
  "add_S1": {
1201
- "full_accuracy": 0.0,
1202
- "digit_accuracy": 0.6228571428571429,
1203
- "n_examples": 50,
1204
  "per_subtask": {
1205
  "SA": {
1206
- "accuracy": 0.7142857142857143,
1207
- "count": 126
1208
  },
1209
  "SC": {
1210
- "accuracy": 0.7215189873417721,
1211
- "count": 79
1212
  },
1213
  "SS": {
1214
- "accuracy": 0.8571428571428571,
1215
- "count": 21
1216
  },
1217
  "UC": {
1218
- "accuracy": 0.4274193548387097,
1219
- "count": 124
1220
  }
1221
  }
1222
  },
1223
  "add_S2": {
1224
- "full_accuracy": 0.02,
1225
- "digit_accuracy": 0.5857142857142857,
1226
- "n_examples": 50,
1227
  "per_subtask": {
1228
  "SA": {
1229
- "accuracy": 0.7333333333333333,
1230
- "count": 75
1231
  },
1232
  "SC": {
1233
- "accuracy": 0.7096774193548387,
1234
- "count": 62
1235
  },
1236
  "SS": {
1237
- "accuracy": 0.7435897435897436,
1238
- "count": 39
1239
  },
1240
  "UC": {
1241
- "accuracy": 0.36936936936936937,
1242
- "count": 111
1243
  },
1244
  "US": {
1245
- "accuracy": 0.5714285714285714,
1246
- "count": 63
1247
  }
1248
  }
1249
  },
1250
  "add_S3": {
1251
- "full_accuracy": 0.02,
1252
- "digit_accuracy": 0.5485714285714286,
1253
- "n_examples": 50,
1254
  "per_subtask": {
1255
  "SA": {
1256
- "accuracy": 0.7833333333333333,
1257
- "count": 60
1258
  },
1259
  "SC": {
1260
- "accuracy": 0.631578947368421,
1261
- "count": 57
1262
  },
1263
  "SS": {
1264
- "accuracy": 0.7894736842105263,
1265
- "count": 19
1266
  },
1267
  "UC": {
1268
- "accuracy": 0.3942307692307692,
1269
- "count": 104
1270
  },
1271
  "US": {
1272
- "accuracy": 0.4818181818181818,
1273
- "count": 110
1274
  }
1275
  }
1276
  },
1277
  "add_S4": {
1278
- "full_accuracy": 0.04,
1279
- "digit_accuracy": 0.46,
1280
- "n_examples": 50,
1281
  "per_subtask": {
1282
  "SA": {
1283
- "accuracy": 0.7708333333333334,
1284
- "count": 48
1285
  },
1286
  "SC": {
1287
- "accuracy": 0.6923076923076923,
1288
- "count": 52
1289
  },
1290
  "SS": {
1291
- "accuracy": 0.7142857142857143,
1292
- "count": 7
1293
  },
1294
  "UC": {
1295
- "accuracy": 0.3258426966292135,
1296
- "count": 89
1297
  },
1298
  "US": {
1299
- "accuracy": 0.35064935064935066,
1300
- "count": 154
1301
  }
1302
  }
1303
  },
1304
  "add_S5": {
1305
- "full_accuracy": 0.02,
1306
- "digit_accuracy": 0.3314285714285714,
1307
- "n_examples": 50,
1308
  "per_subtask": {
1309
  "SA": {
1310
- "accuracy": 0.68,
1311
- "count": 50
1312
  },
1313
  "SC": {
1314
- "accuracy": 0.72,
1315
- "count": 50
1316
  },
1317
  "UC": {
1318
- "accuracy": 0.12,
1319
- "count": 50
1320
  },
1321
  "US": {
1322
- "accuracy": 0.2,
1323
- "count": 200
1324
  }
1325
  }
1326
  },
1327
  "add_S6": {
1328
- "full_accuracy": 0.0,
1329
- "digit_accuracy": 0.09142857142857143,
1330
- "n_examples": 50,
1331
  "per_subtask": {
1332
  "SC": {
1333
- "accuracy": 0.52,
1334
- "count": 50
1335
  },
1336
  "UC": {
1337
- "accuracy": 0.0,
1338
- "count": 50
1339
  },
1340
  "US": {
1341
- "accuracy": 0.024,
1342
- "count": 250
1343
  }
1344
  }
1345
  },
1346
  "add_random": {
1347
- "full_accuracy": 0.025,
1348
- "digit_accuracy": 0.5957142857142858,
1349
  "n_examples": 200,
1350
  "per_subtask": {
1351
  "SA": {
1352
- "accuracy": 0.6682134570765661,
1353
- "count": 431
1354
  },
1355
  "SC": {
1356
- "accuracy": 0.6930379746835443,
1357
- "count": 316
1358
  },
1359
  "SS": {
1360
- "accuracy": 0.8974358974358975,
1361
- "count": 39
1362
  },
1363
  "UC": {
1364
- "accuracy": 0.4589285714285714,
1365
- "count": 560
1366
  },
1367
  "US": {
1368
- "accuracy": 0.6481481481481481,
1369
- "count": 54
1370
  }
1371
  }
1372
  },
1373
  "add_C1": {
1374
- "full_accuracy": 0.02,
1375
- "digit_accuracy": 0.7228571428571429,
1376
- "n_examples": 50,
1377
  "per_subtask": {
1378
  "SA": {
1379
- "accuracy": 0.82,
1380
- "count": 250
1381
  },
1382
  "SC": {
1383
- "accuracy": 0.78,
1384
- "count": 50
1385
  },
1386
  "UC": {
1387
- "accuracy": 0.18,
1388
- "count": 50
1389
  }
1390
  }
1391
  },
1392
  "add_C2": {
1393
- "full_accuracy": 0.0,
1394
- "digit_accuracy": 0.6485714285714286,
1395
- "n_examples": 50,
1396
  "per_subtask": {
1397
  "SA": {
1398
- "accuracy": 0.81,
1399
- "count": 200
1400
  },
1401
  "SC": {
1402
- "accuracy": 0.76,
1403
- "count": 50
1404
  },
1405
  "UC": {
1406
- "accuracy": 0.20481927710843373,
1407
- "count": 83
1408
  },
1409
  "US": {
1410
- "accuracy": 0.5882352941176471,
1411
- "count": 17
1412
  }
1413
  }
1414
  },
1415
  "add_C3": {
1416
- "full_accuracy": 0.04,
1417
- "digit_accuracy": 0.62,
1418
- "n_examples": 50,
1419
  "per_subtask": {
1420
  "SA": {
1421
- "accuracy": 0.8,
1422
- "count": 150
1423
  },
1424
  "SC": {
1425
- "accuracy": 0.78,
1426
- "count": 50
1427
  },
1428
  "UC": {
1429
- "accuracy": 0.28,
1430
- "count": 100
1431
  },
1432
  "US": {
1433
- "accuracy": 0.6,
1434
- "count": 50
1435
  }
1436
  }
1437
  },
1438
  "add_C4": {
1439
- "full_accuracy": 0.0,
1440
- "digit_accuracy": 0.52,
1441
- "n_examples": 50,
1442
  "per_subtask": {
1443
  "SA": {
1444
- "accuracy": 0.76,
1445
- "count": 100
1446
  },
1447
  "SC": {
1448
- "accuracy": 0.68,
1449
- "count": 50
1450
  },
1451
  "UC": {
1452
- "accuracy": 0.15151515151515152,
1453
- "count": 132
1454
  },
1455
  "US": {
1456
- "accuracy": 0.7647058823529411,
1457
- "count": 68
1458
  }
1459
  }
1460
  },
1461
  "add_C5": {
1462
- "full_accuracy": 0.02,
1463
- "digit_accuracy": 0.5,
1464
- "n_examples": 50,
1465
  "per_subtask": {
1466
  "SA": {
1467
- "accuracy": 0.76,
1468
- "count": 50
1469
  },
1470
  "SC": {
1471
- "accuracy": 0.72,
1472
- "count": 50
1473
  },
1474
  "UC": {
1475
- "accuracy": 0.2465753424657534,
1476
- "count": 146
1477
  },
1478
  "US": {
1479
- "accuracy": 0.625,
1480
- "count": 104
1481
  }
1482
  }
1483
  },
1484
  "add_C6": {
1485
- "full_accuracy": 0.0,
1486
- "digit_accuracy": 0.44571428571428573,
1487
- "n_examples": 50,
1488
  "per_subtask": {
1489
  "SC": {
1490
- "accuracy": 0.66,
1491
- "count": 50
1492
  },
1493
  "UC": {
1494
- "accuracy": 0.37037037037037035,
1495
- "count": 189
1496
  },
1497
  "US": {
1498
- "accuracy": 0.4774774774774775,
1499
- "count": 111
1500
  }
1501
  }
1502
  },
1503
  "sub_M0": {
1504
- "full_accuracy": 0.06,
1505
- "digit_accuracy": 0.6371428571428571,
1506
- "n_examples": 50,
1507
  "per_subtask": {
1508
  "MD": {
1509
- "accuracy": 0.5808580858085809,
1510
- "count": 303
1511
  },
1512
  "ME": {
1513
- "accuracy": 1.0,
1514
- "count": 47
1515
  }
1516
  }
1517
  },
1518
  "sub_M1": {
1519
- "full_accuracy": 0.0,
1520
- "digit_accuracy": 0.5828571428571429,
1521
- "n_examples": 50,
1522
  "per_subtask": {
1523
  "MD": {
1524
- "accuracy": 0.6453900709219859,
1525
- "count": 141
1526
  },
1527
  "MB": {
1528
- "accuracy": 0.4027777777777778,
1529
- "count": 72
1530
  },
1531
  "ME": {
1532
- "accuracy": 0.8888888888888888,
1533
- "count": 18
1534
  },
1535
  "UB": {
1536
- "accuracy": 0.5714285714285714,
1537
- "count": 119
1538
  }
1539
  }
1540
  },
1541
  "sub_M2": {
1542
- "full_accuracy": 0.02,
1543
- "digit_accuracy": 0.5885714285714285,
1544
- "n_examples": 50,
1545
  "per_subtask": {
1546
  "MD": {
1547
- "accuracy": 0.8035714285714286,
1548
- "count": 112
1549
  },
1550
  "MB": {
1551
- "accuracy": 0.33962264150943394,
1552
- "count": 53
1553
  },
1554
  "ME": {
1555
- "accuracy": 0.9574468085106383,
1556
- "count": 47
1557
  },
1558
  "UB": {
1559
- "accuracy": 0.5058823529411764,
1560
- "count": 85
1561
  },
1562
  "UD": {
1563
- "accuracy": 0.18867924528301888,
1564
- "count": 53
1565
  }
1566
  }
1567
  },
1568
  "sub_M3": {
1569
- "full_accuracy": 0.0,
1570
- "digit_accuracy": 0.44857142857142857,
1571
- "n_examples": 50,
1572
  "per_subtask": {
1573
  "MD": {
1574
- "accuracy": 0.7628865979381443,
1575
- "count": 97
1576
  },
1577
  "MB": {
1578
- "accuracy": 0.19607843137254902,
1579
- "count": 51
1580
  },
1581
  "ME": {
1582
- "accuracy": 0.9629629629629629,
1583
- "count": 27
1584
  },
1585
  "UB": {
1586
- "accuracy": 0.5675675675675675,
1587
- "count": 74
1588
  },
1589
  "UD": {
1590
- "accuracy": 0.04950495049504951,
1591
- "count": 101
1592
  }
1593
  }
1594
  },
1595
  "sub_M4": {
1596
- "full_accuracy": 0.02,
1597
- "digit_accuracy": 0.3485714285714286,
1598
- "n_examples": 50,
1599
  "per_subtask": {
1600
  "MD": {
1601
- "accuracy": 0.62,
1602
- "count": 100
1603
  },
1604
  "MB": {
1605
- "accuracy": 0.18,
1606
- "count": 50
1607
  },
1608
  "UB": {
1609
- "accuracy": 0.78,
1610
- "count": 50
1611
  },
1612
  "UD": {
1613
- "accuracy": 0.08,
1614
- "count": 150
1615
  }
1616
  }
1617
  },
1618
  "sub_M5": {
1619
- "full_accuracy": 0.02,
1620
- "digit_accuracy": 0.28285714285714286,
1621
- "n_examples": 50,
1622
  "per_subtask": {
1623
  "MD": {
1624
  "accuracy": 1.0,
1625
- "count": 50
1626
  },
1627
  "MB": {
1628
- "accuracy": 0.02,
1629
- "count": 50
1630
  },
1631
  "UB": {
1632
- "accuracy": 0.74,
1633
- "count": 50
1634
  },
1635
  "UD": {
1636
- "accuracy": 0.055,
1637
- "count": 200
1638
  }
1639
  }
1640
  },
1641
  "sub_random": {
1642
- "full_accuracy": 0.025,
1643
- "digit_accuracy": 0.5657142857142857,
1644
  "n_examples": 200,
1645
  "per_subtask": {
1646
  "MD": {
1647
- "accuracy": 0.6385964912280702,
1648
- "count": 570
1649
  },
1650
  "MB": {
1651
- "accuracy": 0.45126353790613716,
1652
- "count": 277
1653
  },
1654
  "ME": {
1655
- "accuracy": 0.9245283018867925,
1656
  "count": 53
1657
  },
1658
  "UB": {
1659
- "accuracy": 0.5329087048832272,
1660
- "count": 471
1661
  },
1662
  "UD": {
1663
- "accuracy": 0.10344827586206896,
1664
- "count": 29
1665
  }
1666
  }
1667
  },
1668
  "sub_B3": {
1669
- "full_accuracy": 0.0,
1670
- "digit_accuracy": 0.4542857142857143,
1671
- "n_examples": 50,
1672
  "per_subtask": {
1673
  "MD": {
1674
- "accuracy": 0.5666666666666667,
1675
- "count": 150
1676
  },
1677
  "MB": {
1678
- "accuracy": 0.36,
1679
- "count": 50
1680
  },
1681
  "UB": {
1682
- "accuracy": 0.5148514851485149,
1683
- "count": 101
1684
  },
1685
  "UD": {
1686
- "accuracy": 0.08163265306122448,
1687
- "count": 49
1688
  }
1689
  }
1690
  },
1691
  "sub_B4": {
1692
- "full_accuracy": 0.0,
1693
- "digit_accuracy": 0.4342857142857143,
1694
- "n_examples": 50,
1695
  "per_subtask": {
1696
  "MD": {
1697
- "accuracy": 0.6,
1698
- "count": 100
1699
  },
1700
  "MB": {
1701
- "accuracy": 0.36,
1702
- "count": 50
1703
  },
1704
  "UB": {
1705
- "accuracy": 0.4793388429752066,
1706
- "count": 121
1707
  },
1708
  "UD": {
1709
- "accuracy": 0.20253164556962025,
1710
- "count": 79
1711
  }
1712
  }
1713
  },
1714
  "sub_B5": {
1715
- "full_accuracy": 0.0,
1716
- "digit_accuracy": 0.46,
1717
- "n_examples": 50,
1718
  "per_subtask": {
1719
  "MD": {
1720
  "accuracy": 1.0,
1721
- "count": 50
1722
  },
1723
  "MB": {
1724
- "accuracy": 0.2,
1725
- "count": 50
1726
  },
1727
  "UB": {
1728
- "accuracy": 0.4407894736842105,
1729
- "count": 152
1730
  },
1731
  "UD": {
1732
- "accuracy": 0.3469387755102041,
1733
- "count": 98
1734
  }
1735
  }
1736
  }
1737
  },
1738
  "summary": {
1739
- "overall_accuracy": 0.019333333333333334,
1740
- "digit_accuracy": 0.522952380952381,
1741
- "total_examples": 1500,
1742
  "n_splits": 24
1743
  }
1744
  },
1745
- "sorl_overall_accuracy": 0.020416666666666666,
1746
- "sft_overall_accuracy": 0.0
1747
  }
 
70
  3719,
71
  3769,
72
  3819,
73
+ 3869,
74
+ 3960,
75
+ 4010,
76
+ 4060,
77
+ 4110,
78
+ 4160,
79
+ 4210,
80
+ 4260,
81
+ 4351,
82
+ 4401,
83
+ 4451,
84
+ 4501,
85
+ 4551,
86
+ 4601,
87
+ 4651,
88
+ 4742,
89
+ 4792,
90
+ 4842,
91
+ 4892,
92
+ 4942,
93
+ 4992,
94
+ 5042,
95
+ 5133,
96
+ 5183,
97
+ 5233,
98
+ 5283,
99
+ 5333,
100
+ 5383,
101
+ 5433,
102
+ 5524,
103
+ 5574,
104
+ 5624,
105
+ 5674,
106
+ 5724,
107
+ 5774,
108
+ 5824,
109
+ 5915,
110
+ 5965,
111
+ 6015,
112
+ 6065,
113
+ 6115,
114
+ 6165,
115
+ 6215,
116
+ 6306,
117
+ 6356,
118
+ 6406,
119
+ 6456,
120
+ 6506,
121
+ 6556,
122
+ 6606,
123
+ 6697,
124
+ 6747,
125
+ 6797,
126
+ 6847,
127
+ 6897,
128
+ 6947,
129
+ 6997,
130
+ 7088,
131
+ 7138,
132
+ 7188,
133
+ 7238,
134
+ 7288,
135
+ 7338,
136
+ 7388,
137
+ 7479,
138
+ 7529,
139
+ 7579,
140
+ 7629,
141
+ 7679,
142
+ 7729,
143
+ 7779
144
  ],
145
  "loss": [
146
+ 15.438215255737305,
147
+ 11.698705673217773,
148
+ 11.35198974609375,
149
+ 11.583304405212402,
150
+ 13.52648639678955,
151
+ 14.204401016235352,
152
+ 12.998001098632812,
153
+ 11.166366577148438,
154
+ 10.07100772857666,
155
+ 9.512551307678223,
156
+ 8.862318992614746,
157
+ 8.31992244720459,
158
+ 7.685413837432861,
159
+ 7.327797889709473,
160
+ 6.562270641326904,
161
+ 6.250687599182129,
162
+ 5.861550331115723,
163
+ 5.564685344696045,
164
+ 5.315859794616699,
165
+ 5.033608436584473,
166
+ 4.932528972625732,
167
+ 4.55800724029541,
168
+ 4.5580596923828125,
169
+ 4.372578144073486,
170
+ 4.254790306091309,
171
+ 4.210170745849609,
172
+ 4.063313007354736,
173
+ 4.026899814605713,
174
+ 4.05972957611084,
175
+ 3.9455442428588867,
176
+ 3.8566818237304688,
177
+ 3.797435760498047,
178
+ 3.6889798641204834,
179
+ 3.740785837173462,
180
+ 3.5677337646484375,
181
+ 3.458096504211426,
182
+ 3.2327089309692383,
183
+ 3.0761547088623047,
184
+ 2.7873401641845703,
185
+ 2.8123507499694824,
186
+ 2.591607093811035,
187
+ 2.6727454662323,
188
+ 2.263127088546753,
189
+ 2.2004404067993164,
190
+ 1.9771809577941895,
191
+ 1.6977636814117432,
192
+ 1.3677500486373901,
193
+ 1.4016914367675781,
194
+ 1.1960985660552979,
195
+ 0.7710381150245667,
196
+ 0.9146587252616882,
197
+ 0.6109359264373779,
198
+ 0.4325253963470459,
199
+ 0.3495136499404907,
200
+ -0.13719046115875244,
201
+ -0.3947882652282715,
202
+ -0.37380528450012207,
203
+ -0.5639680624008179,
204
+ -0.7665214538574219,
205
+ -1.1608381271362305,
206
+ -1.1922835111618042,
207
+ -1.4211184978485107,
208
+ -1.504981517791748,
209
+ -1.635431170463562,
210
+ -2.1759157180786133,
211
+ -1.9983330965042114,
212
+ -2.054861545562744,
213
+ -2.627793312072754,
214
+ -2.457620143890381,
215
+ -3.170637369155884,
216
+ -2.9656014442443848,
217
+ -2.9379959106445312,
218
+ -3.5171995162963867,
219
+ -3.3045692443847656,
220
+ -3.2605278491973877,
221
+ -3.6454877853393555,
222
+ -3.7729570865631104,
223
+ -3.7499589920043945,
224
+ -4.199748516082764,
225
+ -3.9465160369873047,
226
+ -4.559499740600586,
227
+ -4.674802780151367,
228
+ -4.359200477600098,
229
+ -4.95430326461792,
230
+ -5.200588226318359,
231
+ -5.365489959716797,
232
+ -5.439723491668701,
233
+ -5.131085395812988,
234
+ -5.364423751831055,
235
+ -5.8407769203186035,
236
+ -5.78155517578125,
237
+ -5.790386199951172,
238
+ -6.290831089019775,
239
+ -6.478070259094238,
240
+ -6.362205505371094,
241
+ -6.496718406677246,
242
+ -6.589505672454834,
243
+ -6.66395902633667,
244
+ -7.2407732009887695,
245
+ -6.973321914672852,
246
+ -7.167119979858398,
247
+ -7.098984718322754,
248
+ -7.72056770324707,
249
+ -7.13163948059082,
250
+ -7.3493571281433105,
251
+ -7.345783233642578,
252
+ -7.536464691162109,
253
+ -7.644423484802246,
254
+ -7.953723907470703,
255
+ -7.990867614746094,
256
+ -7.49429988861084,
257
+ -7.873161792755127,
258
+ -8.245901107788086,
259
+ -7.67487907409668,
260
+ -8.254423141479492,
261
+ -8.392303466796875,
262
+ -8.373635292053223,
263
+ -8.042476654052734,
264
+ -8.853918075561523,
265
+ -8.413291931152344,
266
+ -8.181249618530273,
267
+ -8.420297622680664,
268
+ -8.319875717163086,
269
+ -9.00948429107666,
270
+ -8.943597793579102,
271
+ -8.881410598754883,
272
+ -8.481571197509766,
273
+ -8.24708366394043,
274
+ -8.969070434570312,
275
+ -9.175048828125,
276
+ -9.334626197814941,
277
+ -8.908629417419434,
278
+ -8.629892349243164,
279
+ -8.761899948120117,
280
+ -8.947555541992188,
281
+ -9.112472534179688,
282
+ -8.871394157409668,
283
+ -8.744105339050293,
284
+ -9.35738754272461,
285
+ -8.966874122619629
286
  ],
287
  "base_loss": [
288
+ 11.9345703125,
289
+ 11.817322731018066,
290
+ 11.699155807495117,
291
+ 11.364814758300781,
292
+ 10.792383193969727,
293
+ 10.061384201049805,
294
+ 9.513914108276367,
295
+ 8.453359603881836,
296
+ 7.739280700683594,
297
+ 7.290415287017822,
298
+ 6.789176940917969,
299
+ 6.299552917480469,
300
+ 5.759676456451416,
301
+ 5.421372413635254,
302
+ 4.712368965148926,
303
+ 4.4211649894714355,
304
+ 4.006752967834473,
305
+ 3.7291598320007324,
306
+ 3.5219593048095703,
307
+ 3.2360665798187256,
308
+ 3.1453914642333984,
309
+ 2.8143575191497803,
310
+ 2.7994096279144287,
311
+ 2.621875047683716,
312
+ 2.5696024894714355,
313
+ 2.534588098526001,
314
+ 2.4512345790863037,
315
+ 2.396812915802002,
316
+ 2.3602030277252197,
317
+ 2.3069894313812256,
318
+ 2.3317739963531494,
319
+ 2.2865636348724365,
320
+ 2.288856267929077,
321
+ 2.304441213607788,
322
+ 2.1969876289367676,
323
+ 2.251948118209839,
324
+ 2.1730990409851074,
325
+ 2.1580147743225098,
326
+ 2.1613924503326416,
327
+ 2.200089693069458,
328
+ 2.163917303085327,
329
+ 2.1799607276916504,
330
+ 2.1372125148773193,
331
+ 2.189011335372925,
332
+ 2.124418258666992,
333
+ 2.116373300552368,
334
+ 2.096065044403076,
335
+ 2.1109848022460938,
336
+ 2.1218228340148926,
337
+ 2.142543077468872,
338
+ 2.032945156097412,
339
+ 2.0584824085235596,
340
+ 2.0884969234466553,
341
+ 2.0404374599456787,
342
+ 2.1102449893951416,
343
+ 2.1072633266448975,
344
+ 2.0765204429626465,
345
+ 2.0333292484283447,
346
+ 2.0203804969787598,
347
+ 2.087628126144409,
348
+ 2.0504987239837646,
349
+ 2.02152681350708,
350
+ 2.0398547649383545,
351
+ 1.9614677429199219,
352
+ 2.066767454147339,
353
+ 2.0372583866119385,
354
+ 1.978596568107605,
355
+ 2.0089972019195557,
356
+ 2.0185484886169434,
357
+ 2.0397589206695557,
358
+ 2.0083587169647217,
359
+ 1.9470815658569336,
360
+ 1.9792274236679077,
361
+ 1.9839227199554443,
362
+ 1.9301831722259521,
363
+ 1.9690905809402466,
364
+ 1.9336986541748047,
365
+ 1.8985034227371216,
366
+ 1.9798091650009155,
367
+ 1.8981035947799683,
368
+ 1.983641505241394,
369
+ 2.005276918411255,
370
+ 1.9155381917953491,
371
+ 1.9187160730361938,
372
+ 1.9617462158203125,
373
+ 1.9528653621673584,
374
+ 1.993743658065796,
375
+ 1.8443126678466797,
376
+ 1.8498674631118774,
377
+ 1.9844404458999634,
378
+ 1.9315986633300781,
379
+ 1.8545875549316406,
380
+ 1.9561283588409424,
381
+ 1.9475451707839966,
382
+ 1.8881018161773682,
383
+ 1.904740333557129,
384
+ 1.900530457496643,
385
+ 1.924454689025879,
386
+ 1.9552252292633057,
387
+ 1.8688642978668213,
388
+ 1.9144586324691772,
389
+ 1.8758822679519653,
390
+ 1.9754207134246826,
391
+ 1.840330958366394,
392
+ 1.850648045539856,
393
+ 1.8594962358474731,
394
+ 1.8436225652694702,
395
+ 1.8632913827896118,
396
+ 1.902077555656433,
397
+ 1.9200162887573242,
398
+ 1.8076122999191284,
399
+ 1.8773219585418701,
400
+ 1.8715859651565552,
401
+ 1.8249441385269165,
402
+ 1.876495599746704,
403
+ 1.8286583423614502,
404
+ 1.8903467655181885,
405
+ 1.8312424421310425,
406
+ 1.9122331142425537,
407
+ 1.8325960636138916,
408
+ 1.8598573207855225,
409
+ 1.8170515298843384,
410
+ 1.8049308061599731,
411
+ 1.8960034847259521,
412
+ 1.9015015363693237,
413
+ 1.9183950424194336,
414
+ 1.8224385976791382,
415
+ 1.7621303796768188,
416
+ 1.877859354019165,
417
+ 1.9046906232833862,
418
+ 1.89964759349823,
419
+ 1.851623773574829,
420
+ 1.782663345336914,
421
+ 1.8026123046875,
422
+ 1.8264520168304443,
423
+ 1.8560417890548706,
424
+ 1.8178356885910034,
425
+ 1.7928167581558228,
426
+ 1.8647464513778687,
427
+ 1.832586646080017
428
  ],
429
  "info_loss": [
430
+ -0.5988006591796875,
431
+ -0.9409332275390625,
432
+ -0.927983283996582,
433
+ -0.79962158203125,
434
+ -0.4240531921386719,
435
+ -0.13122177124023438,
436
+ -0.06601619720458984,
437
+ -0.02817535400390625,
438
+ -0.021374225616455078,
439
+ -0.011267662048339844,
440
+ -0.014470100402832031,
441
+ -0.011033058166503906,
442
+ -0.014579296112060547,
443
+ -0.012886524200439453,
444
+ -0.01365804672241211,
445
+ -0.01392221450805664,
446
+ -0.00992727279663086,
447
+ -0.010815858840942383,
448
+ -0.013843536376953125,
449
+ -0.012883901596069336,
450
+ -0.013205766677856445,
451
+ -0.016824722290039062,
452
+ -0.015227794647216797,
453
+ -0.015698909759521484,
454
+ -0.02181529998779297,
455
+ -0.022611379623413086,
456
+ -0.028711318969726562,
457
+ -0.026511430740356445,
458
+ -0.01956796646118164,
459
+ -0.02561187744140625,
460
+ -0.036810874938964844,
461
+ -0.03793692588806152,
462
+ -0.04924178123474121,
463
+ -0.045455217361450195,
464
+ -0.05187726020812988,
465
+ -0.0681753158569336,
466
+ -0.0828087329864502,
467
+ -0.09702324867248535,
468
+ -0.12611079216003418,
469
+ -0.12749648094177246,
470
+ -0.14578962326049805,
471
+ -0.13935232162475586,
472
+ -0.17599689960479736,
473
+ -0.187652587890625,
474
+ -0.20319104194641113,
475
+ -0.23052692413330078,
476
+ -0.26147031784057617,
477
+ -0.2596166133880615,
478
+ -0.28107666969299316,
479
+ -0.3258247375488281,
480
+ -0.300618052482605,
481
+ -0.33340394496917725,
482
+ -0.354347825050354,
483
+ -0.35777151584625244,
484
+ -0.4135168790817261,
485
+ -0.43906593322753906,
486
+ -0.4341762065887451,
487
+ -0.4487084150314331,
488
+ -0.46751904487609863,
489
+ -0.513728141784668,
490
+ -0.513069748878479,
491
+ -0.5332159996032715,
492
+ -0.5433926582336426,
493
+ -0.5485368967056274,
494
+ -0.6132192611694336,
495
+ -0.5926066637039185,
496
+ -0.5921446084976196,
497
+ -0.6526178121566772,
498
+ -0.636623740196228,
499
+ -0.7098889350891113,
500
+ -0.6862444877624512,
501
+ -0.6774629354476929,
502
+ -0.7385284900665283,
503
+ -0.7177724838256836,
504
+ -0.7079417705535889,
505
+ -0.7501206398010254,
506
+ -0.759358286857605,
507
+ -0.7535332441329956,
508
+ -0.8066447973251343,
509
+ -0.7729413509368896,
510
+ -0.842991828918457,
511
+ -0.8565130233764648,
512
+ -0.8158915042877197,
513
+ -0.8756431341171265,
514
+ -0.9046518802642822,
515
+ -0.920454740524292,
516
+ -0.9318876266479492,
517
+ -0.8859519958496094,
518
+ -0.9098713994026184,
519
+ -0.9709377288818359,
520
+ -0.9593663215637207,
521
+ -0.9529187083244324,
522
+ -1.013043761253357,
523
+ -1.0308873653411865,
524
+ -1.013200044631958,
525
+ -1.0285481214523315,
526
+ -1.0371031761169434,
527
+ -1.046999216079712,
528
+ -1.1075186729431152,
529
+ -1.0722583532333374,
530
+ -1.0959675312042236,
531
+ -1.0852978229522705,
532
+ -1.1574437618255615,
533
+ -1.0848921537399292,
534
+ -1.1078517436981201,
535
+ -1.1082909107208252,
536
+ -1.1257438659667969,
537
+ -1.138296127319336,
538
+ -1.1731445789337158,
539
+ -1.1784374713897705,
540
+ -1.1174108982086182,
541
+ -1.162405252456665,
542
+ -1.19862699508667,
543
+ -1.136898159980774,
544
+ -1.2002137899398804,
545
+ -1.2089275121688843,
546
+ -1.2131140232086182,
547
+ -1.1734609603881836,
548
+ -1.263019323348999,
549
+ -1.2105844020843506,
550
+ -1.1900787353515625,
551
+ -1.2095270156860352,
552
+ -1.1981027126312256,
553
+ -1.2763946056365967,
554
+ -1.270012617111206,
555
+ -1.265164852142334,
556
+ -1.215531587600708,
557
+ -1.1855628490447998,
558
+ -1.2688469886779785,
559
+ -1.2921745777130127,
560
+ -1.3072056770324707,
561
+ -1.2597821950912476,
562
+ -1.2257564067840576,
563
+ -1.2400943040847778,
564
+ -1.260976791381836,
565
+ -1.2807002067565918,
566
+ -1.252894401550293,
567
+ -1.236717700958252,
568
+ -1.3059754371643066,
569
+ -1.2633001804351807
570
  ],
571
  "abs_loss": [
572
+ 2.3030121326446533,
573
+ 2.299145460128784,
574
+ 2.289541244506836,
575
+ 2.273411750793457,
576
+ 2.2070724964141846,
577
+ 2.1113064289093018,
578
+ 2.0303194522857666,
579
+ 1.9397281408309937,
580
+ 1.868037223815918,
581
+ 1.8619756698608398,
582
+ 1.8237711191177368,
583
+ 1.8233033418655396,
584
+ 1.8083871603012085,
585
+ 1.8085123300552368,
586
+ 1.807869553565979,
587
+ 1.8163694143295288,
588
+ 1.8033103942871094,
589
+ 1.8110136985778809,
590
+ 1.789840579032898,
591
+ 1.801681399345398,
592
+ 1.7901602983474731,
593
+ 1.79034423828125,
594
+ 1.817492127418518,
595
+ 1.8122094869613647,
596
+ 1.796120047569275,
597
+ 1.8059641122817993,
598
+ 1.7956987619400024,
599
+ 1.7762395143508911,
600
+ 1.796688437461853,
601
+ 1.8057736158370972,
602
+ 1.8015772104263306,
603
+ 1.7790518999099731,
604
+ 1.8092012405395508,
605
+ 1.7991009950637817,
606
+ 1.7919492721557617,
607
+ 1.787245750427246,
608
+ 1.7877745628356934,
609
+ 1.7972835302352905,
610
+ 1.7926057577133179,
611
+ 1.7955513000488281,
612
+ 1.7774804830551147,
613
+ 1.789096474647522,
614
+ 1.7900733947753906,
615
+ 1.8124494552612305,
616
+ 1.7770217657089233,
617
+ 1.7979646921157837,
618
+ 1.795373797416687,
619
+ 1.8022948503494263,
620
+ 1.7857869863510132,
621
+ 1.7971566915512085,
622
+ 1.8082588911056519,
623
+ 1.7944446802139282,
624
+ 1.8030062913894653,
625
+ 1.7937690019607544,
626
+ 1.8005198240280151,
627
+ 1.804343581199646,
628
+ 1.826321005821228,
629
+ 1.8007465600967407,
630
+ 1.79544198513031,
631
+ 1.7963615655899048,
632
+ 1.7872138023376465,
633
+ 1.798779845237732,
634
+ 1.7960869073867798,
635
+ 1.79216730594635,
636
+ 1.7983245849609375,
637
+ 1.810585379600525,
638
+ 1.7835979461669922,
639
+ 1.8006205558776855,
640
+ 1.8101950883865356,
641
+ 1.7974900007247925,
642
+ 1.7953786849975586,
643
+ 1.8133975267410278,
644
+ 1.803335189819336,
645
+ 1.8109596967697144,
646
+ 1.8093476295471191,
647
+ 1.790522575378418,
648
+ 1.8022232055664062,
649
+ 1.7956289052963257,
650
+ 1.7983678579330444,
651
+ 1.785042405128479,
652
+ 1.8020159006118774,
653
+ 1.7974272966384888,
654
+ 1.7881141901016235,
655
+ 1.7824897766113281,
656
+ 1.7926312685012817,
657
+ 1.8120203018188477,
658
+ 1.8074384927749634,
659
+ 1.7968801259994507,
660
+ 1.8008607625961304,
661
+ 1.798738956451416,
662
+ 1.774499773979187,
663
+ 1.8125265836715698,
664
+ 1.8151473999023438,
665
+ 1.809104561805725,
666
+ 1.7991762161254883,
667
+ 1.8214031457901,
668
+ 1.7966961860656738,
669
+ 1.8064117431640625,
670
+ 1.7899032831192017,
671
+ 1.8015222549438477,
672
+ 1.784250259399414,
673
+ 1.7856217622756958,
674
+ 1.788508415222168,
675
+ 1.7838386297225952,
676
+ 1.7966995239257812,
677
+ 1.7942562103271484,
678
+ 1.804557204246521,
679
+ 1.7753878831863403,
680
+ 1.7882212400436401,
681
+ 1.7564178705215454,
682
+ 1.762564778327942,
683
+ 1.7851663827896118,
684
+ 1.7425185441970825,
685
+ 1.7506004571914673,
686
+ 1.7620363235473633,
687
+ 1.7629045248031616,
688
+ 1.736350178718567,
689
+ 1.721484661102295,
690
+ 1.7447915077209473,
691
+ 1.6947892904281616,
692
+ 1.7224642038345337,
693
+ 1.7099981307983398,
694
+ 1.7195545434951782,
695
+ 1.7268810272216797,
696
+ 1.7024202346801758,
697
+ 1.6876293420791626,
698
+ 1.6866716146469116,
699
+ 1.6718778610229492,
700
+ 1.6505346298217773,
701
+ 1.6802678108215332,
702
+ 1.6462732553482056,
703
+ 1.676039695739746,
704
+ 1.6758023500442505,
705
+ 1.6639236211776733,
706
+ 1.667460560798645,
707
+ 1.6574944257736206,
708
+ 1.6825519800186157,
709
+ 1.6431549787521362,
710
+ 1.6516785621643066,
711
+ 1.6596717834472656
712
  ],
713
  "zipf_loss": [
714
+ 9.261350631713867,
715
+ 9.060800552368164,
716
+ 8.703712463378906,
717
+ 7.987364292144775,
718
+ 6.753927707672119,
719
+ 5.244103908538818,
720
+ 3.9412174224853516,
721
+ 2.8007874488830566,
722
+ 2.3586654663085938,
723
+ 2.1486148834228516,
724
+ 2.035465717315674,
725
+ 1.948370337486267,
726
+ 1.8906917572021484,
727
+ 1.85443913936615,
728
+ 1.8056950569152832,
729
+ 1.7871077060699463,
730
+ 1.7737386226654053,
731
+ 1.7625828981399536,
732
+ 1.7533515691757202,
733
+ 1.7462127208709717,
734
+ 1.7401790618896484,
735
+ 1.7328624725341797,
736
+ 1.7291789054870605,
737
+ 1.726471185684204,
738
+ 1.7237286567687988,
739
+ 1.7211002111434937,
740
+ 1.7196215391159058,
741
+ 1.7175772190093994,
742
+ 1.7155373096466064,
743
+ 1.7140963077545166,
744
+ 1.7128589153289795,
745
+ 1.7123363018035889,
746
+ 1.7116212844848633,
747
+ 1.710986614227295,
748
+ 1.710323691368103,
749
+ 1.709177017211914,
750
+ 1.7089197635650635,
751
+ 1.7086440324783325,
752
+ 1.7077951431274414,
753
+ 1.70767080783844,
754
+ 1.7078380584716797,
755
+ 1.7073982954025269,
756
+ 1.706876277923584,
757
+ 1.706709861755371,
758
+ 1.7069709300994873,
759
+ 1.7068631649017334,
760
+ 1.7068507671356201,
761
+ 1.7066432237625122,
762
+ 1.7064636945724487,
763
+ 1.707026720046997,
764
+ 1.7070682048797607,
765
+ 1.7070485353469849,
766
+ 1.707206130027771,
767
+ 1.7074145078659058,
768
+ 1.7076815366744995,
769
+ 1.7081732749938965,
770
+ 1.7088041305541992,
771
+ 1.7097123861312866,
772
+ 1.7087442874908447,
773
+ 1.7091789245605469,
774
+ 1.7091935873031616,
775
+ 1.7096366882324219,
776
+ 1.7094817161560059,
777
+ 1.709253191947937,
778
+ 1.7096768617630005,
779
+ 1.7094162702560425,
780
+ 1.7096279859542847,
781
+ 1.7093257904052734,
782
+ 1.7090497016906738,
783
+ 1.7087442874908447,
784
+ 1.7089468240737915,
785
+ 1.7082123756408691,
786
+ 1.708524227142334,
787
+ 1.7081365585327148,
788
+ 1.7077720165252686,
789
+ 1.707575798034668,
790
+ 1.70670485496521,
791
+ 1.70730721950531,
792
+ 1.707053780555725,
793
+ 1.7062891721725464,
794
+ 1.706575632095337,
795
+ 1.7053077220916748,
796
+ 1.7053639888763428,
797
+ 1.7051633596420288,
798
+ 1.7049205303192139,
799
+ 1.7049903869628906,
800
+ 1.7046650648117065,
801
+ 1.7044339179992676,
802
+ 1.704336404800415,
803
+ 1.704285979270935,
804
+ 1.7030590772628784,
805
+ 1.7029609680175781,
806
+ 1.7019639015197754,
807
+ 1.7023484706878662,
808
+ 1.7017755508422852,
809
+ 1.7018823623657227,
810
+ 1.7013258934020996,
811
+ 1.7009366750717163,
812
+ 1.7001982927322388,
813
+ 1.7002460956573486,
814
+ 1.6996716260910034,
815
+ 1.6995488405227661,
816
+ 1.6995980739593506,
817
+ 1.698567271232605,
818
+ 1.6988425254821777,
819
+ 1.6982042789459229,
820
+ 1.6968954801559448,
821
+ 1.6977074146270752,
822
+ 1.6968213319778442,
823
+ 1.6978480815887451,
824
+ 1.6959397792816162,
825
+ 1.6950517892837524,
826
+ 1.6945316791534424,
827
+ 1.6940975189208984,
828
+ 1.6950159072875977,
829
+ 1.692023754119873,
830
+ 1.693522334098816,
831
+ 1.6887413263320923,
832
+ 1.6895627975463867,
833
+ 1.690476894378662,
834
+ 1.6874343156814575,
835
+ 1.6869207620620728,
836
+ 1.6842659711837769,
837
+ 1.6857703924179077,
838
+ 1.6847844123840332,
839
+ 1.6830801963806152,
840
+ 1.6826388835906982,
841
+ 1.6792271137237549,
842
+ 1.6764867305755615,
843
+ 1.6739792823791504,
844
+ 1.6731560230255127,
845
+ 1.6699650287628174,
846
+ 1.6774280071258545,
847
+ 1.670038104057312,
848
+ 1.66901433467865,
849
+ 1.6727375984191895,
850
+ 1.6714589595794678,
851
+ 1.6659390926361084,
852
+ 1.6724532842636108,
853
+ 1.667574167251587
854
  ],
855
  "denoise_loss": [],
856
  "ortho_loss": [
857
+ 0.7279962301254272,
858
+ 0.6579767465591431,
859
+ 0.6817273497581482,
860
+ 0.6108765006065369,
861
+ 0.579704225063324,
862
+ 0.5299669504165649,
863
+ 0.508258581161499,
864
+ 0.44487300515174866,
865
+ 0.45012974739074707,
866
+ 0.47072339057922363,
867
+ 0.4518543779850006,
868
+ 0.4742773473262787,
869
+ 0.48254117369651794,
870
+ 0.49086296558380127,
871
+ 0.5048026442527771,
872
+ 0.4966145157814026,
873
+ 0.49529483914375305,
874
+ 0.5079429745674133,
875
+ 0.49587127566337585,
876
+ 0.5039945244789124,
877
+ 0.5124256610870361,
878
+ 0.5046111345291138,
879
+ 0.5036980509757996,
880
+ 0.5007200837135315,
881
+ 0.4964256286621094,
882
+ 0.47616976499557495,
883
+ 0.4820389449596405,
884
+ 0.4889011085033417,
885
+ 0.4739861488342285,
886
+ 0.4770560562610626,
887
+ 0.48463118076324463,
888
+ 0.48521560430526733,
889
+ 0.47872859239578247,
890
+ 0.47701188921928406,
891
+ 0.47266486287117004,
892
+ 0.4762444496154785,
893
+ 0.49047499895095825,
894
+ 0.49516916275024414,
895
+ 0.49629321694374084,
896
+ 0.49382537603378296,
897
+ 0.5009201169013977,
898
+ 0.514494776725769,
899
+ 0.5163015723228455,
900
+ 0.5229976177215576,
901
+ 0.5241018533706665,
902
+ 0.5448073744773865,
903
+ 0.5465819835662842,
904
+ 0.5636215806007385,
905
+ 0.5807940363883972,
906
+ 0.6014026999473572,
907
+ 0.6071130633354187,
908
+ 0.6144046187400818,
909
+ 0.6215061545372009,
910
+ 0.6127831339836121,
911
+ 0.6225723624229431,
912
+ 0.635498583316803,
913
+ 0.6355625987052917,
914
+ 0.6451646685600281,
915
+ 0.6524924039840698,
916
+ 0.6498976349830627,
917
+ 0.6616995930671692,
918
+ 0.6724307537078857,
919
+ 0.6730102300643921,
920
+ 0.6778948903083801,
921
+ 0.7103477716445923,
922
+ 0.7236534357070923,
923
+ 0.7376241087913513,
924
+ 0.7405605316162109,
925
+ 0.7441432476043701,
926
+ 0.7507919669151306,
927
+ 0.7573267817497253,
928
+ 0.7659734487533569,
929
+ 0.778541088104248,
930
+ 0.7812742590904236,
931
+ 0.7830356359481812,
932
+ 0.7777599692344666,
933
+ 0.7870175838470459,
934
+ 0.7978265881538391,
935
+ 0.7991683483123779,
936
+ 0.8024166822433472,
937
+ 0.8022057414054871,
938
+ 0.8008230328559875,
939
+ 0.7993347644805908,
940
+ 0.795768678188324,
941
+ 0.7919573783874512,
942
+ 0.7857846617698669,
943
+ 0.785811722278595,
944
+ 0.7959153056144714,
945
+ 0.7985197305679321,
946
+ 0.8080344796180725,
947
+ 0.8078311681747437,
948
+ 0.8130998015403748,
949
+ 0.8166013956069946,
950
+ 0.8189990520477295,
951
+ 0.8234202265739441,
952
+ 0.8193479776382446,
953
+ 0.8200980424880981,
954
+ 0.8206661939620972,
955
+ 0.8180730938911438,
956
+ 0.8158308267593384,
957
+ 0.8163653016090393,
958
+ 0.8168421387672424,
959
+ 0.8173116445541382,
960
+ 0.8186611533164978,
961
+ 0.8203118443489075,
962
+ 0.8226667642593384,
963
+ 0.8240026235580444,
964
+ 0.8218040466308594,
965
+ 0.8227177858352661,
966
+ 0.8246716260910034,
967
+ 0.8230711221694946,
968
+ 0.8251593708992004,
969
+ 0.8275840282440186,
970
+ 0.8295618891716003,
971
+ 0.8331630229949951,
972
+ 0.8328480124473572,
973
+ 0.8344019651412964,
974
+ 0.8350807428359985,
975
+ 0.8389624953269958,
976
+ 0.8427870273590088,
977
+ 0.8432278037071228,
978
+ 0.8458070755004883,
979
+ 0.8469033241271973,
980
+ 0.8485611081123352,
981
+ 0.8497316241264343,
982
+ 0.8513529896736145,
983
+ 0.8531062006950378,
984
+ 0.8545234203338623,
985
+ 0.8543184995651245,
986
+ 0.8548187613487244,
987
+ 0.8546685576438904,
988
+ 0.854476273059845,
989
+ 0.856360673904419,
990
+ 0.8580570220947266,
991
+ 0.8584591746330261,
992
+ 0.858634352684021,
993
+ 0.8592213988304138,
994
+ 0.8611096143722534,
995
+ 0.8617076873779297,
996
+ 0.8623132109642029
997
  ],
998
  "lr": [
999
+ 4.188034188034189e-06,
1000
+ 8.461538461538462e-06,
1001
+ 1.2735042735042738e-05,
1002
+ 1.700854700854701e-05,
1003
  2e-05,
1004
  2e-05,
1005
  2e-05,
 
1040
  2e-05,
1041
  2e-05,
1042
  2e-05,
1043
+ 2e-05,
1044
+ 2e-05,
1045
+ 2e-05,
1046
+ 2e-05,
1047
+ 2e-05,
1048
+ 2e-05,
1049
+ 2e-05,
1050
+ 2e-05,
1051
+ 2e-05,
1052
+ 2e-05,
1053
+ 2e-05,
1054
+ 2e-05,
1055
+ 2e-05,
1056
+ 2e-05,
1057
+ 2e-05,
1058
+ 2e-05,
1059
+ 2e-05,
1060
+ 2e-05,
1061
+ 2e-05,
1062
+ 2e-05,
1063
+ 2e-05,
1064
+ 2e-05,
1065
+ 2e-05,
1066
+ 2e-05,
1067
+ 2e-05,
1068
+ 2e-05,
1069
+ 2e-05,
1070
+ 2e-05,
1071
+ 2e-05,
1072
+ 2e-05,
1073
+ 2e-05,
1074
+ 2e-05,
1075
+ 2e-05,
1076
+ 2e-05,
1077
+ 2e-05,
1078
+ 2e-05,
1079
+ 2e-05,
1080
+ 2e-05,
1081
+ 2e-05,
1082
+ 2e-05,
1083
+ 2e-05,
1084
+ 1.9967967308199317e-05,
1085
+ 1.967136831004482e-05,
1086
+ 1.9374769311890326e-05,
1087
+ 1.907817031373583e-05,
1088
+ 1.8781571315581332e-05,
1089
+ 1.848497231742684e-05,
1090
+ 1.7945162140785656e-05,
1091
+ 1.7648563142631164e-05,
1092
+ 1.735196414447667e-05,
1093
+ 1.705536514632217e-05,
1094
+ 1.675876614816768e-05,
1095
+ 1.6462167150013183e-05,
1096
+ 1.6165568151858688e-05,
1097
+ 1.5625757975217507e-05,
1098
+ 1.532915897706301e-05,
1099
+ 1.5032559978908518e-05,
1100
+ 1.473596098075402e-05,
1101
+ 1.4439361982599525e-05,
1102
+ 1.4142762984445033e-05,
1103
+ 1.3846163986290536e-05,
1104
+ 1.3306353809649356e-05,
1105
+ 1.300975481149486e-05,
1106
+ 1.2713155813340366e-05,
1107
+ 1.2416556815185872e-05,
1108
+ 1.2119957817031374e-05,
1109
+ 1.182335881887688e-05,
1110
+ 1.1526759820722382e-05,
1111
+ 1.0986949644081204e-05,
1112
+ 1.0690350645926705e-05,
1113
+ 1.0393751647772212e-05,
1114
+ 1.009715264961772e-05,
1115
+ 9.800553651463221e-06,
1116
+ 9.50395465330873e-06,
1117
+ 9.207355655154231e-06,
1118
+ 8.66754547851305e-06,
1119
+ 8.370946480358558e-06,
1120
+ 8.07434748220406e-06,
1121
+ 7.777748484049568e-06,
1122
+ 7.481149485895069e-06,
1123
+ 7.184550487740576e-06,
1124
+ 6.887951489586082e-06,
1125
+ 6.3481413129448985e-06,
1126
+ 6.051542314790406e-06,
1127
+ 5.754943316635908e-06,
1128
+ 5.4583443184814145e-06,
1129
+ 5.161745320326917e-06,
1130
+ 4.865146322172423e-06,
1131
+ 4.5685473240179305e-06,
1132
+ 4.028737147376746e-06,
1133
+ 3.732138149222253e-06,
1134
+ 3.435539151067755e-06,
1135
+ 3.1389401529132617e-06,
1136
+ 2.8423411547587686e-06,
1137
+ 2.5457421566042704e-06,
1138
+ 2.2491431584497773e-06
1139
  ],
1140
  "emb_lr": [],
1141
  "eval_step": [
 
1148
  2696,
1149
  3087,
1150
  3478,
1151
+ 3869,
1152
+ 4260,
1153
+ 4651,
1154
+ 5042,
1155
+ 5433,
1156
+ 5824,
1157
+ 6215,
1158
+ 6606,
1159
+ 6997,
1160
+ 7388,
1161
+ 7779
1162
  ],
1163
  "eval_accuracy": [
1164
+ 0.02,
1165
+ 0.02,
1166
+ 0.02,
1167
+ 0.02,
1168
+ 0.02,
1169
+ 0.02,
1170
  0.01,
1171
  0.0,
 
1172
  0.01,
 
 
1173
  0.01,
1174
+ 0.01,
1175
+ 0.01,
1176
+ 0.01,
1177
+ 0.01,
1178
+ 0.01,
1179
+ 0.01,
1180
+ 0.01,
1181
+ 0.01,
1182
+ 0.01,
1183
  0.01
1184
  ]
1185
  },
1186
+ "final_accuracy": 0.20653846153846153,
1187
  "sft_eval": {
1188
  "config": {
1189
  "ops": "add_sub",
1190
  "K": null,
1191
  "mode": "sft",
1192
  "n_digits": 6,
1193
+ "n_per_split": 100
1194
  },
1195
  "splits": {
1196
  "add_S0": {
1197
  "full_accuracy": 0.0,
1198
+ "digit_accuracy": 0.3142857142857143,
1199
+ "n_examples": 100,
1200
  "per_subtask": {
1201
  "SA": {
1202
+ "accuracy": 0.21487603305785125,
1203
+ "count": 605
1204
  },
1205
  "SS": {
1206
+ "accuracy": 0.9473684210526315,
1207
+ "count": 95
1208
  }
1209
  }
1210
  },
1211
  "add_S1": {
1212
  "full_accuracy": 0.0,
1213
+ "digit_accuracy": 0.22142857142857142,
1214
+ "n_examples": 100,
1215
  "per_subtask": {
1216
  "SA": {
1217
+ "accuracy": 0.24019607843137256,
1218
+ "count": 204
1219
  },
1220
  "SC": {
1221
+ "accuracy": 0.10650887573964497,
1222
+ "count": 169
1223
  },
1224
  "SS": {
1225
+ "accuracy": 0.8387096774193549,
1226
+ "count": 31
1227
  },
1228
  "UC": {
1229
+ "accuracy": 0.20945945945945946,
1230
+ "count": 296
1231
  }
1232
  }
1233
  },
1234
  "add_S2": {
1235
  "full_accuracy": 0.0,
1236
+ "digit_accuracy": 0.34285714285714286,
1237
+ "n_examples": 100,
1238
  "per_subtask": {
1239
  "SA": {
1240
+ "accuracy": 0.3619631901840491,
1241
+ "count": 163
1242
  },
1243
  "SC": {
1244
+ "accuracy": 0.06923076923076923,
1245
+ "count": 130
1246
  },
1247
  "SS": {
1248
+ "accuracy": 0.5632183908045977,
1249
+ "count": 87
1250
  },
1251
  "UC": {
1252
+ "accuracy": 0.32019704433497537,
1253
+ "count": 203
1254
  },
1255
  "US": {
1256
+ "accuracy": 0.49572649572649574,
1257
+ "count": 117
1258
  }
1259
  }
1260
  },
1261
  "add_S3": {
1262
  "full_accuracy": 0.0,
1263
+ "digit_accuracy": 0.3271428571428571,
1264
+ "n_examples": 100,
1265
  "per_subtask": {
1266
  "SA": {
1267
+ "accuracy": 0.3884297520661157,
1268
+ "count": 121
1269
  },
1270
  "SC": {
1271
+ "accuracy": 0.06611570247933884,
1272
+ "count": 121
1273
  },
1274
  "SS": {
1275
+ "accuracy": 0.7346938775510204,
1276
+ "count": 49
1277
  },
1278
  "UC": {
1279
+ "accuracy": 0.27419354838709675,
1280
+ "count": 186
1281
  },
1282
  "US": {
1283
+ "accuracy": 0.3901345291479821,
1284
+ "count": 223
1285
  }
1286
  }
1287
  },
1288
  "add_S4": {
1289
  "full_accuracy": 0.0,
1290
+ "digit_accuracy": 0.4042857142857143,
1291
+ "n_examples": 100,
1292
  "per_subtask": {
1293
  "SA": {
1294
+ "accuracy": 0.3557692307692308,
1295
+ "count": 104
1296
  },
1297
  "SC": {
1298
+ "accuracy": 0.10377358490566038,
1299
+ "count": 106
1300
  },
1301
  "SS": {
1302
+ "accuracy": 0.5652173913043478,
1303
+ "count": 23
1304
  },
1305
  "UC": {
1306
+ "accuracy": 0.38125,
1307
+ "count": 160
1308
  },
1309
  "US": {
1310
+ "accuracy": 0.5244299674267101,
1311
+ "count": 307
1312
  }
1313
  }
1314
  },
1315
  "add_S5": {
1316
  "full_accuracy": 0.0,
1317
+ "digit_accuracy": 0.13,
1318
+ "n_examples": 100,
1319
  "per_subtask": {
1320
  "SA": {
1321
+ "accuracy": 0.46,
1322
+ "count": 100
1323
  },
1324
  "SC": {
1325
+ "accuracy": 0.01,
1326
+ "count": 100
1327
  },
1328
  "UC": {
1329
+ "accuracy": 0.16,
1330
+ "count": 100
1331
  },
1332
  "US": {
1333
+ "accuracy": 0.07,
1334
+ "count": 400
1335
  }
1336
  }
1337
  },
1338
  "add_S6": {
1339
+ "full_accuracy": 0.1,
1340
+ "digit_accuracy": 0.4857142857142857,
1341
+ "n_examples": 100,
1342
  "per_subtask": {
1343
  "SC": {
1344
+ "accuracy": 0.1,
1345
+ "count": 100
1346
  },
1347
  "UC": {
1348
+ "accuracy": 0.58,
1349
+ "count": 100
1350
  },
1351
  "US": {
1352
+ "accuracy": 0.544,
1353
+ "count": 500
1354
  }
1355
  }
1356
  },
1357
  "add_random": {
1358
  "full_accuracy": 0.0,
1359
+ "digit_accuracy": 0.2307142857142857,
1360
  "n_examples": 200,
1361
  "per_subtask": {
1362
  "SA": {
1363
+ "accuracy": 0.25279642058165547,
1364
+ "count": 447
1365
  },
1366
  "SC": {
1367
+ "accuracy": 0.09375,
1368
+ "count": 320
1369
  },
1370
  "SS": {
1371
+ "accuracy": 0.625,
1372
+ "count": 56
1373
  },
1374
  "UC": {
1375
+ "accuracy": 0.23062381852551986,
1376
+ "count": 529
1377
  },
1378
  "US": {
1379
+ "accuracy": 0.4791666666666667,
1380
+ "count": 48
1381
  }
1382
  }
1383
  },
1384
  "add_C1": {
1385
  "full_accuracy": 0.0,
1386
+ "digit_accuracy": 0.1442857142857143,
1387
+ "n_examples": 100,
1388
  "per_subtask": {
1389
  "SA": {
1390
+ "accuracy": 0.178,
1391
+ "count": 500
1392
  },
1393
  "SC": {
1394
  "accuracy": 0.0,
1395
+ "count": 100
1396
  },
1397
  "UC": {
1398
+ "accuracy": 0.12,
1399
+ "count": 100
1400
  }
1401
  }
1402
  },
1403
  "add_C2": {
1404
  "full_accuracy": 0.0,
1405
+ "digit_accuracy": 0.15142857142857144,
1406
+ "n_examples": 100,
1407
  "per_subtask": {
1408
  "SA": {
1409
+ "accuracy": 0.215,
1410
+ "count": 400
1411
  },
1412
  "SC": {
1413
+ "accuracy": 0.01,
1414
+ "count": 100
1415
  },
1416
  "UC": {
1417
+ "accuracy": 0.09615384615384616,
1418
+ "count": 156
1419
  },
1420
  "US": {
1421
+ "accuracy": 0.09090909090909091,
1422
+ "count": 44
1423
  }
1424
  }
1425
  },
1426
  "add_C3": {
1427
  "full_accuracy": 0.0,
1428
+ "digit_accuracy": 0.15428571428571428,
1429
+ "n_examples": 100,
1430
  "per_subtask": {
1431
  "SA": {
1432
+ "accuracy": 0.25,
1433
+ "count": 300
1434
  },
1435
  "SC": {
 
 
 
 
1436
  "accuracy": 0.06,
1437
  "count": 100
1438
  },
1439
+ "UC": {
1440
+ "accuracy": 0.09045226130653267,
1441
+ "count": 199
1442
+ },
1443
  "US": {
1444
+ "accuracy": 0.0891089108910891,
1445
+ "count": 101
1446
  }
1447
  }
1448
  },
1449
  "add_C4": {
1450
  "full_accuracy": 0.0,
1451
+ "digit_accuracy": 0.18571428571428572,
1452
+ "n_examples": 100,
1453
  "per_subtask": {
1454
  "SA": {
1455
+ "accuracy": 0.34,
1456
+ "count": 200
1457
  },
1458
  "SC": {
1459
+ "accuracy": 0.05,
1460
+ "count": 100
1461
  },
1462
  "UC": {
1463
+ "accuracy": 0.10227272727272728,
1464
+ "count": 264
1465
  },
1466
  "US": {
1467
+ "accuracy": 0.22058823529411764,
1468
+ "count": 136
1469
  }
1470
  }
1471
  },
1472
  "add_C5": {
1473
  "full_accuracy": 0.0,
1474
+ "digit_accuracy": 0.21,
1475
+ "n_examples": 100,
1476
  "per_subtask": {
1477
  "SA": {
1478
+ "accuracy": 0.52,
1479
+ "count": 100
1480
  },
1481
  "SC": {
1482
+ "accuracy": 0.06,
1483
+ "count": 100
1484
  },
1485
  "UC": {
1486
+ "accuracy": 0.11935483870967742,
1487
+ "count": 310
1488
  },
1489
  "US": {
1490
+ "accuracy": 0.2736842105263158,
1491
+ "count": 190
1492
  }
1493
  }
1494
  },
1495
  "add_C6": {
1496
+ "full_accuracy": 0.01,
1497
+ "digit_accuracy": 0.36428571428571427,
1498
+ "n_examples": 100,
1499
  "per_subtask": {
1500
  "SC": {
1501
+ "accuracy": 0.12,
1502
+ "count": 100
1503
  },
1504
  "UC": {
1505
+ "accuracy": 0.24324324324324326,
1506
+ "count": 370
1507
  },
1508
  "US": {
1509
+ "accuracy": 0.6652173913043479,
1510
+ "count": 230
1511
  }
1512
  }
1513
  },
1514
  "sub_M0": {
1515
  "full_accuracy": 0.0,
1516
+ "digit_accuracy": 0.3028571428571429,
1517
+ "n_examples": 100,
1518
  "per_subtask": {
1519
  "MD": {
1520
+ "accuracy": 0.216260162601626,
1521
+ "count": 615
1522
  },
1523
  "ME": {
1524
+ "accuracy": 0.9294117647058824,
1525
+ "count": 85
1526
  }
1527
  }
1528
  },
1529
  "sub_M1": {
1530
  "full_accuracy": 0.0,
1531
+ "digit_accuracy": 0.23142857142857143,
1532
+ "n_examples": 100,
1533
  "per_subtask": {
1534
  "MD": {
1535
+ "accuracy": 0.3904109589041096,
1536
+ "count": 292
1537
  },
1538
  "MB": {
1539
  "accuracy": 0.0,
1540
+ "count": 144
1541
  },
1542
  "ME": {
1543
+ "accuracy": 0.92,
1544
+ "count": 25
1545
  },
1546
  "UB": {
1547
+ "accuracy": 0.10460251046025104,
1548
+ "count": 239
1549
  }
1550
  }
1551
  },
1552
  "sub_M2": {
1553
  "full_accuracy": 0.0,
1554
+ "digit_accuracy": 0.35714285714285715,
1555
+ "n_examples": 100,
1556
  "per_subtask": {
1557
  "MD": {
1558
+ "accuracy": 0.6113744075829384,
1559
+ "count": 211
1560
  },
1561
  "MB": {
1562
+ "accuracy": 0.02608695652173913,
1563
+ "count": 115
1564
  },
1565
  "ME": {
1566
+ "accuracy": 0.9529411764705882,
1567
+ "count": 85
1568
  },
1569
  "UB": {
1570
+ "accuracy": 0.15469613259668508,
1571
+ "count": 181
1572
  },
1573
  "UD": {
1574
+ "accuracy": 0.08333333333333333,
1575
+ "count": 108
1576
  }
1577
  }
1578
  },
1579
  "sub_M3": {
1580
  "full_accuracy": 0.0,
1581
+ "digit_accuracy": 0.33714285714285713,
1582
+ "n_examples": 100,
1583
  "per_subtask": {
1584
  "MD": {
1585
+ "accuracy": 0.7430167597765364,
1586
+ "count": 179
1587
  },
1588
  "MB": {
1589
+ "accuracy": 0.009708737864077669,
1590
+ "count": 103
1591
  },
1592
  "ME": {
1593
+ "accuracy": 0.9821428571428571,
1594
+ "count": 56
1595
  },
1596
  "UB": {
1597
+ "accuracy": 0.15436241610738255,
1598
+ "count": 149
1599
  },
1600
  "UD": {
1601
+ "accuracy": 0.11267605633802817,
1602
+ "count": 213
1603
  }
1604
  }
1605
  },
1606
  "sub_M4": {
1607
  "full_accuracy": 0.0,
1608
+ "digit_accuracy": 0.22285714285714286,
1609
+ "n_examples": 100,
1610
  "per_subtask": {
1611
  "MD": {
1612
+ "accuracy": 0.53,
1613
+ "count": 200
1614
  },
1615
  "MB": {
1616
  "accuracy": 0.0,
1617
+ "count": 100
1618
  },
1619
  "UB": {
1620
+ "accuracy": 0.33,
1621
+ "count": 100
1622
  },
1623
  "UD": {
1624
+ "accuracy": 0.056666666666666664,
1625
+ "count": 300
1626
  }
1627
  }
1628
  },
1629
  "sub_M5": {
1630
  "full_accuracy": 0.0,
1631
+ "digit_accuracy": 0.23857142857142857,
1632
+ "n_examples": 100,
1633
  "per_subtask": {
1634
  "MD": {
1635
  "accuracy": 1.0,
1636
+ "count": 100
1637
  },
1638
  "MB": {
1639
+ "accuracy": 0.03,
1640
+ "count": 100
1641
  },
1642
  "UB": {
1643
+ "accuracy": 0.35,
1644
+ "count": 100
1645
  },
1646
  "UD": {
1647
+ "accuracy": 0.0725,
1648
+ "count": 400
1649
  }
1650
  }
1651
  },
1652
  "sub_random": {
1653
  "full_accuracy": 0.0,
1654
+ "digit_accuracy": 0.24642857142857144,
1655
  "n_examples": 200,
1656
  "per_subtask": {
1657
  "MD": {
1658
+ "accuracy": 0.39166666666666666,
1659
+ "count": 600
1660
  },
1661
  "MB": {
1662
+ "accuracy": 0.00749063670411985,
1663
+ "count": 267
1664
  },
1665
  "ME": {
1666
+ "accuracy": 0.9433962264150944,
1667
  "count": 53
1668
  },
1669
  "UB": {
1670
+ "accuracy": 0.12984054669703873,
1671
+ "count": 439
1672
  },
1673
  "UD": {
1674
+ "accuracy": 0.024390243902439025,
1675
+ "count": 41
1676
  }
1677
  }
1678
  },
1679
  "sub_B3": {
1680
  "full_accuracy": 0.0,
1681
+ "digit_accuracy": 0.21142857142857144,
1682
+ "n_examples": 100,
1683
  "per_subtask": {
1684
  "MD": {
1685
+ "accuracy": 0.36,
1686
+ "count": 300
1687
  },
1688
  "MB": {
1689
+ "accuracy": 0.02,
1690
+ "count": 100
1691
  },
1692
  "UB": {
1693
+ "accuracy": 0.17258883248730963,
1694
+ "count": 197
1695
  },
1696
  "UD": {
1697
+ "accuracy": 0.038834951456310676,
1698
+ "count": 103
1699
  }
1700
  }
1701
  },
1702
  "sub_B4": {
1703
  "full_accuracy": 0.0,
1704
+ "digit_accuracy": 0.22285714285714286,
1705
+ "n_examples": 100,
1706
  "per_subtask": {
1707
  "MD": {
1708
+ "accuracy": 0.53,
1709
+ "count": 200
1710
  },
1711
  "MB": {
1712
  "accuracy": 0.0,
1713
+ "count": 100
1714
  },
1715
  "UB": {
1716
+ "accuracy": 0.14979757085020243,
1717
+ "count": 247
1718
  },
1719
  "UD": {
1720
+ "accuracy": 0.08496732026143791,
1721
+ "count": 153
1722
  }
1723
  }
1724
  },
1725
  "sub_B5": {
1726
  "full_accuracy": 0.0,
1727
+ "digit_accuracy": 0.21857142857142858,
1728
+ "n_examples": 100,
1729
  "per_subtask": {
1730
  "MD": {
1731
  "accuracy": 1.0,
1732
+ "count": 100
1733
  },
1734
  "MB": {
1735
+ "accuracy": 0.02,
1736
+ "count": 100
1737
  },
1738
  "UB": {
1739
+ "accuracy": 0.14093959731543623,
1740
+ "count": 298
1741
  },
1742
  "UD": {
1743
+ "accuracy": 0.04455445544554455,
1744
+ "count": 202
1745
  }
1746
  }
1747
  }
1748
  },
1749
  "summary": {
1750
+ "overall_accuracy": 0.004230769230769231,
1751
+ "digit_accuracy": 0.25884615384615384,
1752
+ "total_examples": 2600,
1753
  "n_splits": 24
1754
  }
1755
  },
 
1759
  "K": 1,
1760
  "mode": "sorl",
1761
  "n_digits": 6,
1762
+ "n_per_split": 100
1763
  },
1764
  "splits": {
1765
  "add_S0": {
1766
+ "full_accuracy": 0.46,
1767
+ "digit_accuracy": 0.9114285714285715,
1768
+ "n_examples": 100,
1769
  "per_subtask": {
1770
  "SA": {
1771
+ "accuracy": 0.8975206611570248,
1772
+ "count": 605
1773
  },
1774
  "SS": {
1775
+ "accuracy": 1.0,
1776
+ "count": 95
1777
  }
1778
  }
1779
  },
1780
  "add_S1": {
1781
+ "full_accuracy": 0.43,
1782
+ "digit_accuracy": 0.9,
1783
+ "n_examples": 100,
1784
  "per_subtask": {
1785
  "SA": {
1786
+ "accuracy": 0.9264705882352942,
1787
+ "count": 204
1788
  },
1789
  "SC": {
1790
+ "accuracy": 0.8875739644970414,
1791
+ "count": 169
1792
  },
1793
  "SS": {
1794
+ "accuracy": 1.0,
1795
+ "count": 31
1796
  },
1797
  "UC": {
1798
+ "accuracy": 0.8783783783783784,
1799
+ "count": 296
1800
  }
1801
  }
1802
  },
1803
  "add_S2": {
1804
+ "full_accuracy": 0.19,
1805
+ "digit_accuracy": 0.82,
1806
+ "n_examples": 100,
1807
  "per_subtask": {
1808
  "SA": {
1809
+ "accuracy": 0.8834355828220859,
1810
+ "count": 163
1811
  },
1812
  "SC": {
1813
+ "accuracy": 0.8692307692307693,
1814
+ "count": 130
1815
  },
1816
  "SS": {
1817
+ "accuracy": 0.9655172413793104,
1818
+ "count": 87
1819
  },
1820
  "UC": {
1821
+ "accuracy": 0.6354679802955665,
1822
+ "count": 203
1823
  },
1824
  "US": {
1825
+ "accuracy": 0.8888888888888888,
1826
+ "count": 117
1827
  }
1828
  }
1829
  },
1830
  "add_S3": {
1831
+ "full_accuracy": 0.09,
1832
+ "digit_accuracy": 0.75,
1833
+ "n_examples": 100,
1834
  "per_subtask": {
1835
  "SA": {
1836
+ "accuracy": 0.9173553719008265,
1837
+ "count": 121
1838
  },
1839
  "SC": {
1840
+ "accuracy": 0.8925619834710744,
1841
+ "count": 121
1842
  },
1843
  "SS": {
1844
+ "accuracy": 0.9795918367346939,
1845
+ "count": 49
1846
  },
1847
  "UC": {
1848
+ "accuracy": 0.5645161290322581,
1849
+ "count": 186
1850
  },
1851
  "US": {
1852
+ "accuracy": 0.6860986547085202,
1853
+ "count": 223
1854
  }
1855
  }
1856
  },
1857
  "add_S4": {
1858
+ "full_accuracy": 0.15,
1859
+ "digit_accuracy": 0.6914285714285714,
1860
+ "n_examples": 100,
1861
  "per_subtask": {
1862
  "SA": {
1863
+ "accuracy": 0.9711538461538461,
1864
+ "count": 104
1865
  },
1866
  "SC": {
1867
+ "accuracy": 0.9245283018867925,
1868
+ "count": 106
1869
  },
1870
  "SS": {
1871
+ "accuracy": 1.0,
1872
+ "count": 23
1873
  },
1874
  "UC": {
1875
+ "accuracy": 0.56875,
1876
+ "count": 160
1877
  },
1878
  "US": {
1879
+ "accuracy": 0.5570032573289903,
1880
+ "count": 307
1881
  }
1882
  }
1883
  },
1884
  "add_S5": {
1885
+ "full_accuracy": 0.11,
1886
+ "digit_accuracy": 0.5214285714285715,
1887
+ "n_examples": 100,
1888
  "per_subtask": {
1889
  "SA": {
1890
+ "accuracy": 0.99,
1891
+ "count": 100
1892
  },
1893
  "SC": {
1894
+ "accuracy": 0.97,
1895
+ "count": 100
1896
  },
1897
  "UC": {
1898
+ "accuracy": 0.33,
1899
+ "count": 100
1900
  },
1901
  "US": {
1902
+ "accuracy": 0.34,
1903
+ "count": 400
1904
  }
1905
  }
1906
  },
1907
  "add_S6": {
1908
+ "full_accuracy": 0.14,
1909
+ "digit_accuracy": 0.43142857142857144,
1910
+ "n_examples": 100,
1911
  "per_subtask": {
1912
  "SC": {
1913
+ "accuracy": 1.0,
1914
+ "count": 100
1915
  },
1916
  "UC": {
1917
+ "accuracy": 0.21,
1918
+ "count": 100
1919
  },
1920
  "US": {
1921
+ "accuracy": 0.362,
1922
+ "count": 500
1923
  }
1924
  }
1925
  },
1926
  "add_random": {
1927
+ "full_accuracy": 0.315,
1928
+ "digit_accuracy": 0.8671428571428571,
1929
  "n_examples": 200,
1930
  "per_subtask": {
1931
  "SA": {
1932
+ "accuracy": 0.8926174496644296,
1933
+ "count": 447
1934
  },
1935
  "SC": {
1936
+ "accuracy": 0.93125,
1937
+ "count": 320
1938
  },
1939
  "SS": {
1940
+ "accuracy": 0.9464285714285714,
1941
+ "count": 56
1942
  },
1943
  "UC": {
1944
+ "accuracy": 0.8071833648393195,
1945
+ "count": 529
1946
  },
1947
  "US": {
1948
+ "accuracy": 0.7708333333333334,
1949
+ "count": 48
1950
  }
1951
  }
1952
  },
1953
  "add_C1": {
1954
+ "full_accuracy": 0.31,
1955
+ "digit_accuracy": 0.8614285714285714,
1956
+ "n_examples": 100,
1957
  "per_subtask": {
1958
  "SA": {
1959
+ "accuracy": 0.9,
1960
+ "count": 500
1961
  },
1962
  "SC": {
1963
+ "accuracy": 0.91,
1964
+ "count": 100
1965
  },
1966
  "UC": {
1967
+ "accuracy": 0.62,
1968
+ "count": 100
1969
  }
1970
  }
1971
  },
1972
  "add_C2": {
1973
+ "full_accuracy": 0.34,
1974
+ "digit_accuracy": 0.8528571428571429,
1975
+ "n_examples": 100,
1976
  "per_subtask": {
1977
  "SA": {
1978
+ "accuracy": 0.905,
1979
+ "count": 400
1980
  },
1981
  "SC": {
1982
+ "accuracy": 0.92,
1983
+ "count": 100
1984
  },
1985
  "UC": {
1986
+ "accuracy": 0.6794871794871795,
1987
+ "count": 156
1988
  },
1989
  "US": {
1990
+ "accuracy": 0.8409090909090909,
1991
+ "count": 44
1992
  }
1993
  }
1994
  },
1995
  "add_C3": {
1996
+ "full_accuracy": 0.23,
1997
+ "digit_accuracy": 0.8328571428571429,
1998
+ "n_examples": 100,
1999
  "per_subtask": {
2000
  "SA": {
2001
+ "accuracy": 0.9366666666666666,
2002
+ "count": 300
2003
  },
2004
  "SC": {
2005
+ "accuracy": 0.95,
2006
+ "count": 100
2007
  },
2008
  "UC": {
2009
+ "accuracy": 0.6432160804020101,
2010
+ "count": 199
2011
  },
2012
  "US": {
2013
+ "accuracy": 0.7821782178217822,
2014
+ "count": 101
2015
  }
2016
  }
2017
  },
2018
  "add_C4": {
2019
+ "full_accuracy": 0.16,
2020
+ "digit_accuracy": 0.8014285714285714,
2021
+ "n_examples": 100,
2022
  "per_subtask": {
2023
  "SA": {
2024
+ "accuracy": 0.945,
2025
+ "count": 200
2026
  },
2027
  "SC": {
2028
+ "accuracy": 0.97,
2029
+ "count": 100
2030
  },
2031
  "UC": {
2032
+ "accuracy": 0.6287878787878788,
2033
+ "count": 264
2034
  },
2035
  "US": {
2036
+ "accuracy": 0.8014705882352942,
2037
+ "count": 136
2038
  }
2039
  }
2040
  },
2041
  "add_C5": {
2042
+ "full_accuracy": 0.13,
2043
+ "digit_accuracy": 0.7571428571428571,
2044
+ "n_examples": 100,
2045
  "per_subtask": {
2046
  "SA": {
2047
+ "accuracy": 1.0,
2048
+ "count": 100
2049
  },
2050
  "SC": {
2051
+ "accuracy": 0.97,
2052
+ "count": 100
2053
  },
2054
  "UC": {
2055
+ "accuracy": 0.5806451612903226,
2056
+ "count": 310
2057
  },
2058
  "US": {
2059
+ "accuracy": 0.8052631578947368,
2060
+ "count": 190
2061
  }
2062
  }
2063
  },
2064
  "add_C6": {
2065
+ "full_accuracy": 0.15,
2066
+ "digit_accuracy": 0.7857142857142857,
2067
+ "n_examples": 100,
2068
  "per_subtask": {
2069
  "SC": {
2070
+ "accuracy": 1.0,
2071
+ "count": 100
2072
  },
2073
  "UC": {
2074
+ "accuracy": 0.6891891891891891,
2075
+ "count": 370
2076
  },
2077
  "US": {
2078
+ "accuracy": 0.8478260869565217,
2079
+ "count": 230
2080
  }
2081
  }
2082
  },
2083
  "sub_M0": {
2084
+ "full_accuracy": 0.44,
2085
+ "digit_accuracy": 0.9,
2086
+ "n_examples": 100,
2087
  "per_subtask": {
2088
  "MD": {
2089
+ "accuracy": 0.8878048780487805,
2090
+ "count": 615
2091
  },
2092
  "ME": {
2093
+ "accuracy": 0.9882352941176471,
2094
+ "count": 85
2095
  }
2096
  }
2097
  },
2098
  "sub_M1": {
2099
+ "full_accuracy": 0.22,
2100
+ "digit_accuracy": 0.8314285714285714,
2101
+ "n_examples": 100,
2102
  "per_subtask": {
2103
  "MD": {
2104
+ "accuracy": 0.910958904109589,
2105
+ "count": 292
2106
  },
2107
  "MB": {
2108
+ "accuracy": 0.8888888888888888,
2109
+ "count": 144
2110
  },
2111
  "ME": {
2112
+ "accuracy": 1.0,
2113
+ "count": 25
2114
  },
2115
  "UB": {
2116
+ "accuracy": 0.6820083682008368,
2117
+ "count": 239
2118
  }
2119
  }
2120
  },
2121
  "sub_M2": {
2122
+ "full_accuracy": 0.21,
2123
+ "digit_accuracy": 0.8314285714285714,
2124
+ "n_examples": 100,
2125
  "per_subtask": {
2126
  "MD": {
2127
+ "accuracy": 0.9241706161137441,
2128
+ "count": 211
2129
  },
2130
  "MB": {
2131
+ "accuracy": 0.9391304347826087,
2132
+ "count": 115
2133
  },
2134
  "ME": {
2135
+ "accuracy": 1.0,
2136
+ "count": 85
2137
  },
2138
  "UB": {
2139
+ "accuracy": 0.5248618784530387,
2140
+ "count": 181
2141
  },
2142
  "UD": {
2143
+ "accuracy": 0.9166666666666666,
2144
+ "count": 108
2145
  }
2146
  }
2147
  },
2148
  "sub_M3": {
2149
+ "full_accuracy": 0.02,
2150
+ "digit_accuracy": 0.7014285714285714,
2151
+ "n_examples": 100,
2152
  "per_subtask": {
2153
  "MD": {
2154
+ "accuracy": 0.9664804469273743,
2155
+ "count": 179
2156
  },
2157
  "MB": {
2158
+ "accuracy": 0.941747572815534,
2159
+ "count": 103
2160
  },
2161
  "ME": {
2162
+ "accuracy": 0.9821428571428571,
2163
+ "count": 56
2164
  },
2165
  "UB": {
2166
+ "accuracy": 0.44966442953020136,
2167
+ "count": 149
2168
  },
2169
  "UD": {
2170
+ "accuracy": 0.4647887323943662,
2171
+ "count": 213
2172
  }
2173
  }
2174
  },
2175
  "sub_M4": {
2176
+ "full_accuracy": 0.04,
2177
+ "digit_accuracy": 0.63,
2178
+ "n_examples": 100,
2179
  "per_subtask": {
2180
  "MD": {
2181
+ "accuracy": 0.925,
2182
+ "count": 200
2183
  },
2184
  "MB": {
2185
+ "accuracy": 0.95,
2186
+ "count": 100
2187
  },
2188
  "UB": {
2189
+ "accuracy": 0.59,
2190
+ "count": 100
2191
  },
2192
  "UD": {
2193
+ "accuracy": 0.34,
2194
+ "count": 300
2195
  }
2196
  }
2197
  },
2198
  "sub_M5": {
2199
+ "full_accuracy": 0.04,
2200
+ "digit_accuracy": 0.5014285714285714,
2201
+ "n_examples": 100,
2202
  "per_subtask": {
2203
  "MD": {
2204
  "accuracy": 1.0,
2205
+ "count": 100
2206
  },
2207
  "MB": {
2208
+ "accuracy": 1.0,
2209
+ "count": 100
2210
  },
2211
  "UB": {
2212
+ "accuracy": 0.62,
2213
+ "count": 100
2214
  },
2215
  "UD": {
2216
+ "accuracy": 0.2225,
2217
+ "count": 400
2218
  }
2219
  }
2220
  },
2221
  "sub_random": {
2222
+ "full_accuracy": 0.32,
2223
+ "digit_accuracy": 0.8492857142857143,
2224
  "n_examples": 200,
2225
  "per_subtask": {
2226
  "MD": {
2227
+ "accuracy": 0.905,
2228
+ "count": 600
2229
  },
2230
  "MB": {
2231
+ "accuracy": 0.9775280898876404,
2232
+ "count": 267
2233
  },
2234
  "ME": {
2235
+ "accuracy": 0.9811320754716981,
2236
  "count": 53
2237
  },
2238
  "UB": {
2239
+ "accuracy": 0.6674259681093394,
2240
+ "count": 439
2241
  },
2242
  "UD": {
2243
+ "accuracy": 0.975609756097561,
2244
+ "count": 41
2245
  }
2246
  }
2247
  },
2248
  "sub_B3": {
2249
+ "full_accuracy": 0.08,
2250
+ "digit_accuracy": 0.7871428571428571,
2251
+ "n_examples": 100,
2252
  "per_subtask": {
2253
  "MD": {
2254
+ "accuracy": 0.8666666666666667,
2255
+ "count": 300
2256
  },
2257
  "MB": {
2258
+ "accuracy": 0.95,
2259
+ "count": 100
2260
  },
2261
  "UB": {
2262
+ "accuracy": 0.5786802030456852,
2263
+ "count": 197
2264
  },
2265
  "UD": {
2266
+ "accuracy": 0.7961165048543689,
2267
+ "count": 103
2268
  }
2269
  }
2270
  },
2271
  "sub_B4": {
2272
+ "full_accuracy": 0.07,
2273
+ "digit_accuracy": 0.7342857142857143,
2274
+ "n_examples": 100,
2275
  "per_subtask": {
2276
  "MD": {
2277
+ "accuracy": 0.895,
2278
+ "count": 200
2279
  },
2280
  "MB": {
2281
+ "accuracy": 0.93,
2282
+ "count": 100
2283
  },
2284
  "UB": {
2285
+ "accuracy": 0.5789473684210527,
2286
+ "count": 247
2287
  },
2288
  "UD": {
2289
+ "accuracy": 0.6470588235294118,
2290
+ "count": 153
2291
  }
2292
  }
2293
  },
2294
  "sub_B5": {
2295
+ "full_accuracy": 0.09,
2296
+ "digit_accuracy": 0.7557142857142857,
2297
+ "n_examples": 100,
2298
  "per_subtask": {
2299
  "MD": {
2300
  "accuracy": 1.0,
2301
+ "count": 100
2302
  },
2303
  "MB": {
2304
+ "accuracy": 1.0,
2305
+ "count": 100
2306
  },
2307
  "UB": {
2308
+ "accuracy": 0.6375838926174496,
2309
+ "count": 298
2310
  },
2311
  "UD": {
2312
+ "accuracy": 0.6881188118811881,
2313
+ "count": 202
2314
  }
2315
  }
2316
  }
2317
  },
2318
  "summary": {
2319
+ "overall_accuracy": 0.20653846153846153,
2320
+ "digit_accuracy": 0.7701098901098901,
2321
+ "total_examples": 2600,
2322
  "n_splits": 24
2323
  }
2324
  },
2325
+ "sorl_overall_accuracy": 0.20653846153846153,
2326
+ "sft_overall_accuracy": 0.004230769230769231
2327
  }
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:552c3c376c464a283388ace103d56b77840cec872425d931fcab7533aaebd6fd
3
  size 157702060
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb5ae69edf2987d564ede203c9db7efacfd8822d3c6cfb5e13932b8c61bf6a6
3
  size 157702060
add_sub_sorl_v1_abs10_K1_25K_2L1H128d/train_config.json CHANGED
@@ -20,7 +20,7 @@
20
  "lr": 2e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
- "warmup_steps": 117,
24
  "cooldown_frac": 0.4,
25
  "max_grad_norm": 1.0,
26
  "vq_abs_pretrain_steps": 0,
@@ -30,7 +30,7 @@
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
- "num_epochs": 10,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 390,
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 39348864,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K_2L1H128d",
72
- "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
- "timestamp": "2026-04-12T13:04:02.986587+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "r858hl5t",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/r858hl5t",
81
- "final_accuracy": 0.020416666666666666,
82
- "sft_accuracy": 0.0,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
20
  "lr": 2e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
+ "warmup_steps": 234,
24
  "cooldown_frac": 0.4,
25
  "max_grad_norm": 1.0,
26
  "vq_abs_pretrain_steps": 0,
 
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
+ "num_epochs": 20,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 390,
 
69
  "no_wandb": false,
70
  "n_params": 39348864,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K_2L1H128d",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-15T18:28:58.775284+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_25K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "v1",
80
+ "wandb_run_id": "imtqvdf8",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/imtqvdf8",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.20653846153846153,
86
+ "sft_accuracy": 0.004230769230769231,
87
  "eval_method": "ArithmeticEvaluator"
88
  }