amirali1985 commited on
Commit
2a6cd83
·
verified ·
1 Parent(s): 3d15e07

Upload add_sub_sorl_v1_abs10_K1_25K_1L2H256d

Browse files
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/metrics.json CHANGED
@@ -70,444 +70,936 @@
70
  3719,
71
  3769,
72
  3819,
73
- 3869
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  ],
75
  "loss": [
76
- 8.524430274963379,
77
- 9.103901863098145,
78
- 11.476999282836914,
79
- 11.07170295715332,
80
- 9.617006301879883,
81
- 8.757355690002441,
82
- 7.827155113220215,
83
- 6.2943830490112305,
84
- 5.568047523498535,
85
- 4.920413970947266,
86
- 4.179653167724609,
87
- 4.015377998352051,
88
- 3.8744664192199707,
89
- 3.6563000679016113,
90
- 3.714816093444824,
91
- 3.138887882232666,
92
- 3.476172924041748,
93
- 3.5434532165527344,
94
- 3.2397027015686035,
95
- 3.118135929107666,
96
- 3.188394546508789,
97
- 3.03411602973938,
98
- 3.202545166015625,
99
- 3.2864937782287598,
100
- 2.669011116027832,
101
- 3.0694193840026855,
102
- 2.755049705505371,
103
- 2.658294677734375,
104
- 3.083820343017578,
105
- 2.523517608642578,
106
- 2.064281463623047,
107
- 2.1752371788024902,
108
- 2.3656363487243652,
109
- 1.9907090663909912,
110
- 1.5538188219070435,
111
- 0.5142228603363037,
112
- 0.4625096321105957,
113
- -0.34851527214050293,
114
- -0.9209808111190796,
115
- -0.848351240158081,
116
- -1.2908415794372559,
117
- -1.9059382677078247,
118
- -2.5187909603118896,
119
- -2.6178600788116455,
120
- -2.9667201042175293,
121
- -3.4197542667388916,
122
- -3.4599928855895996,
123
- -4.1904425621032715,
124
- -5.044626235961914,
125
- -4.426793098449707,
126
- -4.773674011230469,
127
- -5.006862640380859,
128
- -5.084996223449707,
129
- -5.224812030792236,
130
- -5.75303316116333,
131
- -5.572091102600098,
132
- -5.412012100219727,
133
- -5.735694408416748,
134
- -4.803545951843262,
135
- -4.7789177894592285,
136
- -4.737234115600586,
137
- -4.012823104858398,
138
- -3.9731242656707764,
139
- -4.444588661193848,
140
- -4.273018836975098,
141
- -4.604288101196289,
142
- -4.26378059387207,
143
- -4.285333156585693,
144
- -3.932020902633667,
145
- -4.814791202545166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  ],
147
  "base_loss": [
148
- 11.692604064941406,
149
- 10.586286544799805,
150
- 9.575356483459473,
151
- 8.626004219055176,
152
- 7.766415119171143,
153
- 6.867415904998779,
154
- 5.944822788238525,
155
- 4.488890647888184,
156
- 3.712851047515869,
157
- 3.1012394428253174,
158
- 2.6632230281829834,
159
- 2.547546863555908,
160
- 2.3669917583465576,
161
- 2.289210081100464,
162
- 2.217154026031494,
163
- 2.0566632747650146,
164
- 2.032634735107422,
165
- 1.9611835479736328,
166
- 2.062990427017212,
167
- 1.9896939992904663,
168
- 1.9653056859970093,
169
- 1.9700582027435303,
170
- 1.9406718015670776,
171
- 1.9608590602874756,
172
- 1.9188401699066162,
173
- 1.9645260572433472,
174
- 1.8941099643707275,
175
- 1.8756904602050781,
176
- 1.9264477491378784,
177
- 1.9439666271209717,
178
- 1.8485826253890991,
179
- 1.9204241037368774,
180
- 1.8739402294158936,
181
- 1.8911492824554443,
182
- 1.90359628200531,
183
- 1.8942089080810547,
184
- 1.9322471618652344,
185
- 1.8236247301101685,
186
- 1.8280603885650635,
187
- 1.8796042203903198,
188
- 1.8643499612808228,
189
- 1.9163367748260498,
190
- 1.8653056621551514,
191
- 1.8866708278656006,
192
- 1.8857192993164062,
193
- 1.8382318019866943,
194
- 1.8878859281539917,
195
- 1.8952440023422241,
196
- 1.9479080438613892,
197
- 1.8315229415893555,
198
- 1.853525161743164,
199
- 1.7897322177886963,
200
- 1.7672021389007568,
201
- 1.67568838596344,
202
- 1.732727289199829,
203
- 1.6849744319915771,
204
- 1.6561976671218872,
205
- 1.6894086599349976,
206
- 1.5698803663253784,
207
- 1.526742935180664,
208
- 1.5828590393066406,
209
- 1.4620859622955322,
210
- 1.427542805671692,
211
- 1.5105113983154297,
212
- 1.487876296043396,
213
- 1.4823344945907593,
214
- 1.458412766456604,
215
- 1.479615330696106,
216
- 1.3868151903152466,
217
- 1.540739893913269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  ],
219
  "info_loss": [
220
- -1.2530698776245117,
221
- -0.9802732467651367,
222
- -0.4201974868774414,
223
- -0.18035411834716797,
224
- -0.13341093063354492,
225
- -0.07659339904785156,
226
- -0.04645824432373047,
227
- -0.028890609741210938,
228
- -0.016974687576293945,
229
- -0.016387462615966797,
230
- -0.04442238807678223,
231
- -0.047545671463012695,
232
- -0.042586565017700195,
233
- -0.056116580963134766,
234
- -0.04165053367614746,
235
- -0.08310163021087646,
236
- -0.04619729518890381,
237
- -0.03194534778594971,
238
- -0.07237017154693604,
239
- -0.076835036277771,
240
- -0.06701540946960449,
241
- -0.08263790607452393,
242
- -0.06275796890258789,
243
- -0.05613744258880615,
244
- -0.11362743377685547,
245
- -0.07795798778533936,
246
- -0.10230779647827148,
247
- -0.11010193824768066,
248
- -0.07229125499725342,
249
- -0.13019275665283203,
250
- -0.16647255420684814,
251
- -0.16245436668395996,
252
- -0.138879656791687,
253
- -0.17805254459381104,
254
- -0.22295153141021729,
255
- -0.32607901096343994,
256
- -0.3350032567977905,
257
- -0.40525615215301514,
258
- -0.4629077911376953,
259
- -0.46074652671813965,
260
- -0.5033977031707764,
261
- -0.5704048871994019,
262
- -0.6261879205703735,
263
- -0.6387062072753906,
264
- -0.6733493804931641,
265
- -0.7140244245529175,
266
- -0.7230769395828247,
267
- -0.796980619430542,
268
- -0.887682318687439,
269
- -0.8138977289199829,
270
- -0.8508479595184326,
271
- -0.8680267930030823,
272
- -0.8732814192771912,
273
- -0.8783826231956482,
274
- -0.9365927577018738,
275
- -0.9135531187057495,
276
- -0.8946322798728943,
277
- -0.9301126599311829,
278
- -0.8245593905448914,
279
- -0.8180517554283142,
280
- -0.8192586898803711,
281
- -0.734249472618103,
282
- -0.7269594073295593,
283
- -0.7820543646812439,
284
- -0.7622780799865723,
285
- -0.7947943210601807,
286
- -0.7579862475395203,
287
- -0.762677013874054,
288
- -0.7179456353187561,
289
- -0.821166455745697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  ],
291
  "abs_loss": [
292
- 2.301368474960327,
293
- 2.272613763809204,
294
- 2.1877543926239014,
295
- 2.0855491161346436,
296
- 2.003074884414673,
297
- 1.9530082941055298,
298
- 1.8817201852798462,
299
- 1.862039566040039,
300
- 1.8272932767868042,
301
- 1.8210479021072388,
302
- 1.8502413034439087,
303
- 1.8328882455825806,
304
- 1.8329845666885376,
305
- 1.8578821420669556,
306
- 1.8351255655288696,
307
- 1.8775984048843384,
308
- 1.8421878814697266,
309
- 1.8364969491958618,
310
- 1.8568716049194336,
311
- 1.8509222269058228,
312
- 1.840528130531311,
313
- 1.8526434898376465,
314
- 1.8566508293151855,
315
- 1.8434433937072754,
316
- 1.8501802682876587,
317
- 1.8451181650161743,
318
- 1.8511062860488892,
319
- 1.8564246892929077,
320
- 1.8391847610473633,
321
- 1.8499897718429565,
322
- 1.8488394021987915,
323
- 1.8438078165054321,
324
- 1.8546184301376343,
325
- 1.857653021812439,
326
- 1.8427098989486694,
327
- 1.837051272392273,
328
- 1.8287359476089478,
329
- 1.8250846862792969,
330
- 1.8261111974716187,
331
- 1.8157788515090942,
332
- 1.813087821006775,
333
- 1.8383451700210571,
334
- 1.7974052429199219,
335
- 1.8346837759017944,
336
- 1.8273576498031616,
337
- 1.8329792022705078,
338
- 1.8257931470870972,
339
- 1.8227308988571167,
340
- 1.8331661224365234,
341
- 1.8032366037368774,
342
- 1.8001683950424194,
343
- 1.8143806457519531,
344
- 1.8048582077026367,
345
- 1.8383716344833374,
346
- 1.8124042749404907,
347
- 1.7793735265731812,
348
- 1.7747602462768555,
349
- 1.7881097793579102,
350
- 1.7533124685287476,
351
- 1.776807188987732,
352
- 1.735535979270935,
353
- 1.737404465675354,
354
- 1.7615875005722046,
355
- 1.7293590307235718,
356
- 1.7104082107543945,
357
- 1.7312254905700684,
358
- 1.700640082359314,
359
- 1.708137035369873,
360
- 1.7061353921890259,
361
- 1.6751855611801147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  ],
363
  "zipf_loss": [
364
- 9.1323881149292,
365
- 8.093086242675781,
366
- 5.884842872619629,
367
- 4.040685653686523,
368
- 2.9843926429748535,
369
- 2.4605727195739746,
370
- 2.158742666244507,
371
- 1.9081945419311523,
372
- 1.8422138690948486,
373
- 1.8009443283081055,
374
- 1.775629997253418,
375
- 1.75999915599823,
376
- 1.7500418424606323,
377
- 1.7424676418304443,
378
- 1.7306548357009888,
379
- 1.7254811525344849,
380
- 1.7212923765182495,
381
- 1.7180733680725098,
382
- 1.7147266864776611,
383
- 1.711700201034546,
384
- 1.709190011024475,
385
- 1.7051725387573242,
386
- 1.703787922859192,
387
- 1.702664852142334,
388
- 1.7014273405075073,
389
- 1.699961543083191,
390
- 1.6989071369171143,
391
- 1.6979811191558838,
392
- 1.696366548538208,
393
- 1.6964795589447021,
394
- 1.695540428161621,
395
- 1.6949759721755981,
396
- 1.6950308084487915,
397
- 1.6943199634552002,
398
- 1.6954668760299683,
399
- 1.697098970413208,
400
- 1.6974214315414429,
401
- 1.697913408279419,
402
- 1.6974254846572876,
403
- 1.6979317665100098,
404
- 1.697476863861084,
405
- 1.6979395151138306,
406
- 1.6980416774749756,
407
- 1.6990625858306885,
408
- 1.6983184814453125,
409
- 1.6989600658416748,
410
- 1.7003107070922852,
411
- 1.701846718788147,
412
- 1.7009726762771606,
413
- 1.700337290763855,
414
- 1.7012641429901123,
415
- 1.7022359371185303,
416
- 1.7001292705535889,
417
- 1.6994887590408325,
418
- 1.6989268064498901,
419
- 1.7005280256271362,
420
- 1.7006367444992065,
421
- 1.6972121000289917,
422
- 1.6968367099761963,
423
- 1.6971760988235474,
424
- 1.6989400386810303,
425
- 1.6938459873199463,
426
- 1.6927683353424072,
427
- 1.6925078630447388,
428
- 1.6908442974090576,
429
- 1.688198208808899,
430
- 1.687605381011963,
431
- 1.6910079717636108,
432
- 1.6900064945220947,
433
- 1.6886142492294312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  ],
435
  "denoise_loss": [],
436
  "ortho_loss": [
437
- 0.7410624623298645,
438
- 0.5486676096916199,
439
- 0.4325517416000366,
440
- 0.4293130338191986,
441
- 0.42301344871520996,
442
- 0.43677014112472534,
443
- 0.4322786331176758,
444
- 0.43111148476600647,
445
- 0.42917412519454956,
446
- 0.4202711284160614,
447
- 0.40449219942092896,
448
- 0.38009271025657654,
449
- 0.38311055302619934,
450
- 0.38355082273483276,
451
- 0.3772588074207306,
452
- 0.37517693638801575,
453
- 0.35672980546951294,
454
- 0.3655826151371002,
455
- 0.36484745144844055,
456
- 0.36458733677864075,
457
- 0.3716451823711395,
458
- 0.3796772062778473,
459
- 0.37268635630607605,
460
- 0.36658456921577454,
461
- 0.36402279138565063,
462
- 0.36522912979125977,
463
- 0.3712886571884155,
464
- 0.3701127767562866,
465
- 0.3669368028640747,
466
- 0.35840272903442383,
467
- 0.36034736037254333,
468
- 0.36091148853302,
469
- 0.35899659991264343,
470
- 0.356486439704895,
471
- 0.3690042495727539,
472
- 0.3797115981578827,
473
- 0.3873102366924286,
474
- 0.39992478489875793,
475
- 0.41160136461257935,
476
- 0.42042696475982666,
477
- 0.42867836356163025,
478
- 0.43672993779182434,
479
- 0.4450317621231079,
480
- 0.45142629742622375,
481
- 0.451881468296051,
482
- 0.45120909810066223,
483
- 0.4502365291118622,
484
- 0.44785618782043457,
485
- 0.4534400999546051,
486
- 0.45658400654792786,
487
- 0.45758968591690063,
488
- 0.45804986357688904,
489
- 0.4550400674343109,
490
- 0.451234370470047,
491
- 0.4503278136253357,
492
- 0.45379236340522766,
493
- 0.4566922187805176,
494
- 0.4583297073841095,
495
- 0.46125417947769165,
496
- 0.46176251769065857,
497
- 0.4625169634819031,
498
- 0.46337494254112244,
499
- 0.46481263637542725,
500
- 0.4634028375148773,
501
- 0.4632355272769928,
502
- 0.46351659297943115,
503
- 0.46328744292259216,
504
- 0.4644387364387512,
505
- 0.46469375491142273,
506
- 0.4648001492023468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  ],
508
  "lr": [
509
- 8.376068376068378e-06,
510
- 1.6923076923076924e-05,
 
 
511
  2e-05,
512
  2e-05,
513
  2e-05,
@@ -548,34 +1040,102 @@
548
  2e-05,
549
  2e-05,
550
  2e-05,
551
- 1.9973899288162407e-05,
552
- 1.9380701291853413e-05,
553
- 1.8787503295544426e-05,
554
- 1.8194305299235432e-05,
555
- 1.7601107302926442e-05,
556
- 1.7007909306617455e-05,
557
- 1.6414711310308464e-05,
558
- 1.5335090957026102e-05,
559
- 1.474189296071711e-05,
560
- 1.4148694964408121e-05,
561
- 1.355549696809913e-05,
562
- 1.2962298971790142e-05,
563
- 1.2369100975481152e-05,
564
- 1.1775902979172158e-05,
565
- 1.0696282625889797e-05,
566
- 1.0103084629580805e-05,
567
- 9.50988663327182e-06,
568
- 8.91668863696283e-06,
569
- 8.323490640653837e-06,
570
- 7.730292644344845e-06,
571
- 7.137094648035855e-06,
572
- 6.057474294753492e-06,
573
- 5.4642762984445065e-06,
574
- 4.871078302135514e-06,
575
- 4.277880305826523e-06,
576
- 3.684682309517532e-06,
577
- 3.091484313208541e-06,
578
- 2.4982863168995496e-06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  ],
580
  "emb_lr": [],
581
  "eval_step": [
@@ -588,588 +1148,608 @@
588
  2696,
589
  3087,
590
  3478,
591
- 3869
 
 
 
 
 
 
 
 
 
 
592
  ],
593
  "eval_accuracy": [
 
 
 
 
 
 
 
 
 
 
594
  0.01,
595
  0.0,
596
  0.0,
597
- 0.01,
598
  0.0,
599
  0.0,
600
- 0.01,
 
601
  0.0,
602
  0.0,
603
  0.0
604
  ]
605
  },
606
- "final_accuracy": 0.06291666666666666,
607
  "sft_eval": {
608
  "config": {
609
  "ops": "add_sub",
610
  "K": null,
611
  "mode": "sft",
612
  "n_digits": 6,
613
- "n_per_split": 50
614
  },
615
  "splits": {
616
  "add_S0": {
617
- "full_accuracy": 0.0,
618
- "digit_accuracy": 0.5771428571428572,
619
- "n_examples": 50,
620
  "per_subtask": {
621
  "SA": {
622
- "accuracy": 0.5457627118644067,
623
- "count": 295
624
  },
625
  "SS": {
626
- "accuracy": 0.7454545454545455,
627
- "count": 55
628
  }
629
  }
630
  },
631
  "add_S1": {
632
- "full_accuracy": 0.0,
633
- "digit_accuracy": 0.4142857142857143,
634
- "n_examples": 50,
635
  "per_subtask": {
636
  "SA": {
637
- "accuracy": 0.5079365079365079,
638
- "count": 126
639
  },
640
  "SC": {
641
- "accuracy": 0.2911392405063291,
642
- "count": 79
643
  },
644
  "SS": {
645
- "accuracy": 0.6666666666666666,
646
- "count": 21
647
  },
648
  "UC": {
649
- "accuracy": 0.3548387096774194,
650
- "count": 124
651
  }
652
  }
653
  },
654
  "add_S2": {
655
- "full_accuracy": 0.02,
656
- "digit_accuracy": 0.5028571428571429,
657
- "n_examples": 50,
658
  "per_subtask": {
659
  "SA": {
660
- "accuracy": 0.6133333333333333,
661
- "count": 75
662
  },
663
  "SC": {
664
- "accuracy": 0.3709677419354839,
665
- "count": 62
666
  },
667
  "SS": {
668
- "accuracy": 0.6923076923076923,
669
- "count": 39
670
  },
671
  "UC": {
672
- "accuracy": 0.45045045045045046,
673
- "count": 111
674
  },
675
  "US": {
676
- "accuracy": 0.47619047619047616,
677
- "count": 63
678
  }
679
  }
680
  },
681
  "add_S3": {
682
- "full_accuracy": 0.04,
683
- "digit_accuracy": 0.44571428571428573,
684
- "n_examples": 50,
685
  "per_subtask": {
686
  "SA": {
687
- "accuracy": 0.7166666666666667,
688
- "count": 60
689
  },
690
  "SC": {
691
- "accuracy": 0.3157894736842105,
692
- "count": 57
693
  },
694
  "SS": {
695
- "accuracy": 0.47368421052631576,
696
- "count": 19
697
  },
698
  "UC": {
699
- "accuracy": 0.4519230769230769,
700
- "count": 104
701
  },
702
  "US": {
703
- "accuracy": 0.35454545454545455,
704
- "count": 110
705
  }
706
  }
707
  },
708
  "add_S4": {
709
- "full_accuracy": 0.0,
710
- "digit_accuracy": 0.3914285714285714,
711
- "n_examples": 50,
712
  "per_subtask": {
713
  "SA": {
714
- "accuracy": 0.625,
715
- "count": 48
716
  },
717
  "SC": {
718
- "accuracy": 0.38461538461538464,
719
- "count": 52
720
  },
721
  "SS": {
722
- "accuracy": 1.0,
723
- "count": 7
724
  },
725
  "UC": {
726
- "accuracy": 0.3595505617977528,
727
- "count": 89
728
  },
729
  "US": {
730
- "accuracy": 0.3116883116883117,
731
- "count": 154
732
  }
733
  }
734
  },
735
  "add_S5": {
736
- "full_accuracy": 0.06,
737
- "digit_accuracy": 0.3342857142857143,
738
- "n_examples": 50,
739
  "per_subtask": {
740
  "SA": {
741
- "accuracy": 0.72,
742
- "count": 50
743
  },
744
  "SC": {
745
- "accuracy": 0.28,
746
- "count": 50
747
  },
748
  "UC": {
749
- "accuracy": 0.38,
750
- "count": 50
751
  },
752
  "US": {
753
- "accuracy": 0.24,
754
- "count": 200
755
  }
756
  }
757
  },
758
  "add_S6": {
759
- "full_accuracy": 0.18,
760
- "digit_accuracy": 0.3628571428571429,
761
- "n_examples": 50,
762
  "per_subtask": {
763
  "SC": {
764
- "accuracy": 0.48,
765
- "count": 50
766
  },
767
  "UC": {
768
- "accuracy": 0.44,
769
- "count": 50
770
  },
771
  "US": {
772
- "accuracy": 0.324,
773
- "count": 250
774
  }
775
  }
776
  },
777
  "add_random": {
778
- "full_accuracy": 0.0,
779
- "digit_accuracy": 0.44142857142857145,
780
  "n_examples": 200,
781
  "per_subtask": {
782
  "SA": {
783
- "accuracy": 0.531322505800464,
784
- "count": 431
785
  },
786
  "SC": {
787
- "accuracy": 0.439873417721519,
788
- "count": 316
789
  },
790
  "SS": {
791
- "accuracy": 0.7948717948717948,
792
- "count": 39
793
  },
794
  "UC": {
795
- "accuracy": 0.3482142857142857,
796
- "count": 560
797
  },
798
  "US": {
799
- "accuracy": 0.4444444444444444,
800
- "count": 54
801
  }
802
  }
803
  },
804
  "add_C1": {
805
- "full_accuracy": 0.0,
806
- "digit_accuracy": 0.5514285714285714,
807
- "n_examples": 50,
808
  "per_subtask": {
809
  "SA": {
810
- "accuracy": 0.66,
811
- "count": 250
812
  },
813
  "SC": {
814
- "accuracy": 0.24,
815
- "count": 50
816
  },
817
  "UC": {
818
- "accuracy": 0.32,
819
- "count": 50
820
  }
821
  }
822
  },
823
  "add_C2": {
824
- "full_accuracy": 0.02,
825
- "digit_accuracy": 0.4714285714285714,
826
- "n_examples": 50,
827
  "per_subtask": {
828
  "SA": {
829
- "accuracy": 0.545,
830
- "count": 200
831
  },
832
  "SC": {
833
- "accuracy": 0.46,
834
- "count": 50
835
  },
836
  "UC": {
837
- "accuracy": 0.3132530120481928,
838
- "count": 83
839
  },
840
  "US": {
841
- "accuracy": 0.4117647058823529,
842
- "count": 17
843
  }
844
  }
845
  },
846
  "add_C3": {
847
- "full_accuracy": 0.0,
848
- "digit_accuracy": 0.42,
849
- "n_examples": 50,
850
  "per_subtask": {
851
  "SA": {
852
- "accuracy": 0.5933333333333334,
853
- "count": 150
854
  },
855
  "SC": {
856
- "accuracy": 0.36,
857
- "count": 50
858
  },
859
  "UC": {
860
- "accuracy": 0.28,
861
- "count": 100
862
  },
863
  "US": {
864
- "accuracy": 0.24,
865
- "count": 50
866
  }
867
  }
868
  },
869
  "add_C4": {
870
- "full_accuracy": 0.0,
871
- "digit_accuracy": 0.46285714285714286,
872
- "n_examples": 50,
873
  "per_subtask": {
874
  "SA": {
875
- "accuracy": 0.71,
876
- "count": 100
877
  },
878
  "SC": {
879
- "accuracy": 0.36,
880
- "count": 50
881
  },
882
  "UC": {
883
- "accuracy": 0.32575757575757575,
884
- "count": 132
885
  },
886
  "US": {
887
- "accuracy": 0.4411764705882353,
888
- "count": 68
889
  }
890
  }
891
  },
892
  "add_C5": {
893
- "full_accuracy": 0.0,
894
- "digit_accuracy": 0.4142857142857143,
895
- "n_examples": 50,
896
  "per_subtask": {
897
  "SA": {
898
- "accuracy": 0.72,
899
- "count": 50
900
  },
901
  "SC": {
902
- "accuracy": 0.38,
903
- "count": 50
904
  },
905
  "UC": {
906
- "accuracy": 0.3287671232876712,
907
- "count": 146
908
  },
909
  "US": {
910
- "accuracy": 0.40384615384615385,
911
- "count": 104
912
  }
913
  }
914
  },
915
  "add_C6": {
916
- "full_accuracy": 0.04,
917
- "digit_accuracy": 0.40285714285714286,
918
- "n_examples": 50,
919
  "per_subtask": {
920
  "SC": {
921
- "accuracy": 0.4,
922
- "count": 50
923
  },
924
  "UC": {
925
- "accuracy": 0.3862433862433862,
926
- "count": 189
927
  },
928
  "US": {
929
- "accuracy": 0.43243243243243246,
930
- "count": 111
931
  }
932
  }
933
  },
934
  "sub_M0": {
935
- "full_accuracy": 0.0,
936
- "digit_accuracy": 0.45714285714285713,
937
- "n_examples": 50,
938
  "per_subtask": {
939
  "MD": {
940
- "accuracy": 0.39933993399339934,
941
- "count": 303
942
  },
943
  "ME": {
944
- "accuracy": 0.8297872340425532,
945
- "count": 47
946
  }
947
  }
948
  },
949
  "sub_M1": {
950
- "full_accuracy": 0.0,
951
- "digit_accuracy": 0.34285714285714286,
952
- "n_examples": 50,
953
  "per_subtask": {
954
  "MD": {
955
- "accuracy": 0.5177304964539007,
956
- "count": 141
957
  },
958
  "MB": {
959
- "accuracy": 0.09722222222222222,
960
- "count": 72
961
  },
962
  "ME": {
963
- "accuracy": 0.8333333333333334,
964
- "count": 18
965
  },
966
  "UB": {
967
- "accuracy": 0.21008403361344538,
968
- "count": 119
969
  }
970
  }
971
  },
972
  "sub_M2": {
973
- "full_accuracy": 0.0,
974
- "digit_accuracy": 0.4228571428571429,
975
- "n_examples": 50,
976
  "per_subtask": {
977
  "MD": {
978
- "accuracy": 0.7053571428571429,
979
- "count": 112
980
  },
981
  "MB": {
982
- "accuracy": 0.11320754716981132,
983
- "count": 53
984
  },
985
  "ME": {
986
- "accuracy": 0.8297872340425532,
987
- "count": 47
988
  },
989
  "UB": {
990
- "accuracy": 0.2235294117647059,
991
- "count": 85
992
  },
993
  "UD": {
994
- "accuracy": 0.09433962264150944,
995
- "count": 53
996
  }
997
  }
998
  },
999
  "sub_M3": {
1000
- "full_accuracy": 0.0,
1001
- "digit_accuracy": 0.37142857142857144,
1002
- "n_examples": 50,
1003
  "per_subtask": {
1004
  "MD": {
1005
- "accuracy": 0.7216494845360825,
1006
- "count": 97
1007
  },
1008
  "MB": {
1009
- "accuracy": 0.058823529411764705,
1010
- "count": 51
1011
  },
1012
  "ME": {
1013
- "accuracy": 0.8888888888888888,
1014
- "count": 27
1015
  },
1016
  "UB": {
1017
- "accuracy": 0.25675675675675674,
1018
- "count": 74
1019
  },
1020
  "UD": {
1021
- "accuracy": 0.13861386138613863,
1022
- "count": 101
1023
  }
1024
  }
1025
  },
1026
  "sub_M4": {
1027
  "full_accuracy": 0.0,
1028
- "digit_accuracy": 0.2857142857142857,
1029
- "n_examples": 50,
1030
  "per_subtask": {
1031
  "MD": {
1032
- "accuracy": 0.57,
1033
- "count": 100
1034
  },
1035
  "MB": {
1036
- "accuracy": 0.1,
1037
- "count": 50
1038
  },
1039
  "UB": {
1040
- "accuracy": 0.6,
1041
- "count": 50
1042
  },
1043
  "UD": {
1044
- "accuracy": 0.05333333333333334,
1045
- "count": 150
1046
  }
1047
  }
1048
  },
1049
  "sub_M5": {
1050
  "full_accuracy": 0.0,
1051
- "digit_accuracy": 0.2542857142857143,
1052
- "n_examples": 50,
1053
  "per_subtask": {
1054
  "MD": {
1055
  "accuracy": 1.0,
1056
- "count": 50
1057
  },
1058
  "MB": {
1059
- "accuracy": 0.02,
1060
- "count": 50
1061
  },
1062
  "UB": {
1063
- "accuracy": 0.44,
1064
- "count": 50
1065
  },
1066
  "UD": {
1067
- "accuracy": 0.08,
1068
- "count": 200
1069
  }
1070
  }
1071
  },
1072
  "sub_random": {
1073
- "full_accuracy": 0.0,
1074
- "digit_accuracy": 0.32785714285714285,
1075
  "n_examples": 200,
1076
  "per_subtask": {
1077
  "MD": {
1078
- "accuracy": 0.5596491228070175,
1079
- "count": 570
1080
  },
1081
  "MB": {
1082
- "accuracy": 0.06859205776173286,
1083
- "count": 277
1084
  },
1085
  "ME": {
1086
- "accuracy": 0.7547169811320755,
1087
  "count": 53
1088
  },
1089
  "UB": {
1090
- "accuracy": 0.16348195329087048,
1091
- "count": 471
1092
  },
1093
  "UD": {
1094
- "accuracy": 0.13793103448275862,
1095
- "count": 29
1096
  }
1097
  }
1098
  },
1099
  "sub_B3": {
1100
  "full_accuracy": 0.0,
1101
- "digit_accuracy": 0.28,
1102
- "n_examples": 50,
1103
  "per_subtask": {
1104
  "MD": {
1105
- "accuracy": 0.48,
1106
- "count": 150
1107
  },
1108
  "MB": {
1109
- "accuracy": 0.04,
1110
- "count": 50
1111
  },
1112
  "UB": {
1113
- "accuracy": 0.18811881188118812,
1114
- "count": 101
1115
  },
1116
  "UD": {
1117
- "accuracy": 0.10204081632653061,
1118
- "count": 49
1119
  }
1120
  }
1121
  },
1122
  "sub_B4": {
1123
- "full_accuracy": 0.0,
1124
- "digit_accuracy": 0.25142857142857145,
1125
- "n_examples": 50,
1126
  "per_subtask": {
1127
  "MD": {
1128
- "accuracy": 0.61,
1129
- "count": 100
1130
  },
1131
  "MB": {
1132
- "accuracy": 0.02,
1133
- "count": 50
1134
  },
1135
  "UB": {
1136
- "accuracy": 0.17355371900826447,
1137
- "count": 121
1138
  },
1139
  "UD": {
1140
- "accuracy": 0.06329113924050633,
1141
- "count": 79
1142
  }
1143
  }
1144
  },
1145
  "sub_B5": {
1146
  "full_accuracy": 0.0,
1147
- "digit_accuracy": 0.2742857142857143,
1148
- "n_examples": 50,
1149
  "per_subtask": {
1150
  "MD": {
1151
  "accuracy": 1.0,
1152
- "count": 50
1153
  },
1154
  "MB": {
1155
- "accuracy": 0.04,
1156
- "count": 50
1157
  },
1158
  "UB": {
1159
- "accuracy": 0.21710526315789475,
1160
- "count": 152
1161
  },
1162
  "UD": {
1163
- "accuracy": 0.11224489795918367,
1164
- "count": 98
1165
  }
1166
  }
1167
  }
1168
  },
1169
  "summary": {
1170
- "overall_accuracy": 0.012,
1171
- "digit_accuracy": 0.3922857142857143,
1172
- "total_examples": 1500,
1173
  "n_splits": 24
1174
  }
1175
  },
@@ -1179,569 +1759,569 @@
1179
  "K": 1,
1180
  "mode": "sorl",
1181
  "n_digits": 6,
1182
- "n_per_split": 50
1183
  },
1184
  "splits": {
1185
  "add_S0": {
1186
- "full_accuracy": 0.62,
1187
- "digit_accuracy": 0.9228571428571428,
1188
- "n_examples": 50,
1189
  "per_subtask": {
1190
  "SA": {
1191
- "accuracy": 0.9355932203389831,
1192
- "count": 295
1193
  },
1194
  "SS": {
1195
- "accuracy": 0.8545454545454545,
1196
- "count": 55
1197
  }
1198
  }
1199
  },
1200
  "add_S1": {
1201
- "full_accuracy": 0.02,
1202
- "digit_accuracy": 0.66,
1203
- "n_examples": 50,
1204
  "per_subtask": {
1205
  "SA": {
1206
- "accuracy": 0.9365079365079365,
1207
- "count": 126
1208
  },
1209
  "SC": {
1210
- "accuracy": 0.8607594936708861,
1211
- "count": 79
1212
  },
1213
  "SS": {
1214
- "accuracy": 0.8571428571428571,
1215
- "count": 21
1216
  },
1217
  "UC": {
1218
- "accuracy": 0.21774193548387097,
1219
- "count": 124
1220
  }
1221
  }
1222
  },
1223
  "add_S2": {
1224
- "full_accuracy": 0.0,
1225
- "digit_accuracy": 0.5457142857142857,
1226
- "n_examples": 50,
1227
  "per_subtask": {
1228
  "SA": {
1229
- "accuracy": 0.9333333333333333,
1230
- "count": 75
1231
  },
1232
  "SC": {
1233
- "accuracy": 0.7580645161290323,
1234
- "count": 62
1235
  },
1236
  "SS": {
1237
- "accuracy": 0.8461538461538461,
1238
- "count": 39
1239
  },
1240
  "UC": {
1241
- "accuracy": 0.24324324324324326,
1242
- "count": 111
1243
  },
1244
  "US": {
1245
- "accuracy": 0.2222222222222222,
1246
- "count": 63
1247
  }
1248
  }
1249
  },
1250
  "add_S3": {
1251
- "full_accuracy": 0.0,
1252
- "digit_accuracy": 0.5142857142857142,
1253
- "n_examples": 50,
1254
  "per_subtask": {
1255
  "SA": {
1256
- "accuracy": 0.9666666666666667,
1257
- "count": 60
1258
  },
1259
  "SC": {
1260
- "accuracy": 0.8421052631578947,
1261
- "count": 57
1262
  },
1263
  "SS": {
1264
- "accuracy": 1.0,
1265
- "count": 19
1266
  },
1267
  "UC": {
1268
- "accuracy": 0.3557692307692308,
1269
- "count": 104
1270
  },
1271
  "US": {
1272
- "accuracy": 0.16363636363636364,
1273
- "count": 110
1274
  }
1275
  }
1276
  },
1277
  "add_S4": {
1278
- "full_accuracy": 0.0,
1279
- "digit_accuracy": 0.4,
1280
- "n_examples": 50,
1281
  "per_subtask": {
1282
  "SA": {
1283
- "accuracy": 0.9583333333333334,
1284
- "count": 48
1285
  },
1286
  "SC": {
1287
- "accuracy": 0.9038461538461539,
1288
- "count": 52
1289
  },
1290
  "SS": {
1291
- "accuracy": 0.8571428571428571,
1292
- "count": 7
1293
  },
1294
  "UC": {
1295
- "accuracy": 0.2808988764044944,
1296
- "count": 89
1297
  },
1298
  "US": {
1299
- "accuracy": 0.1038961038961039,
1300
- "count": 154
1301
  }
1302
  }
1303
  },
1304
  "add_S5": {
1305
- "full_accuracy": 0.0,
1306
- "digit_accuracy": 0.2914285714285714,
1307
- "n_examples": 50,
1308
  "per_subtask": {
1309
  "SA": {
1310
- "accuracy": 0.92,
1311
- "count": 50
1312
  },
1313
  "SC": {
1314
- "accuracy": 0.66,
1315
- "count": 50
1316
  },
1317
  "UC": {
1318
- "accuracy": 0.1,
1319
- "count": 50
1320
  },
1321
  "US": {
1322
- "accuracy": 0.09,
1323
- "count": 200
1324
  }
1325
  }
1326
  },
1327
  "add_S6": {
1328
- "full_accuracy": 0.0,
1329
- "digit_accuracy": 0.11142857142857143,
1330
- "n_examples": 50,
1331
  "per_subtask": {
1332
  "SC": {
1333
- "accuracy": 0.4,
1334
- "count": 50
1335
  },
1336
  "UC": {
1337
- "accuracy": 0.1,
1338
- "count": 50
1339
  },
1340
  "US": {
1341
- "accuracy": 0.056,
1342
- "count": 250
1343
  }
1344
  }
1345
  },
1346
  "add_random": {
1347
- "full_accuracy": 0.045,
1348
- "digit_accuracy": 0.6085714285714285,
1349
  "n_examples": 200,
1350
  "per_subtask": {
1351
  "SA": {
1352
- "accuracy": 0.9327146171693735,
1353
- "count": 431
1354
  },
1355
  "SC": {
1356
- "accuracy": 0.8670886075949367,
1357
- "count": 316
1358
  },
1359
  "SS": {
1360
- "accuracy": 0.8461538461538461,
1361
- "count": 39
1362
  },
1363
  "UC": {
1364
- "accuracy": 0.2357142857142857,
1365
- "count": 560
1366
  },
1367
  "US": {
1368
- "accuracy": 0.2037037037037037,
1369
- "count": 54
1370
  }
1371
  }
1372
  },
1373
  "add_C1": {
1374
- "full_accuracy": 0.12,
1375
- "digit_accuracy": 0.8314285714285714,
1376
- "n_examples": 50,
1377
  "per_subtask": {
1378
  "SA": {
1379
- "accuracy": 0.956,
1380
- "count": 250
1381
  },
1382
  "SC": {
1383
- "accuracy": 0.88,
1384
- "count": 50
1385
  },
1386
  "UC": {
1387
- "accuracy": 0.16,
1388
- "count": 50
1389
  }
1390
  }
1391
  },
1392
  "add_C2": {
1393
- "full_accuracy": 0.02,
1394
- "digit_accuracy": 0.7114285714285714,
1395
- "n_examples": 50,
1396
  "per_subtask": {
1397
  "SA": {
1398
- "accuracy": 0.945,
1399
- "count": 200
1400
  },
1401
  "SC": {
1402
- "accuracy": 0.9,
1403
- "count": 50
1404
  },
1405
  "UC": {
1406
- "accuracy": 0.1686746987951807,
1407
- "count": 83
1408
  },
1409
  "US": {
1410
- "accuracy": 0.058823529411764705,
1411
- "count": 17
1412
  }
1413
  }
1414
  },
1415
  "add_C3": {
1416
- "full_accuracy": 0.0,
1417
- "digit_accuracy": 0.5971428571428572,
1418
- "n_examples": 50,
1419
  "per_subtask": {
1420
  "SA": {
1421
- "accuracy": 0.94,
1422
- "count": 150
1423
  },
1424
  "SC": {
1425
- "accuracy": 0.74,
1426
- "count": 50
1427
  },
1428
  "UC": {
1429
- "accuracy": 0.21,
1430
- "count": 100
1431
  },
1432
  "US": {
1433
- "accuracy": 0.2,
1434
- "count": 50
1435
  }
1436
  }
1437
  },
1438
  "add_C4": {
1439
- "full_accuracy": 0.0,
1440
- "digit_accuracy": 0.4742857142857143,
1441
- "n_examples": 50,
1442
  "per_subtask": {
1443
  "SA": {
1444
- "accuracy": 0.95,
1445
- "count": 100
1446
  },
1447
  "SC": {
1448
- "accuracy": 0.8,
1449
- "count": 50
1450
  },
1451
  "UC": {
1452
- "accuracy": 0.20454545454545456,
1453
- "count": 132
1454
  },
1455
  "US": {
1456
- "accuracy": 0.058823529411764705,
1457
- "count": 68
1458
  }
1459
  }
1460
  },
1461
  "add_C5": {
1462
- "full_accuracy": 0.0,
1463
- "digit_accuracy": 0.4,
1464
- "n_examples": 50,
1465
  "per_subtask": {
1466
  "SA": {
1467
- "accuracy": 0.96,
1468
- "count": 50
1469
  },
1470
  "SC": {
1471
- "accuracy": 0.8,
1472
- "count": 50
1473
  },
1474
  "UC": {
1475
- "accuracy": 0.2328767123287671,
1476
- "count": 146
1477
  },
1478
  "US": {
1479
- "accuracy": 0.17307692307692307,
1480
- "count": 104
1481
  }
1482
  }
1483
  },
1484
  "add_C6": {
1485
  "full_accuracy": 0.0,
1486
- "digit_accuracy": 0.28285714285714286,
1487
- "n_examples": 50,
1488
  "per_subtask": {
1489
  "SC": {
1490
- "accuracy": 0.68,
1491
- "count": 50
1492
  },
1493
  "UC": {
1494
- "accuracy": 0.31216931216931215,
1495
- "count": 189
1496
  },
1497
  "US": {
1498
- "accuracy": 0.05405405405405406,
1499
- "count": 111
1500
  }
1501
  }
1502
  },
1503
  "sub_M0": {
1504
- "full_accuracy": 0.6,
1505
- "digit_accuracy": 0.9285714285714286,
1506
- "n_examples": 50,
1507
  "per_subtask": {
1508
  "MD": {
1509
- "accuracy": 0.9174917491749175,
1510
- "count": 303
1511
  },
1512
  "ME": {
1513
- "accuracy": 1.0,
1514
- "count": 47
1515
  }
1516
  }
1517
  },
1518
  "sub_M1": {
1519
- "full_accuracy": 0.02,
1520
- "digit_accuracy": 0.64,
1521
- "n_examples": 50,
1522
  "per_subtask": {
1523
  "MD": {
1524
- "accuracy": 0.9361702127659575,
1525
- "count": 141
1526
  },
1527
  "MB": {
1528
- "accuracy": 0.8194444444444444,
1529
- "count": 72
1530
  },
1531
  "ME": {
1532
- "accuracy": 1.0,
1533
- "count": 18
1534
  },
1535
  "UB": {
1536
- "accuracy": 0.12605042016806722,
1537
- "count": 119
1538
  }
1539
  }
1540
  },
1541
  "sub_M2": {
1542
- "full_accuracy": 0.0,
1543
- "digit_accuracy": 0.5885714285714285,
1544
- "n_examples": 50,
1545
  "per_subtask": {
1546
  "MD": {
1547
- "accuracy": 0.9553571428571429,
1548
- "count": 112
1549
  },
1550
  "MB": {
1551
- "accuracy": 0.660377358490566,
1552
- "count": 53
1553
  },
1554
  "ME": {
1555
- "accuracy": 0.9787234042553191,
1556
- "count": 47
1557
  },
1558
  "UB": {
1559
- "accuracy": 0.16470588235294117,
1560
- "count": 85
1561
  },
1562
  "UD": {
1563
- "accuracy": 0.07547169811320754,
1564
- "count": 53
1565
  }
1566
  }
1567
  },
1568
  "sub_M3": {
1569
- "full_accuracy": 0.0,
1570
- "digit_accuracy": 0.47714285714285715,
1571
- "n_examples": 50,
1572
  "per_subtask": {
1573
  "MD": {
1574
- "accuracy": 0.9381443298969072,
1575
- "count": 97
1576
  },
1577
  "MB": {
1578
- "accuracy": 0.6078431372549019,
1579
- "count": 51
1580
  },
1581
  "ME": {
1582
  "accuracy": 1.0,
1583
- "count": 27
1584
  },
1585
  "UB": {
1586
- "accuracy": 0.16216216216216217,
1587
- "count": 74
1588
  },
1589
  "UD": {
1590
- "accuracy": 0.0594059405940594,
1591
- "count": 101
1592
  }
1593
  }
1594
  },
1595
  "sub_M4": {
1596
- "full_accuracy": 0.0,
1597
- "digit_accuracy": 0.44,
1598
- "n_examples": 50,
1599
  "per_subtask": {
1600
  "MD": {
1601
- "accuracy": 0.95,
1602
- "count": 100
1603
  },
1604
  "MB": {
1605
- "accuracy": 0.8,
1606
- "count": 50
1607
  },
1608
  "UB": {
1609
- "accuracy": 0.2,
1610
- "count": 50
1611
  },
1612
  "UD": {
1613
- "accuracy": 0.06,
1614
- "count": 150
1615
  }
1616
  }
1617
  },
1618
  "sub_M5": {
1619
- "full_accuracy": 0.0,
1620
- "digit_accuracy": 0.28285714285714286,
1621
- "n_examples": 50,
1622
  "per_subtask": {
1623
  "MD": {
1624
  "accuracy": 1.0,
1625
- "count": 50
1626
  },
1627
  "MB": {
1628
- "accuracy": 0.52,
1629
- "count": 50
1630
  },
1631
  "UB": {
1632
- "accuracy": 0.3,
1633
- "count": 50
1634
  },
1635
  "UD": {
1636
- "accuracy": 0.04,
1637
- "count": 200
1638
  }
1639
  }
1640
  },
1641
  "sub_random": {
1642
  "full_accuracy": 0.04,
1643
- "digit_accuracy": 0.6114285714285714,
1644
  "n_examples": 200,
1645
  "per_subtask": {
1646
  "MD": {
1647
- "accuracy": 0.9245614035087719,
1648
- "count": 570
1649
  },
1650
  "MB": {
1651
- "accuracy": 0.7906137184115524,
1652
- "count": 277
1653
  },
1654
  "ME": {
1655
- "accuracy": 0.9433962264150944,
1656
  "count": 53
1657
  },
1658
  "UB": {
1659
- "accuracy": 0.12526539278131635,
1660
- "count": 471
1661
  },
1662
  "UD": {
1663
- "accuracy": 0.034482758620689655,
1664
- "count": 29
1665
  }
1666
  }
1667
  },
1668
  "sub_B3": {
1669
- "full_accuracy": 0.0,
1670
- "digit_accuracy": 0.5685714285714286,
1671
- "n_examples": 50,
1672
  "per_subtask": {
1673
  "MD": {
1674
- "accuracy": 0.9333333333333333,
1675
- "count": 150
1676
  },
1677
  "MB": {
1678
- "accuracy": 0.86,
1679
- "count": 50
1680
  },
1681
  "UB": {
1682
- "accuracy": 0.1188118811881188,
1683
- "count": 101
1684
  },
1685
  "UD": {
1686
- "accuracy": 0.08163265306122448,
1687
- "count": 49
1688
  }
1689
  }
1690
  },
1691
  "sub_B4": {
1692
  "full_accuracy": 0.0,
1693
- "digit_accuracy": 0.4514285714285714,
1694
- "n_examples": 50,
1695
  "per_subtask": {
1696
  "MD": {
1697
- "accuracy": 0.94,
1698
- "count": 100
1699
  },
1700
  "MB": {
1701
- "accuracy": 0.84,
1702
- "count": 50
1703
  },
1704
  "UB": {
1705
- "accuracy": 0.1652892561983471,
1706
- "count": 121
1707
  },
1708
  "UD": {
1709
- "accuracy": 0.02531645569620253,
1710
- "count": 79
1711
  }
1712
  }
1713
  },
1714
  "sub_B5": {
1715
  "full_accuracy": 0.0,
1716
- "digit_accuracy": 0.32857142857142857,
1717
- "n_examples": 50,
1718
  "per_subtask": {
1719
  "MD": {
1720
  "accuracy": 1.0,
1721
- "count": 50
1722
  },
1723
  "MB": {
1724
- "accuracy": 0.68,
1725
- "count": 50
1726
  },
1727
  "UB": {
1728
- "accuracy": 0.14473684210526316,
1729
- "count": 152
1730
  },
1731
  "UD": {
1732
- "accuracy": 0.09183673469387756,
1733
- "count": 98
1734
  }
1735
  }
1736
  }
1737
  },
1738
  "summary": {
1739
- "overall_accuracy": 0.058,
1740
- "digit_accuracy": 0.5440952380952381,
1741
- "total_examples": 1500,
1742
  "n_splits": 24
1743
  }
1744
  },
1745
- "sorl_overall_accuracy": 0.06291666666666666,
1746
- "sft_overall_accuracy": 0.011666666666666667
1747
  }
 
70
  3719,
71
  3769,
72
  3819,
73
+ 3869,
74
+ 3960,
75
+ 4010,
76
+ 4060,
77
+ 4110,
78
+ 4160,
79
+ 4210,
80
+ 4260,
81
+ 4351,
82
+ 4401,
83
+ 4451,
84
+ 4501,
85
+ 4551,
86
+ 4601,
87
+ 4651,
88
+ 4742,
89
+ 4792,
90
+ 4842,
91
+ 4892,
92
+ 4942,
93
+ 4992,
94
+ 5042,
95
+ 5133,
96
+ 5183,
97
+ 5233,
98
+ 5283,
99
+ 5333,
100
+ 5383,
101
+ 5433,
102
+ 5524,
103
+ 5574,
104
+ 5624,
105
+ 5674,
106
+ 5724,
107
+ 5774,
108
+ 5824,
109
+ 5915,
110
+ 5965,
111
+ 6015,
112
+ 6065,
113
+ 6115,
114
+ 6165,
115
+ 6215,
116
+ 6306,
117
+ 6356,
118
+ 6406,
119
+ 6456,
120
+ 6506,
121
+ 6556,
122
+ 6606,
123
+ 6697,
124
+ 6747,
125
+ 6797,
126
+ 6847,
127
+ 6897,
128
+ 6947,
129
+ 6997,
130
+ 7088,
131
+ 7138,
132
+ 7188,
133
+ 7238,
134
+ 7288,
135
+ 7338,
136
+ 7388,
137
+ 7479,
138
+ 7529,
139
+ 7579,
140
+ 7629,
141
+ 7679,
142
+ 7729,
143
+ 7779
144
  ],
145
  "loss": [
146
+ 13.336702346801758,
147
+ 6.918850421905518,
148
+ 8.896209716796875,
149
+ 11.282072067260742,
150
+ 10.738224029541016,
151
+ 9.437276840209961,
152
+ 8.512653350830078,
153
+ 6.82270622253418,
154
+ 6.035636901855469,
155
+ 5.339687824249268,
156
+ 4.746975421905518,
157
+ 4.351130962371826,
158
+ 4.000054359436035,
159
+ 3.8665781021118164,
160
+ 3.4704408645629883,
161
+ 3.5424342155456543,
162
+ 3.418386459350586,
163
+ 3.4378161430358887,
164
+ 3.1492156982421875,
165
+ 3.2364323139190674,
166
+ 3.2923150062561035,
167
+ 3.3853559494018555,
168
+ 3.247429370880127,
169
+ 3.1995742321014404,
170
+ 3.0241119861602783,
171
+ 2.8602428436279297,
172
+ 2.9540209770202637,
173
+ 2.8344078063964844,
174
+ 3.0442986488342285,
175
+ 2.776155948638916,
176
+ 3.210538864135742,
177
+ 2.941742420196533,
178
+ 2.6947340965270996,
179
+ 2.501694917678833,
180
+ 2.366105079650879,
181
+ 2.6169660091400146,
182
+ 2.829813241958618,
183
+ 2.588465690612793,
184
+ 2.4476141929626465,
185
+ 2.5533101558685303,
186
+ 2.3008124828338623,
187
+ 2.4801294803619385,
188
+ 2.1546616554260254,
189
+ 2.3855626583099365,
190
+ 2.0368154048919678,
191
+ 2.2342143058776855,
192
+ 1.8597867488861084,
193
+ 1.8611640930175781,
194
+ 1.6935548782348633,
195
+ 0.8804562091827393,
196
+ 0.11107325553894043,
197
+ -0.9678399562835693,
198
+ -1.721615195274353,
199
+ -1.8610954284667969,
200
+ -2.863313913345337,
201
+ -3.38641357421875,
202
+ -4.709446907043457,
203
+ -5.262845993041992,
204
+ -6.146366119384766,
205
+ -5.718625545501709,
206
+ -6.643040657043457,
207
+ -6.837361812591553,
208
+ -6.452937126159668,
209
+ -7.447319030761719,
210
+ -7.173651218414307,
211
+ -6.313607215881348,
212
+ -6.0170488357543945,
213
+ -6.332575798034668,
214
+ -6.148547172546387,
215
+ -5.757007598876953,
216
+ -5.9003777503967285,
217
+ -5.504100799560547,
218
+ -5.5883588790893555,
219
+ -5.009939670562744,
220
+ -5.31670618057251,
221
+ -4.6090087890625,
222
+ -4.64957332611084,
223
+ -4.408687591552734,
224
+ -3.7260475158691406,
225
+ -4.101681709289551,
226
+ -3.8994011878967285,
227
+ -5.165737152099609,
228
+ -3.7062177658081055,
229
+ -2.60445237159729,
230
+ -2.9606399536132812,
231
+ -3.406674385070801,
232
+ -2.8535101413726807,
233
+ -3.822021961212158,
234
+ -2.915630578994751,
235
+ -2.7904458045959473,
236
+ -1.838794469833374,
237
+ -2.1989693641662598,
238
+ -3.1347155570983887,
239
+ -2.6108860969543457,
240
+ -2.125248908996582,
241
+ -3.6851775646209717,
242
+ -3.1484384536743164,
243
+ -2.594548463821411,
244
+ -1.9205899238586426,
245
+ -1.8759794235229492,
246
+ -1.4699983596801758,
247
+ -2.4754478931427,
248
+ -1.2241714000701904,
249
+ -1.7784931659698486,
250
+ -1.4043972492218018,
251
+ -1.5459847450256348,
252
+ -1.0906867980957031,
253
+ -1.367735743522644,
254
+ -1.1391258239746094,
255
+ -1.3844239711761475,
256
+ -0.9637696743011475,
257
+ -1.7850940227508545,
258
+ -1.3724936246871948,
259
+ -1.0476679801940918,
260
+ -0.7958612442016602,
261
+ -1.0953247547149658,
262
+ -0.923236608505249,
263
+ -0.6573870182037354,
264
+ -0.963498592376709,
265
+ -0.4147820472717285,
266
+ -0.6397067308425903,
267
+ 0.05295729637145996,
268
+ -0.4455200433731079,
269
+ -0.09821927547454834,
270
+ -0.48106658458709717,
271
+ -0.7404338121414185,
272
+ -0.7213606834411621,
273
+ -0.2695554494857788,
274
+ -0.538608193397522,
275
+ -0.25685858726501465,
276
+ -0.5154925584793091,
277
+ -0.575459361076355,
278
+ 0.10156595706939697,
279
+ -0.0369570255279541,
280
+ -0.06800520420074463,
281
+ -0.22091543674468994,
282
+ -0.5098806619644165,
283
+ -0.40339577198028564,
284
+ -0.06847453117370605,
285
+ 0.28865164518356323
286
  ],
287
  "base_loss": [
288
+ 11.838828086853027,
289
+ 11.370939254760742,
290
+ 10.588272094726562,
291
+ 9.626557350158691,
292
+ 8.752111434936523,
293
+ 7.883986949920654,
294
+ 6.886651039123535,
295
+ 5.24971866607666,
296
+ 4.344367027282715,
297
+ 3.6988837718963623,
298
+ 3.1608898639678955,
299
+ 2.7533035278320312,
300
+ 2.4814517498016357,
301
+ 2.2925686836242676,
302
+ 2.2401986122131348,
303
+ 2.186060667037964,
304
+ 2.111450672149658,
305
+ 2.060096025466919,
306
+ 2.0241243839263916,
307
+ 2.045396089553833,
308
+ 2.0305867195129395,
309
+ 1.9218031167984009,
310
+ 2.0242366790771484,
311
+ 1.9403225183486938,
312
+ 1.939099669456482,
313
+ 1.912737488746643,
314
+ 1.9754208326339722,
315
+ 1.9057916402816772,
316
+ 1.9977864027023315,
317
+ 1.9481257200241089,
318
+ 1.875176191329956,
319
+ 1.887980341911316,
320
+ 1.9379119873046875,
321
+ 1.9210156202316284,
322
+ 1.9257110357284546,
323
+ 1.8627643585205078,
324
+ 1.883230209350586,
325
+ 1.9365102052688599,
326
+ 1.8662726879119873,
327
+ 1.9016093015670776,
328
+ 1.8828037977218628,
329
+ 1.8137520551681519,
330
+ 1.8668559789657593,
331
+ 1.9005358219146729,
332
+ 1.8279279470443726,
333
+ 1.821746826171875,
334
+ 1.8822076320648193,
335
+ 1.8654073476791382,
336
+ 1.914265751838684,
337
+ 1.9144601821899414,
338
+ 1.8745934963226318,
339
+ 1.8415658473968506,
340
+ 1.8748531341552734,
341
+ 1.890276551246643,
342
+ 1.8496427536010742,
343
+ 1.8990840911865234,
344
+ 1.9223690032958984,
345
+ 1.8435914516448975,
346
+ 1.903731346130371,
347
+ 1.8424265384674072,
348
+ 1.8502159118652344,
349
+ 1.8517528772354126,
350
+ 1.807663917541504,
351
+ 1.8589733839035034,
352
+ 1.8163392543792725,
353
+ 1.7087652683258057,
354
+ 1.6609666347503662,
355
+ 1.6727839708328247,
356
+ 1.6066601276397705,
357
+ 1.5752772092819214,
358
+ 1.5740200281143188,
359
+ 1.5229791402816772,
360
+ 1.5197570323944092,
361
+ 1.4390777349472046,
362
+ 1.522583246231079,
363
+ 1.4195992946624756,
364
+ 1.3857381343841553,
365
+ 1.3357621431350708,
366
+ 1.2583914995193481,
367
+ 1.2635949850082397,
368
+ 1.2785142660140991,
369
+ 1.433748483657837,
370
+ 1.2576029300689697,
371
+ 1.075629472732544,
372
+ 1.1458590030670166,
373
+ 1.1873705387115479,
374
+ 1.0882703065872192,
375
+ 1.2453869581222534,
376
+ 1.123284935951233,
377
+ 1.067132592201233,
378
+ 0.991763710975647,
379
+ 1.0049587488174438,
380
+ 1.1444469690322876,
381
+ 1.0937929153442383,
382
+ 0.9533916115760803,
383
+ 1.1876996755599976,
384
+ 1.1239768266677856,
385
+ 1.06907320022583,
386
+ 0.9426320791244507,
387
+ 0.9662951231002808,
388
+ 0.9075624346733093,
389
+ 1.0121568441390991,
390
+ 0.8707554936408997,
391
+ 0.9627531170845032,
392
+ 0.87786865234375,
393
+ 0.9018198251724243,
394
+ 0.8150426149368286,
395
+ 0.8885507583618164,
396
+ 0.8482800722122192,
397
+ 0.8568582534790039,
398
+ 0.8357753753662109,
399
+ 0.9259698987007141,
400
+ 0.8818322420120239,
401
+ 0.8244481086730957,
402
+ 0.7622126340866089,
403
+ 0.8197087049484253,
404
+ 0.8249402642250061,
405
+ 0.7731627225875854,
406
+ 0.8350104093551636,
407
+ 0.7889235615730286,
408
+ 0.76270991563797,
409
+ 0.650455892086029,
410
+ 0.7395504713058472,
411
+ 0.6885175704956055,
412
+ 0.7411212921142578,
413
+ 0.7502254247665405,
414
+ 0.7916255593299866,
415
+ 0.7127252221107483,
416
+ 0.7604287266731262,
417
+ 0.696518063545227,
418
+ 0.7437885999679565,
419
+ 0.7705757021903992,
420
+ 0.6630964875221252,
421
+ 0.6888636946678162,
422
+ 0.6941862106323242,
423
+ 0.6990187764167786,
424
+ 0.7434395551681519,
425
+ 0.7088843584060669,
426
+ 0.6719340682029724,
427
+ 0.6367870569229126
428
  ],
429
  "info_loss": [
430
+ -0.7947063446044922,
431
+ -1.3534460067749023,
432
+ -0.974909782409668,
433
+ -0.4544658660888672,
434
+ -0.24358177185058594,
435
+ -0.1736612319946289,
436
+ -0.09863615036010742,
437
+ -0.06042051315307617,
438
+ -0.03946113586425781,
439
+ -0.03833341598510742,
440
+ -0.03979611396789551,
441
+ -0.03621935844421387,
442
+ -0.042429447174072266,
443
+ -0.035790443420410156,
444
+ -0.06885123252868652,
445
+ -0.05568408966064453,
446
+ -0.06048703193664551,
447
+ -0.052857398986816406,
448
+ -0.0776216983795166,
449
+ -0.07078409194946289,
450
+ -0.06339800357818604,
451
+ -0.042737722396850586,
452
+ -0.0667877197265625,
453
+ -0.06311452388763428,
454
+ -0.08029758930206299,
455
+ -0.09371721744537354,
456
+ -0.09059333801269531,
457
+ -0.09547686576843262,
458
+ -0.08352577686309814,
459
+ -0.10552775859832764,
460
+ -0.05450236797332764,
461
+ -0.08229613304138184,
462
+ -0.11237549781799316,
463
+ -0.12987017631530762,
464
+ -0.143774151802063,
465
+ -0.11218726634979248,
466
+ -0.0928647518157959,
467
+ -0.12236285209655762,
468
+ -0.1294715404510498,
469
+ -0.12258577346801758,
470
+ -0.1456829309463501,
471
+ -0.12091708183288574,
472
+ -0.15867328643798828,
473
+ -0.1391061544418335,
474
+ -0.16621637344360352,
475
+ -0.14623987674713135,
476
+ -0.189733624458313,
477
+ -0.1877741813659668,
478
+ -0.20934617519378662,
479
+ -0.29095458984375,
480
+ -0.3639545440673828,
481
+ -0.468471884727478,
482
+ -0.5470935106277466,
483
+ -0.5625057220458984,
484
+ -0.658776044845581,
485
+ -0.7161940336227417,
486
+ -0.8510096073150635,
487
+ -0.8982928395271301,
488
+ -0.9927720427513123,
489
+ -0.9438121914863586,
490
+ -1.0370187759399414,
491
+ -1.0566596984863281,
492
+ -1.013656497001648,
493
+ -1.1180551052093506,
494
+ -1.0859358310699463,
495
+ -0.9888584613800049,
496
+ -0.95399409532547,
497
+ -0.9860228896141052,
498
+ -0.9598300457000732,
499
+ -0.9168238043785095,
500
+ -0.9273094534873962,
501
+ -0.8808233737945557,
502
+ -0.8881385326385498,
503
+ -0.819733202457428,
504
+ -0.8567293286323547,
505
+ -0.7732676267623901,
506
+ -0.7744763493537903,
507
+ -0.7403684258460999,
508
+ -0.6630790829658508,
509
+ -0.70015549659729,
510
+ -0.6803638935089111,
511
+ -0.8223849534988403,
512
+ -0.6571109294891357,
513
+ -0.5277228951454163,
514
+ -0.5673674941062927,
515
+ -0.6172389388084412,
516
+ -0.5512561798095703,
517
+ -0.6607455015182495,
518
+ -0.5600118637084961,
519
+ -0.5406466126441956,
520
+ -0.4401552081108093,
521
+ -0.4710603952407837,
522
+ -0.578697681427002,
523
+ -0.5256502628326416,
524
+ -0.4602707326412201,
525
+ -0.6362345218658447,
526
+ -0.5754833817481995,
527
+ -0.5146251320838928,
528
+ -0.4332084655761719,
529
+ -0.42987990379333496,
530
+ -0.3861679434776306,
531
+ -0.49510324001312256,
532
+ -0.3570902347564697,
533
+ -0.41823434829711914,
534
+ -0.3782045841217041,
535
+ -0.3882964253425598,
536
+ -0.3307146728038788,
537
+ -0.3685101866722107,
538
+ -0.3422055244445801,
539
+ -0.3646612763404846,
540
+ -0.32243889570236206,
541
+ -0.4127432703971863,
542
+ -0.36529260873794556,
543
+ -0.32932430505752563,
544
+ -0.29582005739212036,
545
+ -0.3240930736064911,
546
+ -0.3139222264289856,
547
+ -0.2795545160770416,
548
+ -0.3168061375617981,
549
+ -0.25903958082199097,
550
+ -0.2791879177093506,
551
+ -0.19818127155303955,
552
+ -0.2572588324546814,
553
+ -0.2158147394657135,
554
+ -0.2577873766422272,
555
+ -0.28448453545570374,
556
+ -0.2858678102493286,
557
+ -0.23259294033050537,
558
+ -0.2640204131603241,
559
+ -0.2303641140460968,
560
+ -0.26194649934768677,
561
+ -0.2717556953430176,
562
+ -0.1934002935886383,
563
+ -0.20960962772369385,
564
+ -0.21173539757728577,
565
+ -0.22608724236488342,
566
+ -0.25635597109794617,
567
+ -0.24326494336128235,
568
+ -0.20668640732765198,
569
+ -0.16726526618003845
570
  ],
571
  "abs_loss": [
572
+ 2.302412748336792,
573
+ 2.3009450435638428,
574
+ 2.27870512008667,
575
+ 2.174964189529419,
576
+ 2.084486722946167,
577
+ 1.9960142374038696,
578
+ 1.9213024377822876,
579
+ 1.8566575050354004,
580
+ 1.8366934061050415,
581
+ 1.851310133934021,
582
+ 1.8485580682754517,
583
+ 1.838810920715332,
584
+ 1.8309296369552612,
585
+ 1.8317207098007202,
586
+ 1.8420051336288452,
587
+ 1.8463129997253418,
588
+ 1.8668345212936401,
589
+ 1.853007435798645,
590
+ 1.8500884771347046,
591
+ 1.8505192995071411,
592
+ 1.8474435806274414,
593
+ 1.8385688066482544,
594
+ 1.861424446105957,
595
+ 1.8699045181274414,
596
+ 1.862662672996521,
597
+ 1.8386759757995605,
598
+ 1.850511908531189,
599
+ 1.8497024774551392,
600
+ 1.8500269651412964,
601
+ 1.8689703941345215,
602
+ 1.8515371084213257,
603
+ 1.818575382232666,
604
+ 1.8620004653930664,
605
+ 1.8577165603637695,
606
+ 1.8511066436767578,
607
+ 1.838352084159851,
608
+ 1.8330349922180176,
609
+ 1.84242582321167,
610
+ 1.8524103164672852,
611
+ 1.8676964044570923,
612
+ 1.8425636291503906,
613
+ 1.8451820611953735,
614
+ 1.8523412942886353,
615
+ 1.8677786588668823,
616
+ 1.817083716392517,
617
+ 1.8620718717575073,
618
+ 1.8646188974380493,
619
+ 1.84919273853302,
620
+ 1.845678448677063,
621
+ 1.8551315069198608,
622
+ 1.848828911781311,
623
+ 1.838497281074524,
624
+ 1.828985571861267,
625
+ 1.8191531896591187,
626
+ 1.8209171295166016,
627
+ 1.8266960382461548,
628
+ 1.8298794031143188,
629
+ 1.8037595748901367,
630
+ 1.8071837425231934,
631
+ 1.7994571924209595,
632
+ 1.795587182044983,
633
+ 1.8066879510879517,
634
+ 1.7850884199142456,
635
+ 1.7945705652236938,
636
+ 1.7674384117126465,
637
+ 1.7592116594314575,
638
+ 1.7298604249954224,
639
+ 1.702737808227539,
640
+ 1.6485201120376587,
641
+ 1.5819498300552368,
642
+ 1.511340618133545,
643
+ 1.4684982299804688,
644
+ 1.4436193704605103,
645
+ 1.4264718294143677,
646
+ 1.3067470788955688,
647
+ 1.2788523435592651,
648
+ 1.2897067070007324,
649
+ 1.226319670677185,
650
+ 1.1635907888412476,
651
+ 1.1739449501037598,
652
+ 1.0503578186035156,
653
+ 1.10030996799469,
654
+ 1.0934771299362183,
655
+ 1.0553520917892456,
656
+ 1.0155118703842163,
657
+ 0.9916570782661438,
658
+ 0.9301231503486633,
659
+ 0.8723134994506836,
660
+ 0.8847482204437256,
661
+ 0.9377768039703369,
662
+ 0.8362235426902771,
663
+ 0.8876826167106628,
664
+ 0.8097426295280457,
665
+ 0.7707312703132629,
666
+ 0.783437967300415,
667
+ 0.7612269520759583,
668
+ 0.7890119552612305,
669
+ 0.7253015041351318,
670
+ 0.7303829789161682,
671
+ 0.701012909412384,
672
+ 0.707586944103241,
673
+ 0.7077252268791199,
674
+ 0.6791206002235413,
675
+ 0.6962205767631531,
676
+ 0.649402379989624,
677
+ 0.6630510091781616,
678
+ 0.6399484872817993,
679
+ 0.6480059027671814,
680
+ 0.6101812720298767,
681
+ 0.6286943554878235,
682
+ 0.6452814936637878,
683
+ 0.60471510887146,
684
+ 0.6004258990287781,
685
+ 0.6153071522712708,
686
+ 0.6862974762916565,
687
+ 0.6473205089569092,
688
+ 0.5864364504814148,
689
+ 0.5834178924560547,
690
+ 0.584533154964447,
691
+ 0.6051172614097595,
692
+ 0.6042944192886353,
693
+ 0.5947246551513672,
694
+ 0.6029585003852844,
695
+ 0.5866187214851379,
696
+ 0.5548930168151855,
697
+ 0.5721796751022339,
698
+ 0.5757569670677185,
699
+ 0.6093991994857788,
700
+ 0.545515239238739,
701
+ 0.5773008465766907,
702
+ 0.5797798037528992,
703
+ 0.5739028453826904,
704
+ 0.5971900820732117,
705
+ 0.5974938273429871,
706
+ 0.5584818124771118,
707
+ 0.5314763188362122,
708
+ 0.5410658121109009,
709
+ 0.581427276134491,
710
+ 0.556355893611908,
711
+ 0.5683047771453857
712
  ],
713
  "zipf_loss": [
714
+ 9.214696884155273,
715
+ 8.852276802062988,
716
+ 7.829165458679199,
717
+ 5.982677459716797,
718
+ 4.213481426239014,
719
+ 3.090301036834717,
720
+ 2.420233964920044,
721
+ 1.9915273189544678,
722
+ 1.9022116661071777,
723
+ 1.8390072584152222,
724
+ 1.799190878868103,
725
+ 1.7761399745941162,
726
+ 1.7598042488098145,
727
+ 1.748741865158081,
728
+ 1.7345539331436157,
729
+ 1.7285829782485962,
730
+ 1.7251226902008057,
731
+ 1.7209933996200562,
732
+ 1.7162994146347046,
733
+ 1.7138252258300781,
734
+ 1.7109640836715698,
735
+ 1.7070730924606323,
736
+ 1.7049274444580078,
737
+ 1.7034064531326294,
738
+ 1.7017219066619873,
739
+ 1.7008098363876343,
740
+ 1.6994823217391968,
741
+ 1.69841468334198,
742
+ 1.6967672109603882,
743
+ 1.6964107751846313,
744
+ 1.6952325105667114,
745
+ 1.6948657035827637,
746
+ 1.6943769454956055,
747
+ 1.6936094760894775,
748
+ 1.693024754524231,
749
+ 1.6922390460968018,
750
+ 1.6919270753860474,
751
+ 1.6913414001464844,
752
+ 1.6908159255981445,
753
+ 1.690788984298706,
754
+ 1.6905815601348877,
755
+ 1.6910300254821777,
756
+ 1.6893043518066406,
757
+ 1.6893105506896973,
758
+ 1.689342737197876,
759
+ 1.6886589527130127,
760
+ 1.6884534358978271,
761
+ 1.6885793209075928,
762
+ 1.688183069229126,
763
+ 1.6900287866592407,
764
+ 1.6911423206329346,
765
+ 1.6914634704589844,
766
+ 1.6915680170059204,
767
+ 1.691770076751709,
768
+ 1.6927120685577393,
769
+ 1.6937731504440308,
770
+ 1.695291519165039,
771
+ 1.6961145401000977,
772
+ 1.696904182434082,
773
+ 1.6971238851547241,
774
+ 1.6973726749420166,
775
+ 1.6968129873275757,
776
+ 1.6974554061889648,
777
+ 1.6948020458221436,
778
+ 1.6926236152648926,
779
+ 1.6902903318405151,
780
+ 1.6889389753341675,
781
+ 1.6845951080322266,
782
+ 1.6782419681549072,
783
+ 1.6777575016021729,
784
+ 1.6475625038146973,
785
+ 1.634303331375122,
786
+ 1.6289076805114746,
787
+ 1.6056675910949707,
788
+ 1.5973291397094727,
789
+ 1.5761830806732178,
790
+ 1.5804808139801025,
791
+ 1.536602258682251,
792
+ 1.5299925804138184,
793
+ 1.5188838243484497,
794
+ 1.520687460899353,
795
+ 1.5143325328826904,
796
+ 1.4979405403137207,
797
+ 1.4916117191314697,
798
+ 1.4656249284744263,
799
+ 1.479178547859192,
800
+ 1.477769136428833,
801
+ 1.4528149366378784,
802
+ 1.4727284908294678,
803
+ 1.45510995388031,
804
+ 1.4873716831207275,
805
+ 1.417907476425171,
806
+ 1.4268403053283691,
807
+ 1.4747503995895386,
808
+ 1.445723056793213,
809
+ 1.4133450984954834,
810
+ 1.403517246246338,
811
+ 1.4100992679595947,
812
+ 1.3958241939544678,
813
+ 1.386423110961914,
814
+ 1.4133601188659668,
815
+ 1.392655372619629,
816
+ 1.4080634117126465,
817
+ 1.3714752197265625,
818
+ 1.4348397254943848,
819
+ 1.3688545227050781,
820
+ 1.3374223709106445,
821
+ 1.364014983177185,
822
+ 1.373631238937378,
823
+ 1.3424608707427979,
824
+ 1.3603157997131348,
825
+ 1.3558974266052246,
826
+ 1.3385576009750366,
827
+ 1.3595960140228271,
828
+ 1.3314967155456543,
829
+ 1.2611653804779053,
830
+ 1.3324017524719238,
831
+ 1.3066534996032715,
832
+ 1.3110988140106201,
833
+ 1.3261786699295044,
834
+ 1.3290331363677979,
835
+ 1.3248416185379028,
836
+ 1.3272221088409424,
837
+ 1.312748670578003,
838
+ 1.3001965284347534,
839
+ 1.29696786403656,
840
+ 1.288116216659546,
841
+ 1.2827088832855225,
842
+ 1.2866158485412598,
843
+ 1.292534351348877,
844
+ 1.3022056818008423,
845
+ 1.314131498336792,
846
+ 1.312753438949585,
847
+ 1.3105261325836182,
848
+ 1.2993143796920776,
849
+ 1.2877905368804932,
850
+ 1.256132960319519,
851
+ 1.2622264623641968,
852
+ 1.2708197832107544,
853
+ 1.2676868438720703
854
  ],
855
  "denoise_loss": [],
856
  "ortho_loss": [
857
+ 0.8263990879058838,
858
+ 0.6398636102676392,
859
+ 0.5248991847038269,
860
+ 0.47035273909568787,
861
+ 0.36485227942466736,
862
+ 0.3518960177898407,
863
+ 0.37278884649276733,
864
+ 0.37953755259513855,
865
+ 0.369808554649353,
866
+ 0.35956481099128723,
867
+ 0.34914538264274597,
868
+ 0.34211647510528564,
869
+ 0.35236310958862305,
870
+ 0.34380295872688293,
871
+ 0.3560903072357178,
872
+ 0.36143550276756287,
873
+ 0.35988783836364746,
874
+ 0.3778095841407776,
875
+ 0.36071673035621643,
876
+ 0.3772004246711731,
877
+ 0.3913002610206604,
878
+ 0.4016982316970825,
879
+ 0.3970619738101959,
880
+ 0.38298705220222473,
881
+ 0.38086575269699097,
882
+ 0.3925340473651886,
883
+ 0.39670127630233765,
884
+ 0.39084312319755554,
885
+ 0.405480295419693,
886
+ 0.42086395621299744,
887
+ 0.411644846200943,
888
+ 0.41944432258605957,
889
+ 0.4163327217102051,
890
+ 0.42740166187286377,
891
+ 0.4257947504520416,
892
+ 0.4076772928237915,
893
+ 0.3938109576702118,
894
+ 0.3850422501564026,
895
+ 0.3819939196109772,
896
+ 0.4029465615749359,
897
+ 0.4009494483470917,
898
+ 0.4083222448825836,
899
+ 0.41084668040275574,
900
+ 0.41851985454559326,
901
+ 0.42693084478378296,
902
+ 0.4023897051811218,
903
+ 0.39613503217697144,
904
+ 0.38698458671569824,
905
+ 0.4010132849216461,
906
+ 0.391172856092453,
907
+ 0.39587458968162537,
908
+ 0.40000119805336,
909
+ 0.4087057411670685,
910
+ 0.41791754961013794,
911
+ 0.42740243673324585,
912
+ 0.43657243251800537,
913
+ 0.44869521260261536,
914
+ 0.4524320960044861,
915
+ 0.4557957649230957,
916
+ 0.4605225622653961,
917
+ 0.46363839507102966,
918
+ 0.46766993403434753,
919
+ 0.47213712334632874,
920
+ 0.47606295347213745,
921
+ 0.4780144989490509,
922
+ 0.4824178218841553,
923
+ 0.48773542046546936,
924
+ 0.490782767534256,
925
+ 0.491031289100647,
926
+ 0.4955742061138153,
927
+ 0.49922680854797363,
928
+ 0.49962109327316284,
929
+ 0.4963490962982178,
930
+ 0.5031077265739441,
931
+ 0.5041939616203308,
932
+ 0.5056381225585938,
933
+ 0.5040112733840942,
934
+ 0.5016523003578186,
935
+ 0.5010254979133606,
936
+ 0.501035749912262,
937
+ 0.4998578429222107,
938
+ 0.496683806180954,
939
+ 0.4952710270881653,
940
+ 0.49363234639167786,
941
+ 0.4910505712032318,
942
+ 0.48717889189720154,
943
+ 0.4841475784778595,
944
+ 0.4831303656101227,
945
+ 0.4807543158531189,
946
+ 0.47504204511642456,
947
+ 0.4743756353855133,
948
+ 0.4675210118293762,
949
+ 0.4632830321788788,
950
+ 0.459530234336853,
951
+ 0.45602646470069885,
952
+ 0.45379316806793213,
953
+ 0.4504814147949219,
954
+ 0.4467865824699402,
955
+ 0.4409796893596649,
956
+ 0.43924814462661743,
957
+ 0.43782851099967957,
958
+ 0.43564632534980774,
959
+ 0.43388527631759644,
960
+ 0.4302870035171509,
961
+ 0.43054941296577454,
962
+ 0.4253215491771698,
963
+ 0.4239141345024109,
964
+ 0.4234074652194977,
965
+ 0.42124322056770325,
966
+ 0.4201543629169464,
967
+ 0.4194556474685669,
968
+ 0.4171600937843323,
969
+ 0.41740599274635315,
970
+ 0.41401854157447815,
971
+ 0.4137851297855377,
972
+ 0.4160709083080292,
973
+ 0.41565027832984924,
974
+ 0.4148636758327484,
975
+ 0.41444507241249084,
976
+ 0.41314461827278137,
977
+ 0.4130113422870636,
978
+ 0.4124045968055725,
979
+ 0.41070786118507385,
980
+ 0.4096534848213196,
981
+ 0.40888485312461853,
982
+ 0.4067349433898926,
983
+ 0.40550580620765686,
984
+ 0.4057595133781433,
985
+ 0.40574514865875244,
986
+ 0.40414148569107056,
987
+ 0.4031132161617279,
988
+ 0.4023817777633667,
989
+ 0.40334025025367737,
990
+ 0.4025489091873169,
991
+ 0.4025358557701111,
992
+ 0.4025465250015259,
993
+ 0.40197664499282837,
994
+ 0.4014347195625305,
995
+ 0.40142199397087097,
996
+ 0.40162885189056396
997
  ],
998
  "lr": [
999
+ 4.188034188034189e-06,
1000
+ 8.461538461538462e-06,
1001
+ 1.2735042735042738e-05,
1002
+ 1.700854700854701e-05,
1003
  2e-05,
1004
  2e-05,
1005
  2e-05,
 
1040
  2e-05,
1041
  2e-05,
1042
  2e-05,
1043
+ 2e-05,
1044
+ 2e-05,
1045
+ 2e-05,
1046
+ 2e-05,
1047
+ 2e-05,
1048
+ 2e-05,
1049
+ 2e-05,
1050
+ 2e-05,
1051
+ 2e-05,
1052
+ 2e-05,
1053
+ 2e-05,
1054
+ 2e-05,
1055
+ 2e-05,
1056
+ 2e-05,
1057
+ 2e-05,
1058
+ 2e-05,
1059
+ 2e-05,
1060
+ 2e-05,
1061
+ 2e-05,
1062
+ 2e-05,
1063
+ 2e-05,
1064
+ 2e-05,
1065
+ 2e-05,
1066
+ 2e-05,
1067
+ 2e-05,
1068
+ 2e-05,
1069
+ 2e-05,
1070
+ 2e-05,
1071
+ 2e-05,
1072
+ 2e-05,
1073
+ 2e-05,
1074
+ 2e-05,
1075
+ 2e-05,
1076
+ 2e-05,
1077
+ 2e-05,
1078
+ 2e-05,
1079
+ 2e-05,
1080
+ 2e-05,
1081
+ 2e-05,
1082
+ 2e-05,
1083
+ 2e-05,
1084
+ 1.9967967308199317e-05,
1085
+ 1.967136831004482e-05,
1086
+ 1.9374769311890326e-05,
1087
+ 1.907817031373583e-05,
1088
+ 1.8781571315581332e-05,
1089
+ 1.848497231742684e-05,
1090
+ 1.7945162140785656e-05,
1091
+ 1.7648563142631164e-05,
1092
+ 1.735196414447667e-05,
1093
+ 1.705536514632217e-05,
1094
+ 1.675876614816768e-05,
1095
+ 1.6462167150013183e-05,
1096
+ 1.6165568151858688e-05,
1097
+ 1.5625757975217507e-05,
1098
+ 1.532915897706301e-05,
1099
+ 1.5032559978908518e-05,
1100
+ 1.473596098075402e-05,
1101
+ 1.4439361982599525e-05,
1102
+ 1.4142762984445033e-05,
1103
+ 1.3846163986290536e-05,
1104
+ 1.3306353809649356e-05,
1105
+ 1.300975481149486e-05,
1106
+ 1.2713155813340366e-05,
1107
+ 1.2416556815185872e-05,
1108
+ 1.2119957817031374e-05,
1109
+ 1.182335881887688e-05,
1110
+ 1.1526759820722382e-05,
1111
+ 1.0986949644081204e-05,
1112
+ 1.0690350645926705e-05,
1113
+ 1.0393751647772212e-05,
1114
+ 1.009715264961772e-05,
1115
+ 9.800553651463221e-06,
1116
+ 9.50395465330873e-06,
1117
+ 9.207355655154231e-06,
1118
+ 8.66754547851305e-06,
1119
+ 8.370946480358558e-06,
1120
+ 8.07434748220406e-06,
1121
+ 7.777748484049568e-06,
1122
+ 7.481149485895069e-06,
1123
+ 7.184550487740576e-06,
1124
+ 6.887951489586082e-06,
1125
+ 6.3481413129448985e-06,
1126
+ 6.051542314790406e-06,
1127
+ 5.754943316635908e-06,
1128
+ 5.4583443184814145e-06,
1129
+ 5.161745320326917e-06,
1130
+ 4.865146322172423e-06,
1131
+ 4.5685473240179305e-06,
1132
+ 4.028737147376746e-06,
1133
+ 3.732138149222253e-06,
1134
+ 3.435539151067755e-06,
1135
+ 3.1389401529132617e-06,
1136
+ 2.8423411547587686e-06,
1137
+ 2.5457421566042704e-06,
1138
+ 2.2491431584497773e-06
1139
  ],
1140
  "emb_lr": [],
1141
  "eval_step": [
 
1148
  2696,
1149
  3087,
1150
  3478,
1151
+ 3869,
1152
+ 4260,
1153
+ 4651,
1154
+ 5042,
1155
+ 5433,
1156
+ 5824,
1157
+ 6215,
1158
+ 6606,
1159
+ 6997,
1160
+ 7388,
1161
+ 7779
1162
  ],
1163
  "eval_accuracy": [
1164
+ 0.02,
1165
+ 0.02,
1166
+ 0.02,
1167
+ 0.02,
1168
+ 0.02,
1169
+ 0.02,
1170
+ 0.02,
1171
+ 0.02,
1172
+ 0.02,
1173
+ 0.0,
1174
  0.01,
1175
  0.0,
1176
  0.0,
 
1177
  0.0,
1178
  0.0,
1179
+ 0.0,
1180
+ 0.0,
1181
  0.0,
1182
  0.0,
1183
  0.0
1184
  ]
1185
  },
1186
+ "final_accuracy": 0.08461538461538462,
1187
  "sft_eval": {
1188
  "config": {
1189
  "ops": "add_sub",
1190
  "K": null,
1191
  "mode": "sft",
1192
  "n_digits": 6,
1193
+ "n_per_split": 100
1194
  },
1195
  "splits": {
1196
  "add_S0": {
1197
+ "full_accuracy": 0.26,
1198
+ "digit_accuracy": 0.8314285714285714,
1199
+ "n_examples": 100,
1200
  "per_subtask": {
1201
  "SA": {
1202
+ "accuracy": 0.828099173553719,
1203
+ "count": 605
1204
  },
1205
  "SS": {
1206
+ "accuracy": 0.8526315789473684,
1207
+ "count": 95
1208
  }
1209
  }
1210
  },
1211
  "add_S1": {
1212
+ "full_accuracy": 0.03,
1213
+ "digit_accuracy": 0.6657142857142857,
1214
+ "n_examples": 100,
1215
  "per_subtask": {
1216
  "SA": {
1217
+ "accuracy": 0.8529411764705882,
1218
+ "count": 204
1219
  },
1220
  "SC": {
1221
+ "accuracy": 0.9053254437869822,
1222
+ "count": 169
1223
  },
1224
  "SS": {
1225
+ "accuracy": 0.8064516129032258,
1226
+ "count": 31
1227
  },
1228
  "UC": {
1229
+ "accuracy": 0.38513513513513514,
1230
+ "count": 296
1231
  }
1232
  }
1233
  },
1234
  "add_S2": {
1235
+ "full_accuracy": 0.09,
1236
+ "digit_accuracy": 0.7042857142857143,
1237
+ "n_examples": 100,
1238
  "per_subtask": {
1239
  "SA": {
1240
+ "accuracy": 0.9141104294478528,
1241
+ "count": 163
1242
  },
1243
  "SC": {
1244
+ "accuracy": 0.9,
1245
+ "count": 130
1246
  },
1247
  "SS": {
1248
+ "accuracy": 0.7701149425287356,
1249
+ "count": 87
1250
  },
1251
  "UC": {
1252
+ "accuracy": 0.4187192118226601,
1253
+ "count": 203
1254
  },
1255
  "US": {
1256
+ "accuracy": 0.6410256410256411,
1257
+ "count": 117
1258
  }
1259
  }
1260
  },
1261
  "add_S3": {
1262
+ "full_accuracy": 0.01,
1263
+ "digit_accuracy": 0.5542857142857143,
1264
+ "n_examples": 100,
1265
  "per_subtask": {
1266
  "SA": {
1267
+ "accuracy": 0.8347107438016529,
1268
+ "count": 121
1269
  },
1270
  "SC": {
1271
+ "accuracy": 0.8347107438016529,
1272
+ "count": 121
1273
  },
1274
  "SS": {
1275
+ "accuracy": 0.7755102040816326,
1276
+ "count": 49
1277
  },
1278
  "UC": {
1279
+ "accuracy": 0.34408602150537637,
1280
+ "count": 186
1281
  },
1282
  "US": {
1283
+ "accuracy": 0.37668161434977576,
1284
+ "count": 223
1285
  }
1286
  }
1287
  },
1288
  "add_S4": {
1289
+ "full_accuracy": 0.09,
1290
+ "digit_accuracy": 0.5085714285714286,
1291
+ "n_examples": 100,
1292
  "per_subtask": {
1293
  "SA": {
1294
+ "accuracy": 0.8653846153846154,
1295
+ "count": 104
1296
  },
1297
  "SC": {
1298
+ "accuracy": 0.8584905660377359,
1299
+ "count": 106
1300
  },
1301
  "SS": {
1302
+ "accuracy": 0.8695652173913043,
1303
+ "count": 23
1304
  },
1305
  "UC": {
1306
+ "accuracy": 0.34375,
1307
+ "count": 160
1308
  },
1309
  "US": {
1310
+ "accuracy": 0.3257328990228013,
1311
+ "count": 307
1312
  }
1313
  }
1314
  },
1315
  "add_S5": {
1316
+ "full_accuracy": 0.03,
1317
+ "digit_accuracy": 0.35,
1318
+ "n_examples": 100,
1319
  "per_subtask": {
1320
  "SA": {
1321
+ "accuracy": 0.91,
1322
+ "count": 100
1323
  },
1324
  "SC": {
1325
+ "accuracy": 0.79,
1326
+ "count": 100
1327
  },
1328
  "UC": {
1329
+ "accuracy": 0.26,
1330
+ "count": 100
1331
  },
1332
  "US": {
1333
+ "accuracy": 0.1225,
1334
+ "count": 400
1335
  }
1336
  }
1337
  },
1338
  "add_S6": {
1339
+ "full_accuracy": 0.12,
1340
+ "digit_accuracy": 0.36142857142857143,
1341
+ "n_examples": 100,
1342
  "per_subtask": {
1343
  "SC": {
1344
+ "accuracy": 0.86,
1345
+ "count": 100
1346
  },
1347
  "UC": {
1348
+ "accuracy": 0.37,
1349
+ "count": 100
1350
  },
1351
  "US": {
1352
+ "accuracy": 0.26,
1353
+ "count": 500
1354
  }
1355
  }
1356
  },
1357
  "add_random": {
1358
+ "full_accuracy": 0.065,
1359
+ "digit_accuracy": 0.6621428571428571,
1360
  "n_examples": 200,
1361
  "per_subtask": {
1362
  "SA": {
1363
+ "accuracy": 0.8478747203579419,
1364
+ "count": 447
1365
  },
1366
  "SC": {
1367
+ "accuracy": 0.884375,
1368
+ "count": 320
1369
  },
1370
  "SS": {
1371
+ "accuracy": 0.8571428571428571,
1372
+ "count": 56
1373
  },
1374
  "UC": {
1375
+ "accuracy": 0.3724007561436673,
1376
+ "count": 529
1377
  },
1378
  "US": {
1379
+ "accuracy": 0.4166666666666667,
1380
+ "count": 48
1381
  }
1382
  }
1383
  },
1384
  "add_C1": {
1385
+ "full_accuracy": 0.13,
1386
+ "digit_accuracy": 0.7485714285714286,
1387
+ "n_examples": 100,
1388
  "per_subtask": {
1389
  "SA": {
1390
+ "accuracy": 0.808,
1391
+ "count": 500
1392
  },
1393
  "SC": {
1394
+ "accuracy": 0.71,
1395
+ "count": 100
1396
  },
1397
  "UC": {
1398
+ "accuracy": 0.49,
1399
+ "count": 100
1400
  }
1401
  }
1402
  },
1403
  "add_C2": {
1404
+ "full_accuracy": 0.1,
1405
+ "digit_accuracy": 0.7142857142857143,
1406
+ "n_examples": 100,
1407
  "per_subtask": {
1408
  "SA": {
1409
+ "accuracy": 0.82,
1410
+ "count": 400
1411
  },
1412
  "SC": {
1413
+ "accuracy": 0.87,
1414
+ "count": 100
1415
  },
1416
  "UC": {
1417
+ "accuracy": 0.42948717948717946,
1418
+ "count": 156
1419
  },
1420
  "US": {
1421
+ "accuracy": 0.4090909090909091,
1422
+ "count": 44
1423
  }
1424
  }
1425
  },
1426
  "add_C3": {
1427
+ "full_accuracy": 0.02,
1428
+ "digit_accuracy": 0.6485714285714286,
1429
+ "n_examples": 100,
1430
  "per_subtask": {
1431
  "SA": {
1432
+ "accuracy": 0.8233333333333334,
1433
+ "count": 300
1434
  },
1435
  "SC": {
1436
+ "accuracy": 0.96,
1437
+ "count": 100
1438
  },
1439
  "UC": {
1440
+ "accuracy": 0.32160804020100503,
1441
+ "count": 199
1442
  },
1443
  "US": {
1444
+ "accuracy": 0.46534653465346537,
1445
+ "count": 101
1446
  }
1447
  }
1448
  },
1449
  "add_C4": {
1450
+ "full_accuracy": 0.01,
1451
+ "digit_accuracy": 0.5728571428571428,
1452
+ "n_examples": 100,
1453
  "per_subtask": {
1454
  "SA": {
1455
+ "accuracy": 0.825,
1456
+ "count": 200
1457
  },
1458
  "SC": {
1459
+ "accuracy": 0.94,
1460
+ "count": 100
1461
  },
1462
  "UC": {
1463
+ "accuracy": 0.29924242424242425,
1464
+ "count": 264
1465
  },
1466
  "US": {
1467
+ "accuracy": 0.4632352941176471,
1468
+ "count": 136
1469
  }
1470
  }
1471
  },
1472
  "add_C5": {
1473
+ "full_accuracy": 0.01,
1474
+ "digit_accuracy": 0.5328571428571428,
1475
+ "n_examples": 100,
1476
  "per_subtask": {
1477
  "SA": {
1478
+ "accuracy": 0.89,
1479
+ "count": 100
1480
  },
1481
  "SC": {
1482
+ "accuracy": 0.94,
1483
+ "count": 100
1484
  },
1485
  "UC": {
1486
+ "accuracy": 0.33225806451612905,
1487
+ "count": 310
1488
  },
1489
  "US": {
1490
+ "accuracy": 0.45789473684210524,
1491
+ "count": 190
1492
  }
1493
  }
1494
  },
1495
  "add_C6": {
1496
+ "full_accuracy": 0.01,
1497
+ "digit_accuracy": 0.49857142857142855,
1498
+ "n_examples": 100,
1499
  "per_subtask": {
1500
  "SC": {
1501
+ "accuracy": 0.88,
1502
+ "count": 100
1503
  },
1504
  "UC": {
1505
+ "accuracy": 0.34864864864864864,
1506
+ "count": 370
1507
  },
1508
  "US": {
1509
+ "accuracy": 0.5739130434782609,
1510
+ "count": 230
1511
  }
1512
  }
1513
  },
1514
  "sub_M0": {
1515
+ "full_accuracy": 0.17,
1516
+ "digit_accuracy": 0.7985714285714286,
1517
+ "n_examples": 100,
1518
  "per_subtask": {
1519
  "MD": {
1520
+ "accuracy": 0.7853658536585366,
1521
+ "count": 615
1522
  },
1523
  "ME": {
1524
+ "accuracy": 0.8941176470588236,
1525
+ "count": 85
1526
  }
1527
  }
1528
  },
1529
  "sub_M1": {
1530
+ "full_accuracy": 0.01,
1531
+ "digit_accuracy": 0.5885714285714285,
1532
+ "n_examples": 100,
1533
  "per_subtask": {
1534
  "MD": {
1535
+ "accuracy": 0.8082191780821918,
1536
+ "count": 292
1537
  },
1538
  "MB": {
1539
+ "accuracy": 0.6736111111111112,
1540
+ "count": 144
1541
  },
1542
  "ME": {
1543
+ "accuracy": 0.88,
1544
+ "count": 25
1545
  },
1546
  "UB": {
1547
+ "accuracy": 0.2384937238493724,
1548
+ "count": 239
1549
  }
1550
  }
1551
  },
1552
  "sub_M2": {
1553
+ "full_accuracy": 0.01,
1554
+ "digit_accuracy": 0.5742857142857143,
1555
+ "n_examples": 100,
1556
  "per_subtask": {
1557
  "MD": {
1558
+ "accuracy": 0.8862559241706162,
1559
+ "count": 211
1560
  },
1561
  "MB": {
1562
+ "accuracy": 0.591304347826087,
1563
+ "count": 115
1564
  },
1565
  "ME": {
1566
+ "accuracy": 0.9529411764705882,
1567
+ "count": 85
1568
  },
1569
  "UB": {
1570
+ "accuracy": 0.30939226519337015,
1571
+ "count": 181
1572
  },
1573
  "UD": {
1574
+ "accuracy": 0.09259259259259259,
1575
+ "count": 108
1576
  }
1577
  }
1578
  },
1579
  "sub_M3": {
1580
+ "full_accuracy": 0.01,
1581
+ "digit_accuracy": 0.5,
1582
+ "n_examples": 100,
1583
  "per_subtask": {
1584
  "MD": {
1585
+ "accuracy": 0.9329608938547486,
1586
+ "count": 179
1587
  },
1588
  "MB": {
1589
+ "accuracy": 0.6310679611650486,
1590
+ "count": 103
1591
  },
1592
  "ME": {
1593
+ "accuracy": 0.9642857142857143,
1594
+ "count": 56
1595
  },
1596
  "UB": {
1597
+ "accuracy": 0.26174496644295303,
1598
+ "count": 149
1599
  },
1600
  "UD": {
1601
+ "accuracy": 0.11737089201877934,
1602
+ "count": 213
1603
  }
1604
  }
1605
  },
1606
  "sub_M4": {
1607
  "full_accuracy": 0.0,
1608
+ "digit_accuracy": 0.4228571428571429,
1609
+ "n_examples": 100,
1610
  "per_subtask": {
1611
  "MD": {
1612
+ "accuracy": 0.8,
1613
+ "count": 200
1614
  },
1615
  "MB": {
1616
+ "accuracy": 0.75,
1617
+ "count": 100
1618
  },
1619
  "UB": {
1620
+ "accuracy": 0.34,
1621
+ "count": 100
1622
  },
1623
  "UD": {
1624
+ "accuracy": 0.09,
1625
+ "count": 300
1626
  }
1627
  }
1628
  },
1629
  "sub_M5": {
1630
  "full_accuracy": 0.0,
1631
+ "digit_accuracy": 0.37,
1632
+ "n_examples": 100,
1633
  "per_subtask": {
1634
  "MD": {
1635
  "accuracy": 1.0,
1636
+ "count": 100
1637
  },
1638
  "MB": {
1639
+ "accuracy": 0.81,
1640
+ "count": 100
1641
  },
1642
  "UB": {
1643
+ "accuracy": 0.37,
1644
+ "count": 100
1645
  },
1646
  "UD": {
1647
+ "accuracy": 0.1025,
1648
+ "count": 400
1649
  }
1650
  }
1651
  },
1652
  "sub_random": {
1653
+ "full_accuracy": 0.015,
1654
+ "digit_accuracy": 0.57,
1655
  "n_examples": 200,
1656
  "per_subtask": {
1657
  "MD": {
1658
+ "accuracy": 0.8083333333333333,
1659
+ "count": 600
1660
  },
1661
  "MB": {
1662
+ "accuracy": 0.6441947565543071,
1663
+ "count": 267
1664
  },
1665
  "ME": {
1666
+ "accuracy": 0.9245283018867925,
1667
  "count": 53
1668
  },
1669
  "UB": {
1670
+ "accuracy": 0.19817767653758542,
1671
+ "count": 439
1672
  },
1673
  "UD": {
1674
+ "accuracy": 0.12195121951219512,
1675
+ "count": 41
1676
  }
1677
  }
1678
  },
1679
  "sub_B3": {
1680
  "full_accuracy": 0.0,
1681
+ "digit_accuracy": 0.52,
1682
+ "n_examples": 100,
1683
  "per_subtask": {
1684
  "MD": {
1685
+ "accuracy": 0.7366666666666667,
1686
+ "count": 300
1687
  },
1688
  "MB": {
1689
+ "accuracy": 0.71,
1690
+ "count": 100
1691
  },
1692
  "UB": {
1693
+ "accuracy": 0.3096446700507614,
1694
+ "count": 197
1695
  },
1696
  "UD": {
1697
+ "accuracy": 0.10679611650485436,
1698
+ "count": 103
1699
  }
1700
  }
1701
  },
1702
  "sub_B4": {
1703
+ "full_accuracy": 0.01,
1704
+ "digit_accuracy": 0.4742857142857143,
1705
+ "n_examples": 100,
1706
  "per_subtask": {
1707
  "MD": {
1708
+ "accuracy": 0.825,
1709
+ "count": 200
1710
  },
1711
  "MB": {
1712
+ "accuracy": 0.73,
1713
+ "count": 100
1714
  },
1715
  "UB": {
1716
+ "accuracy": 0.2591093117408907,
1717
+ "count": 247
1718
  },
1719
  "UD": {
1720
+ "accuracy": 0.19607843137254902,
1721
+ "count": 153
1722
  }
1723
  }
1724
  },
1725
  "sub_B5": {
1726
  "full_accuracy": 0.0,
1727
+ "digit_accuracy": 0.4114285714285714,
1728
+ "n_examples": 100,
1729
  "per_subtask": {
1730
  "MD": {
1731
  "accuracy": 1.0,
1732
+ "count": 100
1733
  },
1734
  "MB": {
1735
+ "accuracy": 0.82,
1736
+ "count": 100
1737
  },
1738
  "UB": {
1739
+ "accuracy": 0.24496644295302014,
1740
+ "count": 298
1741
  },
1742
  "UD": {
1743
+ "accuracy": 0.16336633663366337,
1744
+ "count": 202
1745
  }
1746
  }
1747
  }
1748
  },
1749
  "summary": {
1750
+ "overall_accuracy": 0.04923076923076923,
1751
+ "digit_accuracy": 0.5696703296703297,
1752
+ "total_examples": 2600,
1753
  "n_splits": 24
1754
  }
1755
  },
 
1759
  "K": 1,
1760
  "mode": "sorl",
1761
  "n_digits": 6,
1762
+ "n_per_split": 100
1763
  },
1764
  "splits": {
1765
  "add_S0": {
1766
+ "full_accuracy": 0.41,
1767
+ "digit_accuracy": 0.87,
1768
+ "n_examples": 100,
1769
  "per_subtask": {
1770
  "SA": {
1771
+ "accuracy": 0.8925619834710744,
1772
+ "count": 605
1773
  },
1774
  "SS": {
1775
+ "accuracy": 0.7263157894736842,
1776
+ "count": 95
1777
  }
1778
  }
1779
  },
1780
  "add_S1": {
1781
+ "full_accuracy": 0.1,
1782
+ "digit_accuracy": 0.7042857142857143,
1783
+ "n_examples": 100,
1784
  "per_subtask": {
1785
  "SA": {
1786
+ "accuracy": 0.9019607843137255,
1787
+ "count": 204
1788
  },
1789
  "SC": {
1790
+ "accuracy": 0.8461538461538461,
1791
+ "count": 169
1792
  },
1793
  "SS": {
1794
+ "accuracy": 0.8709677419354839,
1795
+ "count": 31
1796
  },
1797
  "UC": {
1798
+ "accuracy": 0.46959459459459457,
1799
+ "count": 296
1800
  }
1801
  }
1802
  },
1803
  "add_S2": {
1804
+ "full_accuracy": 0.12,
1805
+ "digit_accuracy": 0.7385714285714285,
1806
+ "n_examples": 100,
1807
  "per_subtask": {
1808
  "SA": {
1809
+ "accuracy": 0.9325153374233128,
1810
+ "count": 163
1811
  },
1812
  "SC": {
1813
+ "accuracy": 0.9,
1814
+ "count": 130
1815
  },
1816
  "SS": {
1817
+ "accuracy": 0.735632183908046,
1818
+ "count": 87
1819
  },
1820
  "UC": {
1821
+ "accuracy": 0.4630541871921182,
1822
+ "count": 203
1823
  },
1824
  "US": {
1825
+ "accuracy": 0.7692307692307693,
1826
+ "count": 117
1827
  }
1828
  }
1829
  },
1830
  "add_S3": {
1831
+ "full_accuracy": 0.13,
1832
+ "digit_accuracy": 0.6657142857142857,
1833
+ "n_examples": 100,
1834
  "per_subtask": {
1835
  "SA": {
1836
+ "accuracy": 0.9504132231404959,
1837
+ "count": 121
1838
  },
1839
  "SC": {
1840
+ "accuracy": 0.859504132231405,
1841
+ "count": 121
1842
  },
1843
  "SS": {
1844
+ "accuracy": 0.8979591836734694,
1845
+ "count": 49
1846
  },
1847
  "UC": {
1848
+ "accuracy": 0.44086021505376344,
1849
+ "count": 186
1850
  },
1851
  "US": {
1852
+ "accuracy": 0.5426008968609866,
1853
+ "count": 223
1854
  }
1855
  }
1856
  },
1857
  "add_S4": {
1858
+ "full_accuracy": 0.11,
1859
+ "digit_accuracy": 0.6028571428571429,
1860
+ "n_examples": 100,
1861
  "per_subtask": {
1862
  "SA": {
1863
+ "accuracy": 0.9711538461538461,
1864
+ "count": 104
1865
  },
1866
  "SC": {
1867
+ "accuracy": 0.8018867924528302,
1868
+ "count": 106
1869
  },
1870
  "SS": {
1871
+ "accuracy": 0.9130434782608695,
1872
+ "count": 23
1873
  },
1874
  "UC": {
1875
+ "accuracy": 0.44375,
1876
+ "count": 160
1877
  },
1878
  "US": {
1879
+ "accuracy": 0.46905537459283386,
1880
+ "count": 307
1881
  }
1882
  }
1883
  },
1884
  "add_S5": {
1885
+ "full_accuracy": 0.13,
1886
+ "digit_accuracy": 0.55,
1887
+ "n_examples": 100,
1888
  "per_subtask": {
1889
  "SA": {
1890
+ "accuracy": 1.0,
1891
+ "count": 100
1892
  },
1893
  "SC": {
1894
+ "accuracy": 0.87,
1895
+ "count": 100
1896
  },
1897
  "UC": {
1898
+ "accuracy": 0.28,
1899
+ "count": 100
1900
  },
1901
  "US": {
1902
+ "accuracy": 0.425,
1903
+ "count": 400
1904
  }
1905
  }
1906
  },
1907
  "add_S6": {
1908
+ "full_accuracy": 0.13,
1909
+ "digit_accuracy": 0.40714285714285714,
1910
+ "n_examples": 100,
1911
  "per_subtask": {
1912
  "SC": {
1913
+ "accuracy": 0.89,
1914
+ "count": 100
1915
  },
1916
  "UC": {
1917
+ "accuracy": 0.3,
1918
+ "count": 100
1919
  },
1920
  "US": {
1921
+ "accuracy": 0.332,
1922
+ "count": 500
1923
  }
1924
  }
1925
  },
1926
  "add_random": {
1927
+ "full_accuracy": 0.085,
1928
+ "digit_accuracy": 0.7257142857142858,
1929
  "n_examples": 200,
1930
  "per_subtask": {
1931
  "SA": {
1932
+ "accuracy": 0.9284116331096197,
1933
+ "count": 447
1934
  },
1935
  "SC": {
1936
+ "accuracy": 0.896875,
1937
+ "count": 320
1938
  },
1939
  "SS": {
1940
+ "accuracy": 0.75,
1941
+ "count": 56
1942
  },
1943
  "UC": {
1944
+ "accuracy": 0.4499054820415879,
1945
+ "count": 529
1946
  },
1947
  "US": {
1948
+ "accuracy": 0.7083333333333334,
1949
+ "count": 48
1950
  }
1951
  }
1952
  },
1953
  "add_C1": {
1954
+ "full_accuracy": 0.2,
1955
+ "digit_accuracy": 0.8142857142857143,
1956
+ "n_examples": 100,
1957
  "per_subtask": {
1958
  "SA": {
1959
+ "accuracy": 0.894,
1960
+ "count": 500
1961
  },
1962
  "SC": {
1963
+ "accuracy": 0.81,
1964
+ "count": 100
1965
  },
1966
  "UC": {
1967
+ "accuracy": 0.42,
1968
+ "count": 100
1969
  }
1970
  }
1971
  },
1972
  "add_C2": {
1973
+ "full_accuracy": 0.16,
1974
+ "digit_accuracy": 0.7828571428571428,
1975
+ "n_examples": 100,
1976
  "per_subtask": {
1977
  "SA": {
1978
+ "accuracy": 0.9325,
1979
+ "count": 400
1980
  },
1981
  "SC": {
1982
+ "accuracy": 0.84,
1983
+ "count": 100
1984
  },
1985
  "UC": {
1986
+ "accuracy": 0.41025641025641024,
1987
+ "count": 156
1988
  },
1989
  "US": {
1990
+ "accuracy": 0.6136363636363636,
1991
+ "count": 44
1992
  }
1993
  }
1994
  },
1995
  "add_C3": {
1996
+ "full_accuracy": 0.08,
1997
+ "digit_accuracy": 0.71,
1998
+ "n_examples": 100,
1999
  "per_subtask": {
2000
  "SA": {
2001
+ "accuracy": 0.8966666666666666,
2002
+ "count": 300
2003
  },
2004
  "SC": {
2005
+ "accuracy": 0.87,
2006
+ "count": 100
2007
  },
2008
  "UC": {
2009
+ "accuracy": 0.36180904522613067,
2010
+ "count": 199
2011
  },
2012
  "US": {
2013
+ "accuracy": 0.6831683168316832,
2014
+ "count": 101
2015
  }
2016
  }
2017
  },
2018
  "add_C4": {
2019
+ "full_accuracy": 0.05,
2020
+ "digit_accuracy": 0.6157142857142858,
2021
+ "n_examples": 100,
2022
  "per_subtask": {
2023
  "SA": {
2024
+ "accuracy": 0.92,
2025
+ "count": 200
2026
  },
2027
  "SC": {
2028
+ "accuracy": 0.85,
2029
+ "count": 100
2030
  },
2031
  "UC": {
2032
+ "accuracy": 0.2916666666666667,
2033
+ "count": 264
2034
  },
2035
  "US": {
2036
+ "accuracy": 0.625,
2037
+ "count": 136
2038
  }
2039
  }
2040
  },
2041
  "add_C5": {
2042
+ "full_accuracy": 0.02,
2043
+ "digit_accuracy": 0.6071428571428571,
2044
+ "n_examples": 100,
2045
  "per_subtask": {
2046
  "SA": {
2047
+ "accuracy": 0.99,
2048
+ "count": 100
2049
  },
2050
  "SC": {
2051
+ "accuracy": 0.95,
2052
+ "count": 100
2053
  },
2054
  "UC": {
2055
+ "accuracy": 0.4161290322580645,
2056
+ "count": 310
2057
  },
2058
  "US": {
2059
+ "accuracy": 0.5368421052631579,
2060
+ "count": 190
2061
  }
2062
  }
2063
  },
2064
  "add_C6": {
2065
  "full_accuracy": 0.0,
2066
+ "digit_accuracy": 0.5742857142857143,
2067
+ "n_examples": 100,
2068
  "per_subtask": {
2069
  "SC": {
2070
+ "accuracy": 0.94,
2071
+ "count": 100
2072
  },
2073
  "UC": {
2074
+ "accuracy": 0.45135135135135135,
2075
+ "count": 370
2076
  },
2077
  "US": {
2078
+ "accuracy": 0.6130434782608696,
2079
+ "count": 230
2080
  }
2081
  }
2082
  },
2083
  "sub_M0": {
2084
+ "full_accuracy": 0.14,
2085
+ "digit_accuracy": 0.7957142857142857,
2086
+ "n_examples": 100,
2087
  "per_subtask": {
2088
  "MD": {
2089
+ "accuracy": 0.7804878048780488,
2090
+ "count": 615
2091
  },
2092
  "ME": {
2093
+ "accuracy": 0.9058823529411765,
2094
+ "count": 85
2095
  }
2096
  }
2097
  },
2098
  "sub_M1": {
2099
+ "full_accuracy": 0.03,
2100
+ "digit_accuracy": 0.6628571428571428,
2101
+ "n_examples": 100,
2102
  "per_subtask": {
2103
  "MD": {
2104
+ "accuracy": 0.7705479452054794,
2105
+ "count": 292
2106
  },
2107
  "MB": {
2108
+ "accuracy": 0.8680555555555556,
2109
+ "count": 144
2110
  },
2111
  "ME": {
2112
+ "accuracy": 0.88,
2113
+ "count": 25
2114
  },
2115
  "UB": {
2116
+ "accuracy": 0.38493723849372385,
2117
+ "count": 239
2118
  }
2119
  }
2120
  },
2121
  "sub_M2": {
2122
+ "full_accuracy": 0.01,
2123
+ "digit_accuracy": 0.6585714285714286,
2124
+ "n_examples": 100,
2125
  "per_subtask": {
2126
  "MD": {
2127
+ "accuracy": 0.8672985781990521,
2128
+ "count": 211
2129
  },
2130
  "MB": {
2131
+ "accuracy": 0.8608695652173913,
2132
+ "count": 115
2133
  },
2134
  "ME": {
2135
+ "accuracy": 0.9764705882352941,
2136
+ "count": 85
2137
  },
2138
  "UB": {
2139
+ "accuracy": 0.430939226519337,
2140
+ "count": 181
2141
  },
2142
  "UD": {
2143
+ "accuracy": 0.16666666666666666,
2144
+ "count": 108
2145
  }
2146
  }
2147
  },
2148
  "sub_M3": {
2149
+ "full_accuracy": 0.03,
2150
+ "digit_accuracy": 0.6,
2151
+ "n_examples": 100,
2152
  "per_subtask": {
2153
  "MD": {
2154
+ "accuracy": 0.9329608938547486,
2155
+ "count": 179
2156
  },
2157
  "MB": {
2158
+ "accuracy": 0.8932038834951457,
2159
+ "count": 103
2160
  },
2161
  "ME": {
2162
  "accuracy": 1.0,
2163
+ "count": 56
2164
  },
2165
  "UB": {
2166
+ "accuracy": 0.4228187919463087,
2167
+ "count": 149
2168
  },
2169
  "UD": {
2170
+ "accuracy": 0.19718309859154928,
2171
+ "count": 213
2172
  }
2173
  }
2174
  },
2175
  "sub_M4": {
2176
+ "full_accuracy": 0.05,
2177
+ "digit_accuracy": 0.5314285714285715,
2178
+ "n_examples": 100,
2179
  "per_subtask": {
2180
  "MD": {
2181
+ "accuracy": 0.86,
2182
+ "count": 200
2183
  },
2184
  "MB": {
2185
+ "accuracy": 0.92,
2186
+ "count": 100
2187
  },
2188
  "UB": {
2189
+ "accuracy": 0.54,
2190
+ "count": 100
2191
  },
2192
  "UD": {
2193
+ "accuracy": 0.18,
2194
+ "count": 300
2195
  }
2196
  }
2197
  },
2198
  "sub_M5": {
2199
+ "full_accuracy": 0.03,
2200
+ "digit_accuracy": 0.3942857142857143,
2201
+ "n_examples": 100,
2202
  "per_subtask": {
2203
  "MD": {
2204
  "accuracy": 1.0,
2205
+ "count": 100
2206
  },
2207
  "MB": {
2208
+ "accuracy": 0.97,
2209
+ "count": 100
2210
  },
2211
  "UB": {
2212
+ "accuracy": 0.51,
2213
+ "count": 100
2214
  },
2215
  "UD": {
2216
+ "accuracy": 0.07,
2217
+ "count": 400
2218
  }
2219
  }
2220
  },
2221
  "sub_random": {
2222
  "full_accuracy": 0.04,
2223
+ "digit_accuracy": 0.6657142857142857,
2224
  "n_examples": 200,
2225
  "per_subtask": {
2226
  "MD": {
2227
+ "accuracy": 0.8083333333333333,
2228
+ "count": 600
2229
  },
2230
  "MB": {
2231
+ "accuracy": 0.8539325842696629,
2232
+ "count": 267
2233
  },
2234
  "ME": {
2235
+ "accuracy": 0.9245283018867925,
2236
  "count": 53
2237
  },
2238
  "UB": {
2239
+ "accuracy": 0.3690205011389522,
2240
+ "count": 439
2241
  },
2242
  "UD": {
2243
+ "accuracy": 0.1951219512195122,
2244
+ "count": 41
2245
  }
2246
  }
2247
  },
2248
  "sub_B3": {
2249
+ "full_accuracy": 0.02,
2250
+ "digit_accuracy": 0.5985714285714285,
2251
+ "n_examples": 100,
2252
  "per_subtask": {
2253
  "MD": {
2254
+ "accuracy": 0.8133333333333334,
2255
+ "count": 300
2256
  },
2257
  "MB": {
2258
+ "accuracy": 0.84,
2259
+ "count": 100
2260
  },
2261
  "UB": {
2262
+ "accuracy": 0.38578680203045684,
2263
+ "count": 197
2264
  },
2265
  "UD": {
2266
+ "accuracy": 0.14563106796116504,
2267
+ "count": 103
2268
  }
2269
  }
2270
  },
2271
  "sub_B4": {
2272
  "full_accuracy": 0.0,
2273
+ "digit_accuracy": 0.5528571428571428,
2274
+ "n_examples": 100,
2275
  "per_subtask": {
2276
  "MD": {
2277
+ "accuracy": 0.875,
2278
+ "count": 200
2279
  },
2280
  "MB": {
2281
+ "accuracy": 0.91,
2282
+ "count": 100
2283
  },
2284
  "UB": {
2285
+ "accuracy": 0.3360323886639676,
2286
+ "count": 247
2287
  },
2288
  "UD": {
2289
+ "accuracy": 0.24836601307189543,
2290
+ "count": 153
2291
  }
2292
  }
2293
  },
2294
  "sub_B5": {
2295
  "full_accuracy": 0.0,
2296
+ "digit_accuracy": 0.5,
2297
+ "n_examples": 100,
2298
  "per_subtask": {
2299
  "MD": {
2300
  "accuracy": 1.0,
2301
+ "count": 100
2302
  },
2303
  "MB": {
2304
+ "accuracy": 1.0,
2305
+ "count": 100
2306
  },
2307
  "UB": {
2308
+ "accuracy": 0.348993288590604,
2309
+ "count": 298
2310
  },
2311
  "UD": {
2312
+ "accuracy": 0.22772277227722773,
2313
+ "count": 202
2314
  }
2315
  }
2316
  }
2317
  },
2318
  "summary": {
2319
+ "overall_accuracy": 0.08461538461538462,
2320
+ "digit_accuracy": 0.642967032967033,
2321
+ "total_examples": 2600,
2322
  "n_splits": 24
2323
  }
2324
  },
2325
+ "sorl_overall_accuracy": 0.08461538461538462,
2326
+ "sft_overall_accuracy": 0.04923076923076923
2327
  }
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a10bb1f5b2eb181b8afa8885bc9617b44e94d4a7398d421530a00ee21fe002b3
3
  size 315091124
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47628d01fe602b58dee9cf66e44ae762c69be2e2df852104d940b6c1aca5d2b0
3
  size 315091124
add_sub_sorl_v1_abs10_K1_25K_1L2H256d/train_config.json CHANGED
@@ -20,7 +20,7 @@
20
  "lr": 2e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
- "warmup_steps": 117,
24
  "cooldown_frac": 0.4,
25
  "max_grad_norm": 1.0,
26
  "vq_abs_pretrain_steps": 0,
@@ -30,7 +30,7 @@
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
- "num_epochs": 10,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 390,
@@ -69,16 +69,20 @@
69
  "no_wandb": false,
70
  "n_params": 78696448,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K_1L2H256d",
72
- "git_commit": "3ae8ca0d6b88706715f25991b1e1acd0e3a6e0a6",
73
- "timestamp": "2026-04-12T20:10:10.507089+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
 
77
  "model_repo": "thoughtworks/arithmetic-sorl",
78
  "trainer_version": "v1",
79
- "wandb_run_id": "8olk3u7a",
80
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/8olk3u7a",
81
- "final_accuracy": 0.06291666666666666,
82
- "sft_accuracy": 0.011666666666666667,
 
 
 
83
  "eval_method": "ArithmeticEvaluator"
84
  }
 
20
  "lr": 2e-05,
21
  "emb_lr_mult": 1.0,
22
  "weight_decay": 0.01,
23
+ "warmup_steps": 234,
24
  "cooldown_frac": 0.4,
25
  "max_grad_norm": 1.0,
26
  "vq_abs_pretrain_steps": 0,
 
30
  "vq_abs_pretrain_target_vectors": 20000,
31
  "batch_size": 64,
32
  "gradient_accumulation_steps": 1,
33
+ "num_epochs": 20,
34
  "emb_warmup_steps": 0,
35
  "log_every": 50,
36
  "eval_every": 390,
 
69
  "no_wandb": false,
70
  "n_params": 78696448,
71
  "run_name": "add_sub_sorl_v1_abs10_K1_25K_1L2H256d",
72
+ "git_commit": "f835493c19eb98267697007042c9d440cad2afbb",
73
+ "timestamp": "2026-04-15T20:34:04.553671+00:00",
74
  "tokenizer": "Qwen/Qwen3-0.6B",
75
  "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
  "dataset_config": "add_sub_6digit",
77
+ "train_dataset": "fixed_train/train_25K_seed42.pt",
78
  "model_repo": "thoughtworks/arithmetic-sorl",
79
  "trainer_version": "v1",
80
+ "wandb_run_id": "lz3nsjvy",
81
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/lz3nsjvy",
82
+ "eval_final_dataset": "eval_sets/eval_add_sub_6d_N100_seed42.json",
83
+ "eval_epoch_dataset": "eval_sets/eval_add_sub_6d_N25_seed42.json",
84
+ "eval_hf_repo": "thoughtworks/arithmetic-sorl-data",
85
+ "final_accuracy": 0.08461538461538462,
86
+ "sft_accuracy": 0.04923076923076923,
87
  "eval_method": "ArithmeticEvaluator"
88
  }