amirali1985 commited on
Commit
eba78af
·
verified ·
1 Parent(s): 698e661

Delete folder add_sub_baseline_25K_2L1H128d with huggingface_hub

Browse files
add_sub_baseline_25K_2L1H128d/config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "architectures": [
3
- "SorlModelWrapper"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "float32",
9
- "eos_token_id": null,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 128,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 512,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention"
18
- ],
19
- "max_position_embeddings": 128,
20
- "max_window_layers": 28,
21
- "model_type": "qwen3",
22
- "num_attention_heads": 1,
23
- "num_hidden_layers": 2,
24
- "num_key_value_heads": 1,
25
- "pad_token_id": null,
26
- "rms_norm_eps": 1e-06,
27
- "rope_parameters": {
28
- "rope_theta": 10000.0,
29
- "rope_type": "default"
30
- },
31
- "sliding_window": null,
32
- "tie_word_embeddings": false,
33
- "transformers_version": "5.5.0",
34
- "use_cache": true,
35
- "use_sliding_window": false,
36
- "vocab_size": 151645
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add_sub_baseline_25K_2L1H128d/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "output_attentions": false,
4
- "output_hidden_states": false,
5
- "transformers_version": "5.5.0",
6
- "use_cache": true
7
- }
 
 
 
 
 
 
 
 
add_sub_baseline_25K_2L1H128d/metrics.json DELETED
@@ -1,1207 +0,0 @@
1
- {
2
- "history": {
3
- "step": [
4
- 50,
5
- 100,
6
- 150,
7
- 200,
8
- 250,
9
- 300,
10
- 350,
11
- 400,
12
- 450,
13
- 500,
14
- 550,
15
- 600,
16
- 650,
17
- 700,
18
- 750,
19
- 800,
20
- 850,
21
- 900,
22
- 950,
23
- 1000,
24
- 1050,
25
- 1100,
26
- 1150,
27
- 1200,
28
- 1250,
29
- 1300,
30
- 1350,
31
- 1400,
32
- 1450,
33
- 1500,
34
- 1550,
35
- 1600,
36
- 1650,
37
- 1700,
38
- 1750,
39
- 1800,
40
- 1850,
41
- 1900,
42
- 1950,
43
- 2000,
44
- 2050,
45
- 2100,
46
- 2150,
47
- 2200,
48
- 2250,
49
- 2300,
50
- 2350,
51
- 2400,
52
- 2450,
53
- 2500,
54
- 2550,
55
- 2600,
56
- 2650,
57
- 2700,
58
- 2750,
59
- 2800,
60
- 2850,
61
- 2900,
62
- 2950,
63
- 3000,
64
- 3050,
65
- 3100,
66
- 3150,
67
- 3200,
68
- 3250,
69
- 3300,
70
- 3350,
71
- 3400,
72
- 3450,
73
- 3500,
74
- 3550,
75
- 3600,
76
- 3650,
77
- 3700,
78
- 3750,
79
- 3800,
80
- 3850,
81
- 3900,
82
- 3950,
83
- 4000,
84
- 4050,
85
- 4100,
86
- 4150,
87
- 4200,
88
- 4250,
89
- 4300,
90
- 4350,
91
- 4400,
92
- 4450,
93
- 4500,
94
- 4550,
95
- 4600,
96
- 4650,
97
- 4700,
98
- 4750,
99
- 4800,
100
- 4850,
101
- 4900,
102
- 4950,
103
- 5000,
104
- 5050,
105
- 5100,
106
- 5150,
107
- 5200,
108
- 5250,
109
- 5300,
110
- 5350,
111
- 5400,
112
- 5450,
113
- 5500,
114
- 5550,
115
- 5600,
116
- 5650,
117
- 5700,
118
- 5750,
119
- 5800,
120
- 5850,
121
- 5900,
122
- 5950,
123
- 6000,
124
- 6050,
125
- 6100,
126
- 6150,
127
- 6200,
128
- 6250,
129
- 6300,
130
- 6350,
131
- 6400,
132
- 6450,
133
- 6500,
134
- 6550,
135
- 6600,
136
- 6650,
137
- 6700,
138
- 6750,
139
- 6800,
140
- 6850,
141
- 6900,
142
- 6950,
143
- 7000,
144
- 7050,
145
- 7100,
146
- 7150,
147
- 7200,
148
- 7250,
149
- 7300,
150
- 7350,
151
- 7400,
152
- 7450,
153
- 7500,
154
- 7550,
155
- 7600,
156
- 7650,
157
- 7700,
158
- 7750,
159
- 7800
160
- ],
161
- "loss": [
162
- 11.7634916305542,
163
- 11.45627498626709,
164
- 11.060808181762695,
165
- 10.771638870239258,
166
- 10.56745719909668,
167
- 10.452330589294434,
168
- 10.187796592712402,
169
- 9.892010688781738,
170
- 9.670610427856445,
171
- 9.368887901306152,
172
- 8.985150337219238,
173
- 8.671424865722656,
174
- 8.289552688598633,
175
- 7.789375305175781,
176
- 7.353806018829346,
177
- 6.85903263092041,
178
- 6.285610675811768,
179
- 5.735620975494385,
180
- 5.199695587158203,
181
- 4.512220859527588,
182
- 3.904197931289673,
183
- 3.371004581451416,
184
- 2.821594476699829,
185
- 2.475250482559204,
186
- 2.342219114303589,
187
- 2.2267563343048096,
188
- 2.090350866317749,
189
- 1.9642821550369263,
190
- 1.9541330337524414,
191
- 1.9481629133224487,
192
- 1.8441284894943237,
193
- 1.8009576797485352,
194
- 1.6884515285491943,
195
- 1.7101805210113525,
196
- 1.69532310962677,
197
- 1.5124719142913818,
198
- 1.4401136636734009,
199
- 1.2156561613082886,
200
- 1.0262309312820435,
201
- 0.8835648894309998,
202
- 0.8472915291786194,
203
- 0.7222514152526855,
204
- 0.6780160069465637,
205
- 0.6544712781906128,
206
- 0.6244059205055237,
207
- 0.5614253878593445,
208
- 0.5999141335487366,
209
- 0.5366412401199341,
210
- 0.525538444519043,
211
- 0.4697754383087158,
212
- 0.43578100204467773,
213
- 0.42362627387046814,
214
- 0.4657476842403412,
215
- 0.39314374327659607,
216
- 0.44571369886398315,
217
- 0.46929022669792175,
218
- 0.4601804316043854,
219
- 0.41770702600479126,
220
- 0.3800906240940094,
221
- 0.35692691802978516,
222
- 0.36574602127075195,
223
- 0.36929160356521606,
224
- 0.3116826117038727,
225
- 0.33278241753578186,
226
- 0.3195117115974426,
227
- 0.33156225085258484,
228
- 0.33221402764320374,
229
- 0.28792819380760193,
230
- 0.28109949827194214,
231
- 0.34874439239501953,
232
- 0.3130231499671936,
233
- 0.293014794588089,
234
- 0.3234354853630066,
235
- 0.3052525818347931,
236
- 0.3068646788597107,
237
- 0.27876633405685425,
238
- 0.2666243016719818,
239
- 0.2800367474555969,
240
- 0.2649993300437927,
241
- 0.2613418698310852,
242
- 0.2548615634441376,
243
- 0.2682749629020691,
244
- 0.26009830832481384,
245
- 0.28758829832077026,
246
- 0.2846777141094208,
247
- 0.267145574092865,
248
- 0.2375868707895279,
249
- 0.24300037324428558,
250
- 0.2557847797870636,
251
- 0.27018988132476807,
252
- 0.29427194595336914,
253
- 0.26620692014694214,
254
- 0.21452593803405762,
255
- 0.25236931443214417,
256
- 0.21389567852020264,
257
- 0.23571054637432098,
258
- 0.21729664504528046,
259
- 0.23620560765266418,
260
- 0.244893416762352,
261
- 0.2140817642211914,
262
- 0.1934843510389328,
263
- 0.2416810244321823,
264
- 0.26883748173713684,
265
- 0.25028640031814575,
266
- 0.17800457775592804,
267
- 0.19792746007442474,
268
- 0.21290241181850433,
269
- 0.2256382256746292,
270
- 0.21823109686374664,
271
- 0.18950210511684418,
272
- 0.17954210937023163,
273
- 0.1867464929819107,
274
- 0.21450135111808777,
275
- 0.18878397345542908,
276
- 0.17990292608737946,
277
- 0.18001924455165863,
278
- 0.17861464619636536,
279
- 0.23842141032218933,
280
- 0.16990317404270172,
281
- 0.19019348919391632,
282
- 0.18776902556419373,
283
- 0.16702207922935486,
284
- 0.20157763361930847,
285
- 0.1695648729801178,
286
- 0.1583036333322525,
287
- 0.173870250582695,
288
- 0.1676294356584549,
289
- 0.1616489589214325,
290
- 0.15891310572624207,
291
- 0.1802377700805664,
292
- 0.18001994490623474,
293
- 0.15260060131549835,
294
- 0.1811007410287857,
295
- 0.18948005139827728,
296
- 0.18753862380981445,
297
- 0.14070627093315125,
298
- 0.15743663907051086,
299
- 0.21269260346889496,
300
- 0.14259196817874908,
301
- 0.15691256523132324,
302
- 0.13290558755397797,
303
- 0.15115270018577576,
304
- 0.15428075194358826,
305
- 0.14059321582317352,
306
- 0.16690918803215027,
307
- 0.18262195587158203,
308
- 0.20530763268470764,
309
- 0.1720554083585739,
310
- 0.17429032921791077,
311
- 0.13190022110939026,
312
- 0.1686907261610031,
313
- 0.16373898088932037,
314
- 0.15965792536735535,
315
- 0.17695140838623047,
316
- 0.15248863399028778,
317
- 0.148233100771904
318
- ],
319
- "base_loss": [
320
- 11.7634916305542,
321
- 11.45627498626709,
322
- 11.060808181762695,
323
- 10.771638870239258,
324
- 10.56745719909668,
325
- 10.452330589294434,
326
- 10.187796592712402,
327
- 9.892010688781738,
328
- 9.670610427856445,
329
- 9.368887901306152,
330
- 8.985150337219238,
331
- 8.671424865722656,
332
- 8.289552688598633,
333
- 7.789375305175781,
334
- 7.353806018829346,
335
- 6.85903263092041,
336
- 6.285610675811768,
337
- 5.735620975494385,
338
- 5.199695587158203,
339
- 4.512220859527588,
340
- 3.904197931289673,
341
- 3.371004581451416,
342
- 2.821594476699829,
343
- 2.475250482559204,
344
- 2.342219114303589,
345
- 2.2267563343048096,
346
- 2.090350866317749,
347
- 1.9642821550369263,
348
- 1.9541330337524414,
349
- 1.9481629133224487,
350
- 1.8441284894943237,
351
- 1.8009576797485352,
352
- 1.6884515285491943,
353
- 1.7101805210113525,
354
- 1.69532310962677,
355
- 1.5124719142913818,
356
- 1.4401136636734009,
357
- 1.2156561613082886,
358
- 1.0262309312820435,
359
- 0.8835648894309998,
360
- 0.8472915291786194,
361
- 0.7222514152526855,
362
- 0.6780160069465637,
363
- 0.6544712781906128,
364
- 0.6244059205055237,
365
- 0.5614253878593445,
366
- 0.5999141335487366,
367
- 0.5366412401199341,
368
- 0.525538444519043,
369
- 0.4697754383087158,
370
- 0.43578100204467773,
371
- 0.42362627387046814,
372
- 0.4657476842403412,
373
- 0.39314374327659607,
374
- 0.44571369886398315,
375
- 0.46929022669792175,
376
- 0.4601804316043854,
377
- 0.41770702600479126,
378
- 0.3800906240940094,
379
- 0.35692691802978516,
380
- 0.36574602127075195,
381
- 0.36929160356521606,
382
- 0.3116826117038727,
383
- 0.33278241753578186,
384
- 0.3195117115974426,
385
- 0.33156225085258484,
386
- 0.33221402764320374,
387
- 0.28792819380760193,
388
- 0.28109949827194214,
389
- 0.34874439239501953,
390
- 0.3130231499671936,
391
- 0.293014794588089,
392
- 0.3234354853630066,
393
- 0.3052525818347931,
394
- 0.3068646788597107,
395
- 0.27876633405685425,
396
- 0.2666243016719818,
397
- 0.2800367474555969,
398
- 0.2649993300437927,
399
- 0.2613418698310852,
400
- 0.2548615634441376,
401
- 0.2682749629020691,
402
- 0.26009830832481384,
403
- 0.28758829832077026,
404
- 0.2846777141094208,
405
- 0.267145574092865,
406
- 0.2375868707895279,
407
- 0.24300037324428558,
408
- 0.2557847797870636,
409
- 0.27018988132476807,
410
- 0.29427194595336914,
411
- 0.26620692014694214,
412
- 0.21452593803405762,
413
- 0.25236931443214417,
414
- 0.21389567852020264,
415
- 0.23571054637432098,
416
- 0.21729664504528046,
417
- 0.23620560765266418,
418
- 0.244893416762352,
419
- 0.2140817642211914,
420
- 0.1934843510389328,
421
- 0.2416810244321823,
422
- 0.26883748173713684,
423
- 0.25028640031814575,
424
- 0.17800457775592804,
425
- 0.19792746007442474,
426
- 0.21290241181850433,
427
- 0.2256382256746292,
428
- 0.21823109686374664,
429
- 0.18950210511684418,
430
- 0.17954210937023163,
431
- 0.1867464929819107,
432
- 0.21450135111808777,
433
- 0.18878397345542908,
434
- 0.17990292608737946,
435
- 0.18001924455165863,
436
- 0.17861464619636536,
437
- 0.23842141032218933,
438
- 0.16990317404270172,
439
- 0.19019348919391632,
440
- 0.18776902556419373,
441
- 0.16702207922935486,
442
- 0.20157763361930847,
443
- 0.1695648729801178,
444
- 0.1583036333322525,
445
- 0.173870250582695,
446
- 0.1676294356584549,
447
- 0.1616489589214325,
448
- 0.15891310572624207,
449
- 0.1802377700805664,
450
- 0.18001994490623474,
451
- 0.15260060131549835,
452
- 0.1811007410287857,
453
- 0.18948005139827728,
454
- 0.18753862380981445,
455
- 0.14070627093315125,
456
- 0.15743663907051086,
457
- 0.21269260346889496,
458
- 0.14259196817874908,
459
- 0.15691256523132324,
460
- 0.13290558755397797,
461
- 0.15115270018577576,
462
- 0.15428075194358826,
463
- 0.14059321582317352,
464
- 0.16690918803215027,
465
- 0.18262195587158203,
466
- 0.20530763268470764,
467
- 0.1720554083585739,
468
- 0.17429032921791077,
469
- 0.13190022110939026,
470
- 0.1686907261610031,
471
- 0.16373898088932037,
472
- 0.15965792536735535,
473
- 0.17695140838623047,
474
- 0.15248863399028778,
475
- 0.148233100771904
476
- ],
477
- "lr": [
478
- 2.5063938618925837e-06,
479
- 5.063938618925831e-06,
480
- 7.62148337595908e-06,
481
- 1.0179028132992328e-05,
482
- 1.2736572890025576e-05,
483
- 1.5294117647058822e-05,
484
- 1.7851662404092073e-05,
485
- 2.040920716112532e-05,
486
- 2.296675191815857e-05,
487
- 2.5524296675191817e-05,
488
- 2.8081841432225065e-05,
489
- 3.0639386189258316e-05,
490
- 3.3196930946291564e-05,
491
- 3.575447570332481e-05,
492
- 3.831202046035806e-05,
493
- 4.086956521739131e-05,
494
- 4.3427109974424555e-05,
495
- 4.598465473145781e-05,
496
- 4.854219948849105e-05,
497
- 5.10997442455243e-05,
498
- 5.365728900255755e-05,
499
- 5.62148337595908e-05,
500
- 5.877237851662404e-05,
501
- 6.13299232736573e-05,
502
- 6.388746803069055e-05,
503
- 6.644501278772379e-05,
504
- 6.900255754475704e-05,
505
- 7.156010230179029e-05,
506
- 7.411764705882354e-05,
507
- 7.667519181585678e-05,
508
- 7.923273657289003e-05,
509
- 7.999382181128958e-05,
510
- 7.996356588945887e-05,
511
- 7.990811651495726e-05,
512
- 7.982750864365423e-05,
513
- 7.97217930916005e-05,
514
- 7.9591036502993e-05,
515
- 7.943532130816183e-05,
516
- 7.925474567160515e-05,
517
- 7.904942343010533e-05,
518
- 7.881948402096506e-05,
519
- 7.856507240040864e-05,
520
- 7.828634895220009e-05,
521
- 7.798348938653556e-05,
522
- 7.765668462927371e-05,
523
- 7.730614070157413e-05,
524
- 7.693207859001933e-05,
525
- 7.653473410730253e-05,
526
- 7.611435774356888e-05,
527
- 7.567121450850376e-05,
528
- 7.520558376426795e-05,
529
- 7.471775904938474e-05,
530
- 7.420804789369019e-05,
531
- 7.367677162446306e-05,
532
- 7.312426516385672e-05,
533
- 7.255087681776069e-05,
534
- 7.195696805622496e-05,
535
- 7.13429132855854e-05,
536
- 7.070909961243422e-05,
537
- 7.005592659958366e-05,
538
- 6.938380601417765e-05,
539
- 6.869316156810923e-05,
540
- 6.798442865090831e-05,
541
- 6.725805405526735e-05,
542
- 6.651449569537871e-05,
543
- 6.575422231826058e-05,
544
- 6.497771320825402e-05,
545
- 6.418545788487704e-05,
546
- 6.337795579422628e-05,
547
- 6.255571599412105e-05,
548
- 6.171925683318781e-05,
549
- 6.086910562408781e-05,
550
- 6.0005798311093635e-05,
551
- 5.912987913222422e-05,
552
- 5.824190027615158e-05,
553
- 5.734242153409514e-05,
554
- 5.643200994692358e-05,
555
- 5.55112394476862e-05,
556
- 5.458069049979956e-05,
557
- 5.364094973111714e-05,
558
- 5.269260956411309e-05,
559
- 5.1736267842412726e-05,
560
- 5.077252745390575e-05,
561
- 4.980199595067928e-05,
562
- 4.882528516601063e-05,
563
- 4.784301082866123e-05,
564
- 4.685579217471466e-05,
565
- 4.586425155720376e-05,
566
- 4.4869014053772686e-05,
567
- 4.387070707262142e-05,
568
- 4.286995995698098e-05,
569
- 4.186740358836888e-05,
570
- 4.0863669988874755e-05,
571
- 3.985939192272697e-05,
572
- 3.885520249739142e-05,
573
- 3.785173476445388e-05,
574
- 3.684962132053763e-05,
575
- 3.584949390850793e-05,
576
- 3.485198301921461e-05,
577
- 3.385771749402399e-05,
578
- 3.2867324128390756e-05,
579
- 3.188142727671938e-05,
580
- 3.090064845876465e-05,
581
- 2.9925605967818972e-05,
582
- 2.895691448093382e-05,
583
- 2.799518467142088e-05,
584
- 2.7041022823877087e-05,
585
- 2.6095030451976512e-05,
586
- 2.515780391926975e-05,
587
- 2.4229934063230064e-05,
588
- 2.3312005822783295e-05,
589
- 2.240459786955611e-05,
590
- 2.150828224307534e-05,
591
- 2.0623623990148315e-05,
592
- 1.9751180808651272e-05,
593
- 1.8891502695950898e-05,
594
- 1.8045131602180072e-05,
595
- 1.7212601088586823e-05,
596
- 1.63944359911718e-05,
597
- 1.559115208982597e-05,
598
- 1.48032557831777e-05,
599
- 1.4031243769353617e-05,
600
- 1.3275602732854923e-05,
601
- 1.2536809037746398e-05,
602
- 1.1815328427351398e-05,
603
- 1.1111615730642416e-05,
604
- 1.042611457551213e-05,
605
- 9.759257109105627e-06,
606
- 9.111463725390388e-06,
607
- 8.483142800135428e-06,
608
- 7.87469043346695e-06,
609
- 7.286490200162668e-06,
610
- 6.718912907842181e-06,
611
- 6.1723163632060055e-06,
612
- 5.647045146470409e-06,
613
- 5.143430394140439e-06,
614
- 4.661789590258008e-06,
615
- 4.202426366256558e-06,
616
- 3.7656303095486623e-06,
617
- 3.351676780967026e-06,
618
- 2.960826741174141e-06,
619
- 2.5933265861499514e-06,
620
- 2.2494079918611923e-06,
621
- 1.929287768210473e-06,
622
- 1.6331677223569853e-06,
623
- 1.3612345314951615e-06,
624
- 1.1136596251714304e-06,
625
- 8.905990772131879e-07,
626
- 6.921935073382368e-07,
627
- 5.185679925066245e-07,
628
- 3.698319880708301e-07,
629
- 2.4607925877392135e-07,
630
- 1.4738781963932191e-07,
631
- 7.381988678927255e-08,
632
- 2.5421838223160798e-08,
633
- 2.2241845803394615e-09
634
- ],
635
- "eval_step": [
636
- 390,
637
- 780,
638
- 1170,
639
- 1560,
640
- 1950,
641
- 2340,
642
- 2730,
643
- 3120,
644
- 3510,
645
- 3900,
646
- 4290,
647
- 4680,
648
- 5070,
649
- 5460,
650
- 5850,
651
- 6240,
652
- 6630,
653
- 7020,
654
- 7410,
655
- 7800
656
- ],
657
- "eval_epoch": [
658
- 1,
659
- 2,
660
- 3,
661
- 4,
662
- 5,
663
- 6,
664
- 7,
665
- 8,
666
- 9,
667
- 10,
668
- 11,
669
- 12,
670
- 13,
671
- 14,
672
- 15,
673
- 16,
674
- 17,
675
- 18,
676
- 19,
677
- 20
678
- ],
679
- "eval_accuracy": [
680
- 0.0,
681
- 0.0,
682
- 0.0007142857142857143,
683
- 0.008571428571428572,
684
- 0.009285714285714286,
685
- 0.08928571428571429,
686
- 0.1757142857142857,
687
- 0.22285714285714286,
688
- 0.2392857142857143,
689
- 0.3335714285714286,
690
- 0.3964285714285714,
691
- 0.36357142857142855,
692
- 0.40214285714285714,
693
- 0.455,
694
- 0.41214285714285714,
695
- 0.46714285714285714,
696
- 0.45,
697
- 0.47714285714285715,
698
- 0.5035714285714286,
699
- 0.475
700
- ]
701
- },
702
- "final_accuracy": 0.475,
703
- "sft_eval": {
704
- "config": {
705
- "ops": "add_sub",
706
- "K": null,
707
- "mode": "sft",
708
- "n_digits": 6,
709
- "n_per_split": 50
710
- },
711
- "splits": {
712
- "add_S0": {
713
- "full_accuracy": 0.76,
714
- "n_examples": 50,
715
- "per_subtask": {
716
- "SA": {
717
- "accuracy": 0.9559322033898305,
718
- "count": 295
719
- },
720
- "SS": {
721
- "accuracy": 1.0,
722
- "count": 55
723
- }
724
- }
725
- },
726
- "add_S1": {
727
- "full_accuracy": 0.72,
728
- "n_examples": 50,
729
- "per_subtask": {
730
- "SA": {
731
- "accuracy": 0.9761904761904762,
732
- "count": 126
733
- },
734
- "SC": {
735
- "accuracy": 0.9493670886075949,
736
- "count": 79
737
- },
738
- "SS": {
739
- "accuracy": 0.9523809523809523,
740
- "count": 21
741
- },
742
- "UC": {
743
- "accuracy": 0.9354838709677419,
744
- "count": 124
745
- }
746
- }
747
- },
748
- "add_S2": {
749
- "full_accuracy": 0.34,
750
- "n_examples": 50,
751
- "per_subtask": {
752
- "SA": {
753
- "accuracy": 0.96,
754
- "count": 75
755
- },
756
- "SC": {
757
- "accuracy": 0.9838709677419355,
758
- "count": 62
759
- },
760
- "SS": {
761
- "accuracy": 0.9230769230769231,
762
- "count": 39
763
- },
764
- "UC": {
765
- "accuracy": 0.7207207207207207,
766
- "count": 111
767
- },
768
- "US": {
769
- "accuracy": 1.0,
770
- "count": 63
771
- }
772
- }
773
- },
774
- "add_S3": {
775
- "full_accuracy": 0.3,
776
- "n_examples": 50,
777
- "per_subtask": {
778
- "SA": {
779
- "accuracy": 1.0,
780
- "count": 60
781
- },
782
- "SC": {
783
- "accuracy": 0.9298245614035088,
784
- "count": 57
785
- },
786
- "SS": {
787
- "accuracy": 1.0,
788
- "count": 19
789
- },
790
- "UC": {
791
- "accuracy": 0.7403846153846154,
792
- "count": 104
793
- },
794
- "US": {
795
- "accuracy": 0.8636363636363636,
796
- "count": 110
797
- }
798
- }
799
- },
800
- "add_S4": {
801
- "full_accuracy": 0.38,
802
- "n_examples": 50,
803
- "per_subtask": {
804
- "SA": {
805
- "accuracy": 1.0,
806
- "count": 48
807
- },
808
- "SC": {
809
- "accuracy": 0.9615384615384616,
810
- "count": 52
811
- },
812
- "SS": {
813
- "accuracy": 1.0,
814
- "count": 7
815
- },
816
- "UC": {
817
- "accuracy": 0.6404494382022472,
818
- "count": 89
819
- },
820
- "US": {
821
- "accuracy": 0.7272727272727273,
822
- "count": 154
823
- }
824
- }
825
- },
826
- "add_S5": {
827
- "full_accuracy": 0.28,
828
- "n_examples": 50,
829
- "per_subtask": {
830
- "SA": {
831
- "accuracy": 1.0,
832
- "count": 50
833
- },
834
- "SC": {
835
- "accuracy": 1.0,
836
- "count": 50
837
- },
838
- "UC": {
839
- "accuracy": 0.5,
840
- "count": 50
841
- },
842
- "US": {
843
- "accuracy": 0.49,
844
- "count": 200
845
- }
846
- }
847
- },
848
- "add_S6": {
849
- "full_accuracy": 0.56,
850
- "n_examples": 50,
851
- "per_subtask": {
852
- "SC": {
853
- "accuracy": 1.0,
854
- "count": 50
855
- },
856
- "UC": {
857
- "accuracy": 0.84,
858
- "count": 50
859
- },
860
- "US": {
861
- "accuracy": 0.776,
862
- "count": 250
863
- }
864
- }
865
- },
866
- "add_random": {
867
- "full_accuracy": 0.695,
868
- "n_examples": 200,
869
- "per_subtask": {
870
- "SA": {
871
- "accuracy": 0.9675174013921114,
872
- "count": 431
873
- },
874
- "SC": {
875
- "accuracy": 0.9746835443037974,
876
- "count": 316
877
- },
878
- "SS": {
879
- "accuracy": 1.0,
880
- "count": 39
881
- },
882
- "UC": {
883
- "accuracy": 0.9160714285714285,
884
- "count": 560
885
- },
886
- "US": {
887
- "accuracy": 0.9444444444444444,
888
- "count": 54
889
- }
890
- }
891
- },
892
- "add_C3": {
893
- "full_accuracy": 0.46,
894
- "n_examples": 50,
895
- "per_subtask": {
896
- "SA": {
897
- "accuracy": 1.0,
898
- "count": 150
899
- },
900
- "SC": {
901
- "accuracy": 0.98,
902
- "count": 50
903
- },
904
- "UC": {
905
- "accuracy": 0.7884615384615384,
906
- "count": 104
907
- },
908
- "US": {
909
- "accuracy": 0.8260869565217391,
910
- "count": 46
911
- }
912
- }
913
- },
914
- "add_C4": {
915
- "full_accuracy": 0.34,
916
- "n_examples": 50,
917
- "per_subtask": {
918
- "SA": {
919
- "accuracy": 1.0,
920
- "count": 100
921
- },
922
- "SC": {
923
- "accuracy": 1.0,
924
- "count": 50
925
- },
926
- "UC": {
927
- "accuracy": 0.7235772357723578,
928
- "count": 123
929
- },
930
- "US": {
931
- "accuracy": 0.7142857142857143,
932
- "count": 77
933
- }
934
- }
935
- },
936
- "add_C5": {
937
- "full_accuracy": 0.36,
938
- "n_examples": 50,
939
- "per_subtask": {
940
- "SA": {
941
- "accuracy": 1.0,
942
- "count": 50
943
- },
944
- "SC": {
945
- "accuracy": 1.0,
946
- "count": 50
947
- },
948
- "UC": {
949
- "accuracy": 0.8051948051948052,
950
- "count": 154
951
- },
952
- "US": {
953
- "accuracy": 0.8541666666666666,
954
- "count": 96
955
- }
956
- }
957
- },
958
- "add_C6": {
959
- "full_accuracy": 0.26,
960
- "n_examples": 50,
961
- "per_subtask": {
962
- "SC": {
963
- "accuracy": 1.0,
964
- "count": 50
965
- },
966
- "UC": {
967
- "accuracy": 0.7802197802197802,
968
- "count": 182
969
- },
970
- "US": {
971
- "accuracy": 0.8135593220338984,
972
- "count": 118
973
- }
974
- }
975
- },
976
- "sub_M0": {
977
- "full_accuracy": 0.82,
978
- "n_examples": 50,
979
- "per_subtask": {
980
- "MD": {
981
- "accuracy": 0.9727891156462585,
982
- "count": 294
983
- },
984
- "ME": {
985
- "accuracy": 0.9821428571428571,
986
- "count": 56
987
- }
988
- }
989
- },
990
- "sub_M1": {
991
- "full_accuracy": 0.82,
992
- "n_examples": 50,
993
- "per_subtask": {
994
- "MD": {
995
- "accuracy": 0.993006993006993,
996
- "count": 143
997
- },
998
- "MB": {
999
- "accuracy": 0.9710144927536232,
1000
- "count": 69
1001
- },
1002
- "ME": {
1003
- "accuracy": 1.0,
1004
- "count": 15
1005
- },
1006
- "UB": {
1007
- "accuracy": 0.9512195121951219,
1008
- "count": 123
1009
- }
1010
- }
1011
- },
1012
- "sub_M2": {
1013
- "full_accuracy": 0.4,
1014
- "n_examples": 50,
1015
- "per_subtask": {
1016
- "MD": {
1017
- "accuracy": 0.9722222222222222,
1018
- "count": 108
1019
- },
1020
- "MB": {
1021
- "accuracy": 0.9230769230769231,
1022
- "count": 52
1023
- },
1024
- "ME": {
1025
- "accuracy": 1.0,
1026
- "count": 52
1027
- },
1028
- "UB": {
1029
- "accuracy": 0.6436781609195402,
1030
- "count": 87
1031
- },
1032
- "UD": {
1033
- "accuracy": 1.0,
1034
- "count": 51
1035
- }
1036
- }
1037
- },
1038
- "sub_M3": {
1039
- "full_accuracy": 0.04,
1040
- "n_examples": 50,
1041
- "per_subtask": {
1042
- "MD": {
1043
- "accuracy": 0.9787234042553191,
1044
- "count": 94
1045
- },
1046
- "MB": {
1047
- "accuracy": 0.9803921568627451,
1048
- "count": 51
1049
- },
1050
- "ME": {
1051
- "accuracy": 1.0,
1052
- "count": 25
1053
- },
1054
- "UB": {
1055
- "accuracy": 0.5769230769230769,
1056
- "count": 78
1057
- },
1058
- "UD": {
1059
- "accuracy": 0.7156862745098039,
1060
- "count": 102
1061
- }
1062
- }
1063
- },
1064
- "sub_M4": {
1065
- "full_accuracy": 0.02,
1066
- "n_examples": 50,
1067
- "per_subtask": {
1068
- "MD": {
1069
- "accuracy": 1.0,
1070
- "count": 100
1071
- },
1072
- "MB": {
1073
- "accuracy": 1.0,
1074
- "count": 50
1075
- },
1076
- "UB": {
1077
- "accuracy": 0.4,
1078
- "count": 50
1079
- },
1080
- "UD": {
1081
- "accuracy": 0.49333333333333335,
1082
- "count": 150
1083
- }
1084
- }
1085
- },
1086
- "sub_M5": {
1087
- "full_accuracy": 0.02,
1088
- "n_examples": 50,
1089
- "per_subtask": {
1090
- "MD": {
1091
- "accuracy": 1.0,
1092
- "count": 50
1093
- },
1094
- "MB": {
1095
- "accuracy": 1.0,
1096
- "count": 50
1097
- },
1098
- "UB": {
1099
- "accuracy": 0.3,
1100
- "count": 50
1101
- },
1102
- "UD": {
1103
- "accuracy": 0.345,
1104
- "count": 200
1105
- }
1106
- }
1107
- },
1108
- "sub_random": {
1109
- "full_accuracy": 0.665,
1110
- "n_examples": 200,
1111
- "per_subtask": {
1112
- "MD": {
1113
- "accuracy": 0.9863945578231292,
1114
- "count": 588
1115
- },
1116
- "MB": {
1117
- "accuracy": 0.9402985074626866,
1118
- "count": 268
1119
- },
1120
- "ME": {
1121
- "accuracy": 1.0,
1122
- "count": 60
1123
- },
1124
- "UB": {
1125
- "accuracy": 0.8903803131991052,
1126
- "count": 447
1127
- },
1128
- "UD": {
1129
- "accuracy": 0.9459459459459459,
1130
- "count": 37
1131
- }
1132
- }
1133
- },
1134
- "sub_B3": {
1135
- "full_accuracy": 0.48,
1136
- "n_examples": 50,
1137
- "per_subtask": {
1138
- "MD": {
1139
- "accuracy": 1.0,
1140
- "count": 150
1141
- },
1142
- "MB": {
1143
- "accuracy": 0.96,
1144
- "count": 50
1145
- },
1146
- "UB": {
1147
- "accuracy": 0.7850467289719626,
1148
- "count": 107
1149
- },
1150
- "UD": {
1151
- "accuracy": 0.9302325581395349,
1152
- "count": 43
1153
- }
1154
- }
1155
- },
1156
- "sub_B4": {
1157
- "full_accuracy": 0.28,
1158
- "n_examples": 50,
1159
- "per_subtask": {
1160
- "MD": {
1161
- "accuracy": 1.0,
1162
- "count": 100
1163
- },
1164
- "MB": {
1165
- "accuracy": 1.0,
1166
- "count": 50
1167
- },
1168
- "UB": {
1169
- "accuracy": 0.7280701754385965,
1170
- "count": 114
1171
- },
1172
- "UD": {
1173
- "accuracy": 0.7325581395348837,
1174
- "count": 86
1175
- }
1176
- }
1177
- },
1178
- "sub_B5": {
1179
- "full_accuracy": 0.22,
1180
- "n_examples": 50,
1181
- "per_subtask": {
1182
- "MD": {
1183
- "accuracy": 1.0,
1184
- "count": 50
1185
- },
1186
- "MB": {
1187
- "accuracy": 1.0,
1188
- "count": 50
1189
- },
1190
- "UB": {
1191
- "accuracy": 0.7647058823529411,
1192
- "count": 153
1193
- },
1194
- "UD": {
1195
- "accuracy": 0.6701030927835051,
1196
- "count": 97
1197
- }
1198
- }
1199
- }
1200
- },
1201
- "summary": {
1202
- "overall_accuracy": 0.475,
1203
- "total_examples": 1400,
1204
- "n_splits": 22
1205
- }
1206
- }
1207
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add_sub_baseline_25K_2L1H128d/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:55759c17a1397757add0fb87992f1bc8233d06311f8877b92abc1920c5912ec2
3
- size 157692826
 
 
 
 
add_sub_baseline_25K_2L1H128d/train_config.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "mode": "baseline",
3
- "ops": "add_sub",
4
- "n_digits": 6,
5
- "n_layer": 2,
6
- "n_head": 1,
7
- "n_embd": 128,
8
- "abs_vocab": 0,
9
- "K": 4,
10
- "alpha_info_gain": 10.0,
11
- "alpha_abs": 0.1,
12
- "alpha_soft_zipf": 1.0,
13
- "batch_size": 64,
14
- "num_epochs": 20,
15
- "dataset_size": 25000,
16
- "lr": 8e-05,
17
- "output_dir": "ckpt/sweep/as_baseline_25K_2L1H128d",
18
- "device": "cuda",
19
- "push_to_hub": true,
20
- "no_wandb": false,
21
- "n_params": 39346560,
22
- "run_name": "add_sub_baseline_25K_2L1H128d",
23
- "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
- "timestamp": "2026-04-12T03:50:16.264783+00:00",
25
- "tokenizer": "Qwen/Qwen3-0.6B",
26
- "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
- "dataset_config": "add_sub_6digit",
28
- "model_repo": "thoughtworks/arithmetic-sorl",
29
- "trainer_version": "sft",
30
- "wandb_run_id": "5bc3yvup",
31
- "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/5bc3yvup",
32
- "final_accuracy": 0.475,
33
- "sft_accuracy": 0.475,
34
- "eval_method": "ArithmeticEvaluator"
35
- }