amirali1985 commited on
Commit
62d71b3
·
verified ·
1 Parent(s): abf49a6

Delete folder add_sub_sorl_v1_abs16_1K with huggingface_hub

Browse files
add_sub_sorl_v1_abs16_1K/config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "architectures": [
3
- "SorlModelWrapper"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "float32",
9
- "eos_token_id": null,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 510,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 2040,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention"
18
- ],
19
- "max_position_embeddings": 128,
20
- "max_window_layers": 28,
21
- "model_type": "qwen3",
22
- "num_attention_heads": 3,
23
- "num_hidden_layers": 2,
24
- "num_key_value_heads": 3,
25
- "pad_token_id": null,
26
- "rms_norm_eps": 1e-06,
27
- "rope_parameters": {
28
- "rope_theta": 10000.0,
29
- "rope_type": "default"
30
- },
31
- "sliding_window": null,
32
- "tie_word_embeddings": false,
33
- "transformers_version": "5.5.0",
34
- "use_cache": true,
35
- "use_sliding_window": false,
36
- "vocab_size": 151660
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add_sub_sorl_v1_abs16_1K/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "output_attentions": false,
4
- "output_hidden_states": false,
5
- "transformers_version": "5.5.0",
6
- "use_cache": true
7
- }
 
 
 
 
 
 
 
 
add_sub_sorl_v1_abs16_1K/metrics.json DELETED
@@ -1,1031 +0,0 @@
1
- {
2
- "history": {
3
- "step": [],
4
- "loss": [],
5
- "base_loss": [],
6
- "info_loss": [],
7
- "abs_loss": [],
8
- "zipf_loss": [],
9
- "denoise_loss": [],
10
- "ortho_loss": [],
11
- "lr": [],
12
- "emb_lr": [],
13
- "eval_step": [
14
- 0
15
- ],
16
- "eval_accuracy": [
17
- 0.01
18
- ]
19
- },
20
- "final_accuracy": 0.0,
21
- "sft_eval": {
22
- "config": {
23
- "ops": "add_sub",
24
- "K": null,
25
- "mode": "sft",
26
- "n_digits": 6,
27
- "n_per_split": 50
28
- },
29
- "splits": {
30
- "add_S0": {
31
- "full_accuracy": 0.0,
32
- "n_examples": 50,
33
- "per_subtask": {
34
- "SA": {
35
- "accuracy": 0.17627118644067796,
36
- "count": 295
37
- },
38
- "SS": {
39
- "accuracy": 0.0,
40
- "count": 55
41
- }
42
- }
43
- },
44
- "add_S1": {
45
- "full_accuracy": 0.0,
46
- "n_examples": 50,
47
- "per_subtask": {
48
- "SA": {
49
- "accuracy": 0.30158730158730157,
50
- "count": 126
51
- },
52
- "SC": {
53
- "accuracy": 0.25316455696202533,
54
- "count": 79
55
- },
56
- "SS": {
57
- "accuracy": 0.0,
58
- "count": 21
59
- },
60
- "UC": {
61
- "accuracy": 0.0,
62
- "count": 124
63
- }
64
- }
65
- },
66
- "add_S2": {
67
- "full_accuracy": 0.0,
68
- "n_examples": 50,
69
- "per_subtask": {
70
- "SA": {
71
- "accuracy": 0.24,
72
- "count": 75
73
- },
74
- "SC": {
75
- "accuracy": 0.2903225806451613,
76
- "count": 62
77
- },
78
- "SS": {
79
- "accuracy": 0.0,
80
- "count": 39
81
- },
82
- "UC": {
83
- "accuracy": 0.0,
84
- "count": 111
85
- },
86
- "US": {
87
- "accuracy": 1.0,
88
- "count": 63
89
- }
90
- }
91
- },
92
- "add_S3": {
93
- "full_accuracy": 0.0,
94
- "n_examples": 50,
95
- "per_subtask": {
96
- "SA": {
97
- "accuracy": 0.26666666666666666,
98
- "count": 60
99
- },
100
- "SC": {
101
- "accuracy": 0.21052631578947367,
102
- "count": 57
103
- },
104
- "SS": {
105
- "accuracy": 0.0,
106
- "count": 19
107
- },
108
- "UC": {
109
- "accuracy": 0.0,
110
- "count": 104
111
- },
112
- "US": {
113
- "accuracy": 1.0,
114
- "count": 110
115
- }
116
- }
117
- },
118
- "add_S4": {
119
- "full_accuracy": 0.0,
120
- "n_examples": 50,
121
- "per_subtask": {
122
- "SA": {
123
- "accuracy": 0.20833333333333334,
124
- "count": 48
125
- },
126
- "SC": {
127
- "accuracy": 0.2692307692307692,
128
- "count": 52
129
- },
130
- "SS": {
131
- "accuracy": 0.0,
132
- "count": 7
133
- },
134
- "UC": {
135
- "accuracy": 0.0,
136
- "count": 89
137
- },
138
- "US": {
139
- "accuracy": 1.0,
140
- "count": 154
141
- }
142
- }
143
- },
144
- "add_S5": {
145
- "full_accuracy": 0.0,
146
- "n_examples": 50,
147
- "per_subtask": {
148
- "SA": {
149
- "accuracy": 0.54,
150
- "count": 50
151
- },
152
- "SC": {
153
- "accuracy": 0.36,
154
- "count": 50
155
- },
156
- "UC": {
157
- "accuracy": 0.0,
158
- "count": 50
159
- },
160
- "US": {
161
- "accuracy": 1.0,
162
- "count": 200
163
- }
164
- }
165
- },
166
- "add_S6": {
167
- "full_accuracy": 0.0,
168
- "n_examples": 50,
169
- "per_subtask": {
170
- "SC": {
171
- "accuracy": 0.34,
172
- "count": 50
173
- },
174
- "UC": {
175
- "accuracy": 0.0,
176
- "count": 50
177
- },
178
- "US": {
179
- "accuracy": 1.0,
180
- "count": 250
181
- }
182
- }
183
- },
184
- "add_random": {
185
- "full_accuracy": 0.0,
186
- "n_examples": 200,
187
- "per_subtask": {
188
- "SA": {
189
- "accuracy": 0.23665893271461716,
190
- "count": 431
191
- },
192
- "SC": {
193
- "accuracy": 0.1962025316455696,
194
- "count": 316
195
- },
196
- "SS": {
197
- "accuracy": 0.0,
198
- "count": 39
199
- },
200
- "UC": {
201
- "accuracy": 0.0,
202
- "count": 560
203
- },
204
- "US": {
205
- "accuracy": 1.0,
206
- "count": 54
207
- }
208
- }
209
- },
210
- "add_C3": {
211
- "full_accuracy": 0.0,
212
- "n_examples": 50,
213
- "per_subtask": {
214
- "SA": {
215
- "accuracy": 0.3466666666666667,
216
- "count": 150
217
- },
218
- "SC": {
219
- "accuracy": 0.22,
220
- "count": 50
221
- },
222
- "UC": {
223
- "accuracy": 0.0,
224
- "count": 104
225
- },
226
- "US": {
227
- "accuracy": 1.0,
228
- "count": 46
229
- }
230
- }
231
- },
232
- "add_C4": {
233
- "full_accuracy": 0.0,
234
- "n_examples": 50,
235
- "per_subtask": {
236
- "SA": {
237
- "accuracy": 0.35,
238
- "count": 100
239
- },
240
- "SC": {
241
- "accuracy": 0.26,
242
- "count": 50
243
- },
244
- "UC": {
245
- "accuracy": 0.0,
246
- "count": 123
247
- },
248
- "US": {
249
- "accuracy": 1.0,
250
- "count": 77
251
- }
252
- }
253
- },
254
- "add_C5": {
255
- "full_accuracy": 0.0,
256
- "n_examples": 50,
257
- "per_subtask": {
258
- "SA": {
259
- "accuracy": 0.42,
260
- "count": 50
261
- },
262
- "SC": {
263
- "accuracy": 0.34,
264
- "count": 50
265
- },
266
- "UC": {
267
- "accuracy": 0.0,
268
- "count": 154
269
- },
270
- "US": {
271
- "accuracy": 1.0,
272
- "count": 96
273
- }
274
- }
275
- },
276
- "add_C6": {
277
- "full_accuracy": 0.0,
278
- "n_examples": 50,
279
- "per_subtask": {
280
- "SC": {
281
- "accuracy": 0.2,
282
- "count": 50
283
- },
284
- "UC": {
285
- "accuracy": 0.0,
286
- "count": 182
287
- },
288
- "US": {
289
- "accuracy": 1.0,
290
- "count": 118
291
- }
292
- }
293
- },
294
- "sub_M0": {
295
- "full_accuracy": 0.0,
296
- "n_examples": 50,
297
- "per_subtask": {
298
- "MD": {
299
- "accuracy": 0.21428571428571427,
300
- "count": 294
301
- },
302
- "ME": {
303
- "accuracy": 1.0,
304
- "count": 56
305
- }
306
- }
307
- },
308
- "sub_M1": {
309
- "full_accuracy": 0.0,
310
- "n_examples": 50,
311
- "per_subtask": {
312
- "MD": {
313
- "accuracy": 0.38461538461538464,
314
- "count": 143
315
- },
316
- "MB": {
317
- "accuracy": 0.0,
318
- "count": 69
319
- },
320
- "ME": {
321
- "accuracy": 1.0,
322
- "count": 15
323
- },
324
- "UB": {
325
- "accuracy": 0.10569105691056911,
326
- "count": 123
327
- }
328
- }
329
- },
330
- "sub_M2": {
331
- "full_accuracy": 0.0,
332
- "n_examples": 50,
333
- "per_subtask": {
334
- "MD": {
335
- "accuracy": 0.6759259259259259,
336
- "count": 108
337
- },
338
- "MB": {
339
- "accuracy": 0.0,
340
- "count": 52
341
- },
342
- "ME": {
343
- "accuracy": 1.0,
344
- "count": 52
345
- },
346
- "UB": {
347
- "accuracy": 0.14942528735632185,
348
- "count": 87
349
- },
350
- "UD": {
351
- "accuracy": 0.0,
352
- "count": 51
353
- }
354
- }
355
- },
356
- "sub_M3": {
357
- "full_accuracy": 0.0,
358
- "n_examples": 50,
359
- "per_subtask": {
360
- "MD": {
361
- "accuracy": 0.6276595744680851,
362
- "count": 94
363
- },
364
- "MB": {
365
- "accuracy": 0.0,
366
- "count": 51
367
- },
368
- "ME": {
369
- "accuracy": 1.0,
370
- "count": 25
371
- },
372
- "UB": {
373
- "accuracy": 0.08974358974358974,
374
- "count": 78
375
- },
376
- "UD": {
377
- "accuracy": 0.0,
378
- "count": 102
379
- }
380
- }
381
- },
382
- "sub_M4": {
383
- "full_accuracy": 0.0,
384
- "n_examples": 50,
385
- "per_subtask": {
386
- "MD": {
387
- "accuracy": 0.5,
388
- "count": 100
389
- },
390
- "MB": {
391
- "accuracy": 0.0,
392
- "count": 50
393
- },
394
- "UB": {
395
- "accuracy": 0.32,
396
- "count": 50
397
- },
398
- "UD": {
399
- "accuracy": 0.0,
400
- "count": 150
401
- }
402
- }
403
- },
404
- "sub_M5": {
405
- "full_accuracy": 0.0,
406
- "n_examples": 50,
407
- "per_subtask": {
408
- "MD": {
409
- "accuracy": 1.0,
410
- "count": 50
411
- },
412
- "MB": {
413
- "accuracy": 0.0,
414
- "count": 50
415
- },
416
- "UB": {
417
- "accuracy": 0.2,
418
- "count": 50
419
- },
420
- "UD": {
421
- "accuracy": 0.0,
422
- "count": 200
423
- }
424
- }
425
- },
426
- "sub_random": {
427
- "full_accuracy": 0.0,
428
- "n_examples": 200,
429
- "per_subtask": {
430
- "MD": {
431
- "accuracy": 0.3758503401360544,
432
- "count": 588
433
- },
434
- "MB": {
435
- "accuracy": 0.0,
436
- "count": 268
437
- },
438
- "ME": {
439
- "accuracy": 1.0,
440
- "count": 60
441
- },
442
- "UB": {
443
- "accuracy": 0.1610738255033557,
444
- "count": 447
445
- },
446
- "UD": {
447
- "accuracy": 0.0,
448
- "count": 37
449
- }
450
- }
451
- },
452
- "sub_B3": {
453
- "full_accuracy": 0.0,
454
- "n_examples": 50,
455
- "per_subtask": {
456
- "MD": {
457
- "accuracy": 0.3333333333333333,
458
- "count": 150
459
- },
460
- "MB": {
461
- "accuracy": 0.0,
462
- "count": 50
463
- },
464
- "UB": {
465
- "accuracy": 0.11214953271028037,
466
- "count": 107
467
- },
468
- "UD": {
469
- "accuracy": 0.0,
470
- "count": 43
471
- }
472
- }
473
- },
474
- "sub_B4": {
475
- "full_accuracy": 0.0,
476
- "n_examples": 50,
477
- "per_subtask": {
478
- "MD": {
479
- "accuracy": 0.5,
480
- "count": 100
481
- },
482
- "MB": {
483
- "accuracy": 0.0,
484
- "count": 50
485
- },
486
- "UB": {
487
- "accuracy": 0.14035087719298245,
488
- "count": 114
489
- },
490
- "UD": {
491
- "accuracy": 0.0,
492
- "count": 86
493
- }
494
- }
495
- },
496
- "sub_B5": {
497
- "full_accuracy": 0.0,
498
- "n_examples": 50,
499
- "per_subtask": {
500
- "MD": {
501
- "accuracy": 1.0,
502
- "count": 50
503
- },
504
- "MB": {
505
- "accuracy": 0.0,
506
- "count": 50
507
- },
508
- "UB": {
509
- "accuracy": 0.09803921568627451,
510
- "count": 153
511
- },
512
- "UD": {
513
- "accuracy": 0.0,
514
- "count": 97
515
- }
516
- }
517
- }
518
- },
519
- "summary": {
520
- "overall_accuracy": 0.0,
521
- "total_examples": 1400,
522
- "n_splits": 22
523
- }
524
- },
525
- "sorl_eval": {
526
- "config": {
527
- "ops": "add_sub",
528
- "K": 4,
529
- "mode": "sorl",
530
- "n_digits": 6,
531
- "n_per_split": 50
532
- },
533
- "splits": {
534
- "add_S0": {
535
- "full_accuracy": 0.0,
536
- "n_examples": 50,
537
- "per_subtask": {
538
- "SA": {
539
- "accuracy": 0.20677966101694914,
540
- "count": 295
541
- },
542
- "SS": {
543
- "accuracy": 0.34545454545454546,
544
- "count": 55
545
- }
546
- }
547
- },
548
- "add_S1": {
549
- "full_accuracy": 0.0,
550
- "n_examples": 50,
551
- "per_subtask": {
552
- "SA": {
553
- "accuracy": 0.2857142857142857,
554
- "count": 126
555
- },
556
- "SC": {
557
- "accuracy": 0.17721518987341772,
558
- "count": 79
559
- },
560
- "SS": {
561
- "accuracy": 0.47619047619047616,
562
- "count": 21
563
- },
564
- "UC": {
565
- "accuracy": 0.03225806451612903,
566
- "count": 124
567
- }
568
- }
569
- },
570
- "add_S2": {
571
- "full_accuracy": 0.0,
572
- "n_examples": 50,
573
- "per_subtask": {
574
- "SA": {
575
- "accuracy": 0.32,
576
- "count": 75
577
- },
578
- "SC": {
579
- "accuracy": 0.1774193548387097,
580
- "count": 62
581
- },
582
- "SS": {
583
- "accuracy": 0.3076923076923077,
584
- "count": 39
585
- },
586
- "UC": {
587
- "accuracy": 0.02702702702702703,
588
- "count": 111
589
- },
590
- "US": {
591
- "accuracy": 0.7301587301587301,
592
- "count": 63
593
- }
594
- }
595
- },
596
- "add_S3": {
597
- "full_accuracy": 0.0,
598
- "n_examples": 50,
599
- "per_subtask": {
600
- "SA": {
601
- "accuracy": 0.43333333333333335,
602
- "count": 60
603
- },
604
- "SC": {
605
- "accuracy": 0.10526315789473684,
606
- "count": 57
607
- },
608
- "SS": {
609
- "accuracy": 0.5263157894736842,
610
- "count": 19
611
- },
612
- "UC": {
613
- "accuracy": 0.019230769230769232,
614
- "count": 104
615
- },
616
- "US": {
617
- "accuracy": 0.4909090909090909,
618
- "count": 110
619
- }
620
- }
621
- },
622
- "add_S4": {
623
- "full_accuracy": 0.0,
624
- "n_examples": 50,
625
- "per_subtask": {
626
- "SA": {
627
- "accuracy": 0.2916666666666667,
628
- "count": 48
629
- },
630
- "SC": {
631
- "accuracy": 0.19230769230769232,
632
- "count": 52
633
- },
634
- "SS": {
635
- "accuracy": 0.0,
636
- "count": 7
637
- },
638
- "UC": {
639
- "accuracy": 0.011235955056179775,
640
- "count": 89
641
- },
642
- "US": {
643
- "accuracy": 0.7012987012987013,
644
- "count": 154
645
- }
646
- }
647
- },
648
- "add_S5": {
649
- "full_accuracy": 0.0,
650
- "n_examples": 50,
651
- "per_subtask": {
652
- "SA": {
653
- "accuracy": 0.5,
654
- "count": 50
655
- },
656
- "SC": {
657
- "accuracy": 0.28,
658
- "count": 50
659
- },
660
- "UC": {
661
- "accuracy": 0.02,
662
- "count": 50
663
- },
664
- "US": {
665
- "accuracy": 0.7,
666
- "count": 200
667
- }
668
- }
669
- },
670
- "add_S6": {
671
- "full_accuracy": 0.0,
672
- "n_examples": 50,
673
- "per_subtask": {
674
- "SC": {
675
- "accuracy": 0.22,
676
- "count": 50
677
- },
678
- "UC": {
679
- "accuracy": 0.0,
680
- "count": 50
681
- },
682
- "US": {
683
- "accuracy": 0.552,
684
- "count": 250
685
- }
686
- }
687
- },
688
- "add_random": {
689
- "full_accuracy": 0.0,
690
- "n_examples": 200,
691
- "per_subtask": {
692
- "SA": {
693
- "accuracy": 0.2482598607888631,
694
- "count": 431
695
- },
696
- "SC": {
697
- "accuracy": 0.10759493670886076,
698
- "count": 316
699
- },
700
- "SS": {
701
- "accuracy": 0.28205128205128205,
702
- "count": 39
703
- },
704
- "UC": {
705
- "accuracy": 0.03571428571428571,
706
- "count": 560
707
- },
708
- "US": {
709
- "accuracy": 0.5925925925925926,
710
- "count": 54
711
- }
712
- }
713
- },
714
- "add_C3": {
715
- "full_accuracy": 0.0,
716
- "n_examples": 50,
717
- "per_subtask": {
718
- "SA": {
719
- "accuracy": 0.3,
720
- "count": 150
721
- },
722
- "SC": {
723
- "accuracy": 0.16,
724
- "count": 50
725
- },
726
- "UC": {
727
- "accuracy": 0.0,
728
- "count": 104
729
- },
730
- "US": {
731
- "accuracy": 0.6086956521739131,
732
- "count": 46
733
- }
734
- }
735
- },
736
- "add_C4": {
737
- "full_accuracy": 0.0,
738
- "n_examples": 50,
739
- "per_subtask": {
740
- "SA": {
741
- "accuracy": 0.34,
742
- "count": 100
743
- },
744
- "SC": {
745
- "accuracy": 0.14,
746
- "count": 50
747
- },
748
- "UC": {
749
- "accuracy": 0.016260162601626018,
750
- "count": 123
751
- },
752
- "US": {
753
- "accuracy": 0.5974025974025974,
754
- "count": 77
755
- }
756
- }
757
- },
758
- "add_C5": {
759
- "full_accuracy": 0.0,
760
- "n_examples": 50,
761
- "per_subtask": {
762
- "SA": {
763
- "accuracy": 0.42,
764
- "count": 50
765
- },
766
- "SC": {
767
- "accuracy": 0.2,
768
- "count": 50
769
- },
770
- "UC": {
771
- "accuracy": 0.0,
772
- "count": 154
773
- },
774
- "US": {
775
- "accuracy": 0.71875,
776
- "count": 96
777
- }
778
- }
779
- },
780
- "add_C6": {
781
- "full_accuracy": 0.0,
782
- "n_examples": 50,
783
- "per_subtask": {
784
- "SC": {
785
- "accuracy": 0.18,
786
- "count": 50
787
- },
788
- "UC": {
789
- "accuracy": 0.0,
790
- "count": 182
791
- },
792
- "US": {
793
- "accuracy": 0.711864406779661,
794
- "count": 118
795
- }
796
- }
797
- },
798
- "sub_M0": {
799
- "full_accuracy": 0.0,
800
- "n_examples": 50,
801
- "per_subtask": {
802
- "MD": {
803
- "accuracy": 0.2108843537414966,
804
- "count": 294
805
- },
806
- "ME": {
807
- "accuracy": 0.9821428571428571,
808
- "count": 56
809
- }
810
- }
811
- },
812
- "sub_M1": {
813
- "full_accuracy": 0.0,
814
- "n_examples": 50,
815
- "per_subtask": {
816
- "MD": {
817
- "accuracy": 0.38461538461538464,
818
- "count": 143
819
- },
820
- "MB": {
821
- "accuracy": 0.0,
822
- "count": 69
823
- },
824
- "ME": {
825
- "accuracy": 1.0,
826
- "count": 15
827
- },
828
- "UB": {
829
- "accuracy": 0.10569105691056911,
830
- "count": 123
831
- }
832
- }
833
- },
834
- "sub_M2": {
835
- "full_accuracy": 0.0,
836
- "n_examples": 50,
837
- "per_subtask": {
838
- "MD": {
839
- "accuracy": 0.6666666666666666,
840
- "count": 108
841
- },
842
- "MB": {
843
- "accuracy": 0.0,
844
- "count": 52
845
- },
846
- "ME": {
847
- "accuracy": 0.9807692307692307,
848
- "count": 52
849
- },
850
- "UB": {
851
- "accuracy": 0.14942528735632185,
852
- "count": 87
853
- },
854
- "UD": {
855
- "accuracy": 0.0,
856
- "count": 51
857
- }
858
- }
859
- },
860
- "sub_M3": {
861
- "full_accuracy": 0.0,
862
- "n_examples": 50,
863
- "per_subtask": {
864
- "MD": {
865
- "accuracy": 0.6276595744680851,
866
- "count": 94
867
- },
868
- "MB": {
869
- "accuracy": 0.0196078431372549,
870
- "count": 51
871
- },
872
- "ME": {
873
- "accuracy": 1.0,
874
- "count": 25
875
- },
876
- "UB": {
877
- "accuracy": 0.08974358974358974,
878
- "count": 78
879
- },
880
- "UD": {
881
- "accuracy": 0.00980392156862745,
882
- "count": 102
883
- }
884
- }
885
- },
886
- "sub_M4": {
887
- "full_accuracy": 0.0,
888
- "n_examples": 50,
889
- "per_subtask": {
890
- "MD": {
891
- "accuracy": 0.5,
892
- "count": 100
893
- },
894
- "MB": {
895
- "accuracy": 0.02,
896
- "count": 50
897
- },
898
- "UB": {
899
- "accuracy": 0.32,
900
- "count": 50
901
- },
902
- "UD": {
903
- "accuracy": 0.006666666666666667,
904
- "count": 150
905
- }
906
- }
907
- },
908
- "sub_M5": {
909
- "full_accuracy": 0.0,
910
- "n_examples": 50,
911
- "per_subtask": {
912
- "MD": {
913
- "accuracy": 1.0,
914
- "count": 50
915
- },
916
- "MB": {
917
- "accuracy": 0.0,
918
- "count": 50
919
- },
920
- "UB": {
921
- "accuracy": 0.2,
922
- "count": 50
923
- },
924
- "UD": {
925
- "accuracy": 0.02,
926
- "count": 200
927
- }
928
- }
929
- },
930
- "sub_random": {
931
- "full_accuracy": 0.0,
932
- "n_examples": 200,
933
- "per_subtask": {
934
- "MD": {
935
- "accuracy": 0.37755102040816324,
936
- "count": 588
937
- },
938
- "MB": {
939
- "accuracy": 0.0037313432835820895,
940
- "count": 268
941
- },
942
- "ME": {
943
- "accuracy": 0.9666666666666667,
944
- "count": 60
945
- },
946
- "UB": {
947
- "accuracy": 0.1476510067114094,
948
- "count": 447
949
- },
950
- "UD": {
951
- "accuracy": 0.0,
952
- "count": 37
953
- }
954
- }
955
- },
956
- "sub_B3": {
957
- "full_accuracy": 0.0,
958
- "n_examples": 50,
959
- "per_subtask": {
960
- "MD": {
961
- "accuracy": 0.3333333333333333,
962
- "count": 150
963
- },
964
- "MB": {
965
- "accuracy": 0.0,
966
- "count": 50
967
- },
968
- "UB": {
969
- "accuracy": 0.102803738317757,
970
- "count": 107
971
- },
972
- "UD": {
973
- "accuracy": 0.06976744186046512,
974
- "count": 43
975
- }
976
- }
977
- },
978
- "sub_B4": {
979
- "full_accuracy": 0.0,
980
- "n_examples": 50,
981
- "per_subtask": {
982
- "MD": {
983
- "accuracy": 0.5,
984
- "count": 100
985
- },
986
- "MB": {
987
- "accuracy": 0.0,
988
- "count": 50
989
- },
990
- "UB": {
991
- "accuracy": 0.13157894736842105,
992
- "count": 114
993
- },
994
- "UD": {
995
- "accuracy": 0.03488372093023256,
996
- "count": 86
997
- }
998
- }
999
- },
1000
- "sub_B5": {
1001
- "full_accuracy": 0.0,
1002
- "n_examples": 50,
1003
- "per_subtask": {
1004
- "MD": {
1005
- "accuracy": 1.0,
1006
- "count": 50
1007
- },
1008
- "MB": {
1009
- "accuracy": 0.02,
1010
- "count": 50
1011
- },
1012
- "UB": {
1013
- "accuracy": 0.0784313725490196,
1014
- "count": 153
1015
- },
1016
- "UD": {
1017
- "accuracy": 0.041237113402061855,
1018
- "count": 97
1019
- }
1020
- }
1021
- }
1022
- },
1023
- "summary": {
1024
- "overall_accuracy": 0.0,
1025
- "total_examples": 1400,
1026
- "n_splits": 22
1027
- }
1028
- },
1029
- "sorl_overall_accuracy": 0.0,
1030
- "sft_overall_accuracy": 0.0
1031
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
add_sub_sorl_v1_abs16_1K/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:43959975cf6b60c99df3f5ee2d6977064967c94752ab5550e4d46030cdbb8767
3
- size 650328152
 
 
 
 
add_sub_sorl_v1_abs16_1K/train_config.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "mode": "sorl",
3
- "ops": "add_sub",
4
- "n_digits": 6,
5
- "n_layer": 2,
6
- "n_head": 3,
7
- "n_embd": 510,
8
- "abs_vocab": 16,
9
- "K": 4,
10
- "alpha_info_gain": 10.0,
11
- "alpha_abs": 0.1,
12
- "alpha_soft_zipf": 1.0,
13
- "batch_size": 64,
14
- "num_epochs": 1,
15
- "dataset_size": 1000,
16
- "lr": 8e-05,
17
- "output_dir": "ckpt/smoke_sorl",
18
- "device": "cuda:0",
19
- "push_to_hub": true,
20
- "no_wandb": true,
21
- "n_params": 162505382,
22
- "run_name": "add_sub_sorl_v1_abs16_1K",
23
- "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
- "timestamp": "2026-04-12T01:47:31.805763+00:00",
25
- "tokenizer": "Qwen/Qwen3-0.6B",
26
- "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
- "dataset_config": "add_sub_6digit",
28
- "model_repo": "thoughtworks/arithmetic-sorl",
29
- "trainer_version": "v1",
30
- "wandb_run_id": null,
31
- "wandb_url": null,
32
- "final_accuracy": 0.0,
33
- "sft_accuracy": 0.0,
34
- "eval_method": "ArithmeticEvaluator"
35
- }