amirali1985 commited on
Commit
ec8377b
·
verified ·
1 Parent(s): 73e6b5f

Upload add_sub_baseline_25K

Browse files
add_sub_baseline_25K/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 510,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2040,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 128,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 3,
23
+ "num_hidden_layers": 2,
24
+ "num_key_value_heads": 3,
25
+ "pad_token_id": null,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "rope_theta": 10000.0,
29
+ "rope_type": "default"
30
+ },
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "transformers_version": "5.5.0",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 151645
37
+ }
add_sub_baseline_25K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_25K/metrics.json ADDED
@@ -0,0 +1,1207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300,
50
+ 2350,
51
+ 2400,
52
+ 2450,
53
+ 2500,
54
+ 2550,
55
+ 2600,
56
+ 2650,
57
+ 2700,
58
+ 2750,
59
+ 2800,
60
+ 2850,
61
+ 2900,
62
+ 2950,
63
+ 3000,
64
+ 3050,
65
+ 3100,
66
+ 3150,
67
+ 3200,
68
+ 3250,
69
+ 3300,
70
+ 3350,
71
+ 3400,
72
+ 3450,
73
+ 3500,
74
+ 3550,
75
+ 3600,
76
+ 3650,
77
+ 3700,
78
+ 3750,
79
+ 3800,
80
+ 3850,
81
+ 3900,
82
+ 3950,
83
+ 4000,
84
+ 4050,
85
+ 4100,
86
+ 4150,
87
+ 4200,
88
+ 4250,
89
+ 4300,
90
+ 4350,
91
+ 4400,
92
+ 4450,
93
+ 4500,
94
+ 4550,
95
+ 4600,
96
+ 4650,
97
+ 4700,
98
+ 4750,
99
+ 4800,
100
+ 4850,
101
+ 4900,
102
+ 4950,
103
+ 5000,
104
+ 5050,
105
+ 5100,
106
+ 5150,
107
+ 5200,
108
+ 5250,
109
+ 5300,
110
+ 5350,
111
+ 5400,
112
+ 5450,
113
+ 5500,
114
+ 5550,
115
+ 5600,
116
+ 5650,
117
+ 5700,
118
+ 5750,
119
+ 5800,
120
+ 5850,
121
+ 5900,
122
+ 5950,
123
+ 6000,
124
+ 6050,
125
+ 6100,
126
+ 6150,
127
+ 6200,
128
+ 6250,
129
+ 6300,
130
+ 6350,
131
+ 6400,
132
+ 6450,
133
+ 6500,
134
+ 6550,
135
+ 6600,
136
+ 6650,
137
+ 6700,
138
+ 6750,
139
+ 6800,
140
+ 6850,
141
+ 6900,
142
+ 6950,
143
+ 7000,
144
+ 7050,
145
+ 7100,
146
+ 7150,
147
+ 7200,
148
+ 7250,
149
+ 7300,
150
+ 7350,
151
+ 7400,
152
+ 7450,
153
+ 7500,
154
+ 7550,
155
+ 7600,
156
+ 7650,
157
+ 7700,
158
+ 7750,
159
+ 7800
160
+ ],
161
+ "loss": [
162
+ 11.06562614440918,
163
+ 9.070802688598633,
164
+ 7.564438819885254,
165
+ 7.00793981552124,
166
+ 6.226614952087402,
167
+ 5.854918479919434,
168
+ 4.9685893058776855,
169
+ 4.255834102630615,
170
+ 3.3195507526397705,
171
+ 2.7683334350585938,
172
+ 2.22994065284729,
173
+ 1.9851258993148804,
174
+ 1.8262479305267334,
175
+ 1.8066816329956055,
176
+ 1.725418210029602,
177
+ 1.7170584201812744,
178
+ 1.760280728340149,
179
+ 1.7368391752243042,
180
+ 1.631116509437561,
181
+ 1.628551721572876,
182
+ 1.4732780456542969,
183
+ 1.0680259466171265,
184
+ 0.8227473497390747,
185
+ 0.6230937242507935,
186
+ 0.5450628995895386,
187
+ 0.41445615887641907,
188
+ 0.37112170457839966,
189
+ 0.2718469500541687,
190
+ 0.25747212767601013,
191
+ 0.2155759632587433,
192
+ 0.250948965549469,
193
+ 0.16987545788288116,
194
+ 0.17013859748840332,
195
+ 0.13095267117023468,
196
+ 0.1641116440296173,
197
+ 0.15072695910930634,
198
+ 0.14167438447475433,
199
+ 0.11960385739803314,
200
+ 0.08400659263134003,
201
+ 0.1277170479297638,
202
+ 0.10943807661533356,
203
+ 0.10479181259870529,
204
+ 0.11750847846269608,
205
+ 0.09830878674983978,
206
+ 0.09297787398099899,
207
+ 0.08759058266878128,
208
+ 0.09592296183109283,
209
+ 0.056994277983903885,
210
+ 0.08595649152994156,
211
+ 0.09432648122310638,
212
+ 0.06118571758270264,
213
+ 0.05625522881746292,
214
+ 0.0965944156050682,
215
+ 0.08749499171972275,
216
+ 0.08019021898508072,
217
+ 0.05185790732502937,
218
+ 0.07028406113386154,
219
+ 0.053000323474407196,
220
+ 0.06424856185913086,
221
+ 0.07609923928976059,
222
+ 0.08353769779205322,
223
+ 0.0732322707772255,
224
+ 0.08481856435537338,
225
+ 0.05480723828077316,
226
+ 0.04933084920048714,
227
+ 0.03942330554127693,
228
+ 0.03680780529975891,
229
+ 0.0338483564555645,
230
+ 0.04500473663210869,
231
+ 0.05092630162835121,
232
+ 0.06185285374522209,
233
+ 0.05888064578175545,
234
+ 0.06084391847252846,
235
+ 0.047385044395923615,
236
+ 0.05746296048164368,
237
+ 0.038728680461645126,
238
+ 0.041216280311346054,
239
+ 0.04032302647829056,
240
+ 0.03952217847108841,
241
+ 0.04412711411714554,
242
+ 0.037145402282476425,
243
+ 0.03596339002251625,
244
+ 0.03715207800269127,
245
+ 0.02417995221912861,
246
+ 0.05104166269302368,
247
+ 0.028953881934285164,
248
+ 0.0322723388671875,
249
+ 0.05351385846734047,
250
+ 0.04065471515059471,
251
+ 0.035228464752435684,
252
+ 0.0399099700152874,
253
+ 0.04321296140551567,
254
+ 0.023695966228842735,
255
+ 0.03870895132422447,
256
+ 0.023070303723216057,
257
+ 0.02995055727660656,
258
+ 0.034223996102809906,
259
+ 0.03115018829703331,
260
+ 0.047400325536727905,
261
+ 0.033650610595941544,
262
+ 0.020459134131669998,
263
+ 0.031521618366241455,
264
+ 0.010215552523732185,
265
+ 0.009668267332017422,
266
+ 0.009839163161814213,
267
+ 0.01371306087821722,
268
+ 0.0191287100315094,
269
+ 0.02995881251990795,
270
+ 0.010273347608745098,
271
+ 0.013487428426742554,
272
+ 0.006214354187250137,
273
+ 0.028749624267220497,
274
+ 0.004388689063489437,
275
+ 0.01441959012299776,
276
+ 0.010049402713775635,
277
+ 0.006188374478369951,
278
+ 0.006518733222037554,
279
+ 0.012474223971366882,
280
+ 0.0018296894850209355,
281
+ 0.009708991274237633,
282
+ 0.012771486304700375,
283
+ 0.006753657478839159,
284
+ 0.006519939284771681,
285
+ 0.0015541197499260306,
286
+ 0.0044125900603830814,
287
+ 0.0019812153186649084,
288
+ 0.009496470913290977,
289
+ 0.014241503551602364,
290
+ 0.004722653888165951,
291
+ 0.003749982686713338,
292
+ 0.009157851338386536,
293
+ 0.002904724795371294,
294
+ 0.002242226619273424,
295
+ 0.012742781080305576,
296
+ 0.002705535152927041,
297
+ 0.0015757112996652722,
298
+ 0.004974766168743372,
299
+ 0.007047231774777174,
300
+ 0.004940888378769159,
301
+ 0.007126574404537678,
302
+ 0.004077407065778971,
303
+ 0.009843333624303341,
304
+ 0.008745082654058933,
305
+ 0.0023113691713660955,
306
+ 0.005770131945610046,
307
+ 0.006909515243023634,
308
+ 0.005357807967811823,
309
+ 0.022677551954984665,
310
+ 0.0067167701199650764,
311
+ 0.002013612538576126,
312
+ 0.00514591159299016,
313
+ 0.004963079001754522,
314
+ 0.0025343645829707384,
315
+ 0.012118958868086338,
316
+ 0.012999330647289753,
317
+ 0.0052215722389519215
318
+ ],
319
+ "base_loss": [
320
+ 11.06562614440918,
321
+ 9.070802688598633,
322
+ 7.564438819885254,
323
+ 7.00793981552124,
324
+ 6.226614952087402,
325
+ 5.854918479919434,
326
+ 4.9685893058776855,
327
+ 4.255834102630615,
328
+ 3.3195507526397705,
329
+ 2.7683334350585938,
330
+ 2.22994065284729,
331
+ 1.9851258993148804,
332
+ 1.8262479305267334,
333
+ 1.8066816329956055,
334
+ 1.725418210029602,
335
+ 1.7170584201812744,
336
+ 1.760280728340149,
337
+ 1.7368391752243042,
338
+ 1.631116509437561,
339
+ 1.628551721572876,
340
+ 1.4732780456542969,
341
+ 1.0680259466171265,
342
+ 0.8227473497390747,
343
+ 0.6230937242507935,
344
+ 0.5450628995895386,
345
+ 0.41445615887641907,
346
+ 0.37112170457839966,
347
+ 0.2718469500541687,
348
+ 0.25747212767601013,
349
+ 0.2155759632587433,
350
+ 0.250948965549469,
351
+ 0.16987545788288116,
352
+ 0.17013859748840332,
353
+ 0.13095267117023468,
354
+ 0.1641116440296173,
355
+ 0.15072695910930634,
356
+ 0.14167438447475433,
357
+ 0.11960385739803314,
358
+ 0.08400659263134003,
359
+ 0.1277170479297638,
360
+ 0.10943807661533356,
361
+ 0.10479181259870529,
362
+ 0.11750847846269608,
363
+ 0.09830878674983978,
364
+ 0.09297787398099899,
365
+ 0.08759058266878128,
366
+ 0.09592296183109283,
367
+ 0.056994277983903885,
368
+ 0.08595649152994156,
369
+ 0.09432648122310638,
370
+ 0.06118571758270264,
371
+ 0.05625522881746292,
372
+ 0.0965944156050682,
373
+ 0.08749499171972275,
374
+ 0.08019021898508072,
375
+ 0.05185790732502937,
376
+ 0.07028406113386154,
377
+ 0.053000323474407196,
378
+ 0.06424856185913086,
379
+ 0.07609923928976059,
380
+ 0.08353769779205322,
381
+ 0.0732322707772255,
382
+ 0.08481856435537338,
383
+ 0.05480723828077316,
384
+ 0.04933084920048714,
385
+ 0.03942330554127693,
386
+ 0.03680780529975891,
387
+ 0.0338483564555645,
388
+ 0.04500473663210869,
389
+ 0.05092630162835121,
390
+ 0.06185285374522209,
391
+ 0.05888064578175545,
392
+ 0.06084391847252846,
393
+ 0.047385044395923615,
394
+ 0.05746296048164368,
395
+ 0.038728680461645126,
396
+ 0.041216280311346054,
397
+ 0.04032302647829056,
398
+ 0.03952217847108841,
399
+ 0.04412711411714554,
400
+ 0.037145402282476425,
401
+ 0.03596339002251625,
402
+ 0.03715207800269127,
403
+ 0.02417995221912861,
404
+ 0.05104166269302368,
405
+ 0.028953881934285164,
406
+ 0.0322723388671875,
407
+ 0.05351385846734047,
408
+ 0.04065471515059471,
409
+ 0.035228464752435684,
410
+ 0.0399099700152874,
411
+ 0.04321296140551567,
412
+ 0.023695966228842735,
413
+ 0.03870895132422447,
414
+ 0.023070303723216057,
415
+ 0.02995055727660656,
416
+ 0.034223996102809906,
417
+ 0.03115018829703331,
418
+ 0.047400325536727905,
419
+ 0.033650610595941544,
420
+ 0.020459134131669998,
421
+ 0.031521618366241455,
422
+ 0.010215552523732185,
423
+ 0.009668267332017422,
424
+ 0.009839163161814213,
425
+ 0.01371306087821722,
426
+ 0.0191287100315094,
427
+ 0.02995881251990795,
428
+ 0.010273347608745098,
429
+ 0.013487428426742554,
430
+ 0.006214354187250137,
431
+ 0.028749624267220497,
432
+ 0.004388689063489437,
433
+ 0.01441959012299776,
434
+ 0.010049402713775635,
435
+ 0.006188374478369951,
436
+ 0.006518733222037554,
437
+ 0.012474223971366882,
438
+ 0.0018296894850209355,
439
+ 0.009708991274237633,
440
+ 0.012771486304700375,
441
+ 0.006753657478839159,
442
+ 0.006519939284771681,
443
+ 0.0015541197499260306,
444
+ 0.0044125900603830814,
445
+ 0.0019812153186649084,
446
+ 0.009496470913290977,
447
+ 0.014241503551602364,
448
+ 0.004722653888165951,
449
+ 0.003749982686713338,
450
+ 0.009157851338386536,
451
+ 0.002904724795371294,
452
+ 0.002242226619273424,
453
+ 0.012742781080305576,
454
+ 0.002705535152927041,
455
+ 0.0015757112996652722,
456
+ 0.004974766168743372,
457
+ 0.007047231774777174,
458
+ 0.004940888378769159,
459
+ 0.007126574404537678,
460
+ 0.004077407065778971,
461
+ 0.009843333624303341,
462
+ 0.008745082654058933,
463
+ 0.0023113691713660955,
464
+ 0.005770131945610046,
465
+ 0.006909515243023634,
466
+ 0.005357807967811823,
467
+ 0.022677551954984665,
468
+ 0.0067167701199650764,
469
+ 0.002013612538576126,
470
+ 0.00514591159299016,
471
+ 0.004963079001754522,
472
+ 0.0025343645829707384,
473
+ 0.012118958868086338,
474
+ 0.012999330647289753,
475
+ 0.0052215722389519215
476
+ ],
477
+ "lr": [
478
+ 2.5063938618925837e-06,
479
+ 5.063938618925831e-06,
480
+ 7.62148337595908e-06,
481
+ 1.0179028132992328e-05,
482
+ 1.2736572890025576e-05,
483
+ 1.5294117647058822e-05,
484
+ 1.7851662404092073e-05,
485
+ 2.040920716112532e-05,
486
+ 2.296675191815857e-05,
487
+ 2.5524296675191817e-05,
488
+ 2.8081841432225065e-05,
489
+ 3.0639386189258316e-05,
490
+ 3.3196930946291564e-05,
491
+ 3.575447570332481e-05,
492
+ 3.831202046035806e-05,
493
+ 4.086956521739131e-05,
494
+ 4.3427109974424555e-05,
495
+ 4.598465473145781e-05,
496
+ 4.854219948849105e-05,
497
+ 5.10997442455243e-05,
498
+ 5.365728900255755e-05,
499
+ 5.62148337595908e-05,
500
+ 5.877237851662404e-05,
501
+ 6.13299232736573e-05,
502
+ 6.388746803069055e-05,
503
+ 6.644501278772379e-05,
504
+ 6.900255754475704e-05,
505
+ 7.156010230179029e-05,
506
+ 7.411764705882354e-05,
507
+ 7.667519181585678e-05,
508
+ 7.923273657289003e-05,
509
+ 7.999382181128958e-05,
510
+ 7.996356588945887e-05,
511
+ 7.990811651495726e-05,
512
+ 7.982750864365423e-05,
513
+ 7.97217930916005e-05,
514
+ 7.9591036502993e-05,
515
+ 7.943532130816183e-05,
516
+ 7.925474567160515e-05,
517
+ 7.904942343010533e-05,
518
+ 7.881948402096506e-05,
519
+ 7.856507240040864e-05,
520
+ 7.828634895220009e-05,
521
+ 7.798348938653556e-05,
522
+ 7.765668462927371e-05,
523
+ 7.730614070157413e-05,
524
+ 7.693207859001933e-05,
525
+ 7.653473410730253e-05,
526
+ 7.611435774356888e-05,
527
+ 7.567121450850376e-05,
528
+ 7.520558376426795e-05,
529
+ 7.471775904938474e-05,
530
+ 7.420804789369019e-05,
531
+ 7.367677162446306e-05,
532
+ 7.312426516385672e-05,
533
+ 7.255087681776069e-05,
534
+ 7.195696805622496e-05,
535
+ 7.13429132855854e-05,
536
+ 7.070909961243422e-05,
537
+ 7.005592659958366e-05,
538
+ 6.938380601417765e-05,
539
+ 6.869316156810923e-05,
540
+ 6.798442865090831e-05,
541
+ 6.725805405526735e-05,
542
+ 6.651449569537871e-05,
543
+ 6.575422231826058e-05,
544
+ 6.497771320825402e-05,
545
+ 6.418545788487704e-05,
546
+ 6.337795579422628e-05,
547
+ 6.255571599412105e-05,
548
+ 6.171925683318781e-05,
549
+ 6.086910562408781e-05,
550
+ 6.0005798311093635e-05,
551
+ 5.912987913222422e-05,
552
+ 5.824190027615158e-05,
553
+ 5.734242153409514e-05,
554
+ 5.643200994692358e-05,
555
+ 5.55112394476862e-05,
556
+ 5.458069049979956e-05,
557
+ 5.364094973111714e-05,
558
+ 5.269260956411309e-05,
559
+ 5.1736267842412726e-05,
560
+ 5.077252745390575e-05,
561
+ 4.980199595067928e-05,
562
+ 4.882528516601063e-05,
563
+ 4.784301082866123e-05,
564
+ 4.685579217471466e-05,
565
+ 4.586425155720376e-05,
566
+ 4.4869014053772686e-05,
567
+ 4.387070707262142e-05,
568
+ 4.286995995698098e-05,
569
+ 4.186740358836888e-05,
570
+ 4.0863669988874755e-05,
571
+ 3.985939192272697e-05,
572
+ 3.885520249739142e-05,
573
+ 3.785173476445388e-05,
574
+ 3.684962132053763e-05,
575
+ 3.584949390850793e-05,
576
+ 3.485198301921461e-05,
577
+ 3.385771749402399e-05,
578
+ 3.2867324128390756e-05,
579
+ 3.188142727671938e-05,
580
+ 3.090064845876465e-05,
581
+ 2.9925605967818972e-05,
582
+ 2.895691448093382e-05,
583
+ 2.799518467142088e-05,
584
+ 2.7041022823877087e-05,
585
+ 2.6095030451976512e-05,
586
+ 2.515780391926975e-05,
587
+ 2.4229934063230064e-05,
588
+ 2.3312005822783295e-05,
589
+ 2.240459786955611e-05,
590
+ 2.150828224307534e-05,
591
+ 2.0623623990148315e-05,
592
+ 1.9751180808651272e-05,
593
+ 1.8891502695950898e-05,
594
+ 1.8045131602180072e-05,
595
+ 1.7212601088586823e-05,
596
+ 1.63944359911718e-05,
597
+ 1.559115208982597e-05,
598
+ 1.48032557831777e-05,
599
+ 1.4031243769353617e-05,
600
+ 1.3275602732854923e-05,
601
+ 1.2536809037746398e-05,
602
+ 1.1815328427351398e-05,
603
+ 1.1111615730642416e-05,
604
+ 1.042611457551213e-05,
605
+ 9.759257109105627e-06,
606
+ 9.111463725390388e-06,
607
+ 8.483142800135428e-06,
608
+ 7.87469043346695e-06,
609
+ 7.286490200162668e-06,
610
+ 6.718912907842181e-06,
611
+ 6.1723163632060055e-06,
612
+ 5.647045146470409e-06,
613
+ 5.143430394140439e-06,
614
+ 4.661789590258008e-06,
615
+ 4.202426366256558e-06,
616
+ 3.7656303095486623e-06,
617
+ 3.351676780967026e-06,
618
+ 2.960826741174141e-06,
619
+ 2.5933265861499514e-06,
620
+ 2.2494079918611923e-06,
621
+ 1.929287768210473e-06,
622
+ 1.6331677223569853e-06,
623
+ 1.3612345314951615e-06,
624
+ 1.1136596251714304e-06,
625
+ 8.905990772131879e-07,
626
+ 6.921935073382368e-07,
627
+ 5.185679925066245e-07,
628
+ 3.698319880708301e-07,
629
+ 2.4607925877392135e-07,
630
+ 1.4738781963932191e-07,
631
+ 7.381988678927255e-08,
632
+ 2.5421838223160798e-08,
633
+ 2.2241845803394615e-09
634
+ ],
635
+ "eval_step": [
636
+ 390,
637
+ 780,
638
+ 1170,
639
+ 1560,
640
+ 1950,
641
+ 2340,
642
+ 2730,
643
+ 3120,
644
+ 3510,
645
+ 3900,
646
+ 4290,
647
+ 4680,
648
+ 5070,
649
+ 5460,
650
+ 5850,
651
+ 6240,
652
+ 6630,
653
+ 7020,
654
+ 7410,
655
+ 7800
656
+ ],
657
+ "eval_epoch": [
658
+ 1,
659
+ 2,
660
+ 3,
661
+ 4,
662
+ 5,
663
+ 6,
664
+ 7,
665
+ 8,
666
+ 9,
667
+ 10,
668
+ 11,
669
+ 12,
670
+ 13,
671
+ 14,
672
+ 15,
673
+ 16,
674
+ 17,
675
+ 18,
676
+ 19,
677
+ 20
678
+ ],
679
+ "eval_accuracy": [
680
+ 0.0064285714285714285,
681
+ 0.007142857142857143,
682
+ 0.085,
683
+ 0.52,
684
+ 0.5935714285714285,
685
+ 0.6385714285714286,
686
+ 0.6514285714285715,
687
+ 0.7364285714285714,
688
+ 0.7307142857142858,
689
+ 0.7328571428571429,
690
+ 0.755,
691
+ 0.7964285714285714,
692
+ 0.8135714285714286,
693
+ 0.89,
694
+ 0.8621428571428571,
695
+ 0.8892857142857142,
696
+ 0.9235714285714286,
697
+ 0.9035714285714286,
698
+ 0.9135714285714286,
699
+ 0.9207142857142857
700
+ ]
701
+ },
702
+ "final_accuracy": 0.9207142857142857,
703
+ "sft_eval": {
704
+ "config": {
705
+ "ops": "add_sub",
706
+ "K": null,
707
+ "mode": "sft",
708
+ "n_digits": 6,
709
+ "n_per_split": 50
710
+ },
711
+ "splits": {
712
+ "add_S0": {
713
+ "full_accuracy": 1.0,
714
+ "n_examples": 50,
715
+ "per_subtask": {
716
+ "SA": {
717
+ "accuracy": 1.0,
718
+ "count": 295
719
+ },
720
+ "SS": {
721
+ "accuracy": 1.0,
722
+ "count": 55
723
+ }
724
+ }
725
+ },
726
+ "add_S1": {
727
+ "full_accuracy": 1.0,
728
+ "n_examples": 50,
729
+ "per_subtask": {
730
+ "SA": {
731
+ "accuracy": 1.0,
732
+ "count": 126
733
+ },
734
+ "SC": {
735
+ "accuracy": 1.0,
736
+ "count": 79
737
+ },
738
+ "SS": {
739
+ "accuracy": 1.0,
740
+ "count": 21
741
+ },
742
+ "UC": {
743
+ "accuracy": 1.0,
744
+ "count": 124
745
+ }
746
+ }
747
+ },
748
+ "add_S2": {
749
+ "full_accuracy": 1.0,
750
+ "n_examples": 50,
751
+ "per_subtask": {
752
+ "SA": {
753
+ "accuracy": 1.0,
754
+ "count": 75
755
+ },
756
+ "SC": {
757
+ "accuracy": 1.0,
758
+ "count": 62
759
+ },
760
+ "SS": {
761
+ "accuracy": 1.0,
762
+ "count": 39
763
+ },
764
+ "UC": {
765
+ "accuracy": 1.0,
766
+ "count": 111
767
+ },
768
+ "US": {
769
+ "accuracy": 1.0,
770
+ "count": 63
771
+ }
772
+ }
773
+ },
774
+ "add_S3": {
775
+ "full_accuracy": 1.0,
776
+ "n_examples": 50,
777
+ "per_subtask": {
778
+ "SA": {
779
+ "accuracy": 1.0,
780
+ "count": 60
781
+ },
782
+ "SC": {
783
+ "accuracy": 1.0,
784
+ "count": 57
785
+ },
786
+ "SS": {
787
+ "accuracy": 1.0,
788
+ "count": 19
789
+ },
790
+ "UC": {
791
+ "accuracy": 1.0,
792
+ "count": 104
793
+ },
794
+ "US": {
795
+ "accuracy": 1.0,
796
+ "count": 110
797
+ }
798
+ }
799
+ },
800
+ "add_S4": {
801
+ "full_accuracy": 0.92,
802
+ "n_examples": 50,
803
+ "per_subtask": {
804
+ "SA": {
805
+ "accuracy": 1.0,
806
+ "count": 48
807
+ },
808
+ "SC": {
809
+ "accuracy": 1.0,
810
+ "count": 52
811
+ },
812
+ "SS": {
813
+ "accuracy": 1.0,
814
+ "count": 7
815
+ },
816
+ "UC": {
817
+ "accuracy": 0.9550561797752809,
818
+ "count": 89
819
+ },
820
+ "US": {
821
+ "accuracy": 0.9935064935064936,
822
+ "count": 154
823
+ }
824
+ }
825
+ },
826
+ "add_S5": {
827
+ "full_accuracy": 0.54,
828
+ "n_examples": 50,
829
+ "per_subtask": {
830
+ "SA": {
831
+ "accuracy": 1.0,
832
+ "count": 50
833
+ },
834
+ "SC": {
835
+ "accuracy": 1.0,
836
+ "count": 50
837
+ },
838
+ "UC": {
839
+ "accuracy": 0.64,
840
+ "count": 50
841
+ },
842
+ "US": {
843
+ "accuracy": 0.91,
844
+ "count": 200
845
+ }
846
+ }
847
+ },
848
+ "add_S6": {
849
+ "full_accuracy": 0.92,
850
+ "n_examples": 50,
851
+ "per_subtask": {
852
+ "SC": {
853
+ "accuracy": 1.0,
854
+ "count": 50
855
+ },
856
+ "UC": {
857
+ "accuracy": 0.94,
858
+ "count": 50
859
+ },
860
+ "US": {
861
+ "accuracy": 0.98,
862
+ "count": 250
863
+ }
864
+ }
865
+ },
866
+ "add_random": {
867
+ "full_accuracy": 0.995,
868
+ "n_examples": 200,
869
+ "per_subtask": {
870
+ "SA": {
871
+ "accuracy": 1.0,
872
+ "count": 431
873
+ },
874
+ "SC": {
875
+ "accuracy": 1.0,
876
+ "count": 316
877
+ },
878
+ "SS": {
879
+ "accuracy": 1.0,
880
+ "count": 39
881
+ },
882
+ "UC": {
883
+ "accuracy": 0.9982142857142857,
884
+ "count": 560
885
+ },
886
+ "US": {
887
+ "accuracy": 1.0,
888
+ "count": 54
889
+ }
890
+ }
891
+ },
892
+ "add_C3": {
893
+ "full_accuracy": 0.98,
894
+ "n_examples": 50,
895
+ "per_subtask": {
896
+ "SA": {
897
+ "accuracy": 1.0,
898
+ "count": 150
899
+ },
900
+ "SC": {
901
+ "accuracy": 1.0,
902
+ "count": 50
903
+ },
904
+ "UC": {
905
+ "accuracy": 0.9903846153846154,
906
+ "count": 104
907
+ },
908
+ "US": {
909
+ "accuracy": 1.0,
910
+ "count": 46
911
+ }
912
+ }
913
+ },
914
+ "add_C4": {
915
+ "full_accuracy": 0.9,
916
+ "n_examples": 50,
917
+ "per_subtask": {
918
+ "SA": {
919
+ "accuracy": 1.0,
920
+ "count": 100
921
+ },
922
+ "SC": {
923
+ "accuracy": 1.0,
924
+ "count": 50
925
+ },
926
+ "UC": {
927
+ "accuracy": 0.967479674796748,
928
+ "count": 123
929
+ },
930
+ "US": {
931
+ "accuracy": 0.974025974025974,
932
+ "count": 77
933
+ }
934
+ }
935
+ },
936
+ "add_C5": {
937
+ "full_accuracy": 0.98,
938
+ "n_examples": 50,
939
+ "per_subtask": {
940
+ "SA": {
941
+ "accuracy": 1.0,
942
+ "count": 50
943
+ },
944
+ "SC": {
945
+ "accuracy": 1.0,
946
+ "count": 50
947
+ },
948
+ "UC": {
949
+ "accuracy": 0.9935064935064936,
950
+ "count": 154
951
+ },
952
+ "US": {
953
+ "accuracy": 1.0,
954
+ "count": 96
955
+ }
956
+ }
957
+ },
958
+ "add_C6": {
959
+ "full_accuracy": 0.96,
960
+ "n_examples": 50,
961
+ "per_subtask": {
962
+ "SC": {
963
+ "accuracy": 1.0,
964
+ "count": 50
965
+ },
966
+ "UC": {
967
+ "accuracy": 0.989010989010989,
968
+ "count": 182
969
+ },
970
+ "US": {
971
+ "accuracy": 0.9915254237288136,
972
+ "count": 118
973
+ }
974
+ }
975
+ },
976
+ "sub_M0": {
977
+ "full_accuracy": 1.0,
978
+ "n_examples": 50,
979
+ "per_subtask": {
980
+ "MD": {
981
+ "accuracy": 1.0,
982
+ "count": 294
983
+ },
984
+ "ME": {
985
+ "accuracy": 1.0,
986
+ "count": 56
987
+ }
988
+ }
989
+ },
990
+ "sub_M1": {
991
+ "full_accuracy": 1.0,
992
+ "n_examples": 50,
993
+ "per_subtask": {
994
+ "MD": {
995
+ "accuracy": 1.0,
996
+ "count": 143
997
+ },
998
+ "MB": {
999
+ "accuracy": 1.0,
1000
+ "count": 69
1001
+ },
1002
+ "ME": {
1003
+ "accuracy": 1.0,
1004
+ "count": 15
1005
+ },
1006
+ "UB": {
1007
+ "accuracy": 1.0,
1008
+ "count": 123
1009
+ }
1010
+ }
1011
+ },
1012
+ "sub_M2": {
1013
+ "full_accuracy": 1.0,
1014
+ "n_examples": 50,
1015
+ "per_subtask": {
1016
+ "MD": {
1017
+ "accuracy": 1.0,
1018
+ "count": 108
1019
+ },
1020
+ "MB": {
1021
+ "accuracy": 1.0,
1022
+ "count": 52
1023
+ },
1024
+ "ME": {
1025
+ "accuracy": 1.0,
1026
+ "count": 52
1027
+ },
1028
+ "UB": {
1029
+ "accuracy": 1.0,
1030
+ "count": 87
1031
+ },
1032
+ "UD": {
1033
+ "accuracy": 1.0,
1034
+ "count": 51
1035
+ }
1036
+ }
1037
+ },
1038
+ "sub_M3": {
1039
+ "full_accuracy": 1.0,
1040
+ "n_examples": 50,
1041
+ "per_subtask": {
1042
+ "MD": {
1043
+ "accuracy": 1.0,
1044
+ "count": 94
1045
+ },
1046
+ "MB": {
1047
+ "accuracy": 1.0,
1048
+ "count": 51
1049
+ },
1050
+ "ME": {
1051
+ "accuracy": 1.0,
1052
+ "count": 25
1053
+ },
1054
+ "UB": {
1055
+ "accuracy": 1.0,
1056
+ "count": 78
1057
+ },
1058
+ "UD": {
1059
+ "accuracy": 1.0,
1060
+ "count": 102
1061
+ }
1062
+ }
1063
+ },
1064
+ "sub_M4": {
1065
+ "full_accuracy": 0.64,
1066
+ "n_examples": 50,
1067
+ "per_subtask": {
1068
+ "MD": {
1069
+ "accuracy": 1.0,
1070
+ "count": 100
1071
+ },
1072
+ "MB": {
1073
+ "accuracy": 1.0,
1074
+ "count": 50
1075
+ },
1076
+ "UB": {
1077
+ "accuracy": 0.68,
1078
+ "count": 50
1079
+ },
1080
+ "UD": {
1081
+ "accuracy": 0.9866666666666667,
1082
+ "count": 150
1083
+ }
1084
+ }
1085
+ },
1086
+ "sub_M5": {
1087
+ "full_accuracy": 0.24,
1088
+ "n_examples": 50,
1089
+ "per_subtask": {
1090
+ "MD": {
1091
+ "accuracy": 1.0,
1092
+ "count": 50
1093
+ },
1094
+ "MB": {
1095
+ "accuracy": 1.0,
1096
+ "count": 50
1097
+ },
1098
+ "UB": {
1099
+ "accuracy": 0.52,
1100
+ "count": 50
1101
+ },
1102
+ "UD": {
1103
+ "accuracy": 0.885,
1104
+ "count": 200
1105
+ }
1106
+ }
1107
+ },
1108
+ "sub_random": {
1109
+ "full_accuracy": 1.0,
1110
+ "n_examples": 200,
1111
+ "per_subtask": {
1112
+ "MD": {
1113
+ "accuracy": 1.0,
1114
+ "count": 588
1115
+ },
1116
+ "MB": {
1117
+ "accuracy": 1.0,
1118
+ "count": 268
1119
+ },
1120
+ "ME": {
1121
+ "accuracy": 1.0,
1122
+ "count": 60
1123
+ },
1124
+ "UB": {
1125
+ "accuracy": 1.0,
1126
+ "count": 447
1127
+ },
1128
+ "UD": {
1129
+ "accuracy": 1.0,
1130
+ "count": 37
1131
+ }
1132
+ }
1133
+ },
1134
+ "sub_B3": {
1135
+ "full_accuracy": 1.0,
1136
+ "n_examples": 50,
1137
+ "per_subtask": {
1138
+ "MD": {
1139
+ "accuracy": 1.0,
1140
+ "count": 150
1141
+ },
1142
+ "MB": {
1143
+ "accuracy": 1.0,
1144
+ "count": 50
1145
+ },
1146
+ "UB": {
1147
+ "accuracy": 1.0,
1148
+ "count": 107
1149
+ },
1150
+ "UD": {
1151
+ "accuracy": 1.0,
1152
+ "count": 43
1153
+ }
1154
+ }
1155
+ },
1156
+ "sub_B4": {
1157
+ "full_accuracy": 0.9,
1158
+ "n_examples": 50,
1159
+ "per_subtask": {
1160
+ "MD": {
1161
+ "accuracy": 1.0,
1162
+ "count": 100
1163
+ },
1164
+ "MB": {
1165
+ "accuracy": 1.0,
1166
+ "count": 50
1167
+ },
1168
+ "UB": {
1169
+ "accuracy": 0.956140350877193,
1170
+ "count": 114
1171
+ },
1172
+ "UD": {
1173
+ "accuracy": 1.0,
1174
+ "count": 86
1175
+ }
1176
+ }
1177
+ },
1178
+ "sub_B5": {
1179
+ "full_accuracy": 0.82,
1180
+ "n_examples": 50,
1181
+ "per_subtask": {
1182
+ "MD": {
1183
+ "accuracy": 1.0,
1184
+ "count": 50
1185
+ },
1186
+ "MB": {
1187
+ "accuracy": 1.0,
1188
+ "count": 50
1189
+ },
1190
+ "UB": {
1191
+ "accuracy": 0.9477124183006536,
1192
+ "count": 153
1193
+ },
1194
+ "UD": {
1195
+ "accuracy": 0.979381443298969,
1196
+ "count": 97
1197
+ }
1198
+ }
1199
+ }
1200
+ },
1201
+ "summary": {
1202
+ "overall_accuracy": 0.9207142857142857,
1203
+ "total_examples": 1400,
1204
+ "n_splits": 22
1205
+ }
1206
+ }
1207
+ }
add_sub_baseline_25K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf525d5b1ad9ce7ab59b46e66c0d4b353f9086c36262ae393e08d901ea70fb2
3
+ size 650266922
add_sub_baseline_25K/train_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "baseline",
3
+ "ops": "add_sub",
4
+ "n_digits": 6,
5
+ "n_layer": 2,
6
+ "n_head": 3,
7
+ "n_embd": 510,
8
+ "abs_vocab": 0,
9
+ "K": 4,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "batch_size": 64,
14
+ "num_epochs": 20,
15
+ "dataset_size": 25000,
16
+ "lr": 8e-05,
17
+ "output_dir": "ckpt/sweep/add_sub_baseline_25K",
18
+ "device": "cuda",
19
+ "push_to_hub": true,
20
+ "no_wandb": false,
21
+ "n_params": 162490082,
22
+ "run_name": "add_sub_baseline_25K",
23
+ "git_commit": "800625019270114adcda289bbd550c4f1109a514",
24
+ "timestamp": "2026-04-12T01:58:10.719272+00:00",
25
+ "tokenizer": "Qwen/Qwen3-0.6B",
26
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
27
+ "dataset_config": "add_sub_6digit",
28
+ "model_repo": "thoughtworks/arithmetic-sorl",
29
+ "trainer_version": "sft",
30
+ "wandb_run_id": "hccg8mv4",
31
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/hccg8mv4",
32
+ "final_accuracy": 0.9207142857142857,
33
+ "sft_accuracy": 0.9207142857142857,
34
+ "eval_method": "ArithmeticEvaluator"
35
+ }