amirali1985 commited on
Commit
23816ca
·
verified ·
1 Parent(s): 509a855

Upload add_sub_baseline_25K_1L3H510d

Browse files
add_sub_baseline_25K_1L3H510d/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 510,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2040,
15
+ "layer_types": [
16
+ "full_attention"
17
+ ],
18
+ "max_position_embeddings": 128,
19
+ "max_window_layers": 28,
20
+ "model_type": "qwen3",
21
+ "num_attention_heads": 3,
22
+ "num_hidden_layers": 1,
23
+ "num_key_value_heads": 3,
24
+ "pad_token_id": null,
25
+ "rms_norm_eps": 1e-06,
26
+ "rope_parameters": {
27
+ "rope_theta": 10000.0,
28
+ "rope_type": "default"
29
+ },
30
+ "sliding_window": null,
31
+ "tie_word_embeddings": false,
32
+ "transformers_version": "5.5.0",
33
+ "use_cache": true,
34
+ "use_sliding_window": false,
35
+ "vocab_size": 151645
36
+ }
add_sub_baseline_25K_1L3H510d/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_baseline_25K_1L3H510d/metrics.json ADDED
@@ -0,0 +1,1207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 400,
12
+ 450,
13
+ 500,
14
+ 550,
15
+ 600,
16
+ 650,
17
+ 700,
18
+ 750,
19
+ 800,
20
+ 850,
21
+ 900,
22
+ 950,
23
+ 1000,
24
+ 1050,
25
+ 1100,
26
+ 1150,
27
+ 1200,
28
+ 1250,
29
+ 1300,
30
+ 1350,
31
+ 1400,
32
+ 1450,
33
+ 1500,
34
+ 1550,
35
+ 1600,
36
+ 1650,
37
+ 1700,
38
+ 1750,
39
+ 1800,
40
+ 1850,
41
+ 1900,
42
+ 1950,
43
+ 2000,
44
+ 2050,
45
+ 2100,
46
+ 2150,
47
+ 2200,
48
+ 2250,
49
+ 2300,
50
+ 2350,
51
+ 2400,
52
+ 2450,
53
+ 2500,
54
+ 2550,
55
+ 2600,
56
+ 2650,
57
+ 2700,
58
+ 2750,
59
+ 2800,
60
+ 2850,
61
+ 2900,
62
+ 2950,
63
+ 3000,
64
+ 3050,
65
+ 3100,
66
+ 3150,
67
+ 3200,
68
+ 3250,
69
+ 3300,
70
+ 3350,
71
+ 3400,
72
+ 3450,
73
+ 3500,
74
+ 3550,
75
+ 3600,
76
+ 3650,
77
+ 3700,
78
+ 3750,
79
+ 3800,
80
+ 3850,
81
+ 3900,
82
+ 3950,
83
+ 4000,
84
+ 4050,
85
+ 4100,
86
+ 4150,
87
+ 4200,
88
+ 4250,
89
+ 4300,
90
+ 4350,
91
+ 4400,
92
+ 4450,
93
+ 4500,
94
+ 4550,
95
+ 4600,
96
+ 4650,
97
+ 4700,
98
+ 4750,
99
+ 4800,
100
+ 4850,
101
+ 4900,
102
+ 4950,
103
+ 5000,
104
+ 5050,
105
+ 5100,
106
+ 5150,
107
+ 5200,
108
+ 5250,
109
+ 5300,
110
+ 5350,
111
+ 5400,
112
+ 5450,
113
+ 5500,
114
+ 5550,
115
+ 5600,
116
+ 5650,
117
+ 5700,
118
+ 5750,
119
+ 5800,
120
+ 5850,
121
+ 5900,
122
+ 5950,
123
+ 6000,
124
+ 6050,
125
+ 6100,
126
+ 6150,
127
+ 6200,
128
+ 6250,
129
+ 6300,
130
+ 6350,
131
+ 6400,
132
+ 6450,
133
+ 6500,
134
+ 6550,
135
+ 6600,
136
+ 6650,
137
+ 6700,
138
+ 6750,
139
+ 6800,
140
+ 6850,
141
+ 6900,
142
+ 6950,
143
+ 7000,
144
+ 7050,
145
+ 7100,
146
+ 7150,
147
+ 7200,
148
+ 7250,
149
+ 7300,
150
+ 7350,
151
+ 7400,
152
+ 7450,
153
+ 7500,
154
+ 7550,
155
+ 7600,
156
+ 7650,
157
+ 7700,
158
+ 7750,
159
+ 7800
160
+ ],
161
+ "loss": [
162
+ 10.19568157196045,
163
+ 7.192863464355469,
164
+ 6.217032432556152,
165
+ 4.67427921295166,
166
+ 2.9682857990264893,
167
+ 2.0980846881866455,
168
+ 1.8946998119354248,
169
+ 1.9396218061447144,
170
+ 1.8249216079711914,
171
+ 1.7901777029037476,
172
+ 1.7934296131134033,
173
+ 1.7734571695327759,
174
+ 1.7680686712265015,
175
+ 1.6811468601226807,
176
+ 1.6904586553573608,
177
+ 1.6290004253387451,
178
+ 1.5940027236938477,
179
+ 1.5358794927597046,
180
+ 1.278952717781067,
181
+ 1.1427963972091675,
182
+ 0.9103876352310181,
183
+ 0.7655139565467834,
184
+ 0.7879643440246582,
185
+ 0.704889714717865,
186
+ 0.7027565240859985,
187
+ 0.7044745683670044,
188
+ 0.6972912549972534,
189
+ 0.6401001811027527,
190
+ 0.6621316075325012,
191
+ 0.6303651928901672,
192
+ 0.5821676254272461,
193
+ 0.6099020838737488,
194
+ 0.5351438522338867,
195
+ 0.5087913870811462,
196
+ 0.541701078414917,
197
+ 0.47329190373420715,
198
+ 0.4839247167110443,
199
+ 0.47691941261291504,
200
+ 0.4968743920326233,
201
+ 0.45558029413223267,
202
+ 0.4464017450809479,
203
+ 0.45957401394844055,
204
+ 0.4282552897930145,
205
+ 0.41863182187080383,
206
+ 0.4324481785297394,
207
+ 0.3976905047893524,
208
+ 0.3744252026081085,
209
+ 0.3938696086406708,
210
+ 0.36305326223373413,
211
+ 0.3572494089603424,
212
+ 0.39805254340171814,
213
+ 0.38645902276039124,
214
+ 0.3354526162147522,
215
+ 0.3297174274921417,
216
+ 0.37510284781455994,
217
+ 0.3538384437561035,
218
+ 0.3419480621814728,
219
+ 0.31991156935691833,
220
+ 0.30549606680870056,
221
+ 0.3192532956600189,
222
+ 0.29912063479423523,
223
+ 0.3299557864665985,
224
+ 0.3237224221229553,
225
+ 0.3043464720249176,
226
+ 0.29594531655311584,
227
+ 0.2924143970012665,
228
+ 0.3392622172832489,
229
+ 0.29438456892967224,
230
+ 0.2825162708759308,
231
+ 0.2688557803630829,
232
+ 0.27867329120635986,
233
+ 0.26582691073417664,
234
+ 0.2573307156562805,
235
+ 0.2639952301979065,
236
+ 0.2442159354686737,
237
+ 0.30415433645248413,
238
+ 0.28714659810066223,
239
+ 0.2530474364757538,
240
+ 0.2529540956020355,
241
+ 0.26841235160827637,
242
+ 0.2817437946796417,
243
+ 0.25053316354751587,
244
+ 0.2483910471200943,
245
+ 0.2930961549282074,
246
+ 0.28800302743911743,
247
+ 0.25981736183166504,
248
+ 0.23965764045715332,
249
+ 0.24082252383232117,
250
+ 0.2665214538574219,
251
+ 0.24315674602985382,
252
+ 0.23547802865505219,
253
+ 0.2577818036079407,
254
+ 0.2551196217536926,
255
+ 0.2866048812866211,
256
+ 0.22905278205871582,
257
+ 0.2948581576347351,
258
+ 0.22631767392158508,
259
+ 0.23360957205295563,
260
+ 0.23361513018608093,
261
+ 0.2707379162311554,
262
+ 0.2082580029964447,
263
+ 0.259878545999527,
264
+ 0.23401972651481628,
265
+ 0.22835519909858704,
266
+ 0.24494406580924988,
267
+ 0.2527218461036682,
268
+ 0.23783628642559052,
269
+ 0.2323145866394043,
270
+ 0.2299260050058365,
271
+ 0.22768929600715637,
272
+ 0.22232380509376526,
273
+ 0.23369023203849792,
274
+ 0.2056972235441208,
275
+ 0.19805261492729187,
276
+ 0.2061125934123993,
277
+ 0.22895146906375885,
278
+ 0.2325371354818344,
279
+ 0.20736224949359894,
280
+ 0.2153971642255783,
281
+ 0.22081783413887024,
282
+ 0.21203044056892395,
283
+ 0.21354515850543976,
284
+ 0.24314843118190765,
285
+ 0.21091873943805695,
286
+ 0.22135458886623383,
287
+ 0.22974269092082977,
288
+ 0.20833232998847961,
289
+ 0.20342501997947693,
290
+ 0.21650569140911102,
291
+ 0.2234790027141571,
292
+ 0.21649529039859772,
293
+ 0.20806384086608887,
294
+ 0.201065331697464,
295
+ 0.21765287220478058,
296
+ 0.199764221906662,
297
+ 0.21314558386802673,
298
+ 0.18189337849617004,
299
+ 0.1983877569437027,
300
+ 0.20944280922412872,
301
+ 0.20895661413669586,
302
+ 0.20532403886318207,
303
+ 0.20170608162879944,
304
+ 0.2062072455883026,
305
+ 0.21582531929016113,
306
+ 0.20745429396629333,
307
+ 0.22070619463920593,
308
+ 0.20926928520202637,
309
+ 0.18736205995082855,
310
+ 0.19164054095745087,
311
+ 0.2283153533935547,
312
+ 0.19645021855831146,
313
+ 0.2316746711730957,
314
+ 0.21589398384094238,
315
+ 0.19139589369297028,
316
+ 0.218153715133667,
317
+ 0.2088916301727295
318
+ ],
319
+ "base_loss": [
320
+ 10.19568157196045,
321
+ 7.192863464355469,
322
+ 6.217032432556152,
323
+ 4.67427921295166,
324
+ 2.9682857990264893,
325
+ 2.0980846881866455,
326
+ 1.8946998119354248,
327
+ 1.9396218061447144,
328
+ 1.8249216079711914,
329
+ 1.7901777029037476,
330
+ 1.7934296131134033,
331
+ 1.7734571695327759,
332
+ 1.7680686712265015,
333
+ 1.6811468601226807,
334
+ 1.6904586553573608,
335
+ 1.6290004253387451,
336
+ 1.5940027236938477,
337
+ 1.5358794927597046,
338
+ 1.278952717781067,
339
+ 1.1427963972091675,
340
+ 0.9103876352310181,
341
+ 0.7655139565467834,
342
+ 0.7879643440246582,
343
+ 0.704889714717865,
344
+ 0.7027565240859985,
345
+ 0.7044745683670044,
346
+ 0.6972912549972534,
347
+ 0.6401001811027527,
348
+ 0.6621316075325012,
349
+ 0.6303651928901672,
350
+ 0.5821676254272461,
351
+ 0.6099020838737488,
352
+ 0.5351438522338867,
353
+ 0.5087913870811462,
354
+ 0.541701078414917,
355
+ 0.47329190373420715,
356
+ 0.4839247167110443,
357
+ 0.47691941261291504,
358
+ 0.4968743920326233,
359
+ 0.45558029413223267,
360
+ 0.4464017450809479,
361
+ 0.45957401394844055,
362
+ 0.4282552897930145,
363
+ 0.41863182187080383,
364
+ 0.4324481785297394,
365
+ 0.3976905047893524,
366
+ 0.3744252026081085,
367
+ 0.3938696086406708,
368
+ 0.36305326223373413,
369
+ 0.3572494089603424,
370
+ 0.39805254340171814,
371
+ 0.38645902276039124,
372
+ 0.3354526162147522,
373
+ 0.3297174274921417,
374
+ 0.37510284781455994,
375
+ 0.3538384437561035,
376
+ 0.3419480621814728,
377
+ 0.31991156935691833,
378
+ 0.30549606680870056,
379
+ 0.3192532956600189,
380
+ 0.29912063479423523,
381
+ 0.3299557864665985,
382
+ 0.3237224221229553,
383
+ 0.3043464720249176,
384
+ 0.29594531655311584,
385
+ 0.2924143970012665,
386
+ 0.3392622172832489,
387
+ 0.29438456892967224,
388
+ 0.2825162708759308,
389
+ 0.2688557803630829,
390
+ 0.27867329120635986,
391
+ 0.26582691073417664,
392
+ 0.2573307156562805,
393
+ 0.2639952301979065,
394
+ 0.2442159354686737,
395
+ 0.30415433645248413,
396
+ 0.28714659810066223,
397
+ 0.2530474364757538,
398
+ 0.2529540956020355,
399
+ 0.26841235160827637,
400
+ 0.2817437946796417,
401
+ 0.25053316354751587,
402
+ 0.2483910471200943,
403
+ 0.2930961549282074,
404
+ 0.28800302743911743,
405
+ 0.25981736183166504,
406
+ 0.23965764045715332,
407
+ 0.24082252383232117,
408
+ 0.2665214538574219,
409
+ 0.24315674602985382,
410
+ 0.23547802865505219,
411
+ 0.2577818036079407,
412
+ 0.2551196217536926,
413
+ 0.2866048812866211,
414
+ 0.22905278205871582,
415
+ 0.2948581576347351,
416
+ 0.22631767392158508,
417
+ 0.23360957205295563,
418
+ 0.23361513018608093,
419
+ 0.2707379162311554,
420
+ 0.2082580029964447,
421
+ 0.259878545999527,
422
+ 0.23401972651481628,
423
+ 0.22835519909858704,
424
+ 0.24494406580924988,
425
+ 0.2527218461036682,
426
+ 0.23783628642559052,
427
+ 0.2323145866394043,
428
+ 0.2299260050058365,
429
+ 0.22768929600715637,
430
+ 0.22232380509376526,
431
+ 0.23369023203849792,
432
+ 0.2056972235441208,
433
+ 0.19805261492729187,
434
+ 0.2061125934123993,
435
+ 0.22895146906375885,
436
+ 0.2325371354818344,
437
+ 0.20736224949359894,
438
+ 0.2153971642255783,
439
+ 0.22081783413887024,
440
+ 0.21203044056892395,
441
+ 0.21354515850543976,
442
+ 0.24314843118190765,
443
+ 0.21091873943805695,
444
+ 0.22135458886623383,
445
+ 0.22974269092082977,
446
+ 0.20833232998847961,
447
+ 0.20342501997947693,
448
+ 0.21650569140911102,
449
+ 0.2234790027141571,
450
+ 0.21649529039859772,
451
+ 0.20806384086608887,
452
+ 0.201065331697464,
453
+ 0.21765287220478058,
454
+ 0.199764221906662,
455
+ 0.21314558386802673,
456
+ 0.18189337849617004,
457
+ 0.1983877569437027,
458
+ 0.20944280922412872,
459
+ 0.20895661413669586,
460
+ 0.20532403886318207,
461
+ 0.20170608162879944,
462
+ 0.2062072455883026,
463
+ 0.21582531929016113,
464
+ 0.20745429396629333,
465
+ 0.22070619463920593,
466
+ 0.20926928520202637,
467
+ 0.18736205995082855,
468
+ 0.19164054095745087,
469
+ 0.2283153533935547,
470
+ 0.19645021855831146,
471
+ 0.2316746711730957,
472
+ 0.21589398384094238,
473
+ 0.19139589369297028,
474
+ 0.218153715133667,
475
+ 0.2088916301727295
476
+ ],
477
+ "lr": [
478
+ 8.376068376068378e-06,
479
+ 1.6923076923076924e-05,
480
+ 2.5470085470085475e-05,
481
+ 3.401709401709402e-05,
482
+ 3.99996141174052e-05,
483
+ 3.9992754396617386e-05,
484
+ 3.997732289238075e-05,
485
+ 3.9953326220867826e-05,
486
+ 3.99207746705195e-05,
487
+ 3.987968219763389e-05,
488
+ 3.9830066420382645e-05,
489
+ 3.97719486112573e-05,
490
+ 3.9705353687948734e-05,
491
+ 3.9630310202663935e-05,
492
+ 3.9546850329884316e-05,
493
+ 3.945500985257116e-05,
494
+ 3.9354828146823805e-05,
495
+ 3.924634816499739e-05,
496
+ 3.9129616417287294e-05,
497
+ 3.900468295178809e-05,
498
+ 3.887160133303572e-05,
499
+ 3.8730428619042037e-05,
500
+ 3.858122533683144e-05,
501
+ 3.842405545649026e-05,
502
+ 3.825898636373997e-05,
503
+ 3.808608883104587e-05,
504
+ 3.790543698727386e-05,
505
+ 3.7717108285908e-05,
506
+ 3.752118347184284e-05,
507
+ 3.7317746546764446e-05,
508
+ 3.710688473313514e-05,
509
+ 3.688868843679738e-05,
510
+ 3.666325120821272e-05,
511
+ 3.643066970235256e-05,
512
+ 3.619104363725791e-05,
513
+ 3.5944475751285765e-05,
514
+ 3.569107175906064e-05,
515
+ 3.5430940306149985e-05,
516
+ 3.516419292248301e-05,
517
+ 3.489094397453285e-05,
518
+ 3.461131061628253e-05,
519
+ 3.4325412738995875e-05,
520
+ 3.403337291981479e-05,
521
+ 3.373531636920496e-05,
522
+ 3.343137087727258e-05,
523
+ 3.3121666758975014e-05,
524
+ 3.280633679824903e-05,
525
+ 3.248551619108036e-05,
526
+ 3.21593424875392e-05,
527
+ 3.1827955532806334e-05,
528
+ 3.149149740721537e-05,
529
+ 3.115011236533647e-05,
530
+ 3.080394677412806e-05,
531
+ 3.045314905018279e-05,
532
+ 3.0097869596094755e-05,
533
+ 2.9738260735975154e-05,
534
+ 2.9374476650144227e-05,
535
+ 2.9006673309027263e-05,
536
+ 2.8635008406283132e-05,
537
+ 2.8259641291194015e-05,
538
+ 2.7880732900345262e-05,
539
+ 2.7498445688624725e-05,
540
+ 2.711294355957111e-05,
541
+ 2.6724391795101242e-05,
542
+ 2.633295698464635e-05,
543
+ 2.5938806953727778e-05,
544
+ 2.5542110692002687e-05,
545
+ 2.5143038280810704e-05,
546
+ 2.4741760820252464e-05,
547
+ 2.433845035583137e-05,
548
+ 2.3933279804690064e-05,
549
+ 2.352642288147312e-05,
550
+ 2.3118054023847876e-05,
551
+ 2.2708348317715223e-05,
552
+ 2.2297481422142487e-05,
553
+ 2.188562949405059e-05,
554
+ 2.1472969112687716e-05,
555
+ 2.1059677203921957e-05,
556
+ 2.064593096438528e-05,
557
+ 2.0231907785501493e-05,
558
+ 1.9817785177430605e-05,
559
+ 1.9403740692962372e-05,
560
+ 1.898995185139145e-05,
561
+ 1.8576596062407023e-05,
562
+ 1.816385055002938e-05,
563
+ 1.7751892276626043e-05,
564
+ 1.7340897867040178e-05,
565
+ 1.6931043532863625e-05,
566
+ 1.6522504996887168e-05,
567
+ 1.611545741776037e-05,
568
+ 1.5710075314893218e-05,
569
+ 1.5306532493631954e-05,
570
+ 1.4905001970740918e-05,
571
+ 1.4505655900222602e-05,
572
+ 1.4108665499507574e-05,
573
+ 1.371420097604592e-05,
574
+ 1.332243145433177e-05,
575
+ 1.2933524903392054e-05,
576
+ 1.2547648064770774e-05,
577
+ 1.2164966381039404e-05,
578
+ 1.178564392486436e-05,
579
+ 1.140984332866172e-05,
580
+ 1.1037725714869483e-05,
581
+ 1.066945062686719e-05,
582
+ 1.0305175960572616e-05,
583
+ 9.945057896744699e-06,
584
+ 9.589250834021969e-06,
585
+ 9.237907322724944e-06,
586
+ 8.891177999451028e-06,
587
+ 8.549211522489898e-06,
588
+ 8.212154508087055e-06,
589
+ 7.88015146758299e-06,
590
+ 7.553344745454641e-06,
591
+ 7.231874458286057e-06,
592
+ 6.915878434694157e-06,
593
+ 6.605492156235467e-06,
594
+ 6.30084869931916e-06,
595
+ 6.002078678151244e-06,
596
+ 5.709310188734507e-06,
597
+ 5.422668753947975e-06,
598
+ 5.14227726972974e-06,
599
+ 4.868255952385965e-06,
600
+ 4.600722287048818e-06,
601
+ 4.339790977305362e-06,
602
+ 4.085573896019013e-06,
603
+ 3.838180037364703e-06,
604
+ 3.5977154700981752e-06,
605
+ 3.364283292079631e-06,
606
+ 3.137983586071065e-06,
607
+ 2.9189133768263488e-06,
608
+ 2.707166589492387e-06,
609
+ 2.5028340093392257e-06,
610
+ 2.3060032428363876e-06,
611
+ 2.1167586800920613e-06,
612
+ 1.9351814586713113e-06,
613
+ 1.7613494288088008e-06,
614
+ 1.5953371200309199e-06,
615
+ 1.437215709201667e-06,
616
+ 1.2870529900059636e-06,
617
+ 1.1449133438834802e-06,
618
+ 1.0108577124254482e-06,
619
+ 8.849435712462972e-07,
620
+ 7.67224905341275e-07,
621
+ 6.57752185940721e-07,
622
+ 5.565723488707586e-07,
623
+ 4.637287744298502e-07,
624
+ 3.79261268789719e-07,
625
+ 3.032060469286724e-07,
626
+ 2.3559571710463747e-07,
627
+ 1.7645926687452908e-07,
628
+ 1.2582205066603127e-07,
629
+ 8.370577890698173e-08,
630
+ 5.012850871717989e-08,
631
+ 2.5104636166479735e-08,
632
+ 8.64489010255598e-09,
633
+ 7.563275509769874e-10
634
+ ],
635
+ "eval_step": [
636
+ 390,
637
+ 780,
638
+ 1170,
639
+ 1560,
640
+ 1950,
641
+ 2340,
642
+ 2730,
643
+ 3120,
644
+ 3510,
645
+ 3900,
646
+ 4290,
647
+ 4680,
648
+ 5070,
649
+ 5460,
650
+ 5850,
651
+ 6240,
652
+ 6630,
653
+ 7020,
654
+ 7410,
655
+ 7800
656
+ ],
657
+ "eval_epoch": [
658
+ 1,
659
+ 2,
660
+ 3,
661
+ 4,
662
+ 5,
663
+ 6,
664
+ 7,
665
+ 8,
666
+ 9,
667
+ 10,
668
+ 11,
669
+ 12,
670
+ 13,
671
+ 14,
672
+ 15,
673
+ 16,
674
+ 17,
675
+ 18,
676
+ 19,
677
+ 20
678
+ ],
679
+ "eval_accuracy": [
680
+ 0.0033333333333333335,
681
+ 0.0033333333333333335,
682
+ 0.028888888888888888,
683
+ 0.07444444444444444,
684
+ 0.14888888888888888,
685
+ 0.24555555555555555,
686
+ 0.28444444444444444,
687
+ 0.32555555555555554,
688
+ 0.4177777777777778,
689
+ 0.4411111111111111,
690
+ 0.42333333333333334,
691
+ 0.4166666666666667,
692
+ 0.4677777777777778,
693
+ 0.46444444444444444,
694
+ 0.5133333333333333,
695
+ 0.5,
696
+ 0.5111111111111111,
697
+ 0.4922222222222222,
698
+ 0.5,
699
+ 0.5077777777777778
700
+ ]
701
+ },
702
+ "final_accuracy": 0.395,
703
+ "sft_eval": {
704
+ "config": {
705
+ "ops": "add_sub",
706
+ "K": null,
707
+ "mode": "sft",
708
+ "n_digits": 6,
709
+ "n_per_split": 100
710
+ },
711
+ "splits": {
712
+ "add_S0": {
713
+ "full_accuracy": 0.69,
714
+ "n_examples": 100,
715
+ "per_subtask": {
716
+ "SA": {
717
+ "accuracy": 0.943801652892562,
718
+ "count": 605
719
+ },
720
+ "SS": {
721
+ "accuracy": 0.9578947368421052,
722
+ "count": 95
723
+ }
724
+ }
725
+ },
726
+ "add_S1": {
727
+ "full_accuracy": 0.62,
728
+ "n_examples": 100,
729
+ "per_subtask": {
730
+ "SA": {
731
+ "accuracy": 0.9656862745098039,
732
+ "count": 204
733
+ },
734
+ "SC": {
735
+ "accuracy": 0.9704142011834319,
736
+ "count": 169
737
+ },
738
+ "SS": {
739
+ "accuracy": 0.9354838709677419,
740
+ "count": 31
741
+ },
742
+ "UC": {
743
+ "accuracy": 0.8986486486486487,
744
+ "count": 296
745
+ }
746
+ }
747
+ },
748
+ "add_S2": {
749
+ "full_accuracy": 0.45,
750
+ "n_examples": 100,
751
+ "per_subtask": {
752
+ "SA": {
753
+ "accuracy": 0.9570552147239264,
754
+ "count": 163
755
+ },
756
+ "SC": {
757
+ "accuracy": 0.9307692307692308,
758
+ "count": 130
759
+ },
760
+ "SS": {
761
+ "accuracy": 0.896551724137931,
762
+ "count": 87
763
+ },
764
+ "UC": {
765
+ "accuracy": 0.7733990147783252,
766
+ "count": 203
767
+ },
768
+ "US": {
769
+ "accuracy": 0.9572649572649573,
770
+ "count": 117
771
+ }
772
+ }
773
+ },
774
+ "add_S3": {
775
+ "full_accuracy": 0.21,
776
+ "n_examples": 100,
777
+ "per_subtask": {
778
+ "SA": {
779
+ "accuracy": 0.9752066115702479,
780
+ "count": 121
781
+ },
782
+ "SC": {
783
+ "accuracy": 0.9504132231404959,
784
+ "count": 121
785
+ },
786
+ "SS": {
787
+ "accuracy": 0.9591836734693877,
788
+ "count": 49
789
+ },
790
+ "UC": {
791
+ "accuracy": 0.6236559139784946,
792
+ "count": 186
793
+ },
794
+ "US": {
795
+ "accuracy": 0.7085201793721974,
796
+ "count": 223
797
+ }
798
+ }
799
+ },
800
+ "add_S4": {
801
+ "full_accuracy": 0.32,
802
+ "n_examples": 100,
803
+ "per_subtask": {
804
+ "SA": {
805
+ "accuracy": 1.0,
806
+ "count": 104
807
+ },
808
+ "SC": {
809
+ "accuracy": 0.9622641509433962,
810
+ "count": 106
811
+ },
812
+ "SS": {
813
+ "accuracy": 1.0,
814
+ "count": 23
815
+ },
816
+ "UC": {
817
+ "accuracy": 0.725,
818
+ "count": 160
819
+ },
820
+ "US": {
821
+ "accuracy": 0.5602605863192183,
822
+ "count": 307
823
+ }
824
+ }
825
+ },
826
+ "add_S5": {
827
+ "full_accuracy": 0.3,
828
+ "n_examples": 100,
829
+ "per_subtask": {
830
+ "SA": {
831
+ "accuracy": 1.0,
832
+ "count": 100
833
+ },
834
+ "SC": {
835
+ "accuracy": 0.98,
836
+ "count": 100
837
+ },
838
+ "UC": {
839
+ "accuracy": 0.46,
840
+ "count": 100
841
+ },
842
+ "US": {
843
+ "accuracy": 0.5475,
844
+ "count": 400
845
+ }
846
+ }
847
+ },
848
+ "add_S6": {
849
+ "full_accuracy": 0.4,
850
+ "n_examples": 100,
851
+ "per_subtask": {
852
+ "SC": {
853
+ "accuracy": 1.0,
854
+ "count": 100
855
+ },
856
+ "UC": {
857
+ "accuracy": 0.44,
858
+ "count": 100
859
+ },
860
+ "US": {
861
+ "accuracy": 0.494,
862
+ "count": 500
863
+ }
864
+ }
865
+ },
866
+ "add_random": {
867
+ "full_accuracy": 0.675,
868
+ "n_examples": 200,
869
+ "per_subtask": {
870
+ "SA": {
871
+ "accuracy": 0.9664429530201343,
872
+ "count": 447
873
+ },
874
+ "SC": {
875
+ "accuracy": 0.95625,
876
+ "count": 320
877
+ },
878
+ "SS": {
879
+ "accuracy": 0.9821428571428571,
880
+ "count": 56
881
+ },
882
+ "UC": {
883
+ "accuracy": 0.9168241965973535,
884
+ "count": 529
885
+ },
886
+ "US": {
887
+ "accuracy": 0.8958333333333334,
888
+ "count": 48
889
+ }
890
+ }
891
+ },
892
+ "add_C3": {
893
+ "full_accuracy": 0.42,
894
+ "n_examples": 100,
895
+ "per_subtask": {
896
+ "SA": {
897
+ "accuracy": 0.9833333333333333,
898
+ "count": 300
899
+ },
900
+ "SC": {
901
+ "accuracy": 1.0,
902
+ "count": 100
903
+ },
904
+ "UC": {
905
+ "accuracy": 0.7409326424870466,
906
+ "count": 193
907
+ },
908
+ "US": {
909
+ "accuracy": 0.8037383177570093,
910
+ "count": 107
911
+ }
912
+ }
913
+ },
914
+ "add_C4": {
915
+ "full_accuracy": 0.42,
916
+ "n_examples": 100,
917
+ "per_subtask": {
918
+ "SA": {
919
+ "accuracy": 0.99,
920
+ "count": 200
921
+ },
922
+ "SC": {
923
+ "accuracy": 0.99,
924
+ "count": 100
925
+ },
926
+ "UC": {
927
+ "accuracy": 0.7734375,
928
+ "count": 256
929
+ },
930
+ "US": {
931
+ "accuracy": 0.7569444444444444,
932
+ "count": 144
933
+ }
934
+ }
935
+ },
936
+ "add_C5": {
937
+ "full_accuracy": 0.37,
938
+ "n_examples": 100,
939
+ "per_subtask": {
940
+ "SA": {
941
+ "accuracy": 1.0,
942
+ "count": 100
943
+ },
944
+ "SC": {
945
+ "accuracy": 0.97,
946
+ "count": 100
947
+ },
948
+ "UC": {
949
+ "accuracy": 0.7908496732026143,
950
+ "count": 306
951
+ },
952
+ "US": {
953
+ "accuracy": 0.7474226804123711,
954
+ "count": 194
955
+ }
956
+ }
957
+ },
958
+ "add_C6": {
959
+ "full_accuracy": 0.32,
960
+ "n_examples": 100,
961
+ "per_subtask": {
962
+ "SC": {
963
+ "accuracy": 1.0,
964
+ "count": 100
965
+ },
966
+ "UC": {
967
+ "accuracy": 0.7923497267759563,
968
+ "count": 366
969
+ },
970
+ "US": {
971
+ "accuracy": 0.8034188034188035,
972
+ "count": 234
973
+ }
974
+ }
975
+ },
976
+ "sub_M0": {
977
+ "full_accuracy": 0.77,
978
+ "n_examples": 100,
979
+ "per_subtask": {
980
+ "MD": {
981
+ "accuracy": 0.9584026622296173,
982
+ "count": 601
983
+ },
984
+ "ME": {
985
+ "accuracy": 0.98989898989899,
986
+ "count": 99
987
+ }
988
+ }
989
+ },
990
+ "sub_M1": {
991
+ "full_accuracy": 0.62,
992
+ "n_examples": 100,
993
+ "per_subtask": {
994
+ "MD": {
995
+ "accuracy": 0.974910394265233,
996
+ "count": 279
997
+ },
998
+ "MB": {
999
+ "accuracy": 0.9586206896551724,
1000
+ "count": 145
1001
+ },
1002
+ "ME": {
1003
+ "accuracy": 1.0,
1004
+ "count": 24
1005
+ },
1006
+ "UB": {
1007
+ "accuracy": 0.8849206349206349,
1008
+ "count": 252
1009
+ }
1010
+ }
1011
+ },
1012
+ "sub_M2": {
1013
+ "full_accuracy": 0.28,
1014
+ "n_examples": 100,
1015
+ "per_subtask": {
1016
+ "MD": {
1017
+ "accuracy": 0.971830985915493,
1018
+ "count": 213
1019
+ },
1020
+ "MB": {
1021
+ "accuracy": 0.9823008849557522,
1022
+ "count": 113
1023
+ },
1024
+ "ME": {
1025
+ "accuracy": 0.9529411764705882,
1026
+ "count": 85
1027
+ },
1028
+ "UB": {
1029
+ "accuracy": 0.6574585635359116,
1030
+ "count": 181
1031
+ },
1032
+ "UD": {
1033
+ "accuracy": 0.8240740740740741,
1034
+ "count": 108
1035
+ }
1036
+ }
1037
+ },
1038
+ "sub_M3": {
1039
+ "full_accuracy": 0.09,
1040
+ "n_examples": 100,
1041
+ "per_subtask": {
1042
+ "MD": {
1043
+ "accuracy": 0.994413407821229,
1044
+ "count": 179
1045
+ },
1046
+ "MB": {
1047
+ "accuracy": 0.9514563106796117,
1048
+ "count": 103
1049
+ },
1050
+ "ME": {
1051
+ "accuracy": 1.0,
1052
+ "count": 56
1053
+ },
1054
+ "UB": {
1055
+ "accuracy": 0.46308724832214765,
1056
+ "count": 149
1057
+ },
1058
+ "UD": {
1059
+ "accuracy": 0.7230046948356808,
1060
+ "count": 213
1061
+ }
1062
+ }
1063
+ },
1064
+ "sub_M4": {
1065
+ "full_accuracy": 0.05,
1066
+ "n_examples": 100,
1067
+ "per_subtask": {
1068
+ "MD": {
1069
+ "accuracy": 1.0,
1070
+ "count": 200
1071
+ },
1072
+ "MB": {
1073
+ "accuracy": 0.98,
1074
+ "count": 100
1075
+ },
1076
+ "UB": {
1077
+ "accuracy": 0.38,
1078
+ "count": 100
1079
+ },
1080
+ "UD": {
1081
+ "accuracy": 0.35333333333333333,
1082
+ "count": 300
1083
+ }
1084
+ }
1085
+ },
1086
+ "sub_M5": {
1087
+ "full_accuracy": 0.03,
1088
+ "n_examples": 100,
1089
+ "per_subtask": {
1090
+ "MD": {
1091
+ "accuracy": 1.0,
1092
+ "count": 100
1093
+ },
1094
+ "MB": {
1095
+ "accuracy": 1.0,
1096
+ "count": 100
1097
+ },
1098
+ "UB": {
1099
+ "accuracy": 0.41,
1100
+ "count": 100
1101
+ },
1102
+ "UD": {
1103
+ "accuracy": 0.285,
1104
+ "count": 400
1105
+ }
1106
+ }
1107
+ },
1108
+ "sub_random": {
1109
+ "full_accuracy": 0.605,
1110
+ "n_examples": 200,
1111
+ "per_subtask": {
1112
+ "MD": {
1113
+ "accuracy": 0.965,
1114
+ "count": 600
1115
+ },
1116
+ "MB": {
1117
+ "accuracy": 0.9438202247191011,
1118
+ "count": 267
1119
+ },
1120
+ "ME": {
1121
+ "accuracy": 1.0,
1122
+ "count": 53
1123
+ },
1124
+ "UB": {
1125
+ "accuracy": 0.8883826879271071,
1126
+ "count": 439
1127
+ },
1128
+ "UD": {
1129
+ "accuracy": 0.9024390243902439,
1130
+ "count": 41
1131
+ }
1132
+ }
1133
+ },
1134
+ "sub_B3": {
1135
+ "full_accuracy": 0.27,
1136
+ "n_examples": 100,
1137
+ "per_subtask": {
1138
+ "MD": {
1139
+ "accuracy": 0.9833333333333333,
1140
+ "count": 300
1141
+ },
1142
+ "MB": {
1143
+ "accuracy": 0.97,
1144
+ "count": 100
1145
+ },
1146
+ "UB": {
1147
+ "accuracy": 0.6040609137055838,
1148
+ "count": 197
1149
+ },
1150
+ "UD": {
1151
+ "accuracy": 0.6699029126213593,
1152
+ "count": 103
1153
+ }
1154
+ }
1155
+ },
1156
+ "sub_B4": {
1157
+ "full_accuracy": 0.12,
1158
+ "n_examples": 100,
1159
+ "per_subtask": {
1160
+ "MD": {
1161
+ "accuracy": 0.99,
1162
+ "count": 200
1163
+ },
1164
+ "MB": {
1165
+ "accuracy": 1.0,
1166
+ "count": 100
1167
+ },
1168
+ "UB": {
1169
+ "accuracy": 0.6072874493927125,
1170
+ "count": 247
1171
+ },
1172
+ "UD": {
1173
+ "accuracy": 0.5620915032679739,
1174
+ "count": 153
1175
+ }
1176
+ }
1177
+ },
1178
+ "sub_B5": {
1179
+ "full_accuracy": 0.17,
1180
+ "n_examples": 100,
1181
+ "per_subtask": {
1182
+ "MD": {
1183
+ "accuracy": 1.0,
1184
+ "count": 100
1185
+ },
1186
+ "MB": {
1187
+ "accuracy": 1.0,
1188
+ "count": 100
1189
+ },
1190
+ "UB": {
1191
+ "accuracy": 0.6879194630872483,
1192
+ "count": 298
1193
+ },
1194
+ "UD": {
1195
+ "accuracy": 0.5594059405940595,
1196
+ "count": 202
1197
+ }
1198
+ }
1199
+ }
1200
+ },
1201
+ "summary": {
1202
+ "overall_accuracy": 0.395,
1203
+ "total_examples": 2400,
1204
+ "n_splits": 22
1205
+ }
1206
+ }
1207
+ }
add_sub_baseline_25K_1L3H510d/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b98be00f1db73dc27014b4c1caa5d84a53eb6d87420aaf5f5b520c01b59bab30
3
+ size 634642298
add_sub_baseline_25K_1L3H510d/train_config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_rollouts": 4,
3
+ "K": 4,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 4e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 234,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
+ "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
+ "num_epochs": 20,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 390,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_baseline_25K_1L3H510d",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 1,
61
+ "n_head": 3,
62
+ "n_embd": 510,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 0,
65
+ "dataset_size": 25000,
66
+ "mode": "baseline",
67
+ "device": "cuda",
68
+ "push_to_hub": true,
69
+ "no_wandb": false,
70
+ "n_params": 158584246,
71
+ "run_name": "add_sub_baseline_25K_1L3H510d",
72
+ "git_commit": "f447da529caceac8c7d256cbb2cd185cbc50feac",
73
+ "timestamp": "2026-04-12T18:11:00.255428+00:00",
74
+ "tokenizer": "Qwen/Qwen3-0.6B",
75
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
+ "dataset_config": "add_sub_6digit",
77
+ "model_repo": "thoughtworks/arithmetic-sorl",
78
+ "trainer_version": "sft",
79
+ "wandb_run_id": "jqxb8wih",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/jqxb8wih",
81
+ "final_accuracy": 0.395,
82
+ "sft_accuracy": 0.395,
83
+ "eval_method": "ArithmeticEvaluator"
84
+ }