amirali1985 commited on
Commit
fb6692e
·
verified ·
1 Parent(s): 8021cf7

Upload add_sub_sorl_v1_abs10_10K

Browse files
add_sub_sorl_v1_abs10_10K/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 510,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2040,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention"
18
+ ],
19
+ "max_position_embeddings": 128,
20
+ "max_window_layers": 28,
21
+ "model_type": "qwen3",
22
+ "num_attention_heads": 3,
23
+ "num_hidden_layers": 2,
24
+ "num_key_value_heads": 3,
25
+ "pad_token_id": null,
26
+ "rms_norm_eps": 1e-06,
27
+ "rope_parameters": {
28
+ "rope_theta": 10000.0,
29
+ "rope_type": "default"
30
+ },
31
+ "sliding_window": null,
32
+ "tie_word_embeddings": false,
33
+ "transformers_version": "5.5.0",
34
+ "use_cache": true,
35
+ "use_sliding_window": false,
36
+ "vocab_size": 151654
37
+ }
add_sub_sorl_v1_abs10_10K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_sorl_v1_abs10_10K/metrics.json ADDED
@@ -0,0 +1,1297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 207,
8
+ 257,
9
+ 307,
10
+ 364,
11
+ 414,
12
+ 464,
13
+ 521,
14
+ 571,
15
+ 621,
16
+ 678,
17
+ 728,
18
+ 778,
19
+ 835,
20
+ 885,
21
+ 935,
22
+ 992,
23
+ 1042,
24
+ 1092,
25
+ 1149,
26
+ 1199,
27
+ 1249,
28
+ 1306,
29
+ 1356,
30
+ 1406,
31
+ 1463,
32
+ 1513,
33
+ 1563
34
+ ],
35
+ "loss": [
36
+ 13.724945068359375,
37
+ 7.864343166351318,
38
+ 5.013725280761719,
39
+ 2.741554021835327,
40
+ 2.641360282897949,
41
+ 2.458710193634033,
42
+ 2.5750374794006348,
43
+ 2.7018885612487793,
44
+ 2.5789809226989746,
45
+ 2.3690404891967773,
46
+ 1.7872730493545532,
47
+ 1.8722014427185059,
48
+ 1.4984050989151,
49
+ 1.2242059707641602,
50
+ 1.2751460075378418,
51
+ 1.0107839107513428,
52
+ 1.6164339780807495,
53
+ 0.9860085248947144,
54
+ 0.9987770915031433,
55
+ 1.071738600730896,
56
+ 0.49998074769973755,
57
+ 0.004859551787376404,
58
+ -0.1695592701435089,
59
+ 0.0747871994972229,
60
+ -0.27059921622276306,
61
+ -0.13733237981796265,
62
+ 0.056721221655607224,
63
+ 0.051335956901311874,
64
+ -0.45392414927482605,
65
+ -0.3348205089569092
66
+ ],
67
+ "base_loss": [
68
+ 7.9022040367126465,
69
+ 5.812614917755127,
70
+ 3.858829975128174,
71
+ 2.114943742752075,
72
+ 1.96530020236969,
73
+ 1.9036047458648682,
74
+ 1.893314242362976,
75
+ 1.8308947086334229,
76
+ 1.8680362701416016,
77
+ 1.7989755868911743,
78
+ 1.8180350065231323,
79
+ 1.8677423000335693,
80
+ 1.862571120262146,
81
+ 1.8560221195220947,
82
+ 1.838614583015442,
83
+ 1.7637040615081787,
84
+ 1.8746064901351929,
85
+ 1.8687989711761475,
86
+ 1.8472496271133423,
87
+ 1.8745392560958862,
88
+ 1.8059179782867432,
89
+ 1.755523920059204,
90
+ 1.7115023136138916,
91
+ 1.7793734073638916,
92
+ 1.759653091430664,
93
+ 1.7252968549728394,
94
+ 1.7201368808746338,
95
+ 1.7096773386001587,
96
+ 1.7525498867034912,
97
+ 1.710257887840271
98
+ ],
99
+ "info_loss": [
100
+ -0.2345585823059082,
101
+ -0.18186569213867188,
102
+ -0.11677074432373047,
103
+ -0.13591337203979492,
104
+ -0.12461328506469727,
105
+ -0.13464367389678955,
106
+ -0.12099599838256836,
107
+ -0.10128498077392578,
108
+ -0.11677730083465576,
109
+ -0.13061606884002686,
110
+ -0.1908276081085205,
111
+ -0.18726766109466553,
112
+ -0.2237793207168579,
113
+ -0.25037193298339844,
114
+ -0.24307358264923096,
115
+ -0.26141083240509033,
116
+ -0.2112140655517578,
117
+ -0.2714040279388428,
118
+ -0.2562897205352783,
119
+ -0.21821486949920654,
120
+ -0.1999720335006714,
121
+ -0.19893121719360352,
122
+ -0.20633161067962646,
123
+ -0.1859527826309204,
124
+ -0.21592140197753906,
125
+ -0.19767189025878906,
126
+ -0.17773842811584473,
127
+ -0.17674016952514648,
128
+ -0.2308502197265625,
129
+ -0.21450459957122803
130
+ ],
131
+ "abs_loss": [
132
+ 2.2656421661376953,
133
+ 2.060791254043579,
134
+ 1.8972232341766357,
135
+ 1.8288743495941162,
136
+ 1.8518760204315186,
137
+ 1.8478550910949707,
138
+ 1.8548732995986938,
139
+ 1.8174316883087158,
140
+ 1.8104970455169678,
141
+ 1.8124628067016602,
142
+ 1.8303263187408447,
143
+ 1.860059380531311,
144
+ 1.8033771514892578,
145
+ 1.8484278917312622,
146
+ 1.7977077960968018,
147
+ 1.7986536026000977,
148
+ 1.8080449104309082,
149
+ 1.7756034135818481,
150
+ 1.678572177886963,
151
+ 1.2864962816238403,
152
+ 0.798673152923584,
153
+ 0.611975371837616,
154
+ 0.5902649164199829,
155
+ 0.584165632724762,
156
+ 0.5656200647354126,
157
+ 0.5475164651870728,
158
+ 0.5673137903213501,
159
+ 0.5301952362060547,
160
+ 0.4829615652561188,
161
+ 0.5067222118377686
162
+ ],
163
+ "zipf_loss": [
164
+ 7.941762447357178,
165
+ 3.664306163787842,
166
+ 2.132880449295044,
167
+ 1.8028565645217896,
168
+ 1.7370052337646484,
169
+ 1.7167567014694214,
170
+ 1.7061958312988281,
171
+ 1.7021006345748901,
172
+ 1.6976680755615234,
173
+ 1.694979190826416,
174
+ 1.6944814920425415,
175
+ 1.6911298036575317,
176
+ 1.6932895183563232,
177
+ 1.6870603561401367,
178
+ 1.6874964237213135,
179
+ 1.6813228130340576,
180
+ 1.673163652420044,
181
+ 1.6536895036697388,
182
+ 1.546567440032959,
183
+ 1.250698447227478,
184
+ 0.6139158010482788,
185
+ 0.1774502694606781,
186
+ 0.1232280433177948,
187
+ 0.09652505815029144,
188
+ 0.07239970564842224,
189
+ 0.059338025748729706,
190
+ 0.05723724141716957,
191
+ 0.05604078993201256,
192
+ 0.053732018917798996,
193
+ 0.04929535463452339
194
+ ],
195
+ "denoise_loss": [],
196
+ "ortho_loss": [
197
+ 0.5284707546234131,
198
+ 0.21797625720500946,
199
+ 0.12806634604930878,
200
+ 0.09031304717063904,
201
+ 0.08532247692346573,
202
+ 0.07537004351615906,
203
+ 0.07645349949598312,
204
+ 0.07695373892784119,
205
+ 0.07142108678817749,
206
+ 0.08406423032283783,
207
+ 0.08843859285116196,
208
+ 0.07815887778997421,
209
+ 0.07982385903596878,
210
+ 0.07472201436758041,
211
+ 0.07963292300701141,
212
+ 0.07144557684659958,
213
+ 0.07161661982536316,
214
+ 0.07070355117321014,
215
+ 0.06902579218149185,
216
+ 0.06851960718631744,
217
+ 0.06775054335594177,
218
+ 0.0655888020992279,
219
+ 0.059131793677806854,
220
+ 0.05900496616959572,
221
+ 0.056693185120821,
222
+ 0.05708722025156021,
223
+ 0.058303531259298325,
224
+ 0.059081800282001495,
225
+ 0.059238508343696594,
226
+ 0.059114448726177216
227
+ ],
228
+ "lr": [
229
+ 1.9600000000000002e-05,
230
+ 3.96e-05,
231
+ 4e-05,
232
+ 4e-05,
233
+ 4e-05,
234
+ 4e-05,
235
+ 4e-05,
236
+ 4e-05,
237
+ 4e-05,
238
+ 4e-05,
239
+ 4e-05,
240
+ 4e-05,
241
+ 4e-05,
242
+ 4e-05,
243
+ 4e-05,
244
+ 4e-05,
245
+ 4e-05,
246
+ 4e-05,
247
+ 3.944897959183673e-05,
248
+ 3.638775510204082e-05,
249
+ 3.3326530612244897e-05,
250
+ 2.983673469387755e-05,
251
+ 2.6775510204081637e-05,
252
+ 2.3714285714285713e-05,
253
+ 2.022448979591837e-05,
254
+ 1.7163265306122454e-05,
255
+ 1.4102040816326535e-05,
256
+ 1.0612244897959182e-05,
257
+ 7.551020408163262e-06,
258
+ 4.48979591836735e-06
259
+ ],
260
+ "emb_lr": [],
261
+ "eval_step": [
262
+ 150,
263
+ 307,
264
+ 464,
265
+ 621,
266
+ 778,
267
+ 935,
268
+ 1092,
269
+ 1199,
270
+ 1356,
271
+ 1513
272
+ ],
273
+ "eval_accuracy": [
274
+ 0.01,
275
+ 0.0,
276
+ 0.0,
277
+ 0.0,
278
+ 0.0,
279
+ 0.01,
280
+ 0.0,
281
+ 0.02,
282
+ 0.01,
283
+ 0.01
284
+ ]
285
+ },
286
+ "final_accuracy": 0.008333333333333333,
287
+ "sft_eval": {
288
+ "config": {
289
+ "ops": "add_sub",
290
+ "K": null,
291
+ "mode": "sft",
292
+ "n_digits": 6,
293
+ "n_per_split": 100
294
+ },
295
+ "splits": {
296
+ "add_S0": {
297
+ "full_accuracy": 0.0,
298
+ "n_examples": 100,
299
+ "per_subtask": {
300
+ "SA": {
301
+ "accuracy": 0.24297520661157024,
302
+ "count": 605
303
+ },
304
+ "SS": {
305
+ "accuracy": 0.8736842105263158,
306
+ "count": 95
307
+ }
308
+ }
309
+ },
310
+ "add_S1": {
311
+ "full_accuracy": 0.0,
312
+ "n_examples": 100,
313
+ "per_subtask": {
314
+ "SA": {
315
+ "accuracy": 0.28921568627450983,
316
+ "count": 204
317
+ },
318
+ "SC": {
319
+ "accuracy": 0.14792899408284024,
320
+ "count": 169
321
+ },
322
+ "SS": {
323
+ "accuracy": 0.8064516129032258,
324
+ "count": 31
325
+ },
326
+ "UC": {
327
+ "accuracy": 0.23648648648648649,
328
+ "count": 296
329
+ }
330
+ }
331
+ },
332
+ "add_S2": {
333
+ "full_accuracy": 0.0,
334
+ "n_examples": 100,
335
+ "per_subtask": {
336
+ "SA": {
337
+ "accuracy": 0.3619631901840491,
338
+ "count": 163
339
+ },
340
+ "SC": {
341
+ "accuracy": 0.1,
342
+ "count": 130
343
+ },
344
+ "SS": {
345
+ "accuracy": 0.4827586206896552,
346
+ "count": 87
347
+ },
348
+ "UC": {
349
+ "accuracy": 0.33497536945812806,
350
+ "count": 203
351
+ },
352
+ "US": {
353
+ "accuracy": 0.5384615384615384,
354
+ "count": 117
355
+ }
356
+ }
357
+ },
358
+ "add_S3": {
359
+ "full_accuracy": 0.0,
360
+ "n_examples": 100,
361
+ "per_subtask": {
362
+ "SA": {
363
+ "accuracy": 0.4132231404958678,
364
+ "count": 121
365
+ },
366
+ "SC": {
367
+ "accuracy": 0.06611570247933884,
368
+ "count": 121
369
+ },
370
+ "SS": {
371
+ "accuracy": 0.46938775510204084,
372
+ "count": 49
373
+ },
374
+ "UC": {
375
+ "accuracy": 0.34946236559139787,
376
+ "count": 186
377
+ },
378
+ "US": {
379
+ "accuracy": 0.5650224215246636,
380
+ "count": 223
381
+ }
382
+ }
383
+ },
384
+ "add_S4": {
385
+ "full_accuracy": 0.0,
386
+ "n_examples": 100,
387
+ "per_subtask": {
388
+ "SA": {
389
+ "accuracy": 0.4519230769230769,
390
+ "count": 104
391
+ },
392
+ "SC": {
393
+ "accuracy": 0.08490566037735849,
394
+ "count": 106
395
+ },
396
+ "SS": {
397
+ "accuracy": 0.6086956521739131,
398
+ "count": 23
399
+ },
400
+ "UC": {
401
+ "accuracy": 0.40625,
402
+ "count": 160
403
+ },
404
+ "US": {
405
+ "accuracy": 0.4560260586319218,
406
+ "count": 307
407
+ }
408
+ }
409
+ },
410
+ "add_S5": {
411
+ "full_accuracy": 0.02,
412
+ "n_examples": 100,
413
+ "per_subtask": {
414
+ "SA": {
415
+ "accuracy": 0.43,
416
+ "count": 100
417
+ },
418
+ "SC": {
419
+ "accuracy": 0.06,
420
+ "count": 100
421
+ },
422
+ "UC": {
423
+ "accuracy": 0.51,
424
+ "count": 100
425
+ },
426
+ "US": {
427
+ "accuracy": 0.36,
428
+ "count": 400
429
+ }
430
+ }
431
+ },
432
+ "add_S6": {
433
+ "full_accuracy": 0.05,
434
+ "n_examples": 100,
435
+ "per_subtask": {
436
+ "SC": {
437
+ "accuracy": 0.05,
438
+ "count": 100
439
+ },
440
+ "UC": {
441
+ "accuracy": 0.53,
442
+ "count": 100
443
+ },
444
+ "US": {
445
+ "accuracy": 0.502,
446
+ "count": 500
447
+ }
448
+ }
449
+ },
450
+ "add_random": {
451
+ "full_accuracy": 0.0,
452
+ "n_examples": 200,
453
+ "per_subtask": {
454
+ "SA": {
455
+ "accuracy": 0.27293064876957496,
456
+ "count": 447
457
+ },
458
+ "SC": {
459
+ "accuracy": 0.1125,
460
+ "count": 320
461
+ },
462
+ "SS": {
463
+ "accuracy": 0.6964285714285714,
464
+ "count": 56
465
+ },
466
+ "UC": {
467
+ "accuracy": 0.2665406427221172,
468
+ "count": 529
469
+ },
470
+ "US": {
471
+ "accuracy": 0.5,
472
+ "count": 48
473
+ }
474
+ }
475
+ },
476
+ "add_C3": {
477
+ "full_accuracy": 0.0,
478
+ "n_examples": 100,
479
+ "per_subtask": {
480
+ "SA": {
481
+ "accuracy": 0.31666666666666665,
482
+ "count": 300
483
+ },
484
+ "SC": {
485
+ "accuracy": 0.07,
486
+ "count": 100
487
+ },
488
+ "UC": {
489
+ "accuracy": 0.19689119170984457,
490
+ "count": 193
491
+ },
492
+ "US": {
493
+ "accuracy": 0.2523364485981308,
494
+ "count": 107
495
+ }
496
+ }
497
+ },
498
+ "add_C4": {
499
+ "full_accuracy": 0.0,
500
+ "n_examples": 100,
501
+ "per_subtask": {
502
+ "SA": {
503
+ "accuracy": 0.405,
504
+ "count": 200
505
+ },
506
+ "SC": {
507
+ "accuracy": 0.05,
508
+ "count": 100
509
+ },
510
+ "UC": {
511
+ "accuracy": 0.1484375,
512
+ "count": 256
513
+ },
514
+ "US": {
515
+ "accuracy": 0.2986111111111111,
516
+ "count": 144
517
+ }
518
+ }
519
+ },
520
+ "add_C5": {
521
+ "full_accuracy": 0.02,
522
+ "n_examples": 100,
523
+ "per_subtask": {
524
+ "SA": {
525
+ "accuracy": 0.54,
526
+ "count": 100
527
+ },
528
+ "SC": {
529
+ "accuracy": 0.1,
530
+ "count": 100
531
+ },
532
+ "UC": {
533
+ "accuracy": 0.24836601307189543,
534
+ "count": 306
535
+ },
536
+ "US": {
537
+ "accuracy": 0.5,
538
+ "count": 194
539
+ }
540
+ }
541
+ },
542
+ "add_C6": {
543
+ "full_accuracy": 0.0,
544
+ "n_examples": 100,
545
+ "per_subtask": {
546
+ "SC": {
547
+ "accuracy": 0.14,
548
+ "count": 100
549
+ },
550
+ "UC": {
551
+ "accuracy": 0.2459016393442623,
552
+ "count": 366
553
+ },
554
+ "US": {
555
+ "accuracy": 0.6709401709401709,
556
+ "count": 234
557
+ }
558
+ }
559
+ },
560
+ "sub_M0": {
561
+ "full_accuracy": 0.0,
562
+ "n_examples": 100,
563
+ "per_subtask": {
564
+ "MD": {
565
+ "accuracy": 0.23960066555740434,
566
+ "count": 601
567
+ },
568
+ "ME": {
569
+ "accuracy": 0.8282828282828283,
570
+ "count": 99
571
+ }
572
+ }
573
+ },
574
+ "sub_M1": {
575
+ "full_accuracy": 0.0,
576
+ "n_examples": 100,
577
+ "per_subtask": {
578
+ "MD": {
579
+ "accuracy": 0.4444444444444444,
580
+ "count": 279
581
+ },
582
+ "MB": {
583
+ "accuracy": 0.034482758620689655,
584
+ "count": 145
585
+ },
586
+ "ME": {
587
+ "accuracy": 0.875,
588
+ "count": 24
589
+ },
590
+ "UB": {
591
+ "accuracy": 0.1388888888888889,
592
+ "count": 252
593
+ }
594
+ }
595
+ },
596
+ "sub_M2": {
597
+ "full_accuracy": 0.0,
598
+ "n_examples": 100,
599
+ "per_subtask": {
600
+ "MD": {
601
+ "accuracy": 0.6291079812206573,
602
+ "count": 213
603
+ },
604
+ "MB": {
605
+ "accuracy": 0.035398230088495575,
606
+ "count": 113
607
+ },
608
+ "ME": {
609
+ "accuracy": 0.8705882352941177,
610
+ "count": 85
611
+ },
612
+ "UB": {
613
+ "accuracy": 0.19337016574585636,
614
+ "count": 181
615
+ },
616
+ "UD": {
617
+ "accuracy": 0.1388888888888889,
618
+ "count": 108
619
+ }
620
+ }
621
+ },
622
+ "sub_M3": {
623
+ "full_accuracy": 0.0,
624
+ "n_examples": 100,
625
+ "per_subtask": {
626
+ "MD": {
627
+ "accuracy": 0.7374301675977654,
628
+ "count": 179
629
+ },
630
+ "MB": {
631
+ "accuracy": 0.02912621359223301,
632
+ "count": 103
633
+ },
634
+ "ME": {
635
+ "accuracy": 0.8928571428571429,
636
+ "count": 56
637
+ },
638
+ "UB": {
639
+ "accuracy": 0.2214765100671141,
640
+ "count": 149
641
+ },
642
+ "UD": {
643
+ "accuracy": 0.13145539906103287,
644
+ "count": 213
645
+ }
646
+ }
647
+ },
648
+ "sub_M4": {
649
+ "full_accuracy": 0.0,
650
+ "n_examples": 100,
651
+ "per_subtask": {
652
+ "MD": {
653
+ "accuracy": 0.55,
654
+ "count": 200
655
+ },
656
+ "MB": {
657
+ "accuracy": 0.06,
658
+ "count": 100
659
+ },
660
+ "UB": {
661
+ "accuracy": 0.35,
662
+ "count": 100
663
+ },
664
+ "UD": {
665
+ "accuracy": 0.16666666666666666,
666
+ "count": 300
667
+ }
668
+ }
669
+ },
670
+ "sub_M5": {
671
+ "full_accuracy": 0.01,
672
+ "n_examples": 100,
673
+ "per_subtask": {
674
+ "MD": {
675
+ "accuracy": 1.0,
676
+ "count": 100
677
+ },
678
+ "MB": {
679
+ "accuracy": 0.04,
680
+ "count": 100
681
+ },
682
+ "UB": {
683
+ "accuracy": 0.47,
684
+ "count": 100
685
+ },
686
+ "UD": {
687
+ "accuracy": 0.16,
688
+ "count": 400
689
+ }
690
+ }
691
+ },
692
+ "sub_random": {
693
+ "full_accuracy": 0.0,
694
+ "n_examples": 200,
695
+ "per_subtask": {
696
+ "MD": {
697
+ "accuracy": 0.4066666666666667,
698
+ "count": 600
699
+ },
700
+ "MB": {
701
+ "accuracy": 0.011235955056179775,
702
+ "count": 267
703
+ },
704
+ "ME": {
705
+ "accuracy": 0.7547169811320755,
706
+ "count": 53
707
+ },
708
+ "UB": {
709
+ "accuracy": 0.1662870159453303,
710
+ "count": 439
711
+ },
712
+ "UD": {
713
+ "accuracy": 0.21951219512195122,
714
+ "count": 41
715
+ }
716
+ }
717
+ },
718
+ "sub_B3": {
719
+ "full_accuracy": 0.0,
720
+ "n_examples": 100,
721
+ "per_subtask": {
722
+ "MD": {
723
+ "accuracy": 0.38,
724
+ "count": 300
725
+ },
726
+ "MB": {
727
+ "accuracy": 0.09,
728
+ "count": 100
729
+ },
730
+ "UB": {
731
+ "accuracy": 0.16243654822335024,
732
+ "count": 197
733
+ },
734
+ "UD": {
735
+ "accuracy": 0.20388349514563106,
736
+ "count": 103
737
+ }
738
+ }
739
+ },
740
+ "sub_B4": {
741
+ "full_accuracy": 0.0,
742
+ "n_examples": 100,
743
+ "per_subtask": {
744
+ "MD": {
745
+ "accuracy": 0.55,
746
+ "count": 200
747
+ },
748
+ "MB": {
749
+ "accuracy": 0.07,
750
+ "count": 100
751
+ },
752
+ "UB": {
753
+ "accuracy": 0.17408906882591094,
754
+ "count": 247
755
+ },
756
+ "UD": {
757
+ "accuracy": 0.1568627450980392,
758
+ "count": 153
759
+ }
760
+ }
761
+ },
762
+ "sub_B5": {
763
+ "full_accuracy": 0.0,
764
+ "n_examples": 100,
765
+ "per_subtask": {
766
+ "MD": {
767
+ "accuracy": 1.0,
768
+ "count": 100
769
+ },
770
+ "MB": {
771
+ "accuracy": 0.04,
772
+ "count": 100
773
+ },
774
+ "UB": {
775
+ "accuracy": 0.1644295302013423,
776
+ "count": 298
777
+ },
778
+ "UD": {
779
+ "accuracy": 0.1188118811881188,
780
+ "count": 202
781
+ }
782
+ }
783
+ }
784
+ },
785
+ "summary": {
786
+ "overall_accuracy": 0.004166666666666667,
787
+ "total_examples": 2400,
788
+ "n_splits": 22
789
+ }
790
+ },
791
+ "sorl_eval": {
792
+ "config": {
793
+ "ops": "add_sub",
794
+ "K": 4,
795
+ "mode": "sorl",
796
+ "n_digits": 6,
797
+ "n_per_split": 100
798
+ },
799
+ "splits": {
800
+ "add_S0": {
801
+ "full_accuracy": 0.0,
802
+ "n_examples": 100,
803
+ "per_subtask": {
804
+ "SA": {
805
+ "accuracy": 0.3256198347107438,
806
+ "count": 605
807
+ },
808
+ "SS": {
809
+ "accuracy": 0.968421052631579,
810
+ "count": 95
811
+ }
812
+ }
813
+ },
814
+ "add_S1": {
815
+ "full_accuracy": 0.0,
816
+ "n_examples": 100,
817
+ "per_subtask": {
818
+ "SA": {
819
+ "accuracy": 0.3431372549019608,
820
+ "count": 204
821
+ },
822
+ "SC": {
823
+ "accuracy": 0.23076923076923078,
824
+ "count": 169
825
+ },
826
+ "SS": {
827
+ "accuracy": 0.8387096774193549,
828
+ "count": 31
829
+ },
830
+ "UC": {
831
+ "accuracy": 0.32094594594594594,
832
+ "count": 296
833
+ }
834
+ }
835
+ },
836
+ "add_S2": {
837
+ "full_accuracy": 0.0,
838
+ "n_examples": 100,
839
+ "per_subtask": {
840
+ "SA": {
841
+ "accuracy": 0.4785276073619632,
842
+ "count": 163
843
+ },
844
+ "SC": {
845
+ "accuracy": 0.13076923076923078,
846
+ "count": 130
847
+ },
848
+ "SS": {
849
+ "accuracy": 0.40229885057471265,
850
+ "count": 87
851
+ },
852
+ "UC": {
853
+ "accuracy": 0.4039408866995074,
854
+ "count": 203
855
+ },
856
+ "US": {
857
+ "accuracy": 0.5897435897435898,
858
+ "count": 117
859
+ }
860
+ }
861
+ },
862
+ "add_S3": {
863
+ "full_accuracy": 0.0,
864
+ "n_examples": 100,
865
+ "per_subtask": {
866
+ "SA": {
867
+ "accuracy": 0.5289256198347108,
868
+ "count": 121
869
+ },
870
+ "SC": {
871
+ "accuracy": 0.0743801652892562,
872
+ "count": 121
873
+ },
874
+ "SS": {
875
+ "accuracy": 0.4897959183673469,
876
+ "count": 49
877
+ },
878
+ "UC": {
879
+ "accuracy": 0.41935483870967744,
880
+ "count": 186
881
+ },
882
+ "US": {
883
+ "accuracy": 0.5964125560538116,
884
+ "count": 223
885
+ }
886
+ }
887
+ },
888
+ "add_S4": {
889
+ "full_accuracy": 0.0,
890
+ "n_examples": 100,
891
+ "per_subtask": {
892
+ "SA": {
893
+ "accuracy": 0.5480769230769231,
894
+ "count": 104
895
+ },
896
+ "SC": {
897
+ "accuracy": 0.08490566037735849,
898
+ "count": 106
899
+ },
900
+ "SS": {
901
+ "accuracy": 0.6521739130434783,
902
+ "count": 23
903
+ },
904
+ "UC": {
905
+ "accuracy": 0.41875,
906
+ "count": 160
907
+ },
908
+ "US": {
909
+ "accuracy": 0.5374592833876222,
910
+ "count": 307
911
+ }
912
+ }
913
+ },
914
+ "add_S5": {
915
+ "full_accuracy": 0.02,
916
+ "n_examples": 100,
917
+ "per_subtask": {
918
+ "SA": {
919
+ "accuracy": 0.46,
920
+ "count": 100
921
+ },
922
+ "SC": {
923
+ "accuracy": 0.05,
924
+ "count": 100
925
+ },
926
+ "UC": {
927
+ "accuracy": 0.46,
928
+ "count": 100
929
+ },
930
+ "US": {
931
+ "accuracy": 0.3175,
932
+ "count": 400
933
+ }
934
+ }
935
+ },
936
+ "add_S6": {
937
+ "full_accuracy": 0.12,
938
+ "n_examples": 100,
939
+ "per_subtask": {
940
+ "SC": {
941
+ "accuracy": 0.12,
942
+ "count": 100
943
+ },
944
+ "UC": {
945
+ "accuracy": 0.48,
946
+ "count": 100
947
+ },
948
+ "US": {
949
+ "accuracy": 0.448,
950
+ "count": 500
951
+ }
952
+ }
953
+ },
954
+ "add_random": {
955
+ "full_accuracy": 0.0,
956
+ "n_examples": 200,
957
+ "per_subtask": {
958
+ "SA": {
959
+ "accuracy": 0.36017897091722595,
960
+ "count": 447
961
+ },
962
+ "SC": {
963
+ "accuracy": 0.18125,
964
+ "count": 320
965
+ },
966
+ "SS": {
967
+ "accuracy": 0.625,
968
+ "count": 56
969
+ },
970
+ "UC": {
971
+ "accuracy": 0.3534971644612476,
972
+ "count": 529
973
+ },
974
+ "US": {
975
+ "accuracy": 0.5,
976
+ "count": 48
977
+ }
978
+ }
979
+ },
980
+ "add_C3": {
981
+ "full_accuracy": 0.0,
982
+ "n_examples": 100,
983
+ "per_subtask": {
984
+ "SA": {
985
+ "accuracy": 0.4,
986
+ "count": 300
987
+ },
988
+ "SC": {
989
+ "accuracy": 0.07,
990
+ "count": 100
991
+ },
992
+ "UC": {
993
+ "accuracy": 0.22279792746113988,
994
+ "count": 193
995
+ },
996
+ "US": {
997
+ "accuracy": 0.37383177570093457,
998
+ "count": 107
999
+ }
1000
+ }
1001
+ },
1002
+ "add_C4": {
1003
+ "full_accuracy": 0.0,
1004
+ "n_examples": 100,
1005
+ "per_subtask": {
1006
+ "SA": {
1007
+ "accuracy": 0.53,
1008
+ "count": 200
1009
+ },
1010
+ "SC": {
1011
+ "accuracy": 0.04,
1012
+ "count": 100
1013
+ },
1014
+ "UC": {
1015
+ "accuracy": 0.19921875,
1016
+ "count": 256
1017
+ },
1018
+ "US": {
1019
+ "accuracy": 0.4513888888888889,
1020
+ "count": 144
1021
+ }
1022
+ }
1023
+ },
1024
+ "add_C5": {
1025
+ "full_accuracy": 0.01,
1026
+ "n_examples": 100,
1027
+ "per_subtask": {
1028
+ "SA": {
1029
+ "accuracy": 0.57,
1030
+ "count": 100
1031
+ },
1032
+ "SC": {
1033
+ "accuracy": 0.09,
1034
+ "count": 100
1035
+ },
1036
+ "UC": {
1037
+ "accuracy": 0.30392156862745096,
1038
+ "count": 306
1039
+ },
1040
+ "US": {
1041
+ "accuracy": 0.4948453608247423,
1042
+ "count": 194
1043
+ }
1044
+ }
1045
+ },
1046
+ "add_C6": {
1047
+ "full_accuracy": 0.0,
1048
+ "n_examples": 100,
1049
+ "per_subtask": {
1050
+ "SC": {
1051
+ "accuracy": 0.09,
1052
+ "count": 100
1053
+ },
1054
+ "UC": {
1055
+ "accuracy": 0.31693989071038253,
1056
+ "count": 366
1057
+ },
1058
+ "US": {
1059
+ "accuracy": 0.7521367521367521,
1060
+ "count": 234
1061
+ }
1062
+ }
1063
+ },
1064
+ "sub_M0": {
1065
+ "full_accuracy": 0.0,
1066
+ "n_examples": 100,
1067
+ "per_subtask": {
1068
+ "MD": {
1069
+ "accuracy": 0.3277870216306156,
1070
+ "count": 601
1071
+ },
1072
+ "ME": {
1073
+ "accuracy": 0.9090909090909091,
1074
+ "count": 99
1075
+ }
1076
+ }
1077
+ },
1078
+ "sub_M1": {
1079
+ "full_accuracy": 0.0,
1080
+ "n_examples": 100,
1081
+ "per_subtask": {
1082
+ "MD": {
1083
+ "accuracy": 0.5483870967741935,
1084
+ "count": 279
1085
+ },
1086
+ "MB": {
1087
+ "accuracy": 0.05517241379310345,
1088
+ "count": 145
1089
+ },
1090
+ "ME": {
1091
+ "accuracy": 0.6666666666666666,
1092
+ "count": 24
1093
+ },
1094
+ "UB": {
1095
+ "accuracy": 0.2222222222222222,
1096
+ "count": 252
1097
+ }
1098
+ }
1099
+ },
1100
+ "sub_M2": {
1101
+ "full_accuracy": 0.0,
1102
+ "n_examples": 100,
1103
+ "per_subtask": {
1104
+ "MD": {
1105
+ "accuracy": 0.7183098591549296,
1106
+ "count": 213
1107
+ },
1108
+ "MB": {
1109
+ "accuracy": 0.017699115044247787,
1110
+ "count": 113
1111
+ },
1112
+ "ME": {
1113
+ "accuracy": 0.9058823529411765,
1114
+ "count": 85
1115
+ },
1116
+ "UB": {
1117
+ "accuracy": 0.23204419889502761,
1118
+ "count": 181
1119
+ },
1120
+ "UD": {
1121
+ "accuracy": 0.2037037037037037,
1122
+ "count": 108
1123
+ }
1124
+ }
1125
+ },
1126
+ "sub_M3": {
1127
+ "full_accuracy": 0.0,
1128
+ "n_examples": 100,
1129
+ "per_subtask": {
1130
+ "MD": {
1131
+ "accuracy": 0.7932960893854749,
1132
+ "count": 179
1133
+ },
1134
+ "MB": {
1135
+ "accuracy": 0.02912621359223301,
1136
+ "count": 103
1137
+ },
1138
+ "ME": {
1139
+ "accuracy": 0.9285714285714286,
1140
+ "count": 56
1141
+ },
1142
+ "UB": {
1143
+ "accuracy": 0.33557046979865773,
1144
+ "count": 149
1145
+ },
1146
+ "UD": {
1147
+ "accuracy": 0.2112676056338028,
1148
+ "count": 213
1149
+ }
1150
+ }
1151
+ },
1152
+ "sub_M4": {
1153
+ "full_accuracy": 0.0,
1154
+ "n_examples": 100,
1155
+ "per_subtask": {
1156
+ "MD": {
1157
+ "accuracy": 0.725,
1158
+ "count": 200
1159
+ },
1160
+ "MB": {
1161
+ "accuracy": 0.1,
1162
+ "count": 100
1163
+ },
1164
+ "UB": {
1165
+ "accuracy": 0.42,
1166
+ "count": 100
1167
+ },
1168
+ "UD": {
1169
+ "accuracy": 0.31,
1170
+ "count": 300
1171
+ }
1172
+ }
1173
+ },
1174
+ "sub_M5": {
1175
+ "full_accuracy": 0.03,
1176
+ "n_examples": 100,
1177
+ "per_subtask": {
1178
+ "MD": {
1179
+ "accuracy": 1.0,
1180
+ "count": 100
1181
+ },
1182
+ "MB": {
1183
+ "accuracy": 0.03,
1184
+ "count": 100
1185
+ },
1186
+ "UB": {
1187
+ "accuracy": 0.71,
1188
+ "count": 100
1189
+ },
1190
+ "UD": {
1191
+ "accuracy": 0.285,
1192
+ "count": 400
1193
+ }
1194
+ }
1195
+ },
1196
+ "sub_random": {
1197
+ "full_accuracy": 0.0,
1198
+ "n_examples": 200,
1199
+ "per_subtask": {
1200
+ "MD": {
1201
+ "accuracy": 0.53,
1202
+ "count": 600
1203
+ },
1204
+ "MB": {
1205
+ "accuracy": 0.03745318352059925,
1206
+ "count": 267
1207
+ },
1208
+ "ME": {
1209
+ "accuracy": 0.7358490566037735,
1210
+ "count": 53
1211
+ },
1212
+ "UB": {
1213
+ "accuracy": 0.23917995444191345,
1214
+ "count": 439
1215
+ },
1216
+ "UD": {
1217
+ "accuracy": 0.17073170731707318,
1218
+ "count": 41
1219
+ }
1220
+ }
1221
+ },
1222
+ "sub_B3": {
1223
+ "full_accuracy": 0.0,
1224
+ "n_examples": 100,
1225
+ "per_subtask": {
1226
+ "MD": {
1227
+ "accuracy": 0.51,
1228
+ "count": 300
1229
+ },
1230
+ "MB": {
1231
+ "accuracy": 0.13,
1232
+ "count": 100
1233
+ },
1234
+ "UB": {
1235
+ "accuracy": 0.18274111675126903,
1236
+ "count": 197
1237
+ },
1238
+ "UD": {
1239
+ "accuracy": 0.39805825242718446,
1240
+ "count": 103
1241
+ }
1242
+ }
1243
+ },
1244
+ "sub_B4": {
1245
+ "full_accuracy": 0.0,
1246
+ "n_examples": 100,
1247
+ "per_subtask": {
1248
+ "MD": {
1249
+ "accuracy": 0.665,
1250
+ "count": 200
1251
+ },
1252
+ "MB": {
1253
+ "accuracy": 0.1,
1254
+ "count": 100
1255
+ },
1256
+ "UB": {
1257
+ "accuracy": 0.21862348178137653,
1258
+ "count": 247
1259
+ },
1260
+ "UD": {
1261
+ "accuracy": 0.38562091503267976,
1262
+ "count": 153
1263
+ }
1264
+ }
1265
+ },
1266
+ "sub_B5": {
1267
+ "full_accuracy": 0.02,
1268
+ "n_examples": 100,
1269
+ "per_subtask": {
1270
+ "MD": {
1271
+ "accuracy": 1.0,
1272
+ "count": 100
1273
+ },
1274
+ "MB": {
1275
+ "accuracy": 0.07,
1276
+ "count": 100
1277
+ },
1278
+ "UB": {
1279
+ "accuracy": 0.2953020134228188,
1280
+ "count": 298
1281
+ },
1282
+ "UD": {
1283
+ "accuracy": 0.33663366336633666,
1284
+ "count": 202
1285
+ }
1286
+ }
1287
+ }
1288
+ },
1289
+ "summary": {
1290
+ "overall_accuracy": 0.008333333333333333,
1291
+ "total_examples": 2400,
1292
+ "n_splits": 22
1293
+ }
1294
+ },
1295
+ "sorl_overall_accuracy": 0.008333333333333333,
1296
+ "sft_overall_accuracy": 0.004166666666666667
1297
+ }
add_sub_sorl_v1_abs10_10K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5539570ac41c2434a03e526e01a1a87e8aa66fcfadd6151a66949f140ded418d
3
+ size 650303660
add_sub_sorl_v1_abs10_10K/train_config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_rollouts": 4,
3
+ "K": 4,
4
+ "max_iterations": 2,
5
+ "memory_span_abs": 1792,
6
+ "memory_span_traj": 1792,
7
+ "temperature": 1.0,
8
+ "ar_search": false,
9
+ "response_only_abs": false,
10
+ "alpha_info_gain": 10.0,
11
+ "alpha_abs": 0.1,
12
+ "alpha_soft_zipf": 1.0,
13
+ "alpha_ortho": 0.0,
14
+ "alpha_anchor": 0.0,
15
+ "alpha_jacobi": 0.0,
16
+ "decay": 0.8,
17
+ "target_vocab_util": 0.8,
18
+ "min_abs_ppl": 0.0,
19
+ "zipf_alpha": 1.0,
20
+ "lr": 4e-05,
21
+ "emb_lr_mult": 1.0,
22
+ "weight_decay": 0.01,
23
+ "warmup_steps": 100,
24
+ "cooldown_frac": 0.4,
25
+ "max_grad_norm": 1.0,
26
+ "vq_abs_pretrain_steps": 0,
27
+ "vq_abs_pretrain_lr": 0.001,
28
+ "vq_abs_pretrain_layer": -1,
29
+ "vq_abs_pretrain_batch_size": 256,
30
+ "vq_abs_pretrain_target_vectors": 20000,
31
+ "batch_size": 64,
32
+ "gradient_accumulation_steps": 1,
33
+ "num_epochs": 10,
34
+ "emb_warmup_steps": 0,
35
+ "log_every": 50,
36
+ "eval_every": 156,
37
+ "save_every": 999999,
38
+ "eval_samples": 100,
39
+ "output_dir": "ckpt/sweep/as_sorl_abs10_K4_10K",
40
+ "eval_K": 4,
41
+ "alpha_traj": 0.0,
42
+ "corrupt_method": "shuffle",
43
+ "corrupt_ratio": 0.3,
44
+ "alpha_contrastive": 1.0,
45
+ "gamma_contrastive": 0.5,
46
+ "alpha_masked_traj": 0.0,
47
+ "mask_nl_ratio": 0.3,
48
+ "mask_nl_mode": "fixed",
49
+ "mask_nl_fixed_id": 0,
50
+ "use_ste": true,
51
+ "n_inner": 1,
52
+ "random_K": null,
53
+ "strip_suffix": null,
54
+ "compress_prefix": null,
55
+ "random_mem_span": null,
56
+ "warmup_ratio": 0.03,
57
+ "beta2": 0.999,
58
+ "seed": 42,
59
+ "n_digits": 6,
60
+ "n_layer": 2,
61
+ "n_head": 3,
62
+ "n_embd": 510,
63
+ "ops": "add_sub",
64
+ "abs_vocab": 10,
65
+ "dataset_size": 10000,
66
+ "mode": "sorl",
67
+ "device": "cuda",
68
+ "push_to_hub": true,
69
+ "no_wandb": false,
70
+ "n_params": 162499262,
71
+ "run_name": "add_sub_sorl_v1_abs10_10K",
72
+ "git_commit": "78d46f8665a87f4b44bd5894bd34f393f2dea51f",
73
+ "timestamp": "2026-04-12T08:59:09.036996+00:00",
74
+ "tokenizer": "Qwen/Qwen3-0.6B",
75
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
76
+ "dataset_config": "add_sub_6digit",
77
+ "model_repo": "thoughtworks/arithmetic-sorl",
78
+ "trainer_version": "v1",
79
+ "wandb_run_id": "f2ajfc34",
80
+ "wandb_url": "https://wandb.ai/nlp_and_interpretability/sorl-arithmetic/runs/f2ajfc34",
81
+ "final_accuracy": 0.008333333333333333,
82
+ "sft_accuracy": 0.004166666666666667,
83
+ "eval_method": "ArithmeticEvaluator"
84
+ }