K2triinK commited on
Commit
cf1151b
·
verified ·
1 Parent(s): 9199d2c

Retrained model Qwen3_14B_Base

Browse files
Files changed (41) hide show
  1. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/README.md +2 -2
  2. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/adapter_config.json +5 -5
  3. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/adapter_model.safetensors +1 -1
  4. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/trainer_state.json +158 -158
  5. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/training_args.bin +1 -1
  6. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/adapter_config.json +5 -5
  7. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/adapter_model.safetensors +1 -1
  8. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/trainer_state.json +210 -210
  9. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/training_args.bin +1 -1
  10. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/adapter_config.json +5 -5
  11. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/adapter_model.safetensors +1 -1
  12. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/trainer_state.json +262 -262
  13. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/training_args.bin +1 -1
  14. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/adapter_config.json +5 -5
  15. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/adapter_model.safetensors +1 -1
  16. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/trainer_state.json +312 -312
  17. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/training_args.bin +1 -1
  18. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/adapter_config.json +5 -5
  19. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/adapter_model.safetensors +1 -1
  20. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/trainer_state.json +364 -364
  21. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/training_args.bin +1 -1
  22. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/adapter_config.json +5 -5
  23. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/adapter_model.safetensors +1 -1
  24. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/trainer_state.json +416 -416
  25. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/training_args.bin +1 -1
  26. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/adapter_config.json +5 -5
  27. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/adapter_model.safetensors +1 -1
  28. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/trainer_state.json +468 -468
  29. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/training_args.bin +1 -1
  30. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/adapter_config.json +5 -5
  31. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/adapter_model.safetensors +1 -1
  32. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/trainer_state.json +54 -54
  33. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/training_args.bin +1 -1
  34. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/adapter_config.json +5 -5
  35. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/adapter_model.safetensors +1 -1
  36. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/trainer_state.json +520 -520
  37. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/training_args.bin +1 -1
  38. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/adapter_config.json +5 -5
  39. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/adapter_model.safetensors +1 -1
  40. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/trainer_state.json +106 -106
  41. substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/training_args.bin +1 -1
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/README.md CHANGED
@@ -4,8 +4,8 @@ library_name: transformers
4
  model_name: Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1
5
  tags:
6
  - generated_from_trainer
7
- - sft
8
  - trl
 
9
  licence: license
10
  ---
11
 
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/q9yyd3lj)
31
 
32
 
33
 
 
4
  model_name: Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
+ - sft
9
  licence: license
10
  ---
11
 
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/8n422oft)
31
 
32
 
33
 
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a7fa0cbcc4014800c9436dbb090d14e70b5e7fcdbe9736a023ffb09c88cfc49
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:984ac46ac781a731b273b9c88cf1b4a3c13339cc6d367d4780df48014fa31bf8
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/trainer_state.json CHANGED
@@ -10,309 +10,309 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  }
318
  ],
@@ -333,8 +333,8 @@
333
  "attributes": {}
334
  }
335
  },
336
- "total_flos": 6.155969099958989e+17,
337
- "train_batch_size": 8,
338
  "trial_name": null,
339
  "trial_params": null
340
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  }
318
  ],
 
333
  "attributes": {}
334
  }
335
  },
336
+ "total_flos": 4.926439897662259e+17,
337
+ "train_batch_size": 4,
338
  "trial_name": null,
339
  "trial_params": null
340
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1206/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2633d84e9af32accfb364c9dfc552be8d2fc44e68460c66daa75524132a9e58e
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ac0edf81a88525b0f3a34ecc5c5aa48cf35bbd2a544ec94398e1ffec140d9d
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/trainer_state.json CHANGED
@@ -10,411 +10,411 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  }
420
  ],
@@ -435,8 +435,8 @@
435
  "attributes": {}
436
  }
437
  },
438
- "total_flos": 8.198780834091725e+17,
439
- "train_batch_size": 8,
440
  "trial_name": null,
441
  "trial_params": null
442
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  }
420
  ],
 
435
  "attributes": {}
436
  }
437
  },
438
+ "total_flos": 6.557655587098829e+17,
439
+ "train_batch_size": 4,
440
  "trial_name": null,
441
  "trial_params": null
442
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-1608/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8744fba0c42f30c651ebb41300658386d096002d77f02f6198a09ae8c04509e
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:955028c0f839459bb48b0b40d0a2a6fd5bf484d3ff2ef74a57b663f80f02f033
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/trainer_state.json CHANGED
@@ -10,513 +10,513 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  },
420
  {
421
- "entropy": 1.634241136637601,
422
  "epoch": 4.104607721046078,
423
- "grad_norm": 0.7396109700202942,
424
- "learning_rate": 0.0002221542645793497,
425
- "loss": 1.610531768798828,
426
- "mean_token_accuracy": 0.6454936681371747,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
- "entropy": 1.588758965730667,
432
  "epoch": 4.229140722291407,
433
- "grad_norm": 0.779146134853363,
434
- "learning_rate": 0.00021628935334882304,
435
- "loss": 1.5645944213867187,
436
- "mean_token_accuracy": 0.6525067055225372,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
- "entropy": 1.5439102852344513,
442
  "epoch": 4.353673723536737,
443
- "grad_norm": 0.614967942237854,
444
- "learning_rate": 0.00021030185115424846,
445
- "loss": 1.5180734252929688,
446
- "mean_token_accuracy": 0.6614933741092682,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
- "entropy": 1.5201536059379577,
452
  "epoch": 4.478206724782067,
453
- "grad_norm": 0.6725050210952759,
454
- "learning_rate": 0.00020420304243777673,
455
- "loss": 1.4964521789550782,
456
- "mean_token_accuracy": 0.6630363047122956,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
- "entropy": 1.503999525308609,
462
  "epoch": 4.602739726027397,
463
- "grad_norm": 0.7740678191184998,
464
- "learning_rate": 0.00019800442141718245,
465
- "loss": 1.4821170043945313,
466
- "mean_token_accuracy": 0.6658095389604568,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
- "entropy": 1.4707296586036682,
472
  "epoch": 4.7272727272727275,
473
- "grad_norm": 0.7613642811775208,
474
- "learning_rate": 0.00019171767042310182,
475
- "loss": 1.451023406982422,
476
- "mean_token_accuracy": 0.6706610345840454,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
- "entropy": 1.449094181060791,
482
  "epoch": 4.851805728518057,
483
- "grad_norm": 0.7195038199424744,
484
- "learning_rate": 0.00018535463788174053,
485
- "loss": 1.4288282775878907,
486
- "mean_token_accuracy": 0.6740301263332367,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
- "entropy": 1.4346733844280244,
492
  "epoch": 4.976338729763388,
493
- "grad_norm": 0.7584077715873718,
494
- "learning_rate": 0.00017892731598454726,
495
- "loss": 1.4155799865722656,
496
- "mean_token_accuracy": 0.6788010179996491,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
- "eval_entropy": 1.4948647777701534,
503
- "eval_mean_token_accuracy": 0.661396715876668,
504
- "eval_not_syn_loss": 1.4358112812042236,
505
- "eval_not_syn_runtime": 95.9987,
506
- "eval_not_syn_samples_per_second": 14.323,
507
- "eval_not_syn_steps_per_second": 1.792,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
- "eval_entropy": 1.4439531952835793,
514
- "eval_mean_token_accuracy": 0.6999529783808908,
515
  "eval_num_tokens": 4996950.0,
516
- "eval_syn_loss": 1.3766233921051025,
517
- "eval_syn_runtime": 99.0602,
518
- "eval_syn_samples_per_second": 13.88,
519
- "eval_syn_steps_per_second": 1.736,
520
  "step": 2010
521
  }
522
  ],
@@ -537,8 +537,8 @@
537
  "attributes": {}
538
  }
539
  },
540
- "total_flos": 1.0251185354115686e+18,
541
- "train_batch_size": 8,
542
  "trial_name": null,
543
  "trial_params": null
544
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  },
420
  {
421
+ "entropy": 0.26439598014559407,
422
  "epoch": 4.104607721046078,
423
+ "grad_norm": 0.4967460632324219,
424
+ "learning_rate": 0.0001456691468223661,
425
+ "loss": 0.22732419967651368,
426
+ "mean_token_accuracy": 0.9187829334928532,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
+ "entropy": 0.2497277507185936,
432
  "epoch": 4.229140722291407,
433
+ "grad_norm": 0.3764539659023285,
434
+ "learning_rate": 0.00014182345600586332,
435
+ "loss": 0.21029539108276368,
436
+ "mean_token_accuracy": 0.9221936762332916,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
+ "entropy": 0.2486180279403925,
442
  "epoch": 4.353673723536737,
443
+ "grad_norm": 0.37183400988578796,
444
+ "learning_rate": 0.0001378973808619437,
445
+ "loss": 0.21009698867797852,
446
+ "mean_token_accuracy": 0.9225871834158897,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
+ "entropy": 0.26017799742519854,
452
  "epoch": 4.478206724782067,
453
+ "grad_norm": 0.43216991424560547,
454
+ "learning_rate": 0.00013389832073116724,
455
+ "loss": 0.2177184295654297,
456
+ "mean_token_accuracy": 0.9205843013525009,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
+ "entropy": 0.26708075165748596,
462
  "epoch": 4.602739726027397,
463
+ "grad_norm": 0.37948140501976013,
464
+ "learning_rate": 0.00012983381250642132,
465
+ "loss": 0.22203298568725585,
466
+ "mean_token_accuracy": 0.9184661367535591,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
+ "entropy": 0.25188380032777785,
472
  "epoch": 4.7272727272727275,
473
+ "grad_norm": 0.4874045252799988,
474
+ "learning_rate": 0.00012571151642839446,
475
+ "loss": 0.21441835403442383,
476
+ "mean_token_accuracy": 0.9194883331656456,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
+ "entropy": 0.2515877375751734,
482
  "epoch": 4.851805728518057,
483
+ "grad_norm": 0.43732911348342896,
484
+ "learning_rate": 0.00012153920164858083,
485
+ "loss": 0.21374931335449218,
486
+ "mean_token_accuracy": 0.921038504242897,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
+ "entropy": 0.254078243970871,
492
  "epoch": 4.976338729763388,
493
+ "grad_norm": 0.41199925541877747,
494
+ "learning_rate": 0.00011732473158702397,
495
+ "loss": 0.21298355102539063,
496
+ "mean_token_accuracy": 0.9217021322250366,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
+ "eval_entropy": 0.31937412129238596,
503
+ "eval_mean_token_accuracy": 0.8401600659586662,
504
+ "eval_not_syn_loss": 0.6070574522018433,
505
+ "eval_not_syn_runtime": 95.8354,
506
+ "eval_not_syn_samples_per_second": 14.348,
507
+ "eval_not_syn_steps_per_second": 1.795,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
+ "eval_entropy": 0.30796578786400863,
514
+ "eval_mean_token_accuracy": 0.8628802147022513,
515
  "eval_num_tokens": 4996950.0,
516
+ "eval_syn_loss": 0.5800920128822327,
517
+ "eval_syn_runtime": 98.9566,
518
+ "eval_syn_samples_per_second": 13.895,
519
+ "eval_syn_steps_per_second": 1.738,
520
  "step": 2010
521
  }
522
  ],
 
537
  "attributes": {}
538
  }
539
  },
540
+ "total_flos": 8.195668813557658e+17,
541
+ "train_batch_size": 4,
542
  "trial_name": null,
543
  "trial_params": null
544
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2010/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:149f6b3a2d3a05bc60fa87032bbad932350de2daacf8ebee6eda07fb5d39e107
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eabb9930fbd0d966b39c4afd562574aa9dfcc917f507e0db6d1909d38d983fa7
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/trainer_state.json CHANGED
@@ -10,613 +10,613 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  },
420
  {
421
- "entropy": 1.634241136637601,
422
  "epoch": 4.104607721046078,
423
- "grad_norm": 0.7396109700202942,
424
- "learning_rate": 0.0002221542645793497,
425
- "loss": 1.610531768798828,
426
- "mean_token_accuracy": 0.6454936681371747,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
- "entropy": 1.588758965730667,
432
  "epoch": 4.229140722291407,
433
- "grad_norm": 0.779146134853363,
434
- "learning_rate": 0.00021628935334882304,
435
- "loss": 1.5645944213867187,
436
- "mean_token_accuracy": 0.6525067055225372,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
- "entropy": 1.5439102852344513,
442
  "epoch": 4.353673723536737,
443
- "grad_norm": 0.614967942237854,
444
- "learning_rate": 0.00021030185115424846,
445
- "loss": 1.5180734252929688,
446
- "mean_token_accuracy": 0.6614933741092682,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
- "entropy": 1.5201536059379577,
452
  "epoch": 4.478206724782067,
453
- "grad_norm": 0.6725050210952759,
454
- "learning_rate": 0.00020420304243777673,
455
- "loss": 1.4964521789550782,
456
- "mean_token_accuracy": 0.6630363047122956,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
- "entropy": 1.503999525308609,
462
  "epoch": 4.602739726027397,
463
- "grad_norm": 0.7740678191184998,
464
- "learning_rate": 0.00019800442141718245,
465
- "loss": 1.4821170043945313,
466
- "mean_token_accuracy": 0.6658095389604568,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
- "entropy": 1.4707296586036682,
472
  "epoch": 4.7272727272727275,
473
- "grad_norm": 0.7613642811775208,
474
- "learning_rate": 0.00019171767042310182,
475
- "loss": 1.451023406982422,
476
- "mean_token_accuracy": 0.6706610345840454,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
- "entropy": 1.449094181060791,
482
  "epoch": 4.851805728518057,
483
- "grad_norm": 0.7195038199424744,
484
- "learning_rate": 0.00018535463788174053,
485
- "loss": 1.4288282775878907,
486
- "mean_token_accuracy": 0.6740301263332367,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
- "entropy": 1.4346733844280244,
492
  "epoch": 4.976338729763388,
493
- "grad_norm": 0.7584077715873718,
494
- "learning_rate": 0.00017892731598454726,
495
- "loss": 1.4155799865722656,
496
- "mean_token_accuracy": 0.6788010179996491,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
- "eval_entropy": 1.4948647777701534,
503
- "eval_mean_token_accuracy": 0.661396715876668,
504
- "eval_not_syn_loss": 1.4358112812042236,
505
- "eval_not_syn_runtime": 95.9987,
506
- "eval_not_syn_samples_per_second": 14.323,
507
- "eval_not_syn_steps_per_second": 1.792,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
- "eval_entropy": 1.4439531952835793,
514
- "eval_mean_token_accuracy": 0.6999529783808908,
515
  "eval_num_tokens": 4996950.0,
516
- "eval_syn_loss": 1.3766233921051025,
517
- "eval_syn_runtime": 99.0602,
518
- "eval_syn_samples_per_second": 13.88,
519
- "eval_syn_steps_per_second": 1.736,
520
  "step": 2010
521
  },
522
  {
523
- "entropy": 1.345623204202363,
524
  "epoch": 5.099626400996264,
525
- "grad_norm": 0.8724157214164734,
526
- "learning_rate": 0.00017244781808693755,
527
- "loss": 1.323695831298828,
528
- "mean_token_accuracy": 0.6866910710479274,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
- "entropy": 1.3074172997474671,
534
  "epoch": 5.224159402241594,
535
- "grad_norm": 0.8229172825813293,
536
- "learning_rate": 0.00016592835587866364,
537
- "loss": 1.295655517578125,
538
- "mean_token_accuracy": 0.6931505644321442,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
- "entropy": 1.298924881219864,
544
  "epoch": 5.348692403486924,
545
- "grad_norm": 0.9111100435256958,
546
- "learning_rate": 0.0001593812163688578,
547
- "loss": 1.2771641540527343,
548
- "mean_token_accuracy": 0.6941236406564713,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
- "entropy": 1.3120970189571382,
554
  "epoch": 5.473225404732254,
555
- "grad_norm": 0.7868310213088989,
556
- "learning_rate": 0.000152818738729123,
557
- "loss": 1.2942347717285156,
558
- "mean_token_accuracy": 0.6939822369813919,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
- "entropy": 1.2729843455553054,
564
  "epoch": 5.597758405977584,
565
- "grad_norm": 0.9405992031097412,
566
- "learning_rate": 0.00014625329103831503,
567
- "loss": 1.2503909301757812,
568
- "mean_token_accuracy": 0.7005668586492538,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
- "entropy": 1.2202323937416077,
574
  "epoch": 5.722291407222914,
575
- "grad_norm": 0.7629554271697998,
576
- "learning_rate": 0.00013969724697284394,
577
- "loss": 1.199918212890625,
578
- "mean_token_accuracy": 0.7085251879692077,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
- "entropy": 1.1850077486038209,
584
  "epoch": 5.846824408468244,
585
- "grad_norm": 0.9016424417495728,
586
- "learning_rate": 0.00013316296248642664,
587
- "loss": 1.166585693359375,
588
- "mean_token_accuracy": 0.7123788893222809,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
- "entropy": 1.2040903121232986,
594
  "epoch": 5.971357409713574,
595
- "grad_norm": 0.8533362150192261,
596
- "learning_rate": 0.0001266627525232398,
597
- "loss": 1.1885869598388672,
598
- "mean_token_accuracy": 0.7100468480587006,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_entropy": 1.2157157427349756,
605
- "eval_mean_token_accuracy": 0.7150737251653227,
606
- "eval_not_syn_loss": 1.2506903409957886,
607
- "eval_not_syn_runtime": 95.9618,
608
- "eval_not_syn_samples_per_second": 14.329,
609
- "eval_not_syn_steps_per_second": 1.792,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
- "eval_entropy": 1.178627569661584,
616
- "eval_mean_token_accuracy": 0.7016614221556242,
617
  "eval_num_tokens": 5996340.0,
618
- "eval_syn_loss": 1.2114850282669067,
619
- "eval_syn_runtime": 98.9234,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
@@ -639,8 +639,8 @@
639
  "attributes": {}
640
  }
641
  },
642
- "total_flos": 1.2306720169962086e+18,
643
- "train_batch_size": 8,
644
  "trial_name": null,
645
  "trial_params": null
646
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  },
420
  {
421
+ "entropy": 0.26439598014559407,
422
  "epoch": 4.104607721046078,
423
+ "grad_norm": 0.4967460632324219,
424
+ "learning_rate": 0.0001456691468223661,
425
+ "loss": 0.22732419967651368,
426
+ "mean_token_accuracy": 0.9187829334928532,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
+ "entropy": 0.2497277507185936,
432
  "epoch": 4.229140722291407,
433
+ "grad_norm": 0.3764539659023285,
434
+ "learning_rate": 0.00014182345600586332,
435
+ "loss": 0.21029539108276368,
436
+ "mean_token_accuracy": 0.9221936762332916,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
+ "entropy": 0.2486180279403925,
442
  "epoch": 4.353673723536737,
443
+ "grad_norm": 0.37183400988578796,
444
+ "learning_rate": 0.0001378973808619437,
445
+ "loss": 0.21009698867797852,
446
+ "mean_token_accuracy": 0.9225871834158897,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
+ "entropy": 0.26017799742519854,
452
  "epoch": 4.478206724782067,
453
+ "grad_norm": 0.43216991424560547,
454
+ "learning_rate": 0.00013389832073116724,
455
+ "loss": 0.2177184295654297,
456
+ "mean_token_accuracy": 0.9205843013525009,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
+ "entropy": 0.26708075165748596,
462
  "epoch": 4.602739726027397,
463
+ "grad_norm": 0.37948140501976013,
464
+ "learning_rate": 0.00012983381250642132,
465
+ "loss": 0.22203298568725585,
466
+ "mean_token_accuracy": 0.9184661367535591,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
+ "entropy": 0.25188380032777785,
472
  "epoch": 4.7272727272727275,
473
+ "grad_norm": 0.4874045252799988,
474
+ "learning_rate": 0.00012571151642839446,
475
+ "loss": 0.21441835403442383,
476
+ "mean_token_accuracy": 0.9194883331656456,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
+ "entropy": 0.2515877375751734,
482
  "epoch": 4.851805728518057,
483
+ "grad_norm": 0.43732911348342896,
484
+ "learning_rate": 0.00012153920164858083,
485
+ "loss": 0.21374931335449218,
486
+ "mean_token_accuracy": 0.921038504242897,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
+ "entropy": 0.254078243970871,
492
  "epoch": 4.976338729763388,
493
+ "grad_norm": 0.41199925541877747,
494
+ "learning_rate": 0.00011732473158702397,
495
+ "loss": 0.21298355102539063,
496
+ "mean_token_accuracy": 0.9217021322250366,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
+ "eval_entropy": 0.31937412129238596,
503
+ "eval_mean_token_accuracy": 0.8401600659586662,
504
+ "eval_not_syn_loss": 0.6070574522018433,
505
+ "eval_not_syn_runtime": 95.8354,
506
+ "eval_not_syn_samples_per_second": 14.348,
507
+ "eval_not_syn_steps_per_second": 1.795,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
+ "eval_entropy": 0.30796578786400863,
514
+ "eval_mean_token_accuracy": 0.8628802147022513,
515
  "eval_num_tokens": 4996950.0,
516
+ "eval_syn_loss": 0.5800920128822327,
517
+ "eval_syn_runtime": 98.9566,
518
+ "eval_syn_samples_per_second": 13.895,
519
+ "eval_syn_steps_per_second": 1.738,
520
  "step": 2010
521
  },
522
  {
523
+ "entropy": 0.19132825570425602,
524
  "epoch": 5.099626400996264,
525
+ "grad_norm": 0.45538511872291565,
526
+ "learning_rate": 0.0001130760491123961,
527
+ "loss": 0.153853759765625,
528
+ "mean_token_accuracy": 0.9434747304579224,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
+ "entropy": 0.17422323439270257,
534
  "epoch": 5.224159402241594,
535
+ "grad_norm": 0.4580552279949188,
536
+ "learning_rate": 0.00010880116157234302,
537
+ "loss": 0.13612208366394044,
538
+ "mean_token_accuracy": 0.9487342464923859,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
+ "entropy": 0.1799074411019683,
544
  "epoch": 5.348692403486924,
545
+ "grad_norm": 0.39040836691856384,
546
+ "learning_rate": 0.00010450812570230789,
547
+ "loss": 0.14250579833984375,
548
+ "mean_token_accuracy": 0.9467630323767662,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
+ "entropy": 0.18116022080183028,
554
  "epoch": 5.473225404732254,
555
+ "grad_norm": 0.3805501163005829,
556
+ "learning_rate": 0.00010020503244127544,
557
+ "loss": 0.14014955520629882,
558
+ "mean_token_accuracy": 0.946874064207077,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
+ "entropy": 0.1810251370444894,
564
  "epoch": 5.597758405977584,
565
+ "grad_norm": 0.36377736926078796,
566
+ "learning_rate": 9.589999168305372e-05,
567
+ "loss": 0.14044331550598144,
568
+ "mean_token_accuracy": 0.9467302888631821,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
+ "entropy": 0.17202148117125035,
574
  "epoch": 5.722291407222914,
575
+ "grad_norm": 0.38708847761154175,
576
+ "learning_rate": 9.16011169918326e-05,
577
+ "loss": 0.13840054512023925,
578
+ "mean_token_accuracy": 0.9480020496249199,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
+ "entropy": 0.1769936926662922,
584
  "epoch": 5.846824408468244,
585
+ "grad_norm": 0.4674988389015198,
586
+ "learning_rate": 8.73165103108249e-05,
587
+ "loss": 0.1404121208190918,
588
+ "mean_token_accuracy": 0.9461787036061287,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
+ "entropy": 0.17118842527270317,
594
  "epoch": 5.971357409713574,
595
+ "grad_norm": 0.4454115629196167,
596
+ "learning_rate": 8.305424669280888e-05,
597
+ "loss": 0.1366124439239502,
598
+ "mean_token_accuracy": 0.9484315186738967,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_entropy": 0.2494078171114589,
605
+ "eval_mean_token_accuracy": 0.851767166062843,
606
+ "eval_not_syn_loss": 0.7222095131874084,
607
+ "eval_not_syn_runtime": 95.8715,
608
+ "eval_not_syn_samples_per_second": 14.342,
609
+ "eval_not_syn_steps_per_second": 1.794,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
+ "eval_entropy": 0.239824180079754,
616
+ "eval_mean_token_accuracy": 0.8484749911829482,
617
  "eval_num_tokens": 5996340.0,
618
+ "eval_syn_loss": 0.7036991119384766,
619
+ "eval_syn_runtime": 98.923,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
 
639
  "attributes": {}
640
  }
641
  },
642
+ "total_flos": 9.832542010122854e+17,
643
+ "train_batch_size": 4,
644
  "trial_name": null,
645
  "trial_params": null
646
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2412/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d12a47e34a306382397f74cbcf0e39f1683fa5eab3a83931deec91c7a768aab
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:565e0a2c65111239b0973cf73addff4ab61c83d8ea5bda8d6fd11ccd2c60425b
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/trainer_state.json CHANGED
@@ -10,717 +10,717 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  },
420
  {
421
- "entropy": 1.634241136637601,
422
  "epoch": 4.104607721046078,
423
- "grad_norm": 0.7396109700202942,
424
- "learning_rate": 0.0002221542645793497,
425
- "loss": 1.610531768798828,
426
- "mean_token_accuracy": 0.6454936681371747,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
- "entropy": 1.588758965730667,
432
  "epoch": 4.229140722291407,
433
- "grad_norm": 0.779146134853363,
434
- "learning_rate": 0.00021628935334882304,
435
- "loss": 1.5645944213867187,
436
- "mean_token_accuracy": 0.6525067055225372,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
- "entropy": 1.5439102852344513,
442
  "epoch": 4.353673723536737,
443
- "grad_norm": 0.614967942237854,
444
- "learning_rate": 0.00021030185115424846,
445
- "loss": 1.5180734252929688,
446
- "mean_token_accuracy": 0.6614933741092682,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
- "entropy": 1.5201536059379577,
452
  "epoch": 4.478206724782067,
453
- "grad_norm": 0.6725050210952759,
454
- "learning_rate": 0.00020420304243777673,
455
- "loss": 1.4964521789550782,
456
- "mean_token_accuracy": 0.6630363047122956,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
- "entropy": 1.503999525308609,
462
  "epoch": 4.602739726027397,
463
- "grad_norm": 0.7740678191184998,
464
- "learning_rate": 0.00019800442141718245,
465
- "loss": 1.4821170043945313,
466
- "mean_token_accuracy": 0.6658095389604568,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
- "entropy": 1.4707296586036682,
472
  "epoch": 4.7272727272727275,
473
- "grad_norm": 0.7613642811775208,
474
- "learning_rate": 0.00019171767042310182,
475
- "loss": 1.451023406982422,
476
- "mean_token_accuracy": 0.6706610345840454,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
- "entropy": 1.449094181060791,
482
  "epoch": 4.851805728518057,
483
- "grad_norm": 0.7195038199424744,
484
- "learning_rate": 0.00018535463788174053,
485
- "loss": 1.4288282775878907,
486
- "mean_token_accuracy": 0.6740301263332367,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
- "entropy": 1.4346733844280244,
492
  "epoch": 4.976338729763388,
493
- "grad_norm": 0.7584077715873718,
494
- "learning_rate": 0.00017892731598454726,
495
- "loss": 1.4155799865722656,
496
- "mean_token_accuracy": 0.6788010179996491,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
- "eval_entropy": 1.4948647777701534,
503
- "eval_mean_token_accuracy": 0.661396715876668,
504
- "eval_not_syn_loss": 1.4358112812042236,
505
- "eval_not_syn_runtime": 95.9987,
506
- "eval_not_syn_samples_per_second": 14.323,
507
- "eval_not_syn_steps_per_second": 1.792,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
- "eval_entropy": 1.4439531952835793,
514
- "eval_mean_token_accuracy": 0.6999529783808908,
515
  "eval_num_tokens": 4996950.0,
516
- "eval_syn_loss": 1.3766233921051025,
517
- "eval_syn_runtime": 99.0602,
518
- "eval_syn_samples_per_second": 13.88,
519
- "eval_syn_steps_per_second": 1.736,
520
  "step": 2010
521
  },
522
  {
523
- "entropy": 1.345623204202363,
524
  "epoch": 5.099626400996264,
525
- "grad_norm": 0.8724157214164734,
526
- "learning_rate": 0.00017244781808693755,
527
- "loss": 1.323695831298828,
528
- "mean_token_accuracy": 0.6866910710479274,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
- "entropy": 1.3074172997474671,
534
  "epoch": 5.224159402241594,
535
- "grad_norm": 0.8229172825813293,
536
- "learning_rate": 0.00016592835587866364,
537
- "loss": 1.295655517578125,
538
- "mean_token_accuracy": 0.6931505644321442,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
- "entropy": 1.298924881219864,
544
  "epoch": 5.348692403486924,
545
- "grad_norm": 0.9111100435256958,
546
- "learning_rate": 0.0001593812163688578,
547
- "loss": 1.2771641540527343,
548
- "mean_token_accuracy": 0.6941236406564713,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
- "entropy": 1.3120970189571382,
554
  "epoch": 5.473225404732254,
555
- "grad_norm": 0.7868310213088989,
556
- "learning_rate": 0.000152818738729123,
557
- "loss": 1.2942347717285156,
558
- "mean_token_accuracy": 0.6939822369813919,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
- "entropy": 1.2729843455553054,
564
  "epoch": 5.597758405977584,
565
- "grad_norm": 0.9405992031097412,
566
- "learning_rate": 0.00014625329103831503,
567
- "loss": 1.2503909301757812,
568
- "mean_token_accuracy": 0.7005668586492538,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
- "entropy": 1.2202323937416077,
574
  "epoch": 5.722291407222914,
575
- "grad_norm": 0.7629554271697998,
576
- "learning_rate": 0.00013969724697284394,
577
- "loss": 1.199918212890625,
578
- "mean_token_accuracy": 0.7085251879692077,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
- "entropy": 1.1850077486038209,
584
  "epoch": 5.846824408468244,
585
- "grad_norm": 0.9016424417495728,
586
- "learning_rate": 0.00013316296248642664,
587
- "loss": 1.166585693359375,
588
- "mean_token_accuracy": 0.7123788893222809,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
- "entropy": 1.2040903121232986,
594
  "epoch": 5.971357409713574,
595
- "grad_norm": 0.8533362150192261,
596
- "learning_rate": 0.0001266627525232398,
597
- "loss": 1.1885869598388672,
598
- "mean_token_accuracy": 0.7100468480587006,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_entropy": 1.2157157427349756,
605
- "eval_mean_token_accuracy": 0.7150737251653227,
606
- "eval_not_syn_loss": 1.2506903409957886,
607
- "eval_not_syn_runtime": 95.9618,
608
- "eval_not_syn_samples_per_second": 14.329,
609
- "eval_not_syn_steps_per_second": 1.792,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
- "eval_entropy": 1.178627569661584,
616
- "eval_mean_token_accuracy": 0.7016614221556242,
617
  "eval_num_tokens": 5996340.0,
618
- "eval_syn_loss": 1.2114850282669067,
619
- "eval_syn_runtime": 98.9234,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
- "entropy": 1.0982752972178988,
626
  "epoch": 6.094645080946451,
627
- "grad_norm": 1.295694351196289,
628
- "learning_rate": 0.00012020886780836267,
629
- "loss": 1.0774417877197267,
630
- "mean_token_accuracy": 0.7259544272615452,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
- "entropy": 1.0651295524835587,
636
  "epoch": 6.219178082191781,
637
- "grad_norm": 0.9243665337562561,
638
- "learning_rate": 0.0001138134717592517,
639
- "loss": 1.0521366119384765,
640
- "mean_token_accuracy": 0.7293049448728561,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
- "entropy": 1.0358434236049652,
646
  "epoch": 6.34371108343711,
647
- "grad_norm": 0.9286350011825562,
648
- "learning_rate": 0.00010748861756175999,
649
- "loss": 1.018977813720703,
650
- "mean_token_accuracy": 0.7370841908454895,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
- "entropy": 1.0522522723674774,
656
  "epoch": 6.468244084682441,
657
- "grad_norm": 0.9158058762550354,
658
- "learning_rate": 0.00010124622545390751,
659
- "loss": 1.0335795593261718,
660
- "mean_token_accuracy": 0.7348936641216278,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
- "entropy": 1.0202285438776015,
666
  "epoch": 6.592777085927771,
667
- "grad_norm": 0.8085289597511292,
668
- "learning_rate": 9.509806026021276e-05,
669
- "loss": 1.0074135589599609,
670
- "mean_token_accuracy": 0.7410361844301224,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
- "entropy": 0.9939206629991532,
676
  "epoch": 6.717310087173101,
677
- "grad_norm": 0.7527234554290771,
678
- "learning_rate": 8.90557092189276e-05,
679
- "loss": 0.9766032409667968,
680
- "mean_token_accuracy": 0.7435237610340119,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
- "entropy": 0.985169340968132,
686
  "epoch": 6.841843088418431,
687
- "grad_norm": 0.9773848056793213,
688
- "learning_rate": 8.313056014396262e-05,
689
- "loss": 0.9680111694335938,
690
- "mean_token_accuracy": 0.7455707538127899,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
- "entropy": 0.9864326739311218,
696
  "epoch": 6.966376089663761,
697
- "grad_norm": 0.7536692023277283,
698
- "learning_rate": 7.733377996266039e-05,
699
- "loss": 0.9636287689208984,
700
- "mean_token_accuracy": 0.7498565518856048,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
- "eval_entropy": 1.0250733893278032,
707
- "eval_mean_token_accuracy": 0.7460000244683997,
708
- "eval_not_syn_loss": 1.0630079507827759,
709
- "eval_not_syn_runtime": 96.0062,
710
- "eval_not_syn_samples_per_second": 14.322,
711
- "eval_not_syn_steps_per_second": 1.792,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
- "eval_entropy": 0.9910203045190766,
718
- "eval_mean_token_accuracy": 0.7360078449859175,
719
  "eval_num_tokens": 6995730.0,
720
- "eval_syn_loss": 1.0268242359161377,
721
- "eval_syn_runtime": 99.1365,
722
- "eval_syn_samples_per_second": 13.87,
723
- "eval_syn_steps_per_second": 1.735,
724
  "step": 2814
725
  }
726
  ],
@@ -741,8 +741,8 @@
741
  "attributes": {}
742
  }
743
  },
744
- "total_flos": 1.4366868190744474e+18,
745
- "train_batch_size": 8,
746
  "trial_name": null,
747
  "trial_params": null
748
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  },
420
  {
421
+ "entropy": 0.26439598014559407,
422
  "epoch": 4.104607721046078,
423
+ "grad_norm": 0.4967460632324219,
424
+ "learning_rate": 0.0001456691468223661,
425
+ "loss": 0.22732419967651368,
426
+ "mean_token_accuracy": 0.9187829334928532,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
+ "entropy": 0.2497277507185936,
432
  "epoch": 4.229140722291407,
433
+ "grad_norm": 0.3764539659023285,
434
+ "learning_rate": 0.00014182345600586332,
435
+ "loss": 0.21029539108276368,
436
+ "mean_token_accuracy": 0.9221936762332916,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
+ "entropy": 0.2486180279403925,
442
  "epoch": 4.353673723536737,
443
+ "grad_norm": 0.37183400988578796,
444
+ "learning_rate": 0.0001378973808619437,
445
+ "loss": 0.21009698867797852,
446
+ "mean_token_accuracy": 0.9225871834158897,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
+ "entropy": 0.26017799742519854,
452
  "epoch": 4.478206724782067,
453
+ "grad_norm": 0.43216991424560547,
454
+ "learning_rate": 0.00013389832073116724,
455
+ "loss": 0.2177184295654297,
456
+ "mean_token_accuracy": 0.9205843013525009,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
+ "entropy": 0.26708075165748596,
462
  "epoch": 4.602739726027397,
463
+ "grad_norm": 0.37948140501976013,
464
+ "learning_rate": 0.00012983381250642132,
465
+ "loss": 0.22203298568725585,
466
+ "mean_token_accuracy": 0.9184661367535591,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
+ "entropy": 0.25188380032777785,
472
  "epoch": 4.7272727272727275,
473
+ "grad_norm": 0.4874045252799988,
474
+ "learning_rate": 0.00012571151642839446,
475
+ "loss": 0.21441835403442383,
476
+ "mean_token_accuracy": 0.9194883331656456,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
+ "entropy": 0.2515877375751734,
482
  "epoch": 4.851805728518057,
483
+ "grad_norm": 0.43732911348342896,
484
+ "learning_rate": 0.00012153920164858083,
485
+ "loss": 0.21374931335449218,
486
+ "mean_token_accuracy": 0.921038504242897,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
+ "entropy": 0.254078243970871,
492
  "epoch": 4.976338729763388,
493
+ "grad_norm": 0.41199925541877747,
494
+ "learning_rate": 0.00011732473158702397,
495
+ "loss": 0.21298355102539063,
496
+ "mean_token_accuracy": 0.9217021322250366,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
+ "eval_entropy": 0.31937412129238596,
503
+ "eval_mean_token_accuracy": 0.8401600659586662,
504
+ "eval_not_syn_loss": 0.6070574522018433,
505
+ "eval_not_syn_runtime": 95.8354,
506
+ "eval_not_syn_samples_per_second": 14.348,
507
+ "eval_not_syn_steps_per_second": 1.795,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
+ "eval_entropy": 0.30796578786400863,
514
+ "eval_mean_token_accuracy": 0.8628802147022513,
515
  "eval_num_tokens": 4996950.0,
516
+ "eval_syn_loss": 0.5800920128822327,
517
+ "eval_syn_runtime": 98.9566,
518
+ "eval_syn_samples_per_second": 13.895,
519
+ "eval_syn_steps_per_second": 1.738,
520
  "step": 2010
521
  },
522
  {
523
+ "entropy": 0.19132825570425602,
524
  "epoch": 5.099626400996264,
525
+ "grad_norm": 0.45538511872291565,
526
+ "learning_rate": 0.0001130760491123961,
527
+ "loss": 0.153853759765625,
528
+ "mean_token_accuracy": 0.9434747304579224,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
+ "entropy": 0.17422323439270257,
534
  "epoch": 5.224159402241594,
535
+ "grad_norm": 0.4580552279949188,
536
+ "learning_rate": 0.00010880116157234302,
537
+ "loss": 0.13612208366394044,
538
+ "mean_token_accuracy": 0.9487342464923859,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
+ "entropy": 0.1799074411019683,
544
  "epoch": 5.348692403486924,
545
+ "grad_norm": 0.39040836691856384,
546
+ "learning_rate": 0.00010450812570230789,
547
+ "loss": 0.14250579833984375,
548
+ "mean_token_accuracy": 0.9467630323767662,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
+ "entropy": 0.18116022080183028,
554
  "epoch": 5.473225404732254,
555
+ "grad_norm": 0.3805501163005829,
556
+ "learning_rate": 0.00010020503244127544,
557
+ "loss": 0.14014955520629882,
558
+ "mean_token_accuracy": 0.946874064207077,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
+ "entropy": 0.1810251370444894,
564
  "epoch": 5.597758405977584,
565
+ "grad_norm": 0.36377736926078796,
566
+ "learning_rate": 9.589999168305372e-05,
567
+ "loss": 0.14044331550598144,
568
+ "mean_token_accuracy": 0.9467302888631821,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
+ "entropy": 0.17202148117125035,
574
  "epoch": 5.722291407222914,
575
+ "grad_norm": 0.38708847761154175,
576
+ "learning_rate": 9.16011169918326e-05,
577
+ "loss": 0.13840054512023925,
578
+ "mean_token_accuracy": 0.9480020496249199,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
+ "entropy": 0.1769936926662922,
584
  "epoch": 5.846824408468244,
585
+ "grad_norm": 0.4674988389015198,
586
+ "learning_rate": 8.73165103108249e-05,
587
+ "loss": 0.1404121208190918,
588
+ "mean_token_accuracy": 0.9461787036061287,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
+ "entropy": 0.17118842527270317,
594
  "epoch": 5.971357409713574,
595
+ "grad_norm": 0.4454115629196167,
596
+ "learning_rate": 8.305424669280888e-05,
597
+ "loss": 0.1366124439239502,
598
+ "mean_token_accuracy": 0.9484315186738967,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_entropy": 0.2494078171114589,
605
+ "eval_mean_token_accuracy": 0.851767166062843,
606
+ "eval_not_syn_loss": 0.7222095131874084,
607
+ "eval_not_syn_runtime": 95.8715,
608
+ "eval_not_syn_samples_per_second": 14.342,
609
+ "eval_not_syn_steps_per_second": 1.794,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
+ "eval_entropy": 0.239824180079754,
616
+ "eval_mean_token_accuracy": 0.8484749911829482,
617
  "eval_num_tokens": 5996340.0,
618
+ "eval_syn_loss": 0.7036991119384766,
619
+ "eval_syn_runtime": 98.923,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
+ "entropy": 0.14340858902744572,
626
  "epoch": 6.094645080946451,
627
+ "grad_norm": 0.270256370306015,
628
+ "learning_rate": 7.88223590813502e-05,
629
+ "loss": 0.10398475646972656,
630
+ "mean_token_accuracy": 0.9604840600731397,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
+ "entropy": 0.13108037687838078,
636
  "epoch": 6.219178082191781,
637
+ "grad_norm": 0.2870016396045685,
638
+ "learning_rate": 7.462882317138628e-05,
639
+ "loss": 0.09566926002502442,
640
+ "mean_token_accuracy": 0.9636739170551301,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
+ "entropy": 0.1305130884796381,
646
  "epoch": 6.34371108343711,
647
+ "grad_norm": 0.34119686484336853,
648
+ "learning_rate": 7.048154237770433e-05,
649
+ "loss": 0.0943364143371582,
650
+ "mean_token_accuracy": 0.9643464788794518,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
+ "entropy": 0.12516659261658789,
656
  "epoch": 6.468244084682441,
657
+ "grad_norm": 0.2851012051105499,
658
+ "learning_rate": 6.638833293964401e-05,
659
+ "loss": 0.09339779853820801,
660
+ "mean_token_accuracy": 0.9654107868671418,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
+ "entropy": 0.12894487291574477,
666
  "epoch": 6.592777085927771,
667
+ "grad_norm": 0.22259071469306946,
668
+ "learning_rate": 6.235690919009636e-05,
669
+ "loss": 0.09672751426696777,
670
+ "mean_token_accuracy": 0.9641611188650131,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
+ "entropy": 0.13079525250941515,
676
  "epoch": 6.717310087173101,
677
+ "grad_norm": 0.2873247265815735,
678
+ "learning_rate": 5.839486901656255e-05,
679
+ "loss": 0.09482893943786622,
680
+ "mean_token_accuracy": 0.963907478749752,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
+ "entropy": 0.13382539574056865,
686
  "epoch": 6.841843088418431,
687
+ "grad_norm": 0.26538875699043274,
688
+ "learning_rate": 5.450967954167317e-05,
689
+ "loss": 0.09817559242248536,
690
+ "mean_token_accuracy": 0.9620411720871925,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
+ "entropy": 0.12861237980425358,
696
  "epoch": 6.966376089663761,
697
+ "grad_norm": 0.3292505443096161,
698
+ "learning_rate": 5.070866305015545e-05,
699
+ "loss": 0.09485826492309571,
700
+ "mean_token_accuracy": 0.9636287876963615,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
+ "eval_entropy": 0.2065339538073817,
707
+ "eval_mean_token_accuracy": 0.8418768654729045,
708
+ "eval_not_syn_loss": 0.8473101854324341,
709
+ "eval_not_syn_runtime": 95.8493,
710
+ "eval_not_syn_samples_per_second": 14.345,
711
+ "eval_not_syn_steps_per_second": 1.794,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
+ "eval_entropy": 0.19768535250494645,
718
+ "eval_mean_token_accuracy": 0.8554798219093057,
719
  "eval_num_tokens": 6995730.0,
720
+ "eval_syn_loss": 0.8169878125190735,
721
+ "eval_syn_runtime": 98.9508,
722
+ "eval_syn_samples_per_second": 13.896,
723
+ "eval_syn_steps_per_second": 1.738,
724
  "step": 2814
725
  }
726
  ],
 
741
  "attributes": {}
742
  }
743
  },
744
+ "total_flos": 1.1459814588530381e+18,
745
+ "train_batch_size": 4,
746
  "trial_name": null,
747
  "trial_params": null
748
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-2814/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2351094b49b2e98cbe019fb745c46a4f81e1065edbccf9cab85581d857ca1d71
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0505bdf3521dce71d1db6e646a36a0a89a4d874f31e1b35dab9425e8d503662
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/trainer_state.json CHANGED
@@ -10,819 +10,819 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  },
420
  {
421
- "entropy": 1.634241136637601,
422
  "epoch": 4.104607721046078,
423
- "grad_norm": 0.7396109700202942,
424
- "learning_rate": 0.0002221542645793497,
425
- "loss": 1.610531768798828,
426
- "mean_token_accuracy": 0.6454936681371747,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
- "entropy": 1.588758965730667,
432
  "epoch": 4.229140722291407,
433
- "grad_norm": 0.779146134853363,
434
- "learning_rate": 0.00021628935334882304,
435
- "loss": 1.5645944213867187,
436
- "mean_token_accuracy": 0.6525067055225372,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
- "entropy": 1.5439102852344513,
442
  "epoch": 4.353673723536737,
443
- "grad_norm": 0.614967942237854,
444
- "learning_rate": 0.00021030185115424846,
445
- "loss": 1.5180734252929688,
446
- "mean_token_accuracy": 0.6614933741092682,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
- "entropy": 1.5201536059379577,
452
  "epoch": 4.478206724782067,
453
- "grad_norm": 0.6725050210952759,
454
- "learning_rate": 0.00020420304243777673,
455
- "loss": 1.4964521789550782,
456
- "mean_token_accuracy": 0.6630363047122956,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
- "entropy": 1.503999525308609,
462
  "epoch": 4.602739726027397,
463
- "grad_norm": 0.7740678191184998,
464
- "learning_rate": 0.00019800442141718245,
465
- "loss": 1.4821170043945313,
466
- "mean_token_accuracy": 0.6658095389604568,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
- "entropy": 1.4707296586036682,
472
  "epoch": 4.7272727272727275,
473
- "grad_norm": 0.7613642811775208,
474
- "learning_rate": 0.00019171767042310182,
475
- "loss": 1.451023406982422,
476
- "mean_token_accuracy": 0.6706610345840454,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
- "entropy": 1.449094181060791,
482
  "epoch": 4.851805728518057,
483
- "grad_norm": 0.7195038199424744,
484
- "learning_rate": 0.00018535463788174053,
485
- "loss": 1.4288282775878907,
486
- "mean_token_accuracy": 0.6740301263332367,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
- "entropy": 1.4346733844280244,
492
  "epoch": 4.976338729763388,
493
- "grad_norm": 0.7584077715873718,
494
- "learning_rate": 0.00017892731598454726,
495
- "loss": 1.4155799865722656,
496
- "mean_token_accuracy": 0.6788010179996491,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
- "eval_entropy": 1.4948647777701534,
503
- "eval_mean_token_accuracy": 0.661396715876668,
504
- "eval_not_syn_loss": 1.4358112812042236,
505
- "eval_not_syn_runtime": 95.9987,
506
- "eval_not_syn_samples_per_second": 14.323,
507
- "eval_not_syn_steps_per_second": 1.792,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
- "eval_entropy": 1.4439531952835793,
514
- "eval_mean_token_accuracy": 0.6999529783808908,
515
  "eval_num_tokens": 4996950.0,
516
- "eval_syn_loss": 1.3766233921051025,
517
- "eval_syn_runtime": 99.0602,
518
- "eval_syn_samples_per_second": 13.88,
519
- "eval_syn_steps_per_second": 1.736,
520
  "step": 2010
521
  },
522
  {
523
- "entropy": 1.345623204202363,
524
  "epoch": 5.099626400996264,
525
- "grad_norm": 0.8724157214164734,
526
- "learning_rate": 0.00017244781808693755,
527
- "loss": 1.323695831298828,
528
- "mean_token_accuracy": 0.6866910710479274,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
- "entropy": 1.3074172997474671,
534
  "epoch": 5.224159402241594,
535
- "grad_norm": 0.8229172825813293,
536
- "learning_rate": 0.00016592835587866364,
537
- "loss": 1.295655517578125,
538
- "mean_token_accuracy": 0.6931505644321442,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
- "entropy": 1.298924881219864,
544
  "epoch": 5.348692403486924,
545
- "grad_norm": 0.9111100435256958,
546
- "learning_rate": 0.0001593812163688578,
547
- "loss": 1.2771641540527343,
548
- "mean_token_accuracy": 0.6941236406564713,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
- "entropy": 1.3120970189571382,
554
  "epoch": 5.473225404732254,
555
- "grad_norm": 0.7868310213088989,
556
- "learning_rate": 0.000152818738729123,
557
- "loss": 1.2942347717285156,
558
- "mean_token_accuracy": 0.6939822369813919,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
- "entropy": 1.2729843455553054,
564
  "epoch": 5.597758405977584,
565
- "grad_norm": 0.9405992031097412,
566
- "learning_rate": 0.00014625329103831503,
567
- "loss": 1.2503909301757812,
568
- "mean_token_accuracy": 0.7005668586492538,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
- "entropy": 1.2202323937416077,
574
  "epoch": 5.722291407222914,
575
- "grad_norm": 0.7629554271697998,
576
- "learning_rate": 0.00013969724697284394,
577
- "loss": 1.199918212890625,
578
- "mean_token_accuracy": 0.7085251879692077,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
- "entropy": 1.1850077486038209,
584
  "epoch": 5.846824408468244,
585
- "grad_norm": 0.9016424417495728,
586
- "learning_rate": 0.00013316296248642664,
587
- "loss": 1.166585693359375,
588
- "mean_token_accuracy": 0.7123788893222809,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
- "entropy": 1.2040903121232986,
594
  "epoch": 5.971357409713574,
595
- "grad_norm": 0.8533362150192261,
596
- "learning_rate": 0.0001266627525232398,
597
- "loss": 1.1885869598388672,
598
- "mean_token_accuracy": 0.7100468480587006,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_entropy": 1.2157157427349756,
605
- "eval_mean_token_accuracy": 0.7150737251653227,
606
- "eval_not_syn_loss": 1.2506903409957886,
607
- "eval_not_syn_runtime": 95.9618,
608
- "eval_not_syn_samples_per_second": 14.329,
609
- "eval_not_syn_steps_per_second": 1.792,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
- "eval_entropy": 1.178627569661584,
616
- "eval_mean_token_accuracy": 0.7016614221556242,
617
  "eval_num_tokens": 5996340.0,
618
- "eval_syn_loss": 1.2114850282669067,
619
- "eval_syn_runtime": 98.9234,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
- "entropy": 1.0982752972178988,
626
  "epoch": 6.094645080946451,
627
- "grad_norm": 1.295694351196289,
628
- "learning_rate": 0.00012020886780836267,
629
- "loss": 1.0774417877197267,
630
- "mean_token_accuracy": 0.7259544272615452,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
- "entropy": 1.0651295524835587,
636
  "epoch": 6.219178082191781,
637
- "grad_norm": 0.9243665337562561,
638
- "learning_rate": 0.0001138134717592517,
639
- "loss": 1.0521366119384765,
640
- "mean_token_accuracy": 0.7293049448728561,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
- "entropy": 1.0358434236049652,
646
  "epoch": 6.34371108343711,
647
- "grad_norm": 0.9286350011825562,
648
- "learning_rate": 0.00010748861756175999,
649
- "loss": 1.018977813720703,
650
- "mean_token_accuracy": 0.7370841908454895,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
- "entropy": 1.0522522723674774,
656
  "epoch": 6.468244084682441,
657
- "grad_norm": 0.9158058762550354,
658
- "learning_rate": 0.00010124622545390751,
659
- "loss": 1.0335795593261718,
660
- "mean_token_accuracy": 0.7348936641216278,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
- "entropy": 1.0202285438776015,
666
  "epoch": 6.592777085927771,
667
- "grad_norm": 0.8085289597511292,
668
- "learning_rate": 9.509806026021276e-05,
669
- "loss": 1.0074135589599609,
670
- "mean_token_accuracy": 0.7410361844301224,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
- "entropy": 0.9939206629991532,
676
  "epoch": 6.717310087173101,
677
- "grad_norm": 0.7527234554290771,
678
- "learning_rate": 8.90557092189276e-05,
679
- "loss": 0.9766032409667968,
680
- "mean_token_accuracy": 0.7435237610340119,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
- "entropy": 0.985169340968132,
686
  "epoch": 6.841843088418431,
687
- "grad_norm": 0.9773848056793213,
688
- "learning_rate": 8.313056014396262e-05,
689
- "loss": 0.9680111694335938,
690
- "mean_token_accuracy": 0.7455707538127899,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
- "entropy": 0.9864326739311218,
696
  "epoch": 6.966376089663761,
697
- "grad_norm": 0.7536692023277283,
698
- "learning_rate": 7.733377996266039e-05,
699
- "loss": 0.9636287689208984,
700
- "mean_token_accuracy": 0.7498565518856048,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
- "eval_entropy": 1.0250733893278032,
707
- "eval_mean_token_accuracy": 0.7460000244683997,
708
- "eval_not_syn_loss": 1.0630079507827759,
709
- "eval_not_syn_runtime": 96.0062,
710
- "eval_not_syn_samples_per_second": 14.322,
711
- "eval_not_syn_steps_per_second": 1.792,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
- "eval_entropy": 0.9910203045190766,
718
- "eval_mean_token_accuracy": 0.7360078449859175,
719
  "eval_num_tokens": 6995730.0,
720
- "eval_syn_loss": 1.0268242359161377,
721
- "eval_syn_runtime": 99.1365,
722
- "eval_syn_samples_per_second": 13.87,
723
- "eval_syn_steps_per_second": 1.735,
724
  "step": 2814
725
  },
726
  {
727
- "entropy": 0.8933790849916863,
728
  "epoch": 7.089663760896638,
729
- "grad_norm": 0.9901642799377441,
730
- "learning_rate": 7.16762936698672e-05,
731
- "loss": 0.870389404296875,
732
- "mean_token_accuracy": 0.7670638844220325,
733
  "num_tokens": 7081846.0,
734
  "step": 2850
735
  },
736
  {
737
- "entropy": 0.8327929264307022,
738
  "epoch": 7.214196762141968,
739
- "grad_norm": 0.9863399267196655,
740
- "learning_rate": 6.616876373796547e-05,
741
- "loss": 0.8118631744384766,
742
- "mean_token_accuracy": 0.7749309521913529,
743
  "num_tokens": 7202092.0,
744
  "step": 2900
745
  },
746
  {
747
- "entropy": 0.8421830326318741,
748
  "epoch": 7.338729763387297,
749
- "grad_norm": 0.9192395210266113,
750
- "learning_rate": 6.082157002167449e-05,
751
- "loss": 0.8261887359619141,
752
- "mean_token_accuracy": 0.7743502056598663,
753
  "num_tokens": 7328180.0,
754
  "step": 2950
755
  },
756
  {
757
- "entropy": 0.8124729514122009,
758
  "epoch": 7.463262764632628,
759
- "grad_norm": 0.807341456413269,
760
- "learning_rate": 5.564479019549013e-05,
761
- "loss": 0.79787353515625,
762
- "mean_token_accuracy": 0.7786311388015748,
763
  "num_tokens": 7449031.0,
764
  "step": 3000
765
  },
766
  {
767
- "entropy": 0.8328139960765839,
768
  "epoch": 7.587795765877957,
769
- "grad_norm": 0.8058074116706848,
770
- "learning_rate": 5.064818076063412e-05,
771
- "loss": 0.8112649536132812,
772
- "mean_token_accuracy": 0.7778408378362656,
773
  "num_tokens": 7574561.0,
774
  "step": 3050
775
  },
776
  {
777
- "entropy": 0.8063498467206955,
778
  "epoch": 7.712328767123288,
779
- "grad_norm": 1.1108580827713013,
780
- "learning_rate": 4.584115865730714e-05,
781
- "loss": 0.7904315185546875,
782
- "mean_token_accuracy": 0.7799610030651093,
783
  "num_tokens": 7700071.0,
784
  "step": 3100
785
  },
786
  {
787
- "entropy": 0.8056518918275833,
788
  "epoch": 7.8368617683686175,
789
- "grad_norm": 1.0046204328536987,
790
- "learning_rate": 4.123278351690132e-05,
791
- "loss": 0.7889098358154297,
792
- "mean_token_accuracy": 0.781885308623314,
793
  "num_tokens": 7831577.0,
794
  "step": 3150
795
  },
796
  {
797
- "entropy": 0.7862150323390961,
798
  "epoch": 7.961394769613948,
799
- "grad_norm": 0.8963404893875122,
800
- "learning_rate": 3.683174058762063e-05,
801
- "loss": 0.7703626251220703,
802
- "mean_token_accuracy": 0.7862878423929215,
803
  "num_tokens": 7954099.0,
804
  "step": 3200
805
  },
806
  {
807
  "epoch": 8.0,
808
- "eval_entropy": 0.8138092035471007,
809
- "eval_mean_token_accuracy": 0.7570307982522387,
810
- "eval_not_syn_loss": 0.9443947076797485,
811
- "eval_not_syn_runtime": 96.0009,
812
- "eval_not_syn_samples_per_second": 14.323,
813
- "eval_not_syn_steps_per_second": 1.792,
814
  "eval_num_tokens": 7995120.0,
815
  "step": 3216
816
  },
817
  {
818
  "epoch": 8.0,
819
- "eval_entropy": 0.7856217716322389,
820
- "eval_mean_token_accuracy": 0.7748590402824934,
821
  "eval_num_tokens": 7995120.0,
822
- "eval_syn_loss": 0.9141804575920105,
823
- "eval_syn_runtime": 99.0213,
824
- "eval_syn_samples_per_second": 13.886,
825
- "eval_syn_steps_per_second": 1.737,
826
  "step": 3216
827
  }
828
  ],
@@ -843,8 +843,8 @@
843
  "attributes": {}
844
  }
845
  },
846
- "total_flos": 1.6434446421627494e+18,
847
- "train_batch_size": 8,
848
  "trial_name": null,
849
  "trial_params": null
850
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  },
420
  {
421
+ "entropy": 0.26439598014559407,
422
  "epoch": 4.104607721046078,
423
+ "grad_norm": 0.4967460632324219,
424
+ "learning_rate": 0.0001456691468223661,
425
+ "loss": 0.22732419967651368,
426
+ "mean_token_accuracy": 0.9187829334928532,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
+ "entropy": 0.2497277507185936,
432
  "epoch": 4.229140722291407,
433
+ "grad_norm": 0.3764539659023285,
434
+ "learning_rate": 0.00014182345600586332,
435
+ "loss": 0.21029539108276368,
436
+ "mean_token_accuracy": 0.9221936762332916,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
+ "entropy": 0.2486180279403925,
442
  "epoch": 4.353673723536737,
443
+ "grad_norm": 0.37183400988578796,
444
+ "learning_rate": 0.0001378973808619437,
445
+ "loss": 0.21009698867797852,
446
+ "mean_token_accuracy": 0.9225871834158897,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
+ "entropy": 0.26017799742519854,
452
  "epoch": 4.478206724782067,
453
+ "grad_norm": 0.43216991424560547,
454
+ "learning_rate": 0.00013389832073116724,
455
+ "loss": 0.2177184295654297,
456
+ "mean_token_accuracy": 0.9205843013525009,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
+ "entropy": 0.26708075165748596,
462
  "epoch": 4.602739726027397,
463
+ "grad_norm": 0.37948140501976013,
464
+ "learning_rate": 0.00012983381250642132,
465
+ "loss": 0.22203298568725585,
466
+ "mean_token_accuracy": 0.9184661367535591,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
+ "entropy": 0.25188380032777785,
472
  "epoch": 4.7272727272727275,
473
+ "grad_norm": 0.4874045252799988,
474
+ "learning_rate": 0.00012571151642839446,
475
+ "loss": 0.21441835403442383,
476
+ "mean_token_accuracy": 0.9194883331656456,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
+ "entropy": 0.2515877375751734,
482
  "epoch": 4.851805728518057,
483
+ "grad_norm": 0.43732911348342896,
484
+ "learning_rate": 0.00012153920164858083,
485
+ "loss": 0.21374931335449218,
486
+ "mean_token_accuracy": 0.921038504242897,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
+ "entropy": 0.254078243970871,
492
  "epoch": 4.976338729763388,
493
+ "grad_norm": 0.41199925541877747,
494
+ "learning_rate": 0.00011732473158702397,
495
+ "loss": 0.21298355102539063,
496
+ "mean_token_accuracy": 0.9217021322250366,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
+ "eval_entropy": 0.31937412129238596,
503
+ "eval_mean_token_accuracy": 0.8401600659586662,
504
+ "eval_not_syn_loss": 0.6070574522018433,
505
+ "eval_not_syn_runtime": 95.8354,
506
+ "eval_not_syn_samples_per_second": 14.348,
507
+ "eval_not_syn_steps_per_second": 1.795,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
+ "eval_entropy": 0.30796578786400863,
514
+ "eval_mean_token_accuracy": 0.8628802147022513,
515
  "eval_num_tokens": 4996950.0,
516
+ "eval_syn_loss": 0.5800920128822327,
517
+ "eval_syn_runtime": 98.9566,
518
+ "eval_syn_samples_per_second": 13.895,
519
+ "eval_syn_steps_per_second": 1.738,
520
  "step": 2010
521
  },
522
  {
523
+ "entropy": 0.19132825570425602,
524
  "epoch": 5.099626400996264,
525
+ "grad_norm": 0.45538511872291565,
526
+ "learning_rate": 0.0001130760491123961,
527
+ "loss": 0.153853759765625,
528
+ "mean_token_accuracy": 0.9434747304579224,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
+ "entropy": 0.17422323439270257,
534
  "epoch": 5.224159402241594,
535
+ "grad_norm": 0.4580552279949188,
536
+ "learning_rate": 0.00010880116157234302,
537
+ "loss": 0.13612208366394044,
538
+ "mean_token_accuracy": 0.9487342464923859,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
+ "entropy": 0.1799074411019683,
544
  "epoch": 5.348692403486924,
545
+ "grad_norm": 0.39040836691856384,
546
+ "learning_rate": 0.00010450812570230789,
547
+ "loss": 0.14250579833984375,
548
+ "mean_token_accuracy": 0.9467630323767662,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
+ "entropy": 0.18116022080183028,
554
  "epoch": 5.473225404732254,
555
+ "grad_norm": 0.3805501163005829,
556
+ "learning_rate": 0.00010020503244127544,
557
+ "loss": 0.14014955520629882,
558
+ "mean_token_accuracy": 0.946874064207077,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
+ "entropy": 0.1810251370444894,
564
  "epoch": 5.597758405977584,
565
+ "grad_norm": 0.36377736926078796,
566
+ "learning_rate": 9.589999168305372e-05,
567
+ "loss": 0.14044331550598144,
568
+ "mean_token_accuracy": 0.9467302888631821,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
+ "entropy": 0.17202148117125035,
574
  "epoch": 5.722291407222914,
575
+ "grad_norm": 0.38708847761154175,
576
+ "learning_rate": 9.16011169918326e-05,
577
+ "loss": 0.13840054512023925,
578
+ "mean_token_accuracy": 0.9480020496249199,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
+ "entropy": 0.1769936926662922,
584
  "epoch": 5.846824408468244,
585
+ "grad_norm": 0.4674988389015198,
586
+ "learning_rate": 8.73165103108249e-05,
587
+ "loss": 0.1404121208190918,
588
+ "mean_token_accuracy": 0.9461787036061287,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
+ "entropy": 0.17118842527270317,
594
  "epoch": 5.971357409713574,
595
+ "grad_norm": 0.4454115629196167,
596
+ "learning_rate": 8.305424669280888e-05,
597
+ "loss": 0.1366124439239502,
598
+ "mean_token_accuracy": 0.9484315186738967,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_entropy": 0.2494078171114589,
605
+ "eval_mean_token_accuracy": 0.851767166062843,
606
+ "eval_not_syn_loss": 0.7222095131874084,
607
+ "eval_not_syn_runtime": 95.8715,
608
+ "eval_not_syn_samples_per_second": 14.342,
609
+ "eval_not_syn_steps_per_second": 1.794,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
+ "eval_entropy": 0.239824180079754,
616
+ "eval_mean_token_accuracy": 0.8484749911829482,
617
  "eval_num_tokens": 5996340.0,
618
+ "eval_syn_loss": 0.7036991119384766,
619
+ "eval_syn_runtime": 98.923,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
+ "entropy": 0.14340858902744572,
626
  "epoch": 6.094645080946451,
627
+ "grad_norm": 0.270256370306015,
628
+ "learning_rate": 7.88223590813502e-05,
629
+ "loss": 0.10398475646972656,
630
+ "mean_token_accuracy": 0.9604840600731397,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
+ "entropy": 0.13108037687838078,
636
  "epoch": 6.219178082191781,
637
+ "grad_norm": 0.2870016396045685,
638
+ "learning_rate": 7.462882317138628e-05,
639
+ "loss": 0.09566926002502442,
640
+ "mean_token_accuracy": 0.9636739170551301,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
+ "entropy": 0.1305130884796381,
646
  "epoch": 6.34371108343711,
647
+ "grad_norm": 0.34119686484336853,
648
+ "learning_rate": 7.048154237770433e-05,
649
+ "loss": 0.0943364143371582,
650
+ "mean_token_accuracy": 0.9643464788794518,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
+ "entropy": 0.12516659261658789,
656
  "epoch": 6.468244084682441,
657
+ "grad_norm": 0.2851012051105499,
658
+ "learning_rate": 6.638833293964401e-05,
659
+ "loss": 0.09339779853820801,
660
+ "mean_token_accuracy": 0.9654107868671418,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
+ "entropy": 0.12894487291574477,
666
  "epoch": 6.592777085927771,
667
+ "grad_norm": 0.22259071469306946,
668
+ "learning_rate": 6.235690919009636e-05,
669
+ "loss": 0.09672751426696777,
670
+ "mean_token_accuracy": 0.9641611188650131,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
+ "entropy": 0.13079525250941515,
676
  "epoch": 6.717310087173101,
677
+ "grad_norm": 0.2873247265815735,
678
+ "learning_rate": 5.839486901656255e-05,
679
+ "loss": 0.09482893943786622,
680
+ "mean_token_accuracy": 0.963907478749752,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
+ "entropy": 0.13382539574056865,
686
  "epoch": 6.841843088418431,
687
+ "grad_norm": 0.26538875699043274,
688
+ "learning_rate": 5.450967954167317e-05,
689
+ "loss": 0.09817559242248536,
690
+ "mean_token_accuracy": 0.9620411720871925,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
+ "entropy": 0.12861237980425358,
696
  "epoch": 6.966376089663761,
697
+ "grad_norm": 0.3292505443096161,
698
+ "learning_rate": 5.070866305015545e-05,
699
+ "loss": 0.09485826492309571,
700
+ "mean_token_accuracy": 0.9636287876963615,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
+ "eval_entropy": 0.2065339538073817,
707
+ "eval_mean_token_accuracy": 0.8418768654729045,
708
+ "eval_not_syn_loss": 0.8473101854324341,
709
+ "eval_not_syn_runtime": 95.8493,
710
+ "eval_not_syn_samples_per_second": 14.345,
711
+ "eval_not_syn_steps_per_second": 1.794,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
+ "eval_entropy": 0.19768535250494645,
718
+ "eval_mean_token_accuracy": 0.8554798219093057,
719
  "eval_num_tokens": 6995730.0,
720
+ "eval_syn_loss": 0.8169878125190735,
721
+ "eval_syn_runtime": 98.9508,
722
+ "eval_syn_samples_per_second": 13.896,
723
+ "eval_syn_steps_per_second": 1.738,
724
  "step": 2814
725
  },
726
  {
727
+ "entropy": 0.12029899715097865,
728
  "epoch": 7.089663760896638,
729
+ "grad_norm": 0.18435120582580566,
730
+ "learning_rate": 4.699898318877223e-05,
731
+ "loss": 0.08083279609680176,
732
+ "mean_token_accuracy": 0.9689394840688417,
733
  "num_tokens": 7081846.0,
734
  "step": 2850
735
  },
736
  {
737
+ "entropy": 0.10581521928310395,
738
  "epoch": 7.214196762141968,
739
+ "grad_norm": 0.2981043756008148,
740
+ "learning_rate": 4.338763146523955e-05,
741
+ "loss": 0.07383342266082764,
742
+ "mean_token_accuracy": 0.9726339945197106,
743
  "num_tokens": 7202092.0,
744
  "step": 2900
745
  },
746
  {
747
+ "entropy": 0.10466793559491634,
748
  "epoch": 7.338729763387297,
749
+ "grad_norm": 0.24411630630493164,
750
+ "learning_rate": 3.988141407156982e-05,
751
+ "loss": 0.07236807346343994,
752
+ "mean_token_accuracy": 0.9726200112700463,
753
  "num_tokens": 7328180.0,
754
  "step": 2950
755
  },
756
  {
757
+ "entropy": 0.10792888538911939,
758
  "epoch": 7.463262764632628,
759
+ "grad_norm": 0.15558083355426788,
760
+ "learning_rate": 3.6486939056672404e-05,
761
+ "loss": 0.0738653564453125,
762
+ "mean_token_accuracy": 0.9714727732539177,
763
  "num_tokens": 7449031.0,
764
  "step": 3000
765
  },
766
  {
767
+ "entropy": 0.10735130561515689,
768
  "epoch": 7.587795765877957,
769
+ "grad_norm": 0.1883469820022583,
770
+ "learning_rate": 3.321060387238841e-05,
771
+ "loss": 0.07414769649505615,
772
+ "mean_token_accuracy": 0.9723327186703682,
773
  "num_tokens": 7574561.0,
774
  "step": 3050
775
  },
776
  {
777
+ "entropy": 0.10272138025611639,
778
  "epoch": 7.712328767123288,
779
+ "grad_norm": 0.15312588214874268,
780
+ "learning_rate": 3.0058583316430158e-05,
781
+ "loss": 0.07302286624908447,
782
+ "mean_token_accuracy": 0.9726834264397621,
783
  "num_tokens": 7700071.0,
784
  "step": 3100
785
  },
786
  {
787
+ "entropy": 0.10020780436694622,
788
  "epoch": 7.8368617683686175,
789
+ "grad_norm": 0.15675754845142365,
790
+ "learning_rate": 2.7036817894949623e-05,
791
+ "loss": 0.06891141891479492,
792
+ "mean_token_accuracy": 0.9731867194175721,
793
  "num_tokens": 7831577.0,
794
  "step": 3150
795
  },
796
  {
797
+ "entropy": 0.10623522130772471,
798
  "epoch": 7.961394769613948,
799
+ "grad_norm": 0.28400713205337524,
800
+ "learning_rate": 2.415100262666817e-05,
801
+ "loss": 0.07267386436462403,
802
+ "mean_token_accuracy": 0.9713605433702469,
803
  "num_tokens": 7954099.0,
804
  "step": 3200
805
  },
806
  {
807
  "epoch": 8.0,
808
+ "eval_entropy": 0.18371209263974844,
809
+ "eval_mean_token_accuracy": 0.8438690238913824,
810
+ "eval_not_syn_loss": 0.9368809461593628,
811
+ "eval_not_syn_runtime": 95.8165,
812
+ "eval_not_syn_samples_per_second": 14.35,
813
+ "eval_not_syn_steps_per_second": 1.795,
814
  "eval_num_tokens": 7995120.0,
815
  "step": 3216
816
  },
817
  {
818
  "epoch": 8.0,
819
+ "eval_entropy": 0.17606536935754988,
820
+ "eval_mean_token_accuracy": 0.8541917884072592,
821
  "eval_num_tokens": 7995120.0,
822
+ "eval_syn_loss": 0.905842661857605,
823
+ "eval_syn_runtime": 98.8452,
824
+ "eval_syn_samples_per_second": 13.911,
825
+ "eval_syn_steps_per_second": 1.74,
826
  "step": 3216
827
  }
828
  ],
 
843
  "attributes": {}
844
  }
845
  },
846
+ "total_flos": 1.3106142200946278e+18,
847
+ "train_batch_size": 4,
848
  "trial_name": null,
849
  "trial_params": null
850
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3216/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f5ff33740ef685048e8f2403f165f907b8ea0f2064c8153354a3b2e734db35c
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae74308deabc0838dec9482819893f19f9a0c7baf003066c29407ba658bda3fd
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/trainer_state.json CHANGED
@@ -10,921 +10,921 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  },
420
  {
421
- "entropy": 1.634241136637601,
422
  "epoch": 4.104607721046078,
423
- "grad_norm": 0.7396109700202942,
424
- "learning_rate": 0.0002221542645793497,
425
- "loss": 1.610531768798828,
426
- "mean_token_accuracy": 0.6454936681371747,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
- "entropy": 1.588758965730667,
432
  "epoch": 4.229140722291407,
433
- "grad_norm": 0.779146134853363,
434
- "learning_rate": 0.00021628935334882304,
435
- "loss": 1.5645944213867187,
436
- "mean_token_accuracy": 0.6525067055225372,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
- "entropy": 1.5439102852344513,
442
  "epoch": 4.353673723536737,
443
- "grad_norm": 0.614967942237854,
444
- "learning_rate": 0.00021030185115424846,
445
- "loss": 1.5180734252929688,
446
- "mean_token_accuracy": 0.6614933741092682,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
- "entropy": 1.5201536059379577,
452
  "epoch": 4.478206724782067,
453
- "grad_norm": 0.6725050210952759,
454
- "learning_rate": 0.00020420304243777673,
455
- "loss": 1.4964521789550782,
456
- "mean_token_accuracy": 0.6630363047122956,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
- "entropy": 1.503999525308609,
462
  "epoch": 4.602739726027397,
463
- "grad_norm": 0.7740678191184998,
464
- "learning_rate": 0.00019800442141718245,
465
- "loss": 1.4821170043945313,
466
- "mean_token_accuracy": 0.6658095389604568,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
- "entropy": 1.4707296586036682,
472
  "epoch": 4.7272727272727275,
473
- "grad_norm": 0.7613642811775208,
474
- "learning_rate": 0.00019171767042310182,
475
- "loss": 1.451023406982422,
476
- "mean_token_accuracy": 0.6706610345840454,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
- "entropy": 1.449094181060791,
482
  "epoch": 4.851805728518057,
483
- "grad_norm": 0.7195038199424744,
484
- "learning_rate": 0.00018535463788174053,
485
- "loss": 1.4288282775878907,
486
- "mean_token_accuracy": 0.6740301263332367,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
- "entropy": 1.4346733844280244,
492
  "epoch": 4.976338729763388,
493
- "grad_norm": 0.7584077715873718,
494
- "learning_rate": 0.00017892731598454726,
495
- "loss": 1.4155799865722656,
496
- "mean_token_accuracy": 0.6788010179996491,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
- "eval_entropy": 1.4948647777701534,
503
- "eval_mean_token_accuracy": 0.661396715876668,
504
- "eval_not_syn_loss": 1.4358112812042236,
505
- "eval_not_syn_runtime": 95.9987,
506
- "eval_not_syn_samples_per_second": 14.323,
507
- "eval_not_syn_steps_per_second": 1.792,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
- "eval_entropy": 1.4439531952835793,
514
- "eval_mean_token_accuracy": 0.6999529783808908,
515
  "eval_num_tokens": 4996950.0,
516
- "eval_syn_loss": 1.3766233921051025,
517
- "eval_syn_runtime": 99.0602,
518
- "eval_syn_samples_per_second": 13.88,
519
- "eval_syn_steps_per_second": 1.736,
520
  "step": 2010
521
  },
522
  {
523
- "entropy": 1.345623204202363,
524
  "epoch": 5.099626400996264,
525
- "grad_norm": 0.8724157214164734,
526
- "learning_rate": 0.00017244781808693755,
527
- "loss": 1.323695831298828,
528
- "mean_token_accuracy": 0.6866910710479274,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
- "entropy": 1.3074172997474671,
534
  "epoch": 5.224159402241594,
535
- "grad_norm": 0.8229172825813293,
536
- "learning_rate": 0.00016592835587866364,
537
- "loss": 1.295655517578125,
538
- "mean_token_accuracy": 0.6931505644321442,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
- "entropy": 1.298924881219864,
544
  "epoch": 5.348692403486924,
545
- "grad_norm": 0.9111100435256958,
546
- "learning_rate": 0.0001593812163688578,
547
- "loss": 1.2771641540527343,
548
- "mean_token_accuracy": 0.6941236406564713,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
- "entropy": 1.3120970189571382,
554
  "epoch": 5.473225404732254,
555
- "grad_norm": 0.7868310213088989,
556
- "learning_rate": 0.000152818738729123,
557
- "loss": 1.2942347717285156,
558
- "mean_token_accuracy": 0.6939822369813919,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
- "entropy": 1.2729843455553054,
564
  "epoch": 5.597758405977584,
565
- "grad_norm": 0.9405992031097412,
566
- "learning_rate": 0.00014625329103831503,
567
- "loss": 1.2503909301757812,
568
- "mean_token_accuracy": 0.7005668586492538,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
- "entropy": 1.2202323937416077,
574
  "epoch": 5.722291407222914,
575
- "grad_norm": 0.7629554271697998,
576
- "learning_rate": 0.00013969724697284394,
577
- "loss": 1.199918212890625,
578
- "mean_token_accuracy": 0.7085251879692077,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
- "entropy": 1.1850077486038209,
584
  "epoch": 5.846824408468244,
585
- "grad_norm": 0.9016424417495728,
586
- "learning_rate": 0.00013316296248642664,
587
- "loss": 1.166585693359375,
588
- "mean_token_accuracy": 0.7123788893222809,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
- "entropy": 1.2040903121232986,
594
  "epoch": 5.971357409713574,
595
- "grad_norm": 0.8533362150192261,
596
- "learning_rate": 0.0001266627525232398,
597
- "loss": 1.1885869598388672,
598
- "mean_token_accuracy": 0.7100468480587006,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_entropy": 1.2157157427349756,
605
- "eval_mean_token_accuracy": 0.7150737251653227,
606
- "eval_not_syn_loss": 1.2506903409957886,
607
- "eval_not_syn_runtime": 95.9618,
608
- "eval_not_syn_samples_per_second": 14.329,
609
- "eval_not_syn_steps_per_second": 1.792,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
- "eval_entropy": 1.178627569661584,
616
- "eval_mean_token_accuracy": 0.7016614221556242,
617
  "eval_num_tokens": 5996340.0,
618
- "eval_syn_loss": 1.2114850282669067,
619
- "eval_syn_runtime": 98.9234,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
- "entropy": 1.0982752972178988,
626
  "epoch": 6.094645080946451,
627
- "grad_norm": 1.295694351196289,
628
- "learning_rate": 0.00012020886780836267,
629
- "loss": 1.0774417877197267,
630
- "mean_token_accuracy": 0.7259544272615452,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
- "entropy": 1.0651295524835587,
636
  "epoch": 6.219178082191781,
637
- "grad_norm": 0.9243665337562561,
638
- "learning_rate": 0.0001138134717592517,
639
- "loss": 1.0521366119384765,
640
- "mean_token_accuracy": 0.7293049448728561,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
- "entropy": 1.0358434236049652,
646
  "epoch": 6.34371108343711,
647
- "grad_norm": 0.9286350011825562,
648
- "learning_rate": 0.00010748861756175999,
649
- "loss": 1.018977813720703,
650
- "mean_token_accuracy": 0.7370841908454895,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
- "entropy": 1.0522522723674774,
656
  "epoch": 6.468244084682441,
657
- "grad_norm": 0.9158058762550354,
658
- "learning_rate": 0.00010124622545390751,
659
- "loss": 1.0335795593261718,
660
- "mean_token_accuracy": 0.7348936641216278,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
- "entropy": 1.0202285438776015,
666
  "epoch": 6.592777085927771,
667
- "grad_norm": 0.8085289597511292,
668
- "learning_rate": 9.509806026021276e-05,
669
- "loss": 1.0074135589599609,
670
- "mean_token_accuracy": 0.7410361844301224,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
- "entropy": 0.9939206629991532,
676
  "epoch": 6.717310087173101,
677
- "grad_norm": 0.7527234554290771,
678
- "learning_rate": 8.90557092189276e-05,
679
- "loss": 0.9766032409667968,
680
- "mean_token_accuracy": 0.7435237610340119,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
- "entropy": 0.985169340968132,
686
  "epoch": 6.841843088418431,
687
- "grad_norm": 0.9773848056793213,
688
- "learning_rate": 8.313056014396262e-05,
689
- "loss": 0.9680111694335938,
690
- "mean_token_accuracy": 0.7455707538127899,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
- "entropy": 0.9864326739311218,
696
  "epoch": 6.966376089663761,
697
- "grad_norm": 0.7536692023277283,
698
- "learning_rate": 7.733377996266039e-05,
699
- "loss": 0.9636287689208984,
700
- "mean_token_accuracy": 0.7498565518856048,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
- "eval_entropy": 1.0250733893278032,
707
- "eval_mean_token_accuracy": 0.7460000244683997,
708
- "eval_not_syn_loss": 1.0630079507827759,
709
- "eval_not_syn_runtime": 96.0062,
710
- "eval_not_syn_samples_per_second": 14.322,
711
- "eval_not_syn_steps_per_second": 1.792,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
- "eval_entropy": 0.9910203045190766,
718
- "eval_mean_token_accuracy": 0.7360078449859175,
719
  "eval_num_tokens": 6995730.0,
720
- "eval_syn_loss": 1.0268242359161377,
721
- "eval_syn_runtime": 99.1365,
722
- "eval_syn_samples_per_second": 13.87,
723
- "eval_syn_steps_per_second": 1.735,
724
  "step": 2814
725
  },
726
  {
727
- "entropy": 0.8933790849916863,
728
  "epoch": 7.089663760896638,
729
- "grad_norm": 0.9901642799377441,
730
- "learning_rate": 7.16762936698672e-05,
731
- "loss": 0.870389404296875,
732
- "mean_token_accuracy": 0.7670638844220325,
733
  "num_tokens": 7081846.0,
734
  "step": 2850
735
  },
736
  {
737
- "entropy": 0.8327929264307022,
738
  "epoch": 7.214196762141968,
739
- "grad_norm": 0.9863399267196655,
740
- "learning_rate": 6.616876373796547e-05,
741
- "loss": 0.8118631744384766,
742
- "mean_token_accuracy": 0.7749309521913529,
743
  "num_tokens": 7202092.0,
744
  "step": 2900
745
  },
746
  {
747
- "entropy": 0.8421830326318741,
748
  "epoch": 7.338729763387297,
749
- "grad_norm": 0.9192395210266113,
750
- "learning_rate": 6.082157002167449e-05,
751
- "loss": 0.8261887359619141,
752
- "mean_token_accuracy": 0.7743502056598663,
753
  "num_tokens": 7328180.0,
754
  "step": 2950
755
  },
756
  {
757
- "entropy": 0.8124729514122009,
758
  "epoch": 7.463262764632628,
759
- "grad_norm": 0.807341456413269,
760
- "learning_rate": 5.564479019549013e-05,
761
- "loss": 0.79787353515625,
762
- "mean_token_accuracy": 0.7786311388015748,
763
  "num_tokens": 7449031.0,
764
  "step": 3000
765
  },
766
  {
767
- "entropy": 0.8328139960765839,
768
  "epoch": 7.587795765877957,
769
- "grad_norm": 0.8058074116706848,
770
- "learning_rate": 5.064818076063412e-05,
771
- "loss": 0.8112649536132812,
772
- "mean_token_accuracy": 0.7778408378362656,
773
  "num_tokens": 7574561.0,
774
  "step": 3050
775
  },
776
  {
777
- "entropy": 0.8063498467206955,
778
  "epoch": 7.712328767123288,
779
- "grad_norm": 1.1108580827713013,
780
- "learning_rate": 4.584115865730714e-05,
781
- "loss": 0.7904315185546875,
782
- "mean_token_accuracy": 0.7799610030651093,
783
  "num_tokens": 7700071.0,
784
  "step": 3100
785
  },
786
  {
787
- "entropy": 0.8056518918275833,
788
  "epoch": 7.8368617683686175,
789
- "grad_norm": 1.0046204328536987,
790
- "learning_rate": 4.123278351690132e-05,
791
- "loss": 0.7889098358154297,
792
- "mean_token_accuracy": 0.781885308623314,
793
  "num_tokens": 7831577.0,
794
  "step": 3150
795
  },
796
  {
797
- "entropy": 0.7862150323390961,
798
  "epoch": 7.961394769613948,
799
- "grad_norm": 0.8963404893875122,
800
- "learning_rate": 3.683174058762063e-05,
801
- "loss": 0.7703626251220703,
802
- "mean_token_accuracy": 0.7862878423929215,
803
  "num_tokens": 7954099.0,
804
  "step": 3200
805
  },
806
  {
807
  "epoch": 8.0,
808
- "eval_entropy": 0.8138092035471007,
809
- "eval_mean_token_accuracy": 0.7570307982522387,
810
- "eval_not_syn_loss": 0.9443947076797485,
811
- "eval_not_syn_runtime": 96.0009,
812
- "eval_not_syn_samples_per_second": 14.323,
813
- "eval_not_syn_steps_per_second": 1.792,
814
  "eval_num_tokens": 7995120.0,
815
  "step": 3216
816
  },
817
  {
818
  "epoch": 8.0,
819
- "eval_entropy": 0.7856217716322389,
820
- "eval_mean_token_accuracy": 0.7748590402824934,
821
  "eval_num_tokens": 7995120.0,
822
- "eval_syn_loss": 0.9141804575920105,
823
- "eval_syn_runtime": 99.0213,
824
- "eval_syn_samples_per_second": 13.886,
825
- "eval_syn_steps_per_second": 1.737,
826
  "step": 3216
827
  },
828
  {
829
- "entropy": 0.717324167188972,
830
  "epoch": 8.084682440846825,
831
- "grad_norm": 0.7706292271614075,
832
- "learning_rate": 3.26463243656881e-05,
833
- "loss": 0.6817562103271484,
834
- "mean_token_accuracy": 0.8034305211269495,
835
  "num_tokens": 8081323.0,
836
  "step": 3250
837
  },
838
  {
839
- "entropy": 0.7076752084493637,
840
  "epoch": 8.209215442092155,
841
- "grad_norm": 0.7392331957817078,
842
- "learning_rate": 2.8684422962990643e-05,
843
- "loss": 0.6874848175048828,
844
- "mean_token_accuracy": 0.8040069764852524,
845
  "num_tokens": 8208468.0,
846
  "step": 3300
847
  },
848
  {
849
- "entropy": 0.7056704825162887,
850
  "epoch": 8.333748443337484,
851
- "grad_norm": 0.9019565582275391,
852
- "learning_rate": 2.4953503240622073e-05,
853
- "loss": 0.6827576446533203,
854
- "mean_token_accuracy": 0.8037591743469238,
855
  "num_tokens": 8329436.0,
856
  "step": 3350
857
  },
858
  {
859
- "entropy": 0.6830038994550705,
860
  "epoch": 8.458281444582815,
861
- "grad_norm": 0.9338183999061584,
862
- "learning_rate": 2.146059673634357e-05,
863
- "loss": 0.6555431365966797,
864
- "mean_token_accuracy": 0.80817536175251,
865
  "num_tokens": 8449737.0,
866
  "step": 3400
867
  },
868
  {
869
- "entropy": 0.6841743963956833,
870
  "epoch": 8.582814445828145,
871
- "grad_norm": 0.8519166707992554,
872
- "learning_rate": 1.8212286412483148e-05,
873
- "loss": 0.6660543823242188,
874
- "mean_token_accuracy": 0.8084699755907059,
875
  "num_tokens": 8572556.0,
876
  "step": 3450
877
  },
878
  {
879
- "entropy": 0.6859753090143204,
880
  "epoch": 8.707347447073474,
881
- "grad_norm": 0.8856426477432251,
882
- "learning_rate": 1.5214694249249967e-05,
883
- "loss": 0.6608637237548828,
884
- "mean_token_accuracy": 0.8085391563177109,
885
  "num_tokens": 8703112.0,
886
  "step": 3500
887
  },
888
  {
889
- "entropy": 0.6650526940822601,
890
  "epoch": 8.831880448318804,
891
- "grad_norm": 0.8123352527618408,
892
- "learning_rate": 1.2473469706846106e-05,
893
- "loss": 0.636329345703125,
894
- "mean_token_accuracy": 0.8127367502450943,
895
  "num_tokens": 8829124.0,
896
  "step": 3550
897
  },
898
  {
899
- "entropy": 0.674993228316307,
900
  "epoch": 8.956413449564135,
901
- "grad_norm": 0.8388631343841553,
902
- "learning_rate": 9.993779078120502e-06,
903
- "loss": 0.6549046325683594,
904
- "mean_token_accuracy": 0.8100328993797302,
905
  "num_tokens": 8951338.0,
906
  "step": 3600
907
  },
908
  {
909
  "epoch": 9.0,
910
- "eval_entropy": 0.7291332736264827,
911
- "eval_mean_token_accuracy": 0.7639909365842509,
912
- "eval_not_syn_loss": 0.9260964393615723,
913
- "eval_not_syn_runtime": 96.0349,
914
- "eval_not_syn_samples_per_second": 14.318,
915
- "eval_not_syn_steps_per_second": 1.791,
916
  "eval_num_tokens": 8994510.0,
917
  "step": 3618
918
  },
919
  {
920
  "epoch": 9.0,
921
- "eval_entropy": 0.7045032125572825,
922
- "eval_mean_token_accuracy": 0.7828773402197416,
923
  "eval_num_tokens": 8994510.0,
924
- "eval_syn_loss": 0.891645610332489,
925
- "eval_syn_runtime": 99.041,
926
- "eval_syn_samples_per_second": 13.883,
927
- "eval_syn_steps_per_second": 1.737,
928
  "step": 3618
929
  }
930
  ],
@@ -945,8 +945,8 @@
945
  "attributes": {}
946
  }
947
  },
948
- "total_flos": 1.8473447323008922e+18,
949
- "train_batch_size": 8,
950
  "trial_name": null,
951
  "trial_params": null
952
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  },
420
  {
421
+ "entropy": 0.26439598014559407,
422
  "epoch": 4.104607721046078,
423
+ "grad_norm": 0.4967460632324219,
424
+ "learning_rate": 0.0001456691468223661,
425
+ "loss": 0.22732419967651368,
426
+ "mean_token_accuracy": 0.9187829334928532,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
+ "entropy": 0.2497277507185936,
432
  "epoch": 4.229140722291407,
433
+ "grad_norm": 0.3764539659023285,
434
+ "learning_rate": 0.00014182345600586332,
435
+ "loss": 0.21029539108276368,
436
+ "mean_token_accuracy": 0.9221936762332916,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
+ "entropy": 0.2486180279403925,
442
  "epoch": 4.353673723536737,
443
+ "grad_norm": 0.37183400988578796,
444
+ "learning_rate": 0.0001378973808619437,
445
+ "loss": 0.21009698867797852,
446
+ "mean_token_accuracy": 0.9225871834158897,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
+ "entropy": 0.26017799742519854,
452
  "epoch": 4.478206724782067,
453
+ "grad_norm": 0.43216991424560547,
454
+ "learning_rate": 0.00013389832073116724,
455
+ "loss": 0.2177184295654297,
456
+ "mean_token_accuracy": 0.9205843013525009,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
+ "entropy": 0.26708075165748596,
462
  "epoch": 4.602739726027397,
463
+ "grad_norm": 0.37948140501976013,
464
+ "learning_rate": 0.00012983381250642132,
465
+ "loss": 0.22203298568725585,
466
+ "mean_token_accuracy": 0.9184661367535591,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
+ "entropy": 0.25188380032777785,
472
  "epoch": 4.7272727272727275,
473
+ "grad_norm": 0.4874045252799988,
474
+ "learning_rate": 0.00012571151642839446,
475
+ "loss": 0.21441835403442383,
476
+ "mean_token_accuracy": 0.9194883331656456,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
+ "entropy": 0.2515877375751734,
482
  "epoch": 4.851805728518057,
483
+ "grad_norm": 0.43732911348342896,
484
+ "learning_rate": 0.00012153920164858083,
485
+ "loss": 0.21374931335449218,
486
+ "mean_token_accuracy": 0.921038504242897,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
+ "entropy": 0.254078243970871,
492
  "epoch": 4.976338729763388,
493
+ "grad_norm": 0.41199925541877747,
494
+ "learning_rate": 0.00011732473158702397,
495
+ "loss": 0.21298355102539063,
496
+ "mean_token_accuracy": 0.9217021322250366,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
+ "eval_entropy": 0.31937412129238596,
503
+ "eval_mean_token_accuracy": 0.8401600659586662,
504
+ "eval_not_syn_loss": 0.6070574522018433,
505
+ "eval_not_syn_runtime": 95.8354,
506
+ "eval_not_syn_samples_per_second": 14.348,
507
+ "eval_not_syn_steps_per_second": 1.795,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
+ "eval_entropy": 0.30796578786400863,
514
+ "eval_mean_token_accuracy": 0.8628802147022513,
515
  "eval_num_tokens": 4996950.0,
516
+ "eval_syn_loss": 0.5800920128822327,
517
+ "eval_syn_runtime": 98.9566,
518
+ "eval_syn_samples_per_second": 13.895,
519
+ "eval_syn_steps_per_second": 1.738,
520
  "step": 2010
521
  },
522
  {
523
+ "entropy": 0.19132825570425602,
524
  "epoch": 5.099626400996264,
525
+ "grad_norm": 0.45538511872291565,
526
+ "learning_rate": 0.0001130760491123961,
527
+ "loss": 0.153853759765625,
528
+ "mean_token_accuracy": 0.9434747304579224,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
+ "entropy": 0.17422323439270257,
534
  "epoch": 5.224159402241594,
535
+ "grad_norm": 0.4580552279949188,
536
+ "learning_rate": 0.00010880116157234302,
537
+ "loss": 0.13612208366394044,
538
+ "mean_token_accuracy": 0.9487342464923859,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
+ "entropy": 0.1799074411019683,
544
  "epoch": 5.348692403486924,
545
+ "grad_norm": 0.39040836691856384,
546
+ "learning_rate": 0.00010450812570230789,
547
+ "loss": 0.14250579833984375,
548
+ "mean_token_accuracy": 0.9467630323767662,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
+ "entropy": 0.18116022080183028,
554
  "epoch": 5.473225404732254,
555
+ "grad_norm": 0.3805501163005829,
556
+ "learning_rate": 0.00010020503244127544,
557
+ "loss": 0.14014955520629882,
558
+ "mean_token_accuracy": 0.946874064207077,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
+ "entropy": 0.1810251370444894,
564
  "epoch": 5.597758405977584,
565
+ "grad_norm": 0.36377736926078796,
566
+ "learning_rate": 9.589999168305372e-05,
567
+ "loss": 0.14044331550598144,
568
+ "mean_token_accuracy": 0.9467302888631821,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
+ "entropy": 0.17202148117125035,
574
  "epoch": 5.722291407222914,
575
+ "grad_norm": 0.38708847761154175,
576
+ "learning_rate": 9.16011169918326e-05,
577
+ "loss": 0.13840054512023925,
578
+ "mean_token_accuracy": 0.9480020496249199,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
+ "entropy": 0.1769936926662922,
584
  "epoch": 5.846824408468244,
585
+ "grad_norm": 0.4674988389015198,
586
+ "learning_rate": 8.73165103108249e-05,
587
+ "loss": 0.1404121208190918,
588
+ "mean_token_accuracy": 0.9461787036061287,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
+ "entropy": 0.17118842527270317,
594
  "epoch": 5.971357409713574,
595
+ "grad_norm": 0.4454115629196167,
596
+ "learning_rate": 8.305424669280888e-05,
597
+ "loss": 0.1366124439239502,
598
+ "mean_token_accuracy": 0.9484315186738967,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_entropy": 0.2494078171114589,
605
+ "eval_mean_token_accuracy": 0.851767166062843,
606
+ "eval_not_syn_loss": 0.7222095131874084,
607
+ "eval_not_syn_runtime": 95.8715,
608
+ "eval_not_syn_samples_per_second": 14.342,
609
+ "eval_not_syn_steps_per_second": 1.794,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
+ "eval_entropy": 0.239824180079754,
616
+ "eval_mean_token_accuracy": 0.8484749911829482,
617
  "eval_num_tokens": 5996340.0,
618
+ "eval_syn_loss": 0.7036991119384766,
619
+ "eval_syn_runtime": 98.923,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
+ "entropy": 0.14340858902744572,
626
  "epoch": 6.094645080946451,
627
+ "grad_norm": 0.270256370306015,
628
+ "learning_rate": 7.88223590813502e-05,
629
+ "loss": 0.10398475646972656,
630
+ "mean_token_accuracy": 0.9604840600731397,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
+ "entropy": 0.13108037687838078,
636
  "epoch": 6.219178082191781,
637
+ "grad_norm": 0.2870016396045685,
638
+ "learning_rate": 7.462882317138628e-05,
639
+ "loss": 0.09566926002502442,
640
+ "mean_token_accuracy": 0.9636739170551301,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
+ "entropy": 0.1305130884796381,
646
  "epoch": 6.34371108343711,
647
+ "grad_norm": 0.34119686484336853,
648
+ "learning_rate": 7.048154237770433e-05,
649
+ "loss": 0.0943364143371582,
650
+ "mean_token_accuracy": 0.9643464788794518,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
+ "entropy": 0.12516659261658789,
656
  "epoch": 6.468244084682441,
657
+ "grad_norm": 0.2851012051105499,
658
+ "learning_rate": 6.638833293964401e-05,
659
+ "loss": 0.09339779853820801,
660
+ "mean_token_accuracy": 0.9654107868671418,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
+ "entropy": 0.12894487291574477,
666
  "epoch": 6.592777085927771,
667
+ "grad_norm": 0.22259071469306946,
668
+ "learning_rate": 6.235690919009636e-05,
669
+ "loss": 0.09672751426696777,
670
+ "mean_token_accuracy": 0.9641611188650131,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
+ "entropy": 0.13079525250941515,
676
  "epoch": 6.717310087173101,
677
+ "grad_norm": 0.2873247265815735,
678
+ "learning_rate": 5.839486901656255e-05,
679
+ "loss": 0.09482893943786622,
680
+ "mean_token_accuracy": 0.963907478749752,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
+ "entropy": 0.13382539574056865,
686
  "epoch": 6.841843088418431,
687
+ "grad_norm": 0.26538875699043274,
688
+ "learning_rate": 5.450967954167317e-05,
689
+ "loss": 0.09817559242248536,
690
+ "mean_token_accuracy": 0.9620411720871925,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
+ "entropy": 0.12861237980425358,
696
  "epoch": 6.966376089663761,
697
+ "grad_norm": 0.3292505443096161,
698
+ "learning_rate": 5.070866305015545e-05,
699
+ "loss": 0.09485826492309571,
700
+ "mean_token_accuracy": 0.9636287876963615,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
+ "eval_entropy": 0.2065339538073817,
707
+ "eval_mean_token_accuracy": 0.8418768654729045,
708
+ "eval_not_syn_loss": 0.8473101854324341,
709
+ "eval_not_syn_runtime": 95.8493,
710
+ "eval_not_syn_samples_per_second": 14.345,
711
+ "eval_not_syn_steps_per_second": 1.794,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
+ "eval_entropy": 0.19768535250494645,
718
+ "eval_mean_token_accuracy": 0.8554798219093057,
719
  "eval_num_tokens": 6995730.0,
720
+ "eval_syn_loss": 0.8169878125190735,
721
+ "eval_syn_runtime": 98.9508,
722
+ "eval_syn_samples_per_second": 13.896,
723
+ "eval_syn_steps_per_second": 1.738,
724
  "step": 2814
725
  },
726
  {
727
+ "entropy": 0.12029899715097865,
728
  "epoch": 7.089663760896638,
729
+ "grad_norm": 0.18435120582580566,
730
+ "learning_rate": 4.699898318877223e-05,
731
+ "loss": 0.08083279609680176,
732
+ "mean_token_accuracy": 0.9689394840688417,
733
  "num_tokens": 7081846.0,
734
  "step": 2850
735
  },
736
  {
737
+ "entropy": 0.10581521928310395,
738
  "epoch": 7.214196762141968,
739
+ "grad_norm": 0.2981043756008148,
740
+ "learning_rate": 4.338763146523955e-05,
741
+ "loss": 0.07383342266082764,
742
+ "mean_token_accuracy": 0.9726339945197106,
743
  "num_tokens": 7202092.0,
744
  "step": 2900
745
  },
746
  {
747
+ "entropy": 0.10466793559491634,
748
  "epoch": 7.338729763387297,
749
+ "grad_norm": 0.24411630630493164,
750
+ "learning_rate": 3.988141407156982e-05,
751
+ "loss": 0.07236807346343994,
752
+ "mean_token_accuracy": 0.9726200112700463,
753
  "num_tokens": 7328180.0,
754
  "step": 2950
755
  },
756
  {
757
+ "entropy": 0.10792888538911939,
758
  "epoch": 7.463262764632628,
759
+ "grad_norm": 0.15558083355426788,
760
+ "learning_rate": 3.6486939056672404e-05,
761
+ "loss": 0.0738653564453125,
762
+ "mean_token_accuracy": 0.9714727732539177,
763
  "num_tokens": 7449031.0,
764
  "step": 3000
765
  },
766
  {
767
+ "entropy": 0.10735130561515689,
768
  "epoch": 7.587795765877957,
769
+ "grad_norm": 0.1883469820022583,
770
+ "learning_rate": 3.321060387238841e-05,
771
+ "loss": 0.07414769649505615,
772
+ "mean_token_accuracy": 0.9723327186703682,
773
  "num_tokens": 7574561.0,
774
  "step": 3050
775
  },
776
  {
777
+ "entropy": 0.10272138025611639,
778
  "epoch": 7.712328767123288,
779
+ "grad_norm": 0.15312588214874268,
780
+ "learning_rate": 3.0058583316430158e-05,
781
+ "loss": 0.07302286624908447,
782
+ "mean_token_accuracy": 0.9726834264397621,
783
  "num_tokens": 7700071.0,
784
  "step": 3100
785
  },
786
  {
787
+ "entropy": 0.10020780436694622,
788
  "epoch": 7.8368617683686175,
789
+ "grad_norm": 0.15675754845142365,
790
+ "learning_rate": 2.7036817894949623e-05,
791
+ "loss": 0.06891141891479492,
792
+ "mean_token_accuracy": 0.9731867194175721,
793
  "num_tokens": 7831577.0,
794
  "step": 3150
795
  },
796
  {
797
+ "entropy": 0.10623522130772471,
798
  "epoch": 7.961394769613948,
799
+ "grad_norm": 0.28400713205337524,
800
+ "learning_rate": 2.415100262666817e-05,
801
+ "loss": 0.07267386436462403,
802
+ "mean_token_accuracy": 0.9713605433702469,
803
  "num_tokens": 7954099.0,
804
  "step": 3200
805
  },
806
  {
807
  "epoch": 8.0,
808
+ "eval_entropy": 0.18371209263974844,
809
+ "eval_mean_token_accuracy": 0.8438690238913824,
810
+ "eval_not_syn_loss": 0.9368809461593628,
811
+ "eval_not_syn_runtime": 95.8165,
812
+ "eval_not_syn_samples_per_second": 14.35,
813
+ "eval_not_syn_steps_per_second": 1.795,
814
  "eval_num_tokens": 7995120.0,
815
  "step": 3216
816
  },
817
  {
818
  "epoch": 8.0,
819
+ "eval_entropy": 0.17606536935754988,
820
+ "eval_mean_token_accuracy": 0.8541917884072592,
821
  "eval_num_tokens": 7995120.0,
822
+ "eval_syn_loss": 0.905842661857605,
823
+ "eval_syn_runtime": 98.8452,
824
+ "eval_syn_samples_per_second": 13.911,
825
+ "eval_syn_steps_per_second": 1.74,
826
  "step": 3216
827
  },
828
  {
829
+ "entropy": 0.09641267131600115,
830
  "epoch": 8.084682440846825,
831
+ "grad_norm": 0.10478409379720688,
832
+ "learning_rate": 2.1406576309667927e-05,
833
+ "loss": 0.06422062397003174,
834
+ "mean_token_accuracy": 0.974858273761441,
835
  "num_tokens": 8081323.0,
836
  "step": 3250
837
  },
838
  {
839
+ "entropy": 0.09412769414484501,
840
  "epoch": 8.209215442092155,
841
+ "grad_norm": 0.1294923573732376,
842
+ "learning_rate": 1.8808711271073835e-05,
843
+ "loss": 0.06173128604888916,
844
+ "mean_token_accuracy": 0.9750720876455307,
845
  "num_tokens": 8208468.0,
846
  "step": 3300
847
  },
848
  {
849
+ "entropy": 0.09561639416962862,
850
  "epoch": 8.333748443337484,
851
+ "grad_norm": 0.1887606382369995,
852
+ "learning_rate": 1.6362303618944128e-05,
853
+ "loss": 0.06482413768768311,
854
+ "mean_token_accuracy": 0.9742087376117706,
855
  "num_tokens": 8329436.0,
856
  "step": 3350
857
  },
858
  {
859
+ "entropy": 0.09427102731540798,
860
  "epoch": 8.458281444582815,
861
+ "grad_norm": 0.09633524715900421,
862
+ "learning_rate": 1.407196401474173e-05,
863
+ "loss": 0.06472210884094239,
864
+ "mean_token_accuracy": 0.974975728392601,
865
  "num_tokens": 8449737.0,
866
  "step": 3400
867
  },
868
  {
869
+ "entropy": 0.09517974983900786,
870
  "epoch": 8.582814445828145,
871
+ "grad_norm": 0.09141165018081665,
872
+ "learning_rate": 1.194200898377711e-05,
873
+ "loss": 0.06492462158203124,
874
+ "mean_token_accuracy": 0.9747303375601768,
875
  "num_tokens": 8572556.0,
876
  "step": 3450
877
  },
878
  {
879
+ "entropy": 0.09041798285208642,
880
  "epoch": 8.707347447073474,
881
+ "grad_norm": 0.22126418352127075,
882
+ "learning_rate": 9.976452779999523e-06,
883
+ "loss": 0.061339192390441895,
884
+ "mean_token_accuracy": 0.9752290603518486,
885
  "num_tokens": 8703112.0,
886
  "step": 3500
887
  },
888
  {
889
+ "entropy": 0.09546036465093494,
890
  "epoch": 8.831880448318804,
891
+ "grad_norm": 0.15855053067207336,
892
+ "learning_rate": 8.178999820468898e-06,
893
+ "loss": 0.06384926795959472,
894
+ "mean_token_accuracy": 0.9744252774119377,
895
  "num_tokens": 8829124.0,
896
  "step": 3550
897
  },
898
  {
899
+ "entropy": 0.10058791788294912,
900
  "epoch": 8.956413449564135,
901
+ "grad_norm": 0.10365262627601624,
902
+ "learning_rate": 6.553037703766629e-06,
903
+ "loss": 0.06539971351623536,
904
+ "mean_token_accuracy": 0.972833506166935,
905
  "num_tokens": 8951338.0,
906
  "step": 3600
907
  },
908
  {
909
  "epoch": 9.0,
910
+ "eval_entropy": 0.1707501579509225,
911
+ "eval_mean_token_accuracy": 0.8436840848867283,
912
+ "eval_not_syn_loss": 1.025063157081604,
913
+ "eval_not_syn_runtime": 95.8518,
914
+ "eval_not_syn_samples_per_second": 14.345,
915
+ "eval_not_syn_steps_per_second": 1.794,
916
  "eval_num_tokens": 8994510.0,
917
  "step": 3618
918
  },
919
  {
920
  "epoch": 9.0,
921
+ "eval_entropy": 0.163228454977967,
922
+ "eval_mean_token_accuracy": 0.8539366458737573,
923
  "eval_num_tokens": 8994510.0,
924
+ "eval_syn_loss": 0.985819399356842,
925
+ "eval_syn_runtime": 98.9324,
926
+ "eval_syn_samples_per_second": 13.898,
927
+ "eval_syn_steps_per_second": 1.739,
928
  "step": 3618
929
  }
930
  ],
 
945
  "attributes": {}
946
  }
947
  },
948
+ "total_flos": 1.474001737996677e+18,
949
+ "train_batch_size": 4,
950
  "trial_name": null,
951
  "trial_params": null
952
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-3618/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82bf059fef568bdb8d1163d8674420e29b607b7570d5ff862eca2952ec8de6ae
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe24ed9b42c27216cd5d12ca0400d02f41cbd9005692cdd75755209030018078
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/trainer_state.json CHANGED
@@ -10,105 +10,105 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  }
114
  ],
@@ -129,8 +129,8 @@
129
  "attributes": {}
130
  }
131
  },
132
- "total_flos": 2.0613594113258496e+17,
133
- "train_batch_size": 8,
134
  "trial_name": null,
135
  "trial_params": null
136
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  }
114
  ],
 
129
  "attributes": {}
130
  }
131
  },
132
+ "total_flos": 1.6443486597608448e+17,
133
+ "train_batch_size": 4,
134
  "trial_name": null,
135
  "trial_params": null
136
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-402/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50e070bee072e5e4cc08e77783f9c22d1eee1a9c05da39322918bd3d1239158c
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6e0bcbf66c8d5573af14450a8cdf7e8a8e6c9b590dc9dbae935d312b3e32f2
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/trainer_state.json CHANGED
@@ -10,1023 +10,1023 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  },
216
  {
217
- "entropy": 2.380885266294383,
218
  "epoch": 2.1145703611457036,
219
- "grad_norm": 0.7961218953132629,
220
- "learning_rate": 0.00029123539186406294,
221
- "loss": 2.3589501953125,
222
- "mean_token_accuracy": 0.550012742630159,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
- "entropy": 2.3139565205574035,
228
  "epoch": 2.2391033623910337,
229
- "grad_norm": 0.7128121256828308,
230
- "learning_rate": 0.00028861908897689166,
231
- "loss": 2.2914646911621093,
232
- "mean_token_accuracy": 0.5581977427005768,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
- "entropy": 2.247106301784515,
238
  "epoch": 2.3636363636363638,
239
- "grad_norm": 0.7539874315261841,
240
- "learning_rate": 0.00028574387772804067,
241
- "loss": 2.212442169189453,
242
- "mean_token_accuracy": 0.5661728882789612,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
- "entropy": 2.177862950563431,
248
  "epoch": 2.488169364881694,
249
- "grad_norm": 1.169326663017273,
250
- "learning_rate": 0.00028261517693055664,
251
- "loss": 2.1524928283691405,
252
- "mean_token_accuracy": 0.574029511809349,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
- "entropy": 2.11708885550499,
258
  "epoch": 2.6127023661270234,
259
- "grad_norm": 0.6508749723434448,
260
- "learning_rate": 0.0002792388831406343,
261
- "loss": 2.0897698974609376,
262
- "mean_token_accuracy": 0.5808322209119797,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
- "entropy": 2.0785110569000245,
268
  "epoch": 2.7372353673723535,
269
- "grad_norm": 1.4157167673110962,
270
- "learning_rate": 0.0002756213595445772,
271
- "loss": 2.0574497985839844,
272
- "mean_token_accuracy": 0.5843770909309387,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
- "entropy": 2.091529769897461,
278
  "epoch": 2.8617683686176836,
279
- "grad_norm": 1.6840300559997559,
280
- "learning_rate": 0.00027176942396631626,
281
- "loss": 2.062296142578125,
282
- "mean_token_accuracy": 0.58568293094635,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
- "entropy": 2.0315397584438326,
288
  "epoch": 2.9863013698630136,
289
- "grad_norm": 1.153908133506775,
290
- "learning_rate": 0.0002676903360180885,
291
- "loss": 2.0053924560546874,
292
- "mean_token_accuracy": 0.5878721660375595,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
- "eval_entropy": 2.0182155738043233,
299
- "eval_mean_token_accuracy": 0.5854449913252232,
300
- "eval_not_syn_loss": 2.0105812549591064,
301
- "eval_not_syn_runtime": 95.9278,
302
- "eval_not_syn_samples_per_second": 14.334,
303
- "eval_not_syn_steps_per_second": 1.793,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
- "eval_entropy": 1.9446905400863914,
310
- "eval_mean_token_accuracy": 0.5995188099007274,
311
  "eval_num_tokens": 2998170.0,
312
- "eval_syn_loss": 1.9396028518676758,
313
- "eval_syn_runtime": 98.8737,
314
- "eval_syn_samples_per_second": 13.907,
315
- "eval_syn_steps_per_second": 1.74,
316
  "step": 1206
317
  },
318
  {
319
- "entropy": 1.9578519951213489,
320
  "epoch": 3.1095890410958904,
321
- "grad_norm": 0.6903329491615295,
322
- "learning_rate": 0.00026339178341849265,
323
- "loss": 1.9289002990722657,
324
- "mean_token_accuracy": 0.5986791457792725,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
- "entropy": 1.9225856459140778,
330
  "epoch": 3.2341220423412205,
331
- "grad_norm": 0.9334422945976257,
332
- "learning_rate": 0.00025888186750370616,
333
- "loss": 1.9024064636230469,
334
- "mean_token_accuracy": 0.6029617834091187,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
- "entropy": 1.8814079523086549,
340
  "epoch": 3.3586550435865505,
341
- "grad_norm": 0.779586911201477,
342
- "learning_rate": 0.00025416908795917244,
343
- "loss": 1.8609922790527345,
344
- "mean_token_accuracy": 0.6097110098600388,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
- "entropy": 1.8681990671157838,
350
  "epoch": 3.4831880448318806,
351
- "grad_norm": 0.941205620765686,
352
- "learning_rate": 0.0002492623268005321,
353
- "loss": 1.8448422241210938,
354
- "mean_token_accuracy": 0.6135021150112152,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
- "entropy": 1.8034737646579742,
360
  "epoch": 3.6077210460772102,
361
- "grad_norm": 1.2094346284866333,
362
- "learning_rate": 0.0002441708316339893,
363
- "loss": 1.7913978576660157,
364
- "mean_token_accuracy": 0.622687805891037,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
- "entropy": 1.8016248047351837,
370
  "epoch": 3.7322540473225407,
371
- "grad_norm": 0.7303705215454102,
372
- "learning_rate": 0.00023890419822766254,
373
- "loss": 1.7701612854003905,
374
- "mean_token_accuracy": 0.6246249091625213,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
- "entropy": 1.7500162029266357,
380
  "epoch": 3.8567870485678704,
381
- "grad_norm": 0.954683244228363,
382
- "learning_rate": 0.0002334723524267661,
383
- "loss": 1.7250523376464844,
384
- "mean_token_accuracy": 0.6312138819694519,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
- "entropy": 1.6828746914863586,
390
  "epoch": 3.9813200498132004,
391
- "grad_norm": 0.8929846882820129,
392
- "learning_rate": 0.0002278855314467064,
393
- "loss": 1.6539949035644532,
394
- "mean_token_accuracy": 0.6412730294466019,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
- "eval_entropy": 1.7406537913998892,
401
- "eval_mean_token_accuracy": 0.6226560525422873,
402
- "eval_not_syn_loss": 1.707612156867981,
403
- "eval_not_syn_runtime": 95.956,
404
- "eval_not_syn_samples_per_second": 14.329,
405
- "eval_not_syn_steps_per_second": 1.792,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
- "eval_entropy": 1.6662698221761127,
412
- "eval_mean_token_accuracy": 0.6587355476479198,
413
  "eval_num_tokens": 3997560.0,
414
- "eval_syn_loss": 1.6402223110198975,
415
- "eval_syn_runtime": 99.0503,
416
- "eval_syn_samples_per_second": 13.882,
417
- "eval_syn_steps_per_second": 1.736,
418
  "step": 1608
419
  },
420
  {
421
- "entropy": 1.634241136637601,
422
  "epoch": 4.104607721046078,
423
- "grad_norm": 0.7396109700202942,
424
- "learning_rate": 0.0002221542645793497,
425
- "loss": 1.610531768798828,
426
- "mean_token_accuracy": 0.6454936681371747,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
- "entropy": 1.588758965730667,
432
  "epoch": 4.229140722291407,
433
- "grad_norm": 0.779146134853363,
434
- "learning_rate": 0.00021628935334882304,
435
- "loss": 1.5645944213867187,
436
- "mean_token_accuracy": 0.6525067055225372,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
- "entropy": 1.5439102852344513,
442
  "epoch": 4.353673723536737,
443
- "grad_norm": 0.614967942237854,
444
- "learning_rate": 0.00021030185115424846,
445
- "loss": 1.5180734252929688,
446
- "mean_token_accuracy": 0.6614933741092682,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
- "entropy": 1.5201536059379577,
452
  "epoch": 4.478206724782067,
453
- "grad_norm": 0.6725050210952759,
454
- "learning_rate": 0.00020420304243777673,
455
- "loss": 1.4964521789550782,
456
- "mean_token_accuracy": 0.6630363047122956,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
- "entropy": 1.503999525308609,
462
  "epoch": 4.602739726027397,
463
- "grad_norm": 0.7740678191184998,
464
- "learning_rate": 0.00019800442141718245,
465
- "loss": 1.4821170043945313,
466
- "mean_token_accuracy": 0.6658095389604568,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
- "entropy": 1.4707296586036682,
472
  "epoch": 4.7272727272727275,
473
- "grad_norm": 0.7613642811775208,
474
- "learning_rate": 0.00019171767042310182,
475
- "loss": 1.451023406982422,
476
- "mean_token_accuracy": 0.6706610345840454,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
- "entropy": 1.449094181060791,
482
  "epoch": 4.851805728518057,
483
- "grad_norm": 0.7195038199424744,
484
- "learning_rate": 0.00018535463788174053,
485
- "loss": 1.4288282775878907,
486
- "mean_token_accuracy": 0.6740301263332367,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
- "entropy": 1.4346733844280244,
492
  "epoch": 4.976338729763388,
493
- "grad_norm": 0.7584077715873718,
494
- "learning_rate": 0.00017892731598454726,
495
- "loss": 1.4155799865722656,
496
- "mean_token_accuracy": 0.6788010179996491,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
- "eval_entropy": 1.4948647777701534,
503
- "eval_mean_token_accuracy": 0.661396715876668,
504
- "eval_not_syn_loss": 1.4358112812042236,
505
- "eval_not_syn_runtime": 95.9987,
506
- "eval_not_syn_samples_per_second": 14.323,
507
- "eval_not_syn_steps_per_second": 1.792,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
- "eval_entropy": 1.4439531952835793,
514
- "eval_mean_token_accuracy": 0.6999529783808908,
515
  "eval_num_tokens": 4996950.0,
516
- "eval_syn_loss": 1.3766233921051025,
517
- "eval_syn_runtime": 99.0602,
518
- "eval_syn_samples_per_second": 13.88,
519
- "eval_syn_steps_per_second": 1.736,
520
  "step": 2010
521
  },
522
  {
523
- "entropy": 1.345623204202363,
524
  "epoch": 5.099626400996264,
525
- "grad_norm": 0.8724157214164734,
526
- "learning_rate": 0.00017244781808693755,
527
- "loss": 1.323695831298828,
528
- "mean_token_accuracy": 0.6866910710479274,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
- "entropy": 1.3074172997474671,
534
  "epoch": 5.224159402241594,
535
- "grad_norm": 0.8229172825813293,
536
- "learning_rate": 0.00016592835587866364,
537
- "loss": 1.295655517578125,
538
- "mean_token_accuracy": 0.6931505644321442,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
- "entropy": 1.298924881219864,
544
  "epoch": 5.348692403486924,
545
- "grad_norm": 0.9111100435256958,
546
- "learning_rate": 0.0001593812163688578,
547
- "loss": 1.2771641540527343,
548
- "mean_token_accuracy": 0.6941236406564713,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
- "entropy": 1.3120970189571382,
554
  "epoch": 5.473225404732254,
555
- "grad_norm": 0.7868310213088989,
556
- "learning_rate": 0.000152818738729123,
557
- "loss": 1.2942347717285156,
558
- "mean_token_accuracy": 0.6939822369813919,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
- "entropy": 1.2729843455553054,
564
  "epoch": 5.597758405977584,
565
- "grad_norm": 0.9405992031097412,
566
- "learning_rate": 0.00014625329103831503,
567
- "loss": 1.2503909301757812,
568
- "mean_token_accuracy": 0.7005668586492538,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
- "entropy": 1.2202323937416077,
574
  "epoch": 5.722291407222914,
575
- "grad_norm": 0.7629554271697998,
576
- "learning_rate": 0.00013969724697284394,
577
- "loss": 1.199918212890625,
578
- "mean_token_accuracy": 0.7085251879692077,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
- "entropy": 1.1850077486038209,
584
  "epoch": 5.846824408468244,
585
- "grad_norm": 0.9016424417495728,
586
- "learning_rate": 0.00013316296248642664,
587
- "loss": 1.166585693359375,
588
- "mean_token_accuracy": 0.7123788893222809,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
- "entropy": 1.2040903121232986,
594
  "epoch": 5.971357409713574,
595
- "grad_norm": 0.8533362150192261,
596
- "learning_rate": 0.0001266627525232398,
597
- "loss": 1.1885869598388672,
598
- "mean_token_accuracy": 0.7100468480587006,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
- "eval_entropy": 1.2157157427349756,
605
- "eval_mean_token_accuracy": 0.7150737251653227,
606
- "eval_not_syn_loss": 1.2506903409957886,
607
- "eval_not_syn_runtime": 95.9618,
608
- "eval_not_syn_samples_per_second": 14.329,
609
- "eval_not_syn_steps_per_second": 1.792,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
- "eval_entropy": 1.178627569661584,
616
- "eval_mean_token_accuracy": 0.7016614221556242,
617
  "eval_num_tokens": 5996340.0,
618
- "eval_syn_loss": 1.2114850282669067,
619
- "eval_syn_runtime": 98.9234,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
- "entropy": 1.0982752972178988,
626
  "epoch": 6.094645080946451,
627
- "grad_norm": 1.295694351196289,
628
- "learning_rate": 0.00012020886780836267,
629
- "loss": 1.0774417877197267,
630
- "mean_token_accuracy": 0.7259544272615452,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
- "entropy": 1.0651295524835587,
636
  "epoch": 6.219178082191781,
637
- "grad_norm": 0.9243665337562561,
638
- "learning_rate": 0.0001138134717592517,
639
- "loss": 1.0521366119384765,
640
- "mean_token_accuracy": 0.7293049448728561,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
- "entropy": 1.0358434236049652,
646
  "epoch": 6.34371108343711,
647
- "grad_norm": 0.9286350011825562,
648
- "learning_rate": 0.00010748861756175999,
649
- "loss": 1.018977813720703,
650
- "mean_token_accuracy": 0.7370841908454895,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
- "entropy": 1.0522522723674774,
656
  "epoch": 6.468244084682441,
657
- "grad_norm": 0.9158058762550354,
658
- "learning_rate": 0.00010124622545390751,
659
- "loss": 1.0335795593261718,
660
- "mean_token_accuracy": 0.7348936641216278,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
- "entropy": 1.0202285438776015,
666
  "epoch": 6.592777085927771,
667
- "grad_norm": 0.8085289597511292,
668
- "learning_rate": 9.509806026021276e-05,
669
- "loss": 1.0074135589599609,
670
- "mean_token_accuracy": 0.7410361844301224,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
- "entropy": 0.9939206629991532,
676
  "epoch": 6.717310087173101,
677
- "grad_norm": 0.7527234554290771,
678
- "learning_rate": 8.90557092189276e-05,
679
- "loss": 0.9766032409667968,
680
- "mean_token_accuracy": 0.7435237610340119,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
- "entropy": 0.985169340968132,
686
  "epoch": 6.841843088418431,
687
- "grad_norm": 0.9773848056793213,
688
- "learning_rate": 8.313056014396262e-05,
689
- "loss": 0.9680111694335938,
690
- "mean_token_accuracy": 0.7455707538127899,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
- "entropy": 0.9864326739311218,
696
  "epoch": 6.966376089663761,
697
- "grad_norm": 0.7536692023277283,
698
- "learning_rate": 7.733377996266039e-05,
699
- "loss": 0.9636287689208984,
700
- "mean_token_accuracy": 0.7498565518856048,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
- "eval_entropy": 1.0250733893278032,
707
- "eval_mean_token_accuracy": 0.7460000244683997,
708
- "eval_not_syn_loss": 1.0630079507827759,
709
- "eval_not_syn_runtime": 96.0062,
710
- "eval_not_syn_samples_per_second": 14.322,
711
- "eval_not_syn_steps_per_second": 1.792,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
- "eval_entropy": 0.9910203045190766,
718
- "eval_mean_token_accuracy": 0.7360078449859175,
719
  "eval_num_tokens": 6995730.0,
720
- "eval_syn_loss": 1.0268242359161377,
721
- "eval_syn_runtime": 99.1365,
722
- "eval_syn_samples_per_second": 13.87,
723
- "eval_syn_steps_per_second": 1.735,
724
  "step": 2814
725
  },
726
  {
727
- "entropy": 0.8933790849916863,
728
  "epoch": 7.089663760896638,
729
- "grad_norm": 0.9901642799377441,
730
- "learning_rate": 7.16762936698672e-05,
731
- "loss": 0.870389404296875,
732
- "mean_token_accuracy": 0.7670638844220325,
733
  "num_tokens": 7081846.0,
734
  "step": 2850
735
  },
736
  {
737
- "entropy": 0.8327929264307022,
738
  "epoch": 7.214196762141968,
739
- "grad_norm": 0.9863399267196655,
740
- "learning_rate": 6.616876373796547e-05,
741
- "loss": 0.8118631744384766,
742
- "mean_token_accuracy": 0.7749309521913529,
743
  "num_tokens": 7202092.0,
744
  "step": 2900
745
  },
746
  {
747
- "entropy": 0.8421830326318741,
748
  "epoch": 7.338729763387297,
749
- "grad_norm": 0.9192395210266113,
750
- "learning_rate": 6.082157002167449e-05,
751
- "loss": 0.8261887359619141,
752
- "mean_token_accuracy": 0.7743502056598663,
753
  "num_tokens": 7328180.0,
754
  "step": 2950
755
  },
756
  {
757
- "entropy": 0.8124729514122009,
758
  "epoch": 7.463262764632628,
759
- "grad_norm": 0.807341456413269,
760
- "learning_rate": 5.564479019549013e-05,
761
- "loss": 0.79787353515625,
762
- "mean_token_accuracy": 0.7786311388015748,
763
  "num_tokens": 7449031.0,
764
  "step": 3000
765
  },
766
  {
767
- "entropy": 0.8328139960765839,
768
  "epoch": 7.587795765877957,
769
- "grad_norm": 0.8058074116706848,
770
- "learning_rate": 5.064818076063412e-05,
771
- "loss": 0.8112649536132812,
772
- "mean_token_accuracy": 0.7778408378362656,
773
  "num_tokens": 7574561.0,
774
  "step": 3050
775
  },
776
  {
777
- "entropy": 0.8063498467206955,
778
  "epoch": 7.712328767123288,
779
- "grad_norm": 1.1108580827713013,
780
- "learning_rate": 4.584115865730714e-05,
781
- "loss": 0.7904315185546875,
782
- "mean_token_accuracy": 0.7799610030651093,
783
  "num_tokens": 7700071.0,
784
  "step": 3100
785
  },
786
  {
787
- "entropy": 0.8056518918275833,
788
  "epoch": 7.8368617683686175,
789
- "grad_norm": 1.0046204328536987,
790
- "learning_rate": 4.123278351690132e-05,
791
- "loss": 0.7889098358154297,
792
- "mean_token_accuracy": 0.781885308623314,
793
  "num_tokens": 7831577.0,
794
  "step": 3150
795
  },
796
  {
797
- "entropy": 0.7862150323390961,
798
  "epoch": 7.961394769613948,
799
- "grad_norm": 0.8963404893875122,
800
- "learning_rate": 3.683174058762063e-05,
801
- "loss": 0.7703626251220703,
802
- "mean_token_accuracy": 0.7862878423929215,
803
  "num_tokens": 7954099.0,
804
  "step": 3200
805
  },
806
  {
807
  "epoch": 8.0,
808
- "eval_entropy": 0.8138092035471007,
809
- "eval_mean_token_accuracy": 0.7570307982522387,
810
- "eval_not_syn_loss": 0.9443947076797485,
811
- "eval_not_syn_runtime": 96.0009,
812
- "eval_not_syn_samples_per_second": 14.323,
813
- "eval_not_syn_steps_per_second": 1.792,
814
  "eval_num_tokens": 7995120.0,
815
  "step": 3216
816
  },
817
  {
818
  "epoch": 8.0,
819
- "eval_entropy": 0.7856217716322389,
820
- "eval_mean_token_accuracy": 0.7748590402824934,
821
  "eval_num_tokens": 7995120.0,
822
- "eval_syn_loss": 0.9141804575920105,
823
- "eval_syn_runtime": 99.0213,
824
- "eval_syn_samples_per_second": 13.886,
825
- "eval_syn_steps_per_second": 1.737,
826
  "step": 3216
827
  },
828
  {
829
- "entropy": 0.717324167188972,
830
  "epoch": 8.084682440846825,
831
- "grad_norm": 0.7706292271614075,
832
- "learning_rate": 3.26463243656881e-05,
833
- "loss": 0.6817562103271484,
834
- "mean_token_accuracy": 0.8034305211269495,
835
  "num_tokens": 8081323.0,
836
  "step": 3250
837
  },
838
  {
839
- "entropy": 0.7076752084493637,
840
  "epoch": 8.209215442092155,
841
- "grad_norm": 0.7392331957817078,
842
- "learning_rate": 2.8684422962990643e-05,
843
- "loss": 0.6874848175048828,
844
- "mean_token_accuracy": 0.8040069764852524,
845
  "num_tokens": 8208468.0,
846
  "step": 3300
847
  },
848
  {
849
- "entropy": 0.7056704825162887,
850
  "epoch": 8.333748443337484,
851
- "grad_norm": 0.9019565582275391,
852
- "learning_rate": 2.4953503240622073e-05,
853
- "loss": 0.6827576446533203,
854
- "mean_token_accuracy": 0.8037591743469238,
855
  "num_tokens": 8329436.0,
856
  "step": 3350
857
  },
858
  {
859
- "entropy": 0.6830038994550705,
860
  "epoch": 8.458281444582815,
861
- "grad_norm": 0.9338183999061584,
862
- "learning_rate": 2.146059673634357e-05,
863
- "loss": 0.6555431365966797,
864
- "mean_token_accuracy": 0.80817536175251,
865
  "num_tokens": 8449737.0,
866
  "step": 3400
867
  },
868
  {
869
- "entropy": 0.6841743963956833,
870
  "epoch": 8.582814445828145,
871
- "grad_norm": 0.8519166707992554,
872
- "learning_rate": 1.8212286412483148e-05,
873
- "loss": 0.6660543823242188,
874
- "mean_token_accuracy": 0.8084699755907059,
875
  "num_tokens": 8572556.0,
876
  "step": 3450
877
  },
878
  {
879
- "entropy": 0.6859753090143204,
880
  "epoch": 8.707347447073474,
881
- "grad_norm": 0.8856426477432251,
882
- "learning_rate": 1.5214694249249967e-05,
883
- "loss": 0.6608637237548828,
884
- "mean_token_accuracy": 0.8085391563177109,
885
  "num_tokens": 8703112.0,
886
  "step": 3500
887
  },
888
  {
889
- "entropy": 0.6650526940822601,
890
  "epoch": 8.831880448318804,
891
- "grad_norm": 0.8123352527618408,
892
- "learning_rate": 1.2473469706846106e-05,
893
- "loss": 0.636329345703125,
894
- "mean_token_accuracy": 0.8127367502450943,
895
  "num_tokens": 8829124.0,
896
  "step": 3550
897
  },
898
  {
899
- "entropy": 0.674993228316307,
900
  "epoch": 8.956413449564135,
901
- "grad_norm": 0.8388631343841553,
902
- "learning_rate": 9.993779078120502e-06,
903
- "loss": 0.6549046325683594,
904
- "mean_token_accuracy": 0.8100328993797302,
905
  "num_tokens": 8951338.0,
906
  "step": 3600
907
  },
908
  {
909
  "epoch": 9.0,
910
- "eval_entropy": 0.7291332736264827,
911
- "eval_mean_token_accuracy": 0.7639909365842509,
912
- "eval_not_syn_loss": 0.9260964393615723,
913
- "eval_not_syn_runtime": 96.0349,
914
- "eval_not_syn_samples_per_second": 14.318,
915
- "eval_not_syn_steps_per_second": 1.791,
916
  "eval_num_tokens": 8994510.0,
917
  "step": 3618
918
  },
919
  {
920
  "epoch": 9.0,
921
- "eval_entropy": 0.7045032125572825,
922
- "eval_mean_token_accuracy": 0.7828773402197416,
923
  "eval_num_tokens": 8994510.0,
924
- "eval_syn_loss": 0.891645610332489,
925
- "eval_syn_runtime": 99.041,
926
- "eval_syn_samples_per_second": 13.883,
927
- "eval_syn_steps_per_second": 1.737,
928
  "step": 3618
929
  },
930
  {
931
- "entropy": 0.651277188700859,
932
  "epoch": 9.079701120797012,
933
- "grad_norm": 0.7400560975074768,
934
- "learning_rate": 7.780295751832365e-06,
935
- "loss": 0.6093550872802734,
936
- "mean_token_accuracy": 0.8217923815804299,
937
  "num_tokens": 9069880.0,
938
  "step": 3650
939
  },
940
  {
941
- "entropy": 0.6303536769747734,
942
  "epoch": 9.204234122042342,
943
- "grad_norm": 0.7051271796226501,
944
- "learning_rate": 5.8371914048739546e-06,
945
- "loss": 0.594953498840332,
946
- "mean_token_accuracy": 0.8226762419939041,
947
  "num_tokens": 9198937.0,
948
  "step": 3700
949
  },
950
  {
951
- "entropy": 0.6400952243804932,
952
  "epoch": 9.32876712328767,
953
- "grad_norm": 0.6884015202522278,
954
- "learning_rate": 4.168128140052653e-06,
955
- "loss": 0.6061803817749023,
956
- "mean_token_accuracy": 0.8216804820299148,
957
  "num_tokens": 9320870.0,
958
  "step": 3750
959
  },
960
  {
961
- "entropy": 0.6408041715621948,
962
  "epoch": 9.453300124533001,
963
- "grad_norm": 0.7845977544784546,
964
- "learning_rate": 2.7762515842502636e-06,
965
- "loss": 0.601903190612793,
966
- "mean_token_accuracy": 0.8218631267547607,
967
  "num_tokens": 9451044.0,
968
  "step": 3800
969
  },
970
  {
971
- "entropy": 0.6254262709617615,
972
  "epoch": 9.577833125778332,
973
- "grad_norm": 0.6794775128364563,
974
- "learning_rate": 1.6641849599666836e-06,
975
- "loss": 0.5901885986328125,
976
- "mean_token_accuracy": 0.8238948655128479,
977
  "num_tokens": 9569101.0,
978
  "step": 3850
979
  },
980
  {
981
- "entropy": 0.6379631841182709,
982
  "epoch": 9.70236612702366,
983
- "grad_norm": 0.7553540468215942,
984
- "learning_rate": 8.340241414215602e-07,
985
- "loss": 0.599051742553711,
986
- "mean_token_accuracy": 0.8235908967256546,
987
  "num_tokens": 9694849.0,
988
  "step": 3900
989
  },
990
  {
991
- "entropy": 0.6264688801765442,
992
  "epoch": 9.826899128268991,
993
- "grad_norm": 0.6769167184829712,
994
- "learning_rate": 2.8733370453123033e-07,
995
- "loss": 0.5899434280395508,
996
- "mean_token_accuracy": 0.8249167394638062,
997
  "num_tokens": 9824230.0,
998
  "step": 3950
999
  },
1000
  {
1001
- "entropy": 0.6334065935015678,
1002
  "epoch": 9.951432129514322,
1003
- "grad_norm": 0.8305081129074097,
1004
- "learning_rate": 2.514397820565199e-08,
1005
- "loss": 0.5924297714233399,
1006
- "mean_token_accuracy": 0.8238646203279495,
1007
  "num_tokens": 9945501.0,
1008
  "step": 4000
1009
  },
1010
  {
1011
  "epoch": 10.0,
1012
- "eval_entropy": 0.6948450613160466,
1013
- "eval_mean_token_accuracy": 0.7657531517189603,
1014
- "eval_not_syn_loss": 0.9340860247612,
1015
- "eval_not_syn_runtime": 95.9417,
1016
- "eval_not_syn_samples_per_second": 14.332,
1017
- "eval_not_syn_steps_per_second": 1.793,
1018
  "eval_num_tokens": 9993900.0,
1019
  "step": 4020
1020
  },
1021
  {
1022
  "epoch": 10.0,
1023
- "eval_entropy": 0.6709783749524937,
1024
- "eval_mean_token_accuracy": 0.7841822120339371,
1025
  "eval_num_tokens": 9993900.0,
1026
- "eval_syn_loss": 0.90080726146698,
1027
- "eval_syn_runtime": 99.1594,
1028
- "eval_syn_samples_per_second": 13.867,
1029
- "eval_syn_steps_per_second": 1.735,
1030
  "step": 4020
1031
  }
1032
  ],
@@ -1047,8 +1047,8 @@
1047
  "attributes": {}
1048
  }
1049
  },
1050
- "total_flos": 2.0545133142516326e+18,
1051
- "train_batch_size": 8,
1052
  "trial_name": null,
1053
  "trial_params": null
1054
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  },
216
  {
217
+ "entropy": 0.4303537303149098,
218
  "epoch": 2.1145703611457036,
219
+ "grad_norm": 0.35502323508262634,
220
+ "learning_rate": 0.00019096644909178583,
221
+ "loss": 0.38930774688720704,
222
+ "mean_token_accuracy": 0.8710548519486129,
223
  "num_tokens": 2114788.0,
224
  "step": 850
225
  },
226
  {
227
+ "entropy": 0.431627052128315,
228
  "epoch": 2.2391033623910337,
229
+ "grad_norm": 0.4276222288608551,
230
+ "learning_rate": 0.00018925090872111246,
231
+ "loss": 0.38792034149169924,
232
+ "mean_token_accuracy": 0.8717547968029976,
233
  "num_tokens": 2239579.0,
234
  "step": 900
235
  },
236
  {
237
+ "entropy": 0.43496647477149963,
238
  "epoch": 2.3636363636363638,
239
+ "grad_norm": 0.3478545546531677,
240
+ "learning_rate": 0.00018736559911273178,
241
+ "loss": 0.3930927276611328,
242
+ "mean_token_accuracy": 0.8688642394542694,
243
  "num_tokens": 2358308.0,
244
  "step": 950
245
  },
246
  {
247
+ "entropy": 0.4360816968977451,
248
  "epoch": 2.488169364881694,
249
+ "grad_norm": 0.4233720898628235,
250
+ "learning_rate": 0.00018531407344566915,
251
+ "loss": 0.39427772521972654,
252
+ "mean_token_accuracy": 0.867885695695877,
253
  "num_tokens": 2484444.0,
254
  "step": 1000
255
  },
256
  {
257
+ "entropy": 0.43084816545248034,
258
  "epoch": 2.6127023661270234,
259
+ "grad_norm": 0.3549717664718628,
260
+ "learning_rate": 0.0001831001981607139,
261
+ "loss": 0.3911524200439453,
262
+ "mean_token_accuracy": 0.8704073330760003,
263
  "num_tokens": 2609374.0,
264
  "step": 1050
265
  },
266
  {
267
+ "entropy": 0.42635570630431174,
268
  "epoch": 2.7372353673723535,
269
+ "grad_norm": 0.3825649917125702,
270
+ "learning_rate": 0.00018072814567346936,
271
+ "loss": 0.39040073394775393,
272
+ "mean_token_accuracy": 0.8706874567270279,
273
  "num_tokens": 2738852.0,
274
  "step": 1100
275
  },
276
  {
277
+ "entropy": 0.4457248793542385,
278
  "epoch": 2.8617683686176836,
279
+ "grad_norm": 0.393543541431427,
280
+ "learning_rate": 0.00017820238651074317,
281
+ "loss": 0.40400577545166017,
282
+ "mean_token_accuracy": 0.8659286299347877,
283
  "num_tokens": 2860031.0,
284
  "step": 1150
285
  },
286
  {
287
+ "entropy": 0.44322004675865173,
288
  "epoch": 2.9863013698630136,
289
+ "grad_norm": 0.32433032989501953,
290
+ "learning_rate": 0.0001755276808850968,
291
+ "loss": 0.3956157684326172,
292
+ "mean_token_accuracy": 0.8679120713472366,
293
  "num_tokens": 2984557.0,
294
  "step": 1200
295
  },
296
  {
297
  "epoch": 3.0,
298
+ "eval_entropy": 0.4656750720947288,
299
+ "eval_mean_token_accuracy": 0.8436482773963795,
300
+ "eval_not_syn_loss": 0.502048671245575,
301
+ "eval_not_syn_runtime": 96.1696,
302
+ "eval_not_syn_samples_per_second": 14.298,
303
+ "eval_not_syn_steps_per_second": 1.789,
304
  "eval_num_tokens": 2998170.0,
305
  "step": 1206
306
  },
307
  {
308
  "epoch": 3.0,
309
+ "eval_entropy": 0.4488667759091355,
310
+ "eval_mean_token_accuracy": 0.86770307186038,
311
  "eval_num_tokens": 2998170.0,
312
+ "eval_syn_loss": 0.47810930013656616,
313
+ "eval_syn_runtime": 98.894,
314
+ "eval_syn_samples_per_second": 13.904,
315
+ "eval_syn_steps_per_second": 1.739,
316
  "step": 1206
317
  },
318
  {
319
+ "entropy": 0.34708237489967636,
320
  "epoch": 3.1095890410958904,
321
+ "grad_norm": 0.36334800720214844,
322
+ "learning_rate": 0.00017270906972343466,
323
+ "loss": 0.306053466796875,
324
+ "mean_token_accuracy": 0.8936621320970131,
325
  "num_tokens": 3110419.0,
326
  "step": 1250
327
  },
328
  {
329
+ "entropy": 0.3444848913699389,
330
  "epoch": 3.2341220423412205,
331
+ "grad_norm": 0.3762398660182953,
332
+ "learning_rate": 0.00016975186516654035,
333
+ "loss": 0.30163915634155275,
334
+ "mean_token_accuracy": 0.8941574484109879,
335
  "num_tokens": 3233570.0,
336
  "step": 1300
337
  },
338
  {
339
+ "entropy": 0.3394099518656731,
340
  "epoch": 3.3586550435865505,
341
+ "grad_norm": 0.4982982873916626,
342
+ "learning_rate": 0.00016666164055746508,
343
+ "loss": 0.2992570495605469,
344
+ "mean_token_accuracy": 0.8951553416252136,
345
  "num_tokens": 3360467.0,
346
  "step": 1350
347
  },
348
  {
349
+ "entropy": 0.3437681415677071,
350
  "epoch": 3.4831880448318806,
351
+ "grad_norm": 0.48411592841148376,
352
+ "learning_rate": 0.00016344421993763733,
353
+ "loss": 0.30575496673583985,
354
+ "mean_token_accuracy": 0.8930543160438538,
355
  "num_tokens": 3485765.0,
356
  "step": 1400
357
  },
358
  {
359
+ "entropy": 0.34468906089663504,
360
  "epoch": 3.6077210460772102,
361
+ "grad_norm": 0.4013198912143707,
362
+ "learning_rate": 0.00016010566707048957,
363
+ "loss": 0.3041123008728027,
364
+ "mean_token_accuracy": 0.8934586471319199,
365
  "num_tokens": 3609897.0,
366
  "step": 1450
367
  },
368
  {
369
+ "entropy": 0.3555850328505039,
370
  "epoch": 3.7322540473225407,
371
+ "grad_norm": 0.3677787184715271,
372
+ "learning_rate": 0.00015665227401328915,
373
+ "loss": 0.31248834609985354,
374
+ "mean_token_accuracy": 0.8907824546098709,
375
  "num_tokens": 3734361.0,
376
  "step": 1500
377
  },
378
  {
379
+ "entropy": 0.3529116767644882,
380
  "epoch": 3.8567870485678704,
381
+ "grad_norm": 0.3733390271663666,
382
+ "learning_rate": 0.00015309054925871163,
383
+ "loss": 0.3145001220703125,
384
+ "mean_token_accuracy": 0.8914457809925079,
385
  "num_tokens": 3857540.0,
386
  "step": 1550
387
  },
388
  {
389
+ "entropy": 0.3519477976113558,
390
  "epoch": 3.9813200498132004,
391
+ "grad_norm": 0.4491842985153198,
392
+ "learning_rate": 0.0001494272054685054,
393
+ "loss": 0.31178840637207034,
394
+ "mean_token_accuracy": 0.8901231214404106,
395
  "num_tokens": 3978269.0,
396
  "step": 1600
397
  },
398
  {
399
  "epoch": 4.0,
400
+ "eval_entropy": 0.4111844826229783,
401
+ "eval_mean_token_accuracy": 0.8401426447685375,
402
+ "eval_not_syn_loss": 0.5313173532485962,
403
+ "eval_not_syn_runtime": 95.84,
404
+ "eval_not_syn_samples_per_second": 14.347,
405
+ "eval_not_syn_steps_per_second": 1.795,
406
  "eval_num_tokens": 3997560.0,
407
  "step": 1608
408
  },
409
  {
410
  "epoch": 4.0,
411
+ "eval_entropy": 0.39694498886549195,
412
+ "eval_mean_token_accuracy": 0.8641166465226994,
413
  "eval_num_tokens": 3997560.0,
414
+ "eval_syn_loss": 0.5080230832099915,
415
+ "eval_syn_runtime": 98.944,
416
+ "eval_syn_samples_per_second": 13.897,
417
+ "eval_syn_steps_per_second": 1.738,
418
  "step": 1608
419
  },
420
  {
421
+ "entropy": 0.26439598014559407,
422
  "epoch": 4.104607721046078,
423
+ "grad_norm": 0.4967460632324219,
424
+ "learning_rate": 0.0001456691468223661,
425
+ "loss": 0.22732419967651368,
426
+ "mean_token_accuracy": 0.9187829334928532,
427
  "num_tokens": 4101042.0,
428
  "step": 1650
429
  },
430
  {
431
+ "entropy": 0.2497277507185936,
432
  "epoch": 4.229140722291407,
433
+ "grad_norm": 0.3764539659023285,
434
+ "learning_rate": 0.00014182345600586332,
435
+ "loss": 0.21029539108276368,
436
+ "mean_token_accuracy": 0.9221936762332916,
437
  "num_tokens": 4224060.0,
438
  "step": 1700
439
  },
440
  {
441
+ "entropy": 0.2486180279403925,
442
  "epoch": 4.353673723536737,
443
+ "grad_norm": 0.37183400988578796,
444
+ "learning_rate": 0.0001378973808619437,
445
+ "loss": 0.21009698867797852,
446
+ "mean_token_accuracy": 0.9225871834158897,
447
  "num_tokens": 4350859.0,
448
  "step": 1750
449
  },
450
  {
451
+ "entropy": 0.26017799742519854,
452
  "epoch": 4.478206724782067,
453
+ "grad_norm": 0.43216991424560547,
454
+ "learning_rate": 0.00013389832073116724,
455
+ "loss": 0.2177184295654297,
456
+ "mean_token_accuracy": 0.9205843013525009,
457
  "num_tokens": 4472466.0,
458
  "step": 1800
459
  },
460
  {
461
+ "entropy": 0.26708075165748596,
462
  "epoch": 4.602739726027397,
463
+ "grad_norm": 0.37948140501976013,
464
+ "learning_rate": 0.00012983381250642132,
465
+ "loss": 0.22203298568725585,
466
+ "mean_token_accuracy": 0.9184661367535591,
467
  "num_tokens": 4592096.0,
468
  "step": 1850
469
  },
470
  {
471
+ "entropy": 0.25188380032777785,
472
  "epoch": 4.7272727272727275,
473
+ "grad_norm": 0.4874045252799988,
474
+ "learning_rate": 0.00012571151642839446,
475
+ "loss": 0.21441835403442383,
476
+ "mean_token_accuracy": 0.9194883331656456,
477
  "num_tokens": 4719831.0,
478
  "step": 1900
479
  },
480
  {
481
+ "entropy": 0.2515877375751734,
482
  "epoch": 4.851805728518057,
483
+ "grad_norm": 0.43732911348342896,
484
+ "learning_rate": 0.00012153920164858083,
485
+ "loss": 0.21374931335449218,
486
+ "mean_token_accuracy": 0.921038504242897,
487
  "num_tokens": 4850857.0,
488
  "step": 1950
489
  },
490
  {
491
+ "entropy": 0.254078243970871,
492
  "epoch": 4.976338729763388,
493
+ "grad_norm": 0.41199925541877747,
494
+ "learning_rate": 0.00011732473158702397,
495
+ "loss": 0.21298355102539063,
496
+ "mean_token_accuracy": 0.9217021322250366,
497
  "num_tokens": 4973013.0,
498
  "step": 2000
499
  },
500
  {
501
  "epoch": 5.0,
502
+ "eval_entropy": 0.31937412129238596,
503
+ "eval_mean_token_accuracy": 0.8401600659586662,
504
+ "eval_not_syn_loss": 0.6070574522018433,
505
+ "eval_not_syn_runtime": 95.8354,
506
+ "eval_not_syn_samples_per_second": 14.348,
507
+ "eval_not_syn_steps_per_second": 1.795,
508
  "eval_num_tokens": 4996950.0,
509
  "step": 2010
510
  },
511
  {
512
  "epoch": 5.0,
513
+ "eval_entropy": 0.30796578786400863,
514
+ "eval_mean_token_accuracy": 0.8628802147022513,
515
  "eval_num_tokens": 4996950.0,
516
+ "eval_syn_loss": 0.5800920128822327,
517
+ "eval_syn_runtime": 98.9566,
518
+ "eval_syn_samples_per_second": 13.895,
519
+ "eval_syn_steps_per_second": 1.738,
520
  "step": 2010
521
  },
522
  {
523
+ "entropy": 0.19132825570425602,
524
  "epoch": 5.099626400996264,
525
+ "grad_norm": 0.45538511872291565,
526
+ "learning_rate": 0.0001130760491123961,
527
+ "loss": 0.153853759765625,
528
+ "mean_token_accuracy": 0.9434747304579224,
529
  "num_tokens": 5097696.0,
530
  "step": 2050
531
  },
532
  {
533
+ "entropy": 0.17422323439270257,
534
  "epoch": 5.224159402241594,
535
+ "grad_norm": 0.4580552279949188,
536
+ "learning_rate": 0.00010880116157234302,
537
+ "loss": 0.13612208366394044,
538
+ "mean_token_accuracy": 0.9487342464923859,
539
  "num_tokens": 5222620.0,
540
  "step": 2100
541
  },
542
  {
543
+ "entropy": 0.1799074411019683,
544
  "epoch": 5.348692403486924,
545
+ "grad_norm": 0.39040836691856384,
546
+ "learning_rate": 0.00010450812570230789,
547
+ "loss": 0.14250579833984375,
548
+ "mean_token_accuracy": 0.9467630323767662,
549
  "num_tokens": 5342227.0,
550
  "step": 2150
551
  },
552
  {
553
+ "entropy": 0.18116022080183028,
554
  "epoch": 5.473225404732254,
555
+ "grad_norm": 0.3805501163005829,
556
+ "learning_rate": 0.00010020503244127544,
557
+ "loss": 0.14014955520629882,
558
+ "mean_token_accuracy": 0.946874064207077,
559
  "num_tokens": 5467337.0,
560
  "step": 2200
561
  },
562
  {
563
+ "entropy": 0.1810251370444894,
564
  "epoch": 5.597758405977584,
565
+ "grad_norm": 0.36377736926078796,
566
+ "learning_rate": 9.589999168305372e-05,
567
+ "loss": 0.14044331550598144,
568
+ "mean_token_accuracy": 0.9467302888631821,
569
  "num_tokens": 5591523.0,
570
  "step": 2250
571
  },
572
  {
573
+ "entropy": 0.17202148117125035,
574
  "epoch": 5.722291407222914,
575
+ "grad_norm": 0.38708847761154175,
576
+ "learning_rate": 9.16011169918326e-05,
577
+ "loss": 0.13840054512023925,
578
+ "mean_token_accuracy": 0.9480020496249199,
579
  "num_tokens": 5716383.0,
580
  "step": 2300
581
  },
582
  {
583
+ "entropy": 0.1769936926662922,
584
  "epoch": 5.846824408468244,
585
+ "grad_norm": 0.4674988389015198,
586
+ "learning_rate": 8.73165103108249e-05,
587
+ "loss": 0.1404121208190918,
588
+ "mean_token_accuracy": 0.9461787036061287,
589
  "num_tokens": 5839380.0,
590
  "step": 2350
591
  },
592
  {
593
+ "entropy": 0.17118842527270317,
594
  "epoch": 5.971357409713574,
595
+ "grad_norm": 0.4454115629196167,
596
+ "learning_rate": 8.305424669280888e-05,
597
+ "loss": 0.1366124439239502,
598
+ "mean_token_accuracy": 0.9484315186738967,
599
  "num_tokens": 5968087.0,
600
  "step": 2400
601
  },
602
  {
603
  "epoch": 6.0,
604
+ "eval_entropy": 0.2494078171114589,
605
+ "eval_mean_token_accuracy": 0.851767166062843,
606
+ "eval_not_syn_loss": 0.7222095131874084,
607
+ "eval_not_syn_runtime": 95.8715,
608
+ "eval_not_syn_samples_per_second": 14.342,
609
+ "eval_not_syn_steps_per_second": 1.794,
610
  "eval_num_tokens": 5996340.0,
611
  "step": 2412
612
  },
613
  {
614
  "epoch": 6.0,
615
+ "eval_entropy": 0.239824180079754,
616
+ "eval_mean_token_accuracy": 0.8484749911829482,
617
  "eval_num_tokens": 5996340.0,
618
+ "eval_syn_loss": 0.7036991119384766,
619
+ "eval_syn_runtime": 98.923,
620
  "eval_syn_samples_per_second": 13.9,
621
  "eval_syn_steps_per_second": 1.739,
622
  "step": 2412
623
  },
624
  {
625
+ "entropy": 0.14340858902744572,
626
  "epoch": 6.094645080946451,
627
+ "grad_norm": 0.270256370306015,
628
+ "learning_rate": 7.88223590813502e-05,
629
+ "loss": 0.10398475646972656,
630
+ "mean_token_accuracy": 0.9604840600731397,
631
  "num_tokens": 6091666.0,
632
  "step": 2450
633
  },
634
  {
635
+ "entropy": 0.13108037687838078,
636
  "epoch": 6.219178082191781,
637
+ "grad_norm": 0.2870016396045685,
638
+ "learning_rate": 7.462882317138628e-05,
639
+ "loss": 0.09566926002502442,
640
+ "mean_token_accuracy": 0.9636739170551301,
641
  "num_tokens": 6214889.0,
642
  "step": 2500
643
  },
644
  {
645
+ "entropy": 0.1305130884796381,
646
  "epoch": 6.34371108343711,
647
+ "grad_norm": 0.34119686484336853,
648
+ "learning_rate": 7.048154237770433e-05,
649
+ "loss": 0.0943364143371582,
650
+ "mean_token_accuracy": 0.9643464788794518,
651
  "num_tokens": 6337795.0,
652
  "step": 2550
653
  },
654
  {
655
+ "entropy": 0.12516659261658789,
656
  "epoch": 6.468244084682441,
657
+ "grad_norm": 0.2851012051105499,
658
+ "learning_rate": 6.638833293964401e-05,
659
+ "loss": 0.09339779853820801,
660
+ "mean_token_accuracy": 0.9654107868671418,
661
  "num_tokens": 6466252.0,
662
  "step": 2600
663
  },
664
  {
665
+ "entropy": 0.12894487291574477,
666
  "epoch": 6.592777085927771,
667
+ "grad_norm": 0.22259071469306946,
668
+ "learning_rate": 6.235690919009636e-05,
669
+ "loss": 0.09672751426696777,
670
+ "mean_token_accuracy": 0.9641611188650131,
671
  "num_tokens": 6592128.0,
672
  "step": 2650
673
  },
674
  {
675
+ "entropy": 0.13079525250941515,
676
  "epoch": 6.717310087173101,
677
+ "grad_norm": 0.2873247265815735,
678
+ "learning_rate": 5.839486901656255e-05,
679
+ "loss": 0.09482893943786622,
680
+ "mean_token_accuracy": 0.963907478749752,
681
  "num_tokens": 6716885.0,
682
  "step": 2700
683
  },
684
  {
685
+ "entropy": 0.13382539574056865,
686
  "epoch": 6.841843088418431,
687
+ "grad_norm": 0.26538875699043274,
688
+ "learning_rate": 5.450967954167317e-05,
689
+ "loss": 0.09817559242248536,
690
+ "mean_token_accuracy": 0.9620411720871925,
691
  "num_tokens": 6836942.0,
692
  "step": 2750
693
  },
694
  {
695
+ "entropy": 0.12861237980425358,
696
  "epoch": 6.966376089663761,
697
+ "grad_norm": 0.3292505443096161,
698
+ "learning_rate": 5.070866305015545e-05,
699
+ "loss": 0.09485826492309571,
700
+ "mean_token_accuracy": 0.9636287876963615,
701
  "num_tokens": 6963394.0,
702
  "step": 2800
703
  },
704
  {
705
  "epoch": 7.0,
706
+ "eval_entropy": 0.2065339538073817,
707
+ "eval_mean_token_accuracy": 0.8418768654729045,
708
+ "eval_not_syn_loss": 0.8473101854324341,
709
+ "eval_not_syn_runtime": 95.8493,
710
+ "eval_not_syn_samples_per_second": 14.345,
711
+ "eval_not_syn_steps_per_second": 1.794,
712
  "eval_num_tokens": 6995730.0,
713
  "step": 2814
714
  },
715
  {
716
  "epoch": 7.0,
717
+ "eval_entropy": 0.19768535250494645,
718
+ "eval_mean_token_accuracy": 0.8554798219093057,
719
  "eval_num_tokens": 6995730.0,
720
+ "eval_syn_loss": 0.8169878125190735,
721
+ "eval_syn_runtime": 98.9508,
722
+ "eval_syn_samples_per_second": 13.896,
723
+ "eval_syn_steps_per_second": 1.738,
724
  "step": 2814
725
  },
726
  {
727
+ "entropy": 0.12029899715097865,
728
  "epoch": 7.089663760896638,
729
+ "grad_norm": 0.18435120582580566,
730
+ "learning_rate": 4.699898318877223e-05,
731
+ "loss": 0.08083279609680176,
732
+ "mean_token_accuracy": 0.9689394840688417,
733
  "num_tokens": 7081846.0,
734
  "step": 2850
735
  },
736
  {
737
+ "entropy": 0.10581521928310395,
738
  "epoch": 7.214196762141968,
739
+ "grad_norm": 0.2981043756008148,
740
+ "learning_rate": 4.338763146523955e-05,
741
+ "loss": 0.07383342266082764,
742
+ "mean_token_accuracy": 0.9726339945197106,
743
  "num_tokens": 7202092.0,
744
  "step": 2900
745
  },
746
  {
747
+ "entropy": 0.10466793559491634,
748
  "epoch": 7.338729763387297,
749
+ "grad_norm": 0.24411630630493164,
750
+ "learning_rate": 3.988141407156982e-05,
751
+ "loss": 0.07236807346343994,
752
+ "mean_token_accuracy": 0.9726200112700463,
753
  "num_tokens": 7328180.0,
754
  "step": 2950
755
  },
756
  {
757
+ "entropy": 0.10792888538911939,
758
  "epoch": 7.463262764632628,
759
+ "grad_norm": 0.15558083355426788,
760
+ "learning_rate": 3.6486939056672404e-05,
761
+ "loss": 0.0738653564453125,
762
+ "mean_token_accuracy": 0.9714727732539177,
763
  "num_tokens": 7449031.0,
764
  "step": 3000
765
  },
766
  {
767
+ "entropy": 0.10735130561515689,
768
  "epoch": 7.587795765877957,
769
+ "grad_norm": 0.1883469820022583,
770
+ "learning_rate": 3.321060387238841e-05,
771
+ "loss": 0.07414769649505615,
772
+ "mean_token_accuracy": 0.9723327186703682,
773
  "num_tokens": 7574561.0,
774
  "step": 3050
775
  },
776
  {
777
+ "entropy": 0.10272138025611639,
778
  "epoch": 7.712328767123288,
779
+ "grad_norm": 0.15312588214874268,
780
+ "learning_rate": 3.0058583316430158e-05,
781
+ "loss": 0.07302286624908447,
782
+ "mean_token_accuracy": 0.9726834264397621,
783
  "num_tokens": 7700071.0,
784
  "step": 3100
785
  },
786
  {
787
+ "entropy": 0.10020780436694622,
788
  "epoch": 7.8368617683686175,
789
+ "grad_norm": 0.15675754845142365,
790
+ "learning_rate": 2.7036817894949623e-05,
791
+ "loss": 0.06891141891479492,
792
+ "mean_token_accuracy": 0.9731867194175721,
793
  "num_tokens": 7831577.0,
794
  "step": 3150
795
  },
796
  {
797
+ "entropy": 0.10623522130772471,
798
  "epoch": 7.961394769613948,
799
+ "grad_norm": 0.28400713205337524,
800
+ "learning_rate": 2.415100262666817e-05,
801
+ "loss": 0.07267386436462403,
802
+ "mean_token_accuracy": 0.9713605433702469,
803
  "num_tokens": 7954099.0,
804
  "step": 3200
805
  },
806
  {
807
  "epoch": 8.0,
808
+ "eval_entropy": 0.18371209263974844,
809
+ "eval_mean_token_accuracy": 0.8438690238913824,
810
+ "eval_not_syn_loss": 0.9368809461593628,
811
+ "eval_not_syn_runtime": 95.8165,
812
+ "eval_not_syn_samples_per_second": 14.35,
813
+ "eval_not_syn_steps_per_second": 1.795,
814
  "eval_num_tokens": 7995120.0,
815
  "step": 3216
816
  },
817
  {
818
  "epoch": 8.0,
819
+ "eval_entropy": 0.17606536935754988,
820
+ "eval_mean_token_accuracy": 0.8541917884072592,
821
  "eval_num_tokens": 7995120.0,
822
+ "eval_syn_loss": 0.905842661857605,
823
+ "eval_syn_runtime": 98.8452,
824
+ "eval_syn_samples_per_second": 13.911,
825
+ "eval_syn_steps_per_second": 1.74,
826
  "step": 3216
827
  },
828
  {
829
+ "entropy": 0.09641267131600115,
830
  "epoch": 8.084682440846825,
831
+ "grad_norm": 0.10478409379720688,
832
+ "learning_rate": 2.1406576309667927e-05,
833
+ "loss": 0.06422062397003174,
834
+ "mean_token_accuracy": 0.974858273761441,
835
  "num_tokens": 8081323.0,
836
  "step": 3250
837
  },
838
  {
839
+ "entropy": 0.09412769414484501,
840
  "epoch": 8.209215442092155,
841
+ "grad_norm": 0.1294923573732376,
842
+ "learning_rate": 1.8808711271073835e-05,
843
+ "loss": 0.06173128604888916,
844
+ "mean_token_accuracy": 0.9750720876455307,
845
  "num_tokens": 8208468.0,
846
  "step": 3300
847
  },
848
  {
849
+ "entropy": 0.09561639416962862,
850
  "epoch": 8.333748443337484,
851
+ "grad_norm": 0.1887606382369995,
852
+ "learning_rate": 1.6362303618944128e-05,
853
+ "loss": 0.06482413768768311,
854
+ "mean_token_accuracy": 0.9742087376117706,
855
  "num_tokens": 8329436.0,
856
  "step": 3350
857
  },
858
  {
859
+ "entropy": 0.09427102731540798,
860
  "epoch": 8.458281444582815,
861
+ "grad_norm": 0.09633524715900421,
862
+ "learning_rate": 1.407196401474173e-05,
863
+ "loss": 0.06472210884094239,
864
+ "mean_token_accuracy": 0.974975728392601,
865
  "num_tokens": 8449737.0,
866
  "step": 3400
867
  },
868
  {
869
+ "entropy": 0.09517974983900786,
870
  "epoch": 8.582814445828145,
871
+ "grad_norm": 0.09141165018081665,
872
+ "learning_rate": 1.194200898377711e-05,
873
+ "loss": 0.06492462158203124,
874
+ "mean_token_accuracy": 0.9747303375601768,
875
  "num_tokens": 8572556.0,
876
  "step": 3450
877
  },
878
  {
879
+ "entropy": 0.09041798285208642,
880
  "epoch": 8.707347447073474,
881
+ "grad_norm": 0.22126418352127075,
882
+ "learning_rate": 9.976452779999523e-06,
883
+ "loss": 0.061339192390441895,
884
+ "mean_token_accuracy": 0.9752290603518486,
885
  "num_tokens": 8703112.0,
886
  "step": 3500
887
  },
888
  {
889
+ "entropy": 0.09546036465093494,
890
  "epoch": 8.831880448318804,
891
+ "grad_norm": 0.15855053067207336,
892
+ "learning_rate": 8.178999820468898e-06,
893
+ "loss": 0.06384926795959472,
894
+ "mean_token_accuracy": 0.9744252774119377,
895
  "num_tokens": 8829124.0,
896
  "step": 3550
897
  },
898
  {
899
+ "entropy": 0.10058791788294912,
900
  "epoch": 8.956413449564135,
901
+ "grad_norm": 0.10365262627601624,
902
+ "learning_rate": 6.553037703766629e-06,
903
+ "loss": 0.06539971351623536,
904
+ "mean_token_accuracy": 0.972833506166935,
905
  "num_tokens": 8951338.0,
906
  "step": 3600
907
  },
908
  {
909
  "epoch": 9.0,
910
+ "eval_entropy": 0.1707501579509225,
911
+ "eval_mean_token_accuracy": 0.8436840848867283,
912
+ "eval_not_syn_loss": 1.025063157081604,
913
+ "eval_not_syn_runtime": 95.8518,
914
+ "eval_not_syn_samples_per_second": 14.345,
915
+ "eval_not_syn_steps_per_second": 1.794,
916
  "eval_num_tokens": 8994510.0,
917
  "step": 3618
918
  },
919
  {
920
  "epoch": 9.0,
921
+ "eval_entropy": 0.163228454977967,
922
+ "eval_mean_token_accuracy": 0.8539366458737573,
923
  "eval_num_tokens": 8994510.0,
924
+ "eval_syn_loss": 0.985819399356842,
925
+ "eval_syn_runtime": 98.9324,
926
+ "eval_syn_samples_per_second": 13.898,
927
+ "eval_syn_steps_per_second": 1.739,
928
  "step": 3618
929
  },
930
  {
931
+ "entropy": 0.09734603306372659,
932
  "epoch": 9.079701120797012,
933
+ "grad_norm": 0.12082593142986298,
934
+ "learning_rate": 5.101630825503631e-06,
935
+ "loss": 0.06284617900848388,
936
+ "mean_token_accuracy": 0.9750350075538712,
937
  "num_tokens": 9069880.0,
938
  "step": 3650
939
  },
940
  {
941
+ "entropy": 0.08707131499424577,
942
  "epoch": 9.204234122042342,
943
+ "grad_norm": 0.11582205444574356,
944
+ "learning_rate": 3.827514602957916e-06,
945
+ "loss": 0.058627347946166995,
946
+ "mean_token_accuracy": 0.9766133311390877,
947
  "num_tokens": 9198937.0,
948
  "step": 3700
949
  },
950
  {
951
+ "entropy": 0.09363717755302786,
952
  "epoch": 9.32876712328767,
953
+ "grad_norm": 0.0942605510354042,
954
+ "learning_rate": 2.733090319726434e-06,
955
+ "loss": 0.061338043212890624,
956
+ "mean_token_accuracy": 0.9756769821047783,
957
  "num_tokens": 9320870.0,
958
  "step": 3750
959
  },
960
  {
961
+ "entropy": 0.08440944708883763,
962
  "epoch": 9.453300124533001,
963
+ "grad_norm": 0.09462074190378189,
964
+ "learning_rate": 1.820420600107491e-06,
965
+ "loss": 0.05633291721343994,
966
+ "mean_token_accuracy": 0.9779263830184937,
967
  "num_tokens": 9451044.0,
968
  "step": 3800
969
  },
970
  {
971
+ "entropy": 0.09060163488611579,
972
  "epoch": 9.577833125778332,
973
+ "grad_norm": 0.12719611823558807,
974
+ "learning_rate": 1.091225521742671e-06,
975
+ "loss": 0.06255305767059326,
976
+ "mean_token_accuracy": 0.9757561111450195,
977
  "num_tokens": 9569101.0,
978
  "step": 3850
979
  },
980
  {
981
+ "entropy": 0.09161491643637419,
982
  "epoch": 9.70236612702366,
983
+ "grad_norm": 0.13942484557628632,
984
+ "learning_rate": 5.468793738449273e-07,
985
+ "loss": 0.059427495002746585,
986
+ "mean_token_accuracy": 0.9759828591346741,
987
  "num_tokens": 9694849.0,
988
  "step": 3900
989
  },
990
  {
991
+ "entropy": 0.08649967277422548,
992
  "epoch": 9.826899128268991,
993
+ "grad_norm": 0.0871015340089798,
994
+ "learning_rate": 1.8840806712231278e-07,
995
+ "loss": 0.058426890373229984,
996
+ "mean_token_accuracy": 0.9768971011042595,
997
  "num_tokens": 9824230.0,
998
  "step": 3950
999
  },
1000
  {
1001
+ "entropy": 0.09529741924256087,
1002
  "epoch": 9.951432129514322,
1003
+ "grad_norm": 0.13722798228263855,
1004
+ "learning_rate": 1.648720027892707e-08,
1005
+ "loss": 0.061555180549621585,
1006
+ "mean_token_accuracy": 0.9747346398234368,
1007
  "num_tokens": 9945501.0,
1008
  "step": 4000
1009
  },
1010
  {
1011
  "epoch": 10.0,
1012
+ "eval_entropy": 0.16450819373130798,
1013
+ "eval_mean_token_accuracy": 0.8425282915664274,
1014
+ "eval_not_syn_loss": 1.067739725112915,
1015
+ "eval_not_syn_runtime": 95.886,
1016
+ "eval_not_syn_samples_per_second": 14.34,
1017
+ "eval_not_syn_steps_per_second": 1.794,
1018
  "eval_num_tokens": 9993900.0,
1019
  "step": 4020
1020
  },
1021
  {
1022
  "epoch": 10.0,
1023
+ "eval_entropy": 0.1571650807854048,
1024
+ "eval_mean_token_accuracy": 0.8542020715946375,
1025
  "eval_num_tokens": 9993900.0,
1026
+ "eval_syn_loss": 1.0270220041275024,
1027
+ "eval_syn_runtime": 99.001,
1028
+ "eval_syn_samples_per_second": 13.889,
1029
+ "eval_syn_steps_per_second": 1.737,
1030
  "step": 4020
1031
  }
1032
  ],
 
1047
  "attributes": {}
1048
  }
1049
  },
1050
+ "total_flos": 1.6386593884405248e+18,
1051
+ "train_batch_size": 4,
1052
  "trial_name": null,
1053
  "trial_params": null
1054
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-4020/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/adapter_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
- "lora_dropout": 0.08281179805341743,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
@@ -30,12 +30,12 @@
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
33
- "up_proj",
34
- "down_proj",
35
  "gate_proj",
36
- "v_proj",
37
  "o_proj",
38
- "k_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
18
  "loftq_config": {},
19
  "lora_alpha": 256,
20
  "lora_bias": false,
21
+ "lora_dropout": 0.0006939672790126417,
22
  "megatron_config": null,
23
  "megatron_core": "megatron.core",
24
  "modules_to_save": null,
 
30
  "revision": null,
31
  "target_modules": [
32
  "q_proj",
 
 
33
  "gate_proj",
34
+ "down_proj",
35
  "o_proj",
36
+ "up_proj",
37
+ "k_proj",
38
+ "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1280238392858e52564a64a8548695fec8db5b3faad8b8749210a88ec62e32d
3
  size 2055285904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:695790e95b266cd193861415705ef49fdfe8c9854104b61cbd1bcea74824634d
3
  size 2055285904
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/trainer_state.json CHANGED
@@ -10,207 +10,207 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.2452277278900146,
14
  "epoch": 0.12453300124533001,
15
- "grad_norm": 0.6758179664611816,
16
- "learning_rate": 3.687014401013371e-05,
17
- "loss": 1.1777716064453125,
18
- "mean_token_accuracy": 0.7337397015094758,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 0.623466266989708,
24
  "epoch": 0.24906600249066002,
25
- "grad_norm": 0.4892372190952301,
26
- "learning_rate": 7.449273993884157e-05,
27
- "loss": 0.5864884948730469,
28
- "mean_token_accuracy": 0.826547600030899,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 0.5896120843291283,
34
  "epoch": 0.37359900373599003,
35
- "grad_norm": 0.6126167178153992,
36
- "learning_rate": 0.00011211533586754944,
37
- "loss": 0.5540548706054688,
38
- "mean_token_accuracy": 0.8330936986207962,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 0.5726784431934356,
44
  "epoch": 0.49813200498132004,
45
- "grad_norm": 0.4206934869289398,
46
- "learning_rate": 0.0001497379317962573,
47
- "loss": 0.5407680130004883,
48
- "mean_token_accuracy": 0.8367659395933151,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 0.5651785597205162,
54
  "epoch": 0.6226650062266501,
55
- "grad_norm": 0.3723851144313812,
56
- "learning_rate": 0.00018736052772496518,
57
- "loss": 0.5335340881347657,
58
- "mean_token_accuracy": 0.8389524459838867,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 0.5669016176462174,
64
  "epoch": 0.7471980074719801,
65
- "grad_norm": 0.7901780009269714,
66
- "learning_rate": 0.00022498312365367305,
67
- "loss": 0.5371434020996094,
68
- "mean_token_accuracy": 0.8372388821840286,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 0.5642251417040824,
74
  "epoch": 0.8717310087173101,
75
- "grad_norm": 0.46209225058555603,
76
- "learning_rate": 0.0002626057195823809,
77
- "loss": 0.5336666870117187,
78
- "mean_token_accuracy": 0.8379119431972504,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 0.5746454495191574,
84
  "epoch": 0.9962640099626401,
85
- "grad_norm": 0.6201167702674866,
86
- "learning_rate": 0.00030022831551108876,
87
- "loss": 0.5476604461669922,
88
- "mean_token_accuracy": 0.8346919983625412,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
- "eval_entropy": 0.6157312128086423,
95
- "eval_mean_token_accuracy": 0.8423293849756551,
96
- "eval_not_syn_loss": 0.5686389207839966,
97
- "eval_not_syn_runtime": 95.9634,
98
- "eval_not_syn_samples_per_second": 14.328,
99
- "eval_not_syn_steps_per_second": 1.792,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
- "eval_entropy": 0.5961947804966639,
106
- "eval_mean_token_accuracy": 0.8205934797608575,
107
  "eval_num_tokens": 999390.0,
108
- "eval_syn_loss": 0.5787935256958008,
109
- "eval_syn_runtime": 98.9136,
110
- "eval_syn_samples_per_second": 13.901,
111
- "eval_syn_steps_per_second": 1.739,
112
  "step": 402
113
  },
114
  {
115
- "entropy": 0.5431136073488178,
116
  "epoch": 1.1195516811955168,
117
- "grad_norm": 0.6364207863807678,
118
- "learning_rate": 0.0003023597373031377,
119
- "loss": 0.5128697204589844,
120
- "mean_token_accuracy": 0.8424755226482045,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
- "entropy": 0.5492669984698295,
126
  "epoch": 1.244084682440847,
127
- "grad_norm": 0.7887948155403137,
128
- "learning_rate": 0.00030194951160062724,
129
- "loss": 0.5205921554565429,
130
- "mean_token_accuracy": 0.8417876678705215,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
- "entropy": 0.8709675747156144,
136
  "epoch": 1.3686176836861768,
137
- "grad_norm": 0.734591007232666,
138
- "learning_rate": 0.00030125525414139597,
139
- "loss": 0.9426271057128907,
140
- "mean_token_accuracy": 0.7773911562561989,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
- "entropy": 0.5833489724993706,
146
  "epoch": 1.4931506849315068,
147
- "grad_norm": 1.1496635675430298,
148
- "learning_rate": 0.000300278273368912,
149
- "loss": 0.560246467590332,
150
- "mean_token_accuracy": 0.8333927792310715,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
- "entropy": 3.3383523294329644,
156
  "epoch": 1.6176836861768369,
157
- "grad_norm": 4.88291597366333,
158
- "learning_rate": 0.0002990204105656753,
159
- "loss": 3.895135803222656,
160
- "mean_token_accuracy": 0.4006860972382128,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
- "entropy": 4.406278321743011,
166
  "epoch": 1.7422166874221667,
167
- "grad_norm": 2.488743305206299,
168
- "learning_rate": 0.0002974840363830152,
169
- "loss": 4.424278259277344,
170
- "mean_token_accuracy": 0.2545871638506651,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
- "entropy": 3.0789780139923097,
176
  "epoch": 1.8667496886674968,
177
- "grad_norm": 2.5833027362823486,
178
- "learning_rate": 0.00029567204637320413,
179
- "loss": 3.105696716308594,
180
- "mean_token_accuracy": 0.4513031896948814,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
- "entropy": 2.616127333641052,
186
  "epoch": 1.9912826899128269,
187
- "grad_norm": 1.0389364957809448,
188
- "learning_rate": 0.00029358785553230884,
189
- "loss": 2.6003131103515624,
190
- "mean_token_accuracy": 0.5205484080314636,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
- "eval_entropy": 2.5684494819752004,
197
- "eval_mean_token_accuracy": 0.547617181095966,
198
- "eval_not_syn_loss": 2.458150863647461,
199
- "eval_not_syn_runtime": 95.9896,
200
- "eval_not_syn_samples_per_second": 14.324,
201
- "eval_not_syn_steps_per_second": 1.792,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
- "eval_entropy": 2.5337963104248047,
208
- "eval_mean_token_accuracy": 0.5359611507765082,
209
  "eval_num_tokens": 1998780.0,
210
- "eval_syn_loss": 2.417705535888672,
211
- "eval_syn_runtime": 99.0288,
212
- "eval_syn_samples_per_second": 13.885,
213
- "eval_syn_steps_per_second": 1.737,
214
  "step": 804
215
  }
216
  ],
@@ -231,8 +231,8 @@
231
  "attributes": {}
232
  }
233
  },
234
- "total_flos": 4.094726302377984e+17,
235
- "train_batch_size": 8,
236
  "trial_name": null,
237
  "trial_params": null
238
  }
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.3927546733617782,
14
  "epoch": 0.12453300124533001,
15
+ "grad_norm": 0.6564610600471497,
16
+ "learning_rate": 2.4176184199496093e-05,
17
+ "loss": 1.279773941040039,
18
+ "mean_token_accuracy": 0.7125072094798088,
19
  "num_tokens": 121654.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 0.6483061632514,
24
  "epoch": 0.24906600249066002,
25
+ "grad_norm": 0.5326434373855591,
26
+ "learning_rate": 4.884575991326762e-05,
27
+ "loss": 0.6029399490356445,
28
+ "mean_token_accuracy": 0.8237514534592628,
29
  "num_tokens": 249217.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 0.6041012638807297,
34
  "epoch": 0.37359900373599003,
35
+ "grad_norm": 0.4256225526332855,
36
+ "learning_rate": 7.351533562703914e-05,
37
+ "loss": 0.5617047500610352,
38
+ "mean_token_accuracy": 0.8304081869125366,
39
  "num_tokens": 373541.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 0.5842884615063667,
44
  "epoch": 0.49813200498132004,
45
+ "grad_norm": 0.4419579803943634,
46
+ "learning_rate": 9.818491134081067e-05,
47
+ "loss": 0.5414250946044922,
48
+ "mean_token_accuracy": 0.8352482566237449,
49
  "num_tokens": 502382.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 0.5728985281288623,
54
  "epoch": 0.6226650062266501,
55
+ "grad_norm": 0.3333401679992676,
56
+ "learning_rate": 0.0001228544870545822,
57
+ "loss": 0.5301421356201171,
58
+ "mean_token_accuracy": 0.8379305830597877,
59
  "num_tokens": 627269.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 0.5716245217621326,
64
  "epoch": 0.7471980074719801,
65
+ "grad_norm": 0.8607395887374878,
66
+ "learning_rate": 0.00014752406276835373,
67
+ "loss": 0.529751205444336,
68
+ "mean_token_accuracy": 0.8380016613006592,
69
  "num_tokens": 751192.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 0.5588358563184738,
74
  "epoch": 0.8717310087173101,
75
+ "grad_norm": 0.3857034146785736,
76
+ "learning_rate": 0.00017219363848212525,
77
+ "loss": 0.5205639266967773,
78
+ "mean_token_accuracy": 0.83995147138834,
79
  "num_tokens": 873624.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 0.5598713609576226,
84
  "epoch": 0.9962640099626401,
85
+ "grad_norm": 0.39146995544433594,
86
+ "learning_rate": 0.00019686321419589678,
87
+ "loss": 0.5207630920410157,
88
+ "mean_token_accuracy": 0.8398478266596794,
89
  "num_tokens": 996128.0,
90
  "step": 400
91
  },
92
  {
93
  "epoch": 1.0,
94
+ "eval_entropy": 0.5604686953647192,
95
+ "eval_mean_token_accuracy": 0.82587467099345,
96
+ "eval_not_syn_loss": 0.5411113500595093,
97
+ "eval_not_syn_runtime": 96.4062,
98
+ "eval_not_syn_samples_per_second": 14.263,
99
+ "eval_not_syn_steps_per_second": 1.784,
100
  "eval_num_tokens": 999390.0,
101
  "step": 402
102
  },
103
  {
104
  "epoch": 1.0,
105
+ "eval_entropy": 0.5405109611361526,
106
+ "eval_mean_token_accuracy": 0.8581878334976906,
107
  "eval_num_tokens": 999390.0,
108
+ "eval_syn_loss": 0.5110779404640198,
109
+ "eval_syn_runtime": 99.0382,
110
+ "eval_syn_samples_per_second": 13.884,
111
+ "eval_syn_steps_per_second": 1.737,
112
  "step": 402
113
  },
114
  {
115
+ "entropy": 0.5251132612577593,
116
  "epoch": 1.1195516811955168,
117
+ "grad_norm": 0.409588098526001,
118
+ "learning_rate": 0.00019826081236739378,
119
+ "loss": 0.48452903747558596,
120
+ "mean_token_accuracy": 0.8485486621808525,
121
  "num_tokens": 1121021.0,
122
  "step": 450
123
  },
124
  {
125
+ "entropy": 0.5255133463442325,
126
  "epoch": 1.244084682440847,
127
+ "grad_norm": 0.4035375416278839,
128
+ "learning_rate": 0.00019799182258138876,
129
+ "loss": 0.48237808227539064,
130
+ "mean_token_accuracy": 0.8493754121661187,
131
  "num_tokens": 1244321.0,
132
  "step": 500
133
  },
134
  {
135
+ "entropy": 0.5253566785156727,
136
  "epoch": 1.3686176836861768,
137
+ "grad_norm": 0.42013707756996155,
138
+ "learning_rate": 0.000197536589854019,
139
+ "loss": 0.4848367691040039,
140
+ "mean_token_accuracy": 0.8477037993073463,
141
  "num_tokens": 1365615.0,
142
  "step": 550
143
  },
144
  {
145
+ "entropy": 0.520346013456583,
146
  "epoch": 1.4931506849315068,
147
+ "grad_norm": 0.3776898980140686,
148
+ "learning_rate": 0.00019689597214695372,
149
+ "loss": 0.483323860168457,
150
+ "mean_token_accuracy": 0.8489549401402473,
151
  "num_tokens": 1489869.0,
152
  "step": 600
153
  },
154
  {
155
+ "entropy": 0.5263906873762607,
156
  "epoch": 1.6176836861768369,
157
+ "grad_norm": 0.7503569722175598,
158
+ "learning_rate": 0.00019607117681064075,
159
+ "loss": 0.4831574630737305,
160
+ "mean_token_accuracy": 0.8504172155261039,
161
  "num_tokens": 1611614.0,
162
  "step": 650
163
  },
164
  {
165
+ "entropy": 0.5092925234138965,
166
  "epoch": 1.7422166874221667,
167
+ "grad_norm": 0.3184664249420166,
168
+ "learning_rate": 0.00019506375830885428,
169
+ "loss": 0.4734436798095703,
170
+ "mean_token_accuracy": 0.8517335096001625,
171
  "num_tokens": 1741391.0,
172
  "step": 700
173
  },
174
  {
175
+ "entropy": 0.504551088064909,
176
  "epoch": 1.8667496886674968,
177
+ "grad_norm": 0.381900429725647,
178
+ "learning_rate": 0.00019387561528904946,
179
+ "loss": 0.4666786193847656,
180
+ "mean_token_accuracy": 0.8514578703045845,
181
  "num_tokens": 1865156.0,
182
  "step": 750
183
  },
184
  {
185
+ "entropy": 0.5108272171020508,
186
  "epoch": 1.9912826899128269,
187
+ "grad_norm": 0.3435378670692444,
188
+ "learning_rate": 0.00019250898700404634,
189
+ "loss": 0.4672324752807617,
190
+ "mean_token_accuracy": 0.8505285969376564,
191
  "num_tokens": 1989335.0,
192
  "step": 800
193
  },
194
  {
195
  "epoch": 2.0,
196
+ "eval_entropy": 0.515417086177094,
197
+ "eval_mean_token_accuracy": 0.8423887543206992,
198
+ "eval_not_syn_loss": 0.49809959530830383,
199
+ "eval_not_syn_runtime": 95.8874,
200
+ "eval_not_syn_samples_per_second": 14.34,
201
+ "eval_not_syn_steps_per_second": 1.794,
202
  "eval_num_tokens": 1998780.0,
203
  "step": 804
204
  },
205
  {
206
  "epoch": 2.0,
207
+ "eval_entropy": 0.4982072594900464,
208
+ "eval_mean_token_accuracy": 0.8627851702446161,
209
  "eval_num_tokens": 1998780.0,
210
+ "eval_syn_loss": 0.47842374444007874,
211
+ "eval_syn_runtime": 98.9429,
212
+ "eval_syn_samples_per_second": 13.897,
213
+ "eval_syn_steps_per_second": 1.738,
214
  "step": 804
215
  }
216
  ],
 
231
  "attributes": {}
232
  }
233
  },
234
+ "total_flos": 3.2783709113401344e+17,
235
+ "train_batch_size": 4,
236
  "trial_name": null,
237
  "trial_params": null
238
  }
substitutivity_original_Swedish/Qwen3-14B-Base_substitutivity_splits_original_features_train_substitutivity_splits_original_features_test1/checkpoint-804/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74142dc500724b654709fbf32f054b101be2d9c54ca0316d8943f678e5b71e45
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aeaf279dfd18b1ffce5396928b676844e293f17352f1ed8b10806362686fc25
3
  size 6033