RetrO21 commited on
Commit
8496b6d
·
verified ·
1 Parent(s): 7048260

Upload folder using huggingface_hub

Browse files
Files changed (40) hide show
  1. README.md +34 -4
  2. adapter_config.json +2 -2
  3. adapter_model.safetensors +1 -1
  4. checkpoint-1737/adapter_config.json +2 -2
  5. checkpoint-1737/adapter_model.safetensors +1 -1
  6. checkpoint-1737/optimizer.pt +1 -1
  7. checkpoint-1737/rng_state.pth +1 -1
  8. checkpoint-1737/scheduler.pt +1 -1
  9. checkpoint-1737/trainer_state.json +212 -212
  10. checkpoint-1737/training_args.bin +1 -1
  11. checkpoint-3474/adapter_config.json +2 -2
  12. checkpoint-3474/adapter_model.safetensors +1 -1
  13. checkpoint-3474/optimizer.pt +1 -1
  14. checkpoint-3474/rng_state.pth +1 -1
  15. checkpoint-3474/scheduler.pt +1 -1
  16. checkpoint-3474/trainer_state.json +430 -430
  17. checkpoint-3474/training_args.bin +1 -1
  18. checkpoint-5211/adapter_config.json +2 -2
  19. checkpoint-5211/adapter_model.safetensors +1 -1
  20. checkpoint-5211/optimizer.pt +1 -1
  21. checkpoint-5211/rng_state.pth +1 -1
  22. checkpoint-5211/scheduler.pt +1 -1
  23. checkpoint-5211/trainer_state.json +648 -648
  24. checkpoint-5211/training_args.bin +1 -1
  25. checkpoint-6948/adapter_config.json +2 -2
  26. checkpoint-6948/adapter_model.safetensors +1 -1
  27. checkpoint-6948/optimizer.pt +1 -1
  28. checkpoint-6948/rng_state.pth +1 -1
  29. checkpoint-6948/scheduler.pt +1 -1
  30. checkpoint-6948/trainer_state.json +861 -861
  31. checkpoint-6948/training_args.bin +1 -1
  32. checkpoint-8685/adapter_config.json +3 -3
  33. checkpoint-8685/adapter_model.safetensors +2 -2
  34. checkpoint-8685/optimizer.pt +2 -2
  35. checkpoint-8685/rng_state.pth +1 -1
  36. checkpoint-8685/scheduler.pt +1 -1
  37. checkpoint-8685/trainer_state.json +1079 -1079
  38. checkpoint-8685/training_args.bin +1 -1
  39. runs/Dec04_11-47-13_129-213-84-8/events.out.tfevents.1764848895.129-213-84-8.25442.0 +3 -0
  40. training_args.bin +1 -1
README.md CHANGED
@@ -1,9 +1,39 @@
1
  ---
2
  base_model: Qwen/Qwen2-VL-2B-Instruct
3
  library_name: peft
 
4
  tags:
5
- - lora
6
- - qwen2-vl
7
- - adapter
8
- - vision-language
 
 
 
9
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  base_model: Qwen/Qwen2-VL-2B-Instruct
3
  library_name: peft
4
+ model_name: output
5
  tags:
6
+ - adapter
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ license: apache-2.0
12
+ pipeline_tag: text-generation
13
  ---
14
+
15
+ # Model Card for output
16
+
17
+ This model is a fine-tuned version of [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl) and PEFT LoRA.
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+
27
+ generator = pipeline(
28
+ "text-generation",
29
+ model="RetrO21/agrofinetune", # replace with your repo
30
+ device="cuda"
31
+ )
32
+
33
+ output = generator(
34
+ [{"role": "user", "content": question}],
35
+ max_new_tokens=128,
36
+ return_full_text=False
37
+ )[0]
38
+
39
+ print(output["generated_text"])
adapter_config.json CHANGED
@@ -29,10 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
  "k_proj",
34
  "v_proj",
35
- "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "k_proj",
34
  "v_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a42655e5c5bf5a17388c99c67741b81d97a904a649f92d5298361717c78abaac
3
  size 26182176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b4ecb107db701acdc04f96300149f10454a4f22cc800cab0b968eae74c3415
3
  size 26182176
checkpoint-1737/adapter_config.json CHANGED
@@ -29,10 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
  "k_proj",
34
  "v_proj",
35
- "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "k_proj",
34
  "v_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-1737/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca36c29cabd2e8ea449e6eadcd7f7db9042e00cae52ef5b042c56b58c200775a
3
  size 26182176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f567a0aff94e611cf7b87a63d4290b0ee7314a1941c1ad3d0f416afc2fb1f7
3
  size 26182176
checkpoint-1737/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fab12e7cc07b84cfe33ab9be36e25b4dfa882f0ac9e6725dfb7608859ec3a87
3
  size 52486155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b51eac51c311fe05ae0fb6f075e636ed719de49501d8a85daa67a1976ddb3f2
3
  size 52486155
checkpoint-1737/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac7868bb5d10a59d1042ca17d4fc89dc5beddcdf6df99c035480579667b84b19
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09c74ac7a46536c3808a08a6b9cc111c17592bccdc148cd2300d105708f4cc8b
3
  size 14645
checkpoint-1737/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c17d5ce4845692098064761cc4c713c4686c6a262dcb4177eea65f272ed234c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ce062a9f59b08a36604b136fb249f7d9f4c575b16c5cc4c39f6833a49683785
3
  size 1465
checkpoint-1737/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 1737,
3
- "best_metric": 5.861395835876465,
4
  "best_model_checkpoint": "./output/checkpoint-1737",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
@@ -10,362 +10,362 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 3.3817152976989746,
16
  "learning_rate": 4.9e-07,
17
- "loss": 13.8754,
18
- "mean_token_accuracy": 0.15036460414528846,
19
- "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 3.2541544437408447,
26
  "learning_rate": 9.9e-07,
27
- "loss": 14.2282,
28
- "mean_token_accuracy": 0.14137721598148345,
29
- "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 3.6797454357147217,
36
  "learning_rate": 1.49e-06,
37
- "loss": 13.0735,
38
- "mean_token_accuracy": 0.17473630651831626,
39
- "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 4.297911643981934,
46
  "learning_rate": 1.99e-06,
47
- "loss": 13.7392,
48
- "mean_token_accuracy": 0.1473099772632122,
49
- "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.405268669128418,
56
- "learning_rate": 1.9854771784232364e-06,
57
- "loss": 13.0797,
58
- "mean_token_accuracy": 0.16704789966344832,
59
- "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.757556438446045,
66
- "learning_rate": 1.9706579727326615e-06,
67
- "loss": 12.6321,
68
- "mean_token_accuracy": 0.1691790708899498,
69
- "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 6.406249523162842,
76
- "learning_rate": 1.955838767042086e-06,
77
- "loss": 12.2253,
78
- "mean_token_accuracy": 0.17223650276660918,
79
- "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 12.57987117767334,
86
- "learning_rate": 1.9410195613515113e-06,
87
- "loss": 11.9714,
88
- "mean_token_accuracy": 0.15997304677963256,
89
- "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 15.570313453674316,
96
- "learning_rate": 1.9262003556609364e-06,
97
- "loss": 10.8173,
98
- "mean_token_accuracy": 0.16447648257017136,
99
- "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 23.61503791809082,
106
- "learning_rate": 1.9113811499703615e-06,
107
- "loss": 9.3196,
108
- "mean_token_accuracy": 0.16179455041885377,
109
- "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 13.846810340881348,
116
- "learning_rate": 1.8965619442797864e-06,
117
- "loss": 7.9636,
118
- "mean_token_accuracy": 0.16881170988082886,
119
- "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 4.569090366363525,
126
- "learning_rate": 1.8817427385892115e-06,
127
- "loss": 7.4171,
128
- "mean_token_accuracy": 0.16941152423620223,
129
- "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 4.594696521759033,
136
- "learning_rate": 1.8669235328986366e-06,
137
- "loss": 6.9389,
138
- "mean_token_accuracy": 0.1844496901333332,
139
- "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 4.768734931945801,
146
- "learning_rate": 1.8521043272080617e-06,
147
- "loss": 6.9818,
148
- "mean_token_accuracy": 0.16990411713719367,
149
- "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 3.253056764602661,
156
- "learning_rate": 1.8372851215174864e-06,
157
- "loss": 6.7105,
158
- "mean_token_accuracy": 0.18250102579593658,
159
- "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.1871063709259033,
166
- "learning_rate": 1.8224659158269115e-06,
167
- "loss": 6.6685,
168
- "mean_token_accuracy": 0.17129646152257919,
169
- "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.2284677028656006,
176
- "learning_rate": 1.8076467101363366e-06,
177
- "loss": 6.53,
178
- "mean_token_accuracy": 0.18053789794445038,
179
- "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 2.2088730335235596,
186
- "learning_rate": 1.7928275044457617e-06,
187
- "loss": 6.4429,
188
- "mean_token_accuracy": 0.18492739230394364,
189
- "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.3000030517578125,
196
- "learning_rate": 1.7780082987551866e-06,
197
- "loss": 6.047,
198
- "mean_token_accuracy": 0.2291259828209877,
199
- "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 2.1333675384521484,
206
- "learning_rate": 1.7631890930646115e-06,
207
- "loss": 6.0919,
208
- "mean_token_accuracy": 0.22644571751356124,
209
- "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.0400779247283936,
216
- "learning_rate": 1.7483698873740366e-06,
217
- "loss": 6.094,
218
- "mean_token_accuracy": 0.2222653564810753,
219
- "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 2.8049051761627197,
226
- "learning_rate": 1.7335506816834617e-06,
227
- "loss": 5.8011,
228
- "mean_token_accuracy": 0.25127078920602797,
229
- "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 4.063963890075684,
236
- "learning_rate": 1.7187314759928866e-06,
237
- "loss": 5.6855,
238
- "mean_token_accuracy": 0.26265266716480257,
239
- "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 3.9440460205078125,
246
- "learning_rate": 1.7039122703023117e-06,
247
- "loss": 5.8578,
248
- "mean_token_accuracy": 0.24439335912466048,
249
- "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 3.20070481300354,
256
- "learning_rate": 1.6890930646117368e-06,
257
- "loss": 5.8876,
258
- "mean_token_accuracy": 0.24275501281023026,
259
- "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 2.8067362308502197,
266
- "learning_rate": 1.6742738589211617e-06,
267
- "loss": 5.8058,
268
- "mean_token_accuracy": 0.25242207854986193,
269
- "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 2.6375925540924072,
276
- "learning_rate": 1.6594546532305868e-06,
277
- "loss": 5.6718,
278
- "mean_token_accuracy": 0.2665082859992981,
279
- "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 3.951350212097168,
286
- "learning_rate": 1.6446354475400117e-06,
287
- "loss": 5.8012,
288
- "mean_token_accuracy": 0.25434976994991304,
289
- "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 3.580608606338501,
296
- "learning_rate": 1.6298162418494368e-06,
297
- "loss": 5.8027,
298
- "mean_token_accuracy": 0.25208072274923327,
299
- "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 3.9580376148223877,
306
- "learning_rate": 1.614997036158862e-06,
307
- "loss": 5.7364,
308
- "mean_token_accuracy": 0.25940640360116957,
309
- "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 4.55721378326416,
316
- "learning_rate": 1.6001778304682868e-06,
317
- "loss": 5.8092,
318
- "mean_token_accuracy": 0.2496869170665741,
319
- "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 2.330057144165039,
326
- "learning_rate": 1.5853586247777117e-06,
327
- "loss": 5.6604,
328
- "mean_token_accuracy": 0.2686630353331566,
329
- "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 2.9881200790405273,
336
- "learning_rate": 1.5705394190871368e-06,
337
- "loss": 5.8388,
338
- "mean_token_accuracy": 0.2503683388233185,
339
- "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 3.798994779586792,
346
- "learning_rate": 1.555720213396562e-06,
347
- "loss": 5.5635,
348
- "mean_token_accuracy": 0.278279125392437,
349
- "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.139133475343203,
355
- "eval_loss": 5.861395835876465,
356
- "eval_mean_token_accuracy": 0.2402858340657801,
357
- "eval_model_preparation_time": 0.0047,
358
- "eval_num_tokens": 1825107.0,
359
- "eval_runtime": 79.3994,
360
- "eval_samples_per_second": 5.466,
361
- "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  }
364
  ],
365
  "logging_steps": 50,
366
- "max_steps": 6948,
367
  "num_input_tokens_seen": 0,
368
- "num_train_epochs": 4,
369
  "save_steps": 500,
370
  "stateful_callbacks": {
371
  "TrainerControl": {
@@ -379,7 +379,7 @@
379
  "attributes": {}
380
  }
381
  },
382
- "total_flos": 2.5090142668416e+16,
383
  "train_batch_size": 2,
384
  "trial_name": null,
385
  "trial_params": null
 
1
  {
2
  "best_global_step": 1737,
3
+ "best_metric": 5.737204551696777,
4
  "best_model_checkpoint": "./output/checkpoint-1737",
5
  "epoch": 1.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.606692385673523,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.2999913692474365,
16
  "learning_rate": 4.9e-07,
17
+ "loss": 13.6598,
18
+ "mean_token_accuracy": 0.16028020828962325,
19
+ "num_tokens": 53993.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.618675880432129,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.101252555847168,
26
  "learning_rate": 9.9e-07,
27
+ "loss": 14.0188,
28
+ "mean_token_accuracy": 0.1508466500043869,
29
+ "num_tokens": 110134.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.5215235900878907,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.513662815093994,
36
  "learning_rate": 1.49e-06,
37
+ "loss": 12.8555,
38
+ "mean_token_accuracy": 0.18527640983462335,
39
+ "num_tokens": 160191.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.667909698486328,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.327610492706299,
46
  "learning_rate": 1.99e-06,
47
+ "loss": 13.5394,
48
+ "mean_token_accuracy": 0.157139780074358,
49
+ "num_tokens": 214993.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.768263258934021,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.290107250213623,
56
+ "learning_rate": 1.988450206246317e-06,
57
+ "loss": 12.8912,
58
+ "mean_token_accuracy": 0.17374794125556947,
59
+ "num_tokens": 268184.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 3.990619196891785,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.444278717041016,
66
+ "learning_rate": 1.976664702416028e-06,
67
+ "loss": 12.455,
68
+ "mean_token_accuracy": 0.17780130118131637,
69
+ "num_tokens": 319458.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.162646284103394,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 5.615262508392334,
76
+ "learning_rate": 1.9648791985857395e-06,
77
+ "loss": 12.0893,
78
+ "mean_token_accuracy": 0.18191319867968558,
79
+ "num_tokens": 373337.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.532100868225098,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 10.074016571044922,
86
+ "learning_rate": 1.9530936947554507e-06,
87
+ "loss": 11.9261,
88
+ "mean_token_accuracy": 0.169477596282959,
89
+ "num_tokens": 427526.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 4.923871030807495,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 16.220163345336914,
96
+ "learning_rate": 1.9413081909251622e-06,
97
+ "loss": 11.0048,
98
+ "mean_token_accuracy": 0.1704501649737358,
99
+ "num_tokens": 480528.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.521005854606629,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 29.904008865356445,
106
+ "learning_rate": 1.9295226870948733e-06,
107
+ "loss": 9.6524,
108
+ "mean_token_accuracy": 0.16450899541378022,
109
+ "num_tokens": 535314.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.092623329162597,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 17.821575164794922,
116
+ "learning_rate": 1.9177371832645845e-06,
117
+ "loss": 8.1054,
118
+ "mean_token_accuracy": 0.17205011785030366,
119
+ "num_tokens": 588410.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.385262680053711,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 5.502202987670898,
126
+ "learning_rate": 1.9059516794342958e-06,
127
+ "loss": 7.4313,
128
+ "mean_token_accuracy": 0.1734227080643177,
129
+ "num_tokens": 641736.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.278562617301941,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.4657697677612305,
136
+ "learning_rate": 1.8941661756040071e-06,
137
+ "loss": 6.9266,
138
+ "mean_token_accuracy": 0.18680249139666558,
139
+ "num_tokens": 692200.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.553266277313233,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.955812931060791,
146
+ "learning_rate": 1.8823806717737183e-06,
147
+ "loss": 6.9847,
148
+ "mean_token_accuracy": 0.16679802387952805,
149
+ "num_tokens": 745830.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.470935583114624,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.198381423950195,
156
+ "learning_rate": 1.8705951679434296e-06,
157
+ "loss": 6.7277,
158
+ "mean_token_accuracy": 0.17847734570503235,
159
+ "num_tokens": 798872.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.5620588779449465,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 3.1793746948242188,
166
+ "learning_rate": 1.8588096641131407e-06,
167
+ "loss": 6.7032,
168
+ "mean_token_accuracy": 0.17336134731769562,
169
+ "num_tokens": 853045.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.532204885482788,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.824537515640259,
176
+ "learning_rate": 1.847024160282852e-06,
177
+ "loss": 6.5762,
178
+ "mean_token_accuracy": 0.1805124071240425,
179
+ "num_tokens": 907679.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.535988225936889,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 4.350001811981201,
186
+ "learning_rate": 1.8352386564525632e-06,
187
+ "loss": 6.505,
188
+ "mean_token_accuracy": 0.1842605724930763,
189
+ "num_tokens": 964170.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.204533562660218,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.193660020828247,
196
+ "learning_rate": 1.8234531526222745e-06,
197
+ "loss": 6.1211,
198
+ "mean_token_accuracy": 0.21968430042266845,
199
+ "num_tokens": 1015909.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.308737449645996,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.325622320175171,
206
+ "learning_rate": 1.8116676487919857e-06,
207
+ "loss": 6.1653,
208
+ "mean_token_accuracy": 0.21636426240205764,
209
+ "num_tokens": 1068859.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.332560749053955,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0439090728759766,
216
+ "learning_rate": 1.799882144961697e-06,
217
+ "loss": 6.1559,
218
+ "mean_token_accuracy": 0.21859725564718246,
219
+ "num_tokens": 1123202.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.042124252319336,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 3.621903657913208,
226
+ "learning_rate": 1.7880966411314081e-06,
227
+ "loss": 5.8441,
228
+ "mean_token_accuracy": 0.24906315237283708,
229
+ "num_tokens": 1173403.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.921343173980713,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 5.658033847808838,
236
+ "learning_rate": 1.7763111373011195e-06,
237
+ "loss": 5.7104,
238
+ "mean_token_accuracy": 0.2625067520141602,
239
+ "num_tokens": 1225026.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.093586492538452,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 2.4292995929718018,
246
+ "learning_rate": 1.7645256334708308e-06,
247
+ "loss": 5.8658,
248
+ "mean_token_accuracy": 0.24842385441064835,
249
+ "num_tokens": 1279013.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.119112596511841,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.369384288787842,
256
+ "learning_rate": 1.752740129640542e-06,
257
+ "loss": 5.8784,
258
+ "mean_token_accuracy": 0.24857850253582,
259
+ "num_tokens": 1332547.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.025163550376892,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.5110116004943848,
266
+ "learning_rate": 1.7409546258102533e-06,
267
+ "loss": 5.7769,
268
+ "mean_token_accuracy": 0.25835376888513567,
269
+ "num_tokens": 1385192.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.877259612083435,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.4179303646087646,
276
+ "learning_rate": 1.7291691219799646e-06,
277
+ "loss": 5.6284,
278
+ "mean_token_accuracy": 0.2756252554059029,
279
+ "num_tokens": 1437071.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.002246947288513,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.494359016418457,
286
+ "learning_rate": 1.717383618149676e-06,
287
+ "loss": 5.747,
288
+ "mean_token_accuracy": 0.26462210685014725,
289
+ "num_tokens": 1490818.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 5.991955623626709,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 2.340975761413574,
296
+ "learning_rate": 1.705598114319387e-06,
297
+ "loss": 5.7379,
298
+ "mean_token_accuracy": 0.26444981098175047,
299
+ "num_tokens": 1544997.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 5.91768889427185,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 2.2394514083862305,
306
+ "learning_rate": 1.6938126104890984e-06,
307
+ "loss": 5.6564,
308
+ "mean_token_accuracy": 0.2730415526032448,
309
+ "num_tokens": 1598302.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 5.982716989517212,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 1.876839518547058,
316
+ "learning_rate": 1.6820271066588098e-06,
317
+ "loss": 5.7215,
318
+ "mean_token_accuracy": 0.26642445534467696,
319
+ "num_tokens": 1655267.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.820467872619629,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.219966173171997,
326
+ "learning_rate": 1.6702416028285209e-06,
327
+ "loss": 5.5555,
328
+ "mean_token_accuracy": 0.2856418335437775,
329
+ "num_tokens": 1709199.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 5.996349005699158,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.247213840484619,
336
+ "learning_rate": 1.6584560989982322e-06,
337
+ "loss": 5.7283,
338
+ "mean_token_accuracy": 0.2696125540137291,
339
+ "num_tokens": 1765443.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.696683068275451,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 2.8499979972839355,
346
+ "learning_rate": 1.6466705951679433e-06,
347
+ "loss": 5.4335,
348
+ "mean_token_accuracy": 0.29918427973985673,
349
+ "num_tokens": 1817494.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 5.993559589034401,
355
+ "eval_loss": 5.737204551696777,
356
+ "eval_mean_token_accuracy": 0.2618687468739699,
357
+ "eval_model_preparation_time": 0.0045,
358
+ "eval_num_tokens": 1856362.0,
359
+ "eval_runtime": 50.5332,
360
+ "eval_samples_per_second": 8.588,
361
+ "eval_steps_per_second": 4.294,
362
  "step": 1737
363
  }
364
  ],
365
  "logging_steps": 50,
366
+ "max_steps": 8685,
367
  "num_input_tokens_seen": 0,
368
+ "num_train_epochs": 5,
369
  "save_steps": 500,
370
  "stateful_callbacks": {
371
  "TrainerControl": {
 
379
  "attributes": {}
380
  }
381
  },
382
+ "total_flos": 2.546183130710016e+16,
383
  "train_batch_size": 2,
384
  "trial_name": null,
385
  "trial_params": null
checkpoint-1737/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a666397e6243ddba6f7279c90610ed552907ef4de0be511faece3826d13e618
3
  size 6225
checkpoint-3474/adapter_config.json CHANGED
@@ -29,10 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
  "k_proj",
34
  "v_proj",
35
- "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "k_proj",
34
  "v_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-3474/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a64c44cbe26eb26de9c868554476ac772a1101223d4511df741d375932e915d3
3
  size 26182176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df1785d18603767be800e1f55b15fb6ca91bfb92d13b41c606ed125306990ad0
3
  size 26182176
checkpoint-3474/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:678993601594a7d04e501306f05a8d5de7ef3edaadbed87bc8a64e6f10f97582
3
  size 52486155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:351f85f1592228a689e3684e2c19ad9a864153045233b07024283e9a19837ffc
3
  size 52486155
checkpoint-3474/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:388ebf23a81b449689f35e6de23bc7bbc9587bef795c318be18b9ce6620ad7a4
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62a6bc0e3b9744793642655fb166aa4f5a9fb6952bc69941d500e104fd082ebd
3
  size 14645
checkpoint-3474/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d0eb619e824095911c3281fa938e4204802f0a5951fcaf56996a5bc063db576
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c06cf7ae7d08ea484734082c3c87adfab434e5715a578ee4ab7e0ffcbea54c0b
3
  size 1465
checkpoint-3474/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 3474,
3
- "best_metric": 5.656307220458984,
4
  "best_model_checkpoint": "./output/checkpoint-3474",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
@@ -10,724 +10,724 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 3.3817152976989746,
16
  "learning_rate": 4.9e-07,
17
- "loss": 13.8754,
18
- "mean_token_accuracy": 0.15036460414528846,
19
- "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 3.2541544437408447,
26
  "learning_rate": 9.9e-07,
27
- "loss": 14.2282,
28
- "mean_token_accuracy": 0.14137721598148345,
29
- "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 3.6797454357147217,
36
  "learning_rate": 1.49e-06,
37
- "loss": 13.0735,
38
- "mean_token_accuracy": 0.17473630651831626,
39
- "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 4.297911643981934,
46
  "learning_rate": 1.99e-06,
47
- "loss": 13.7392,
48
- "mean_token_accuracy": 0.1473099772632122,
49
- "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.405268669128418,
56
- "learning_rate": 1.9854771784232364e-06,
57
- "loss": 13.0797,
58
- "mean_token_accuracy": 0.16704789966344832,
59
- "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.757556438446045,
66
- "learning_rate": 1.9706579727326615e-06,
67
- "loss": 12.6321,
68
- "mean_token_accuracy": 0.1691790708899498,
69
- "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 6.406249523162842,
76
- "learning_rate": 1.955838767042086e-06,
77
- "loss": 12.2253,
78
- "mean_token_accuracy": 0.17223650276660918,
79
- "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 12.57987117767334,
86
- "learning_rate": 1.9410195613515113e-06,
87
- "loss": 11.9714,
88
- "mean_token_accuracy": 0.15997304677963256,
89
- "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 15.570313453674316,
96
- "learning_rate": 1.9262003556609364e-06,
97
- "loss": 10.8173,
98
- "mean_token_accuracy": 0.16447648257017136,
99
- "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 23.61503791809082,
106
- "learning_rate": 1.9113811499703615e-06,
107
- "loss": 9.3196,
108
- "mean_token_accuracy": 0.16179455041885377,
109
- "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 13.846810340881348,
116
- "learning_rate": 1.8965619442797864e-06,
117
- "loss": 7.9636,
118
- "mean_token_accuracy": 0.16881170988082886,
119
- "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 4.569090366363525,
126
- "learning_rate": 1.8817427385892115e-06,
127
- "loss": 7.4171,
128
- "mean_token_accuracy": 0.16941152423620223,
129
- "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 4.594696521759033,
136
- "learning_rate": 1.8669235328986366e-06,
137
- "loss": 6.9389,
138
- "mean_token_accuracy": 0.1844496901333332,
139
- "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 4.768734931945801,
146
- "learning_rate": 1.8521043272080617e-06,
147
- "loss": 6.9818,
148
- "mean_token_accuracy": 0.16990411713719367,
149
- "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 3.253056764602661,
156
- "learning_rate": 1.8372851215174864e-06,
157
- "loss": 6.7105,
158
- "mean_token_accuracy": 0.18250102579593658,
159
- "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.1871063709259033,
166
- "learning_rate": 1.8224659158269115e-06,
167
- "loss": 6.6685,
168
- "mean_token_accuracy": 0.17129646152257919,
169
- "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.2284677028656006,
176
- "learning_rate": 1.8076467101363366e-06,
177
- "loss": 6.53,
178
- "mean_token_accuracy": 0.18053789794445038,
179
- "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 2.2088730335235596,
186
- "learning_rate": 1.7928275044457617e-06,
187
- "loss": 6.4429,
188
- "mean_token_accuracy": 0.18492739230394364,
189
- "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.3000030517578125,
196
- "learning_rate": 1.7780082987551866e-06,
197
- "loss": 6.047,
198
- "mean_token_accuracy": 0.2291259828209877,
199
- "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 2.1333675384521484,
206
- "learning_rate": 1.7631890930646115e-06,
207
- "loss": 6.0919,
208
- "mean_token_accuracy": 0.22644571751356124,
209
- "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.0400779247283936,
216
- "learning_rate": 1.7483698873740366e-06,
217
- "loss": 6.094,
218
- "mean_token_accuracy": 0.2222653564810753,
219
- "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 2.8049051761627197,
226
- "learning_rate": 1.7335506816834617e-06,
227
- "loss": 5.8011,
228
- "mean_token_accuracy": 0.25127078920602797,
229
- "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 4.063963890075684,
236
- "learning_rate": 1.7187314759928866e-06,
237
- "loss": 5.6855,
238
- "mean_token_accuracy": 0.26265266716480257,
239
- "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 3.9440460205078125,
246
- "learning_rate": 1.7039122703023117e-06,
247
- "loss": 5.8578,
248
- "mean_token_accuracy": 0.24439335912466048,
249
- "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 3.20070481300354,
256
- "learning_rate": 1.6890930646117368e-06,
257
- "loss": 5.8876,
258
- "mean_token_accuracy": 0.24275501281023026,
259
- "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 2.8067362308502197,
266
- "learning_rate": 1.6742738589211617e-06,
267
- "loss": 5.8058,
268
- "mean_token_accuracy": 0.25242207854986193,
269
- "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 2.6375925540924072,
276
- "learning_rate": 1.6594546532305868e-06,
277
- "loss": 5.6718,
278
- "mean_token_accuracy": 0.2665082859992981,
279
- "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 3.951350212097168,
286
- "learning_rate": 1.6446354475400117e-06,
287
- "loss": 5.8012,
288
- "mean_token_accuracy": 0.25434976994991304,
289
- "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 3.580608606338501,
296
- "learning_rate": 1.6298162418494368e-06,
297
- "loss": 5.8027,
298
- "mean_token_accuracy": 0.25208072274923327,
299
- "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 3.9580376148223877,
306
- "learning_rate": 1.614997036158862e-06,
307
- "loss": 5.7364,
308
- "mean_token_accuracy": 0.25940640360116957,
309
- "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 4.55721378326416,
316
- "learning_rate": 1.6001778304682868e-06,
317
- "loss": 5.8092,
318
- "mean_token_accuracy": 0.2496869170665741,
319
- "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 2.330057144165039,
326
- "learning_rate": 1.5853586247777117e-06,
327
- "loss": 5.6604,
328
- "mean_token_accuracy": 0.2686630353331566,
329
- "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 2.9881200790405273,
336
- "learning_rate": 1.5705394190871368e-06,
337
- "loss": 5.8388,
338
- "mean_token_accuracy": 0.2503683388233185,
339
- "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 3.798994779586792,
346
- "learning_rate": 1.555720213396562e-06,
347
- "loss": 5.5635,
348
- "mean_token_accuracy": 0.278279125392437,
349
- "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.139133475343203,
355
- "eval_loss": 5.861395835876465,
356
- "eval_mean_token_accuracy": 0.2402858340657801,
357
- "eval_model_preparation_time": 0.0047,
358
- "eval_num_tokens": 1825107.0,
359
- "eval_runtime": 79.3994,
360
- "eval_samples_per_second": 5.466,
361
- "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 5.8970259666442875,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 2.6411802768707275,
368
- "learning_rate": 1.540901007705987e-06,
369
- "loss": 5.614,
370
- "mean_token_accuracy": 0.273006406724453,
371
- "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 6.0111794090271,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 3.6491827964782715,
378
- "learning_rate": 1.526081802015412e-06,
379
- "loss": 5.7323,
380
- "mean_token_accuracy": 0.26104256987571717,
381
- "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 5.902219276428223,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 2.593249559402466,
388
- "learning_rate": 1.5112625963248368e-06,
389
- "loss": 5.6187,
390
- "mean_token_accuracy": 0.2746362566947937,
391
- "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 5.874705944061279,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 2.554327964782715,
398
- "learning_rate": 1.496443390634262e-06,
399
- "loss": 5.6021,
400
- "mean_token_accuracy": 0.2795292744040489,
401
- "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 5.850096368789673,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 3.6060993671417236,
408
- "learning_rate": 1.481624184943687e-06,
409
- "loss": 5.576,
410
- "mean_token_accuracy": 0.28532547056674956,
411
- "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 5.802229671478272,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 3.0913314819335938,
418
- "learning_rate": 1.466804979253112e-06,
419
- "loss": 5.53,
420
- "mean_token_accuracy": 0.2916027933359146,
421
- "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 5.875646467208862,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 4.777045726776123,
428
- "learning_rate": 1.451985773562537e-06,
429
- "loss": 5.6146,
430
- "mean_token_accuracy": 0.28063644528388976,
431
- "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 5.786596937179565,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 4.207762718200684,
438
- "learning_rate": 1.437166567871962e-06,
439
- "loss": 5.5417,
440
- "mean_token_accuracy": 0.2870470091700554,
441
- "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.672234449386597,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 2.2771811485290527,
448
- "learning_rate": 1.422347362181387e-06,
449
- "loss": 5.4285,
450
- "mean_token_accuracy": 0.30194485366344453,
451
- "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 5.862573285102844,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 3.3273422718048096,
458
- "learning_rate": 1.4075281564908121e-06,
459
- "loss": 5.6169,
460
- "mean_token_accuracy": 0.278145115673542,
461
- "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 5.734760231971741,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 3.7049715518951416,
468
- "learning_rate": 1.392708950800237e-06,
469
- "loss": 5.493,
470
- "mean_token_accuracy": 0.2941485676169395,
471
- "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.665819988250733,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 3.572636604309082,
478
- "learning_rate": 1.3778897451096621e-06,
479
- "loss": 5.4352,
480
- "mean_token_accuracy": 0.3003745040297508,
481
- "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 5.890115032196045,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 2.738203525543213,
488
- "learning_rate": 1.3630705394190872e-06,
489
- "loss": 5.6555,
490
- "mean_token_accuracy": 0.2737997192144394,
491
- "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.66056040763855,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 3.1416995525360107,
498
- "learning_rate": 1.3482513337285121e-06,
499
- "loss": 5.4302,
500
- "mean_token_accuracy": 0.3000989046692848,
501
- "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 5.861240615844727,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 2.7569284439086914,
508
- "learning_rate": 1.333432128037937e-06,
509
- "loss": 5.6304,
510
- "mean_token_accuracy": 0.27707513481378554,
511
- "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.627686910629272,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 1.7750262022018433,
518
- "learning_rate": 1.3186129223473621e-06,
519
- "loss": 5.4058,
520
- "mean_token_accuracy": 0.3019809901714325,
521
- "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.607026796340943,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 3.1005160808563232,
528
- "learning_rate": 1.3037937166567872e-06,
529
- "loss": 5.3836,
530
- "mean_token_accuracy": 0.30584611505270004,
531
- "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 5.6909641885757445,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 1.6848654747009277,
538
- "learning_rate": 1.2889745109662123e-06,
539
- "loss": 5.4653,
540
- "mean_token_accuracy": 0.296178964972496,
541
- "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 5.619450302124023,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 2.469539165496826,
548
- "learning_rate": 1.274155305275637e-06,
549
- "loss": 5.4022,
550
- "mean_token_accuracy": 0.3039679077267647,
551
- "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.61073097705841,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 2.367810010910034,
558
- "learning_rate": 1.259336099585062e-06,
559
- "loss": 5.3956,
560
- "mean_token_accuracy": 0.3051413372159004,
561
- "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.5791136837005615,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 2.3874764442443848,
568
- "learning_rate": 1.2445168938944872e-06,
569
- "loss": 5.3676,
570
- "mean_token_accuracy": 0.3068238252401352,
571
- "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 5.735381307601929,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 2.2097349166870117,
578
- "learning_rate": 1.2296976882039123e-06,
579
- "loss": 5.5239,
580
- "mean_token_accuracy": 0.28974882304668426,
581
- "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.55252691745758,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 1.694831132888794,
588
- "learning_rate": 1.2148784825133372e-06,
589
- "loss": 5.351,
590
- "mean_token_accuracy": 0.3091904193162918,
591
- "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.508773093223572,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 1.8229279518127441,
598
- "learning_rate": 1.200059276822762e-06,
599
- "loss": 5.3164,
600
- "mean_token_accuracy": 0.31158645361661913,
601
- "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 5.676794271469117,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 1.7196234464645386,
608
- "learning_rate": 1.1852400711321872e-06,
609
- "loss": 5.4776,
610
- "mean_token_accuracy": 0.2929128894209862,
611
- "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.551529383659362,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 3.117525577545166,
618
- "learning_rate": 1.1704208654416123e-06,
619
- "loss": 5.3561,
620
- "mean_token_accuracy": 0.30634030640125276,
621
- "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.379635264873505,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 1.876755714416504,
628
- "learning_rate": 1.1556016597510372e-06,
629
- "loss": 5.1868,
630
- "mean_token_accuracy": 0.32913618892431257,
631
- "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.538804936408996,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 1.8670976161956787,
638
- "learning_rate": 1.1407824540604623e-06,
639
- "loss": 5.3494,
640
- "mean_token_accuracy": 0.30661171555519107,
641
- "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.258263626098633,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 2.748718023300171,
648
- "learning_rate": 1.1259632483698874e-06,
649
- "loss": 5.08,
650
- "mean_token_accuracy": 0.3413010013103485,
651
- "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.54539008140564,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 1.8556406497955322,
658
- "learning_rate": 1.1111440426793123e-06,
659
- "loss": 5.3614,
660
- "mean_token_accuracy": 0.30550685405731204,
661
- "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.5433073282241825,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 1.8386749029159546,
668
- "learning_rate": 1.0963248369887374e-06,
669
- "loss": 5.3543,
670
- "mean_token_accuracy": 0.30875524014234546,
671
- "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 5.5769769477844235,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 1.922486662864685,
678
- "learning_rate": 1.0815056312981623e-06,
679
- "loss": 5.3834,
680
- "mean_token_accuracy": 0.3035113242268562,
681
- "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 5.640013842582703,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 2.179500102996826,
688
- "learning_rate": 1.0666864256075874e-06,
689
- "loss": 5.4574,
690
- "mean_token_accuracy": 0.2947095710039139,
691
- "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.506910061836242,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 1.4014379978179932,
698
- "learning_rate": 1.0518672199170125e-06,
699
- "loss": 5.3234,
700
- "mean_token_accuracy": 0.3096472260355949,
701
- "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 5.607311015129089,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 1.41231107711792,
708
- "learning_rate": 1.0370480142264374e-06,
709
- "loss": 5.4226,
710
- "mean_token_accuracy": 0.2979922544956207,
711
- "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 5.831721861790951,
717
- "eval_loss": 5.656307220458984,
718
- "eval_mean_token_accuracy": 0.2641724460685308,
719
- "eval_model_preparation_time": 0.0047,
720
- "eval_num_tokens": 3650214.0,
721
- "eval_runtime": 79.7324,
722
- "eval_samples_per_second": 5.443,
723
- "eval_steps_per_second": 2.722,
724
  "step": 3474
725
  }
726
  ],
727
  "logging_steps": 50,
728
- "max_steps": 6948,
729
  "num_input_tokens_seen": 0,
730
- "num_train_epochs": 4,
731
  "save_steps": 500,
732
  "stateful_callbacks": {
733
  "TrainerControl": {
@@ -741,7 +741,7 @@
741
  "attributes": {}
742
  }
743
  },
744
- "total_flos": 5.014260864635904e+16,
745
  "train_batch_size": 2,
746
  "trial_name": null,
747
  "trial_params": null
 
1
  {
2
  "best_global_step": 3474,
3
+ "best_metric": 5.55628776550293,
4
  "best_model_checkpoint": "./output/checkpoint-3474",
5
  "epoch": 2.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.606692385673523,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.2999913692474365,
16
  "learning_rate": 4.9e-07,
17
+ "loss": 13.6598,
18
+ "mean_token_accuracy": 0.16028020828962325,
19
+ "num_tokens": 53993.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.618675880432129,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.101252555847168,
26
  "learning_rate": 9.9e-07,
27
+ "loss": 14.0188,
28
+ "mean_token_accuracy": 0.1508466500043869,
29
+ "num_tokens": 110134.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.5215235900878907,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.513662815093994,
36
  "learning_rate": 1.49e-06,
37
+ "loss": 12.8555,
38
+ "mean_token_accuracy": 0.18527640983462335,
39
+ "num_tokens": 160191.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.667909698486328,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.327610492706299,
46
  "learning_rate": 1.99e-06,
47
+ "loss": 13.5394,
48
+ "mean_token_accuracy": 0.157139780074358,
49
+ "num_tokens": 214993.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.768263258934021,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.290107250213623,
56
+ "learning_rate": 1.988450206246317e-06,
57
+ "loss": 12.8912,
58
+ "mean_token_accuracy": 0.17374794125556947,
59
+ "num_tokens": 268184.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 3.990619196891785,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.444278717041016,
66
+ "learning_rate": 1.976664702416028e-06,
67
+ "loss": 12.455,
68
+ "mean_token_accuracy": 0.17780130118131637,
69
+ "num_tokens": 319458.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.162646284103394,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 5.615262508392334,
76
+ "learning_rate": 1.9648791985857395e-06,
77
+ "loss": 12.0893,
78
+ "mean_token_accuracy": 0.18191319867968558,
79
+ "num_tokens": 373337.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.532100868225098,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 10.074016571044922,
86
+ "learning_rate": 1.9530936947554507e-06,
87
+ "loss": 11.9261,
88
+ "mean_token_accuracy": 0.169477596282959,
89
+ "num_tokens": 427526.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 4.923871030807495,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 16.220163345336914,
96
+ "learning_rate": 1.9413081909251622e-06,
97
+ "loss": 11.0048,
98
+ "mean_token_accuracy": 0.1704501649737358,
99
+ "num_tokens": 480528.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.521005854606629,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 29.904008865356445,
106
+ "learning_rate": 1.9295226870948733e-06,
107
+ "loss": 9.6524,
108
+ "mean_token_accuracy": 0.16450899541378022,
109
+ "num_tokens": 535314.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.092623329162597,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 17.821575164794922,
116
+ "learning_rate": 1.9177371832645845e-06,
117
+ "loss": 8.1054,
118
+ "mean_token_accuracy": 0.17205011785030366,
119
+ "num_tokens": 588410.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.385262680053711,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 5.502202987670898,
126
+ "learning_rate": 1.9059516794342958e-06,
127
+ "loss": 7.4313,
128
+ "mean_token_accuracy": 0.1734227080643177,
129
+ "num_tokens": 641736.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.278562617301941,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.4657697677612305,
136
+ "learning_rate": 1.8941661756040071e-06,
137
+ "loss": 6.9266,
138
+ "mean_token_accuracy": 0.18680249139666558,
139
+ "num_tokens": 692200.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.553266277313233,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.955812931060791,
146
+ "learning_rate": 1.8823806717737183e-06,
147
+ "loss": 6.9847,
148
+ "mean_token_accuracy": 0.16679802387952805,
149
+ "num_tokens": 745830.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.470935583114624,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.198381423950195,
156
+ "learning_rate": 1.8705951679434296e-06,
157
+ "loss": 6.7277,
158
+ "mean_token_accuracy": 0.17847734570503235,
159
+ "num_tokens": 798872.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.5620588779449465,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 3.1793746948242188,
166
+ "learning_rate": 1.8588096641131407e-06,
167
+ "loss": 6.7032,
168
+ "mean_token_accuracy": 0.17336134731769562,
169
+ "num_tokens": 853045.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.532204885482788,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.824537515640259,
176
+ "learning_rate": 1.847024160282852e-06,
177
+ "loss": 6.5762,
178
+ "mean_token_accuracy": 0.1805124071240425,
179
+ "num_tokens": 907679.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.535988225936889,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 4.350001811981201,
186
+ "learning_rate": 1.8352386564525632e-06,
187
+ "loss": 6.505,
188
+ "mean_token_accuracy": 0.1842605724930763,
189
+ "num_tokens": 964170.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.204533562660218,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.193660020828247,
196
+ "learning_rate": 1.8234531526222745e-06,
197
+ "loss": 6.1211,
198
+ "mean_token_accuracy": 0.21968430042266845,
199
+ "num_tokens": 1015909.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.308737449645996,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.325622320175171,
206
+ "learning_rate": 1.8116676487919857e-06,
207
+ "loss": 6.1653,
208
+ "mean_token_accuracy": 0.21636426240205764,
209
+ "num_tokens": 1068859.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.332560749053955,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0439090728759766,
216
+ "learning_rate": 1.799882144961697e-06,
217
+ "loss": 6.1559,
218
+ "mean_token_accuracy": 0.21859725564718246,
219
+ "num_tokens": 1123202.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.042124252319336,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 3.621903657913208,
226
+ "learning_rate": 1.7880966411314081e-06,
227
+ "loss": 5.8441,
228
+ "mean_token_accuracy": 0.24906315237283708,
229
+ "num_tokens": 1173403.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.921343173980713,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 5.658033847808838,
236
+ "learning_rate": 1.7763111373011195e-06,
237
+ "loss": 5.7104,
238
+ "mean_token_accuracy": 0.2625067520141602,
239
+ "num_tokens": 1225026.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.093586492538452,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 2.4292995929718018,
246
+ "learning_rate": 1.7645256334708308e-06,
247
+ "loss": 5.8658,
248
+ "mean_token_accuracy": 0.24842385441064835,
249
+ "num_tokens": 1279013.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.119112596511841,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.369384288787842,
256
+ "learning_rate": 1.752740129640542e-06,
257
+ "loss": 5.8784,
258
+ "mean_token_accuracy": 0.24857850253582,
259
+ "num_tokens": 1332547.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.025163550376892,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.5110116004943848,
266
+ "learning_rate": 1.7409546258102533e-06,
267
+ "loss": 5.7769,
268
+ "mean_token_accuracy": 0.25835376888513567,
269
+ "num_tokens": 1385192.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.877259612083435,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.4179303646087646,
276
+ "learning_rate": 1.7291691219799646e-06,
277
+ "loss": 5.6284,
278
+ "mean_token_accuracy": 0.2756252554059029,
279
+ "num_tokens": 1437071.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.002246947288513,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.494359016418457,
286
+ "learning_rate": 1.717383618149676e-06,
287
+ "loss": 5.747,
288
+ "mean_token_accuracy": 0.26462210685014725,
289
+ "num_tokens": 1490818.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 5.991955623626709,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 2.340975761413574,
296
+ "learning_rate": 1.705598114319387e-06,
297
+ "loss": 5.7379,
298
+ "mean_token_accuracy": 0.26444981098175047,
299
+ "num_tokens": 1544997.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 5.91768889427185,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 2.2394514083862305,
306
+ "learning_rate": 1.6938126104890984e-06,
307
+ "loss": 5.6564,
308
+ "mean_token_accuracy": 0.2730415526032448,
309
+ "num_tokens": 1598302.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 5.982716989517212,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 1.876839518547058,
316
+ "learning_rate": 1.6820271066588098e-06,
317
+ "loss": 5.7215,
318
+ "mean_token_accuracy": 0.26642445534467696,
319
+ "num_tokens": 1655267.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.820467872619629,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.219966173171997,
326
+ "learning_rate": 1.6702416028285209e-06,
327
+ "loss": 5.5555,
328
+ "mean_token_accuracy": 0.2856418335437775,
329
+ "num_tokens": 1709199.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 5.996349005699158,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.247213840484619,
336
+ "learning_rate": 1.6584560989982322e-06,
337
+ "loss": 5.7283,
338
+ "mean_token_accuracy": 0.2696125540137291,
339
+ "num_tokens": 1765443.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.696683068275451,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 2.8499979972839355,
346
+ "learning_rate": 1.6466705951679433e-06,
347
+ "loss": 5.4335,
348
+ "mean_token_accuracy": 0.29918427973985673,
349
+ "num_tokens": 1817494.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 5.993559589034401,
355
+ "eval_loss": 5.737204551696777,
356
+ "eval_mean_token_accuracy": 0.2618687468739699,
357
+ "eval_model_preparation_time": 0.0045,
358
+ "eval_num_tokens": 1856362.0,
359
+ "eval_runtime": 50.5332,
360
+ "eval_samples_per_second": 8.588,
361
+ "eval_steps_per_second": 4.294,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.746842083930969,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.33052921295166,
368
+ "learning_rate": 1.6348850913376547e-06,
369
+ "loss": 5.4796,
370
+ "mean_token_accuracy": 0.2966849410533905,
371
+ "num_tokens": 1870353.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 5.859029049873352,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 1.6248886585235596,
378
+ "learning_rate": 1.6230995875073658e-06,
379
+ "loss": 5.5975,
380
+ "mean_token_accuracy": 0.2838129925727844,
381
+ "num_tokens": 1926205.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.731445336341858,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 1.6941566467285156,
388
+ "learning_rate": 1.6113140836770771e-06,
389
+ "loss": 5.476,
390
+ "mean_token_accuracy": 0.2992346465587616,
391
+ "num_tokens": 1979821.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.6993954515457155,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 1.1746597290039062,
398
+ "learning_rate": 1.5995285798467883e-06,
399
+ "loss": 5.4608,
400
+ "mean_token_accuracy": 0.3000726142525673,
401
+ "num_tokens": 2034373.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.668873124122619,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 1.728211760520935,
408
+ "learning_rate": 1.5877430760164996e-06,
409
+ "loss": 5.4347,
410
+ "mean_token_accuracy": 0.3033922725915909,
411
+ "num_tokens": 2087339.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.624621086120605,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 1.4078539609909058,
418
+ "learning_rate": 1.5759575721862107e-06,
419
+ "loss": 5.3954,
420
+ "mean_token_accuracy": 0.30784171640872954,
421
+ "num_tokens": 2139520.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.7141213130950925,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 2.186459541320801,
428
+ "learning_rate": 1.564172068355922e-06,
429
+ "loss": 5.4847,
430
+ "mean_token_accuracy": 0.29594049394130706,
431
+ "num_tokens": 2193987.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.632415266036987,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 1.3601349592208862,
438
+ "learning_rate": 1.5523865645256334e-06,
439
+ "loss": 5.4135,
440
+ "mean_token_accuracy": 0.30366597563028336,
441
+ "num_tokens": 2249616.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.510904269218445,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.065760612487793,
448
+ "learning_rate": 1.5406010606953445e-06,
449
+ "loss": 5.2904,
450
+ "mean_token_accuracy": 0.3211754837632179,
451
+ "num_tokens": 2300863.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.703383626937867,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 1.1172698736190796,
458
+ "learning_rate": 1.5288155568650559e-06,
459
+ "loss": 5.4802,
460
+ "mean_token_accuracy": 0.29713701367378237,
461
+ "num_tokens": 2356029.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.565930342674255,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 1.7528513669967651,
468
+ "learning_rate": 1.5170300530347672e-06,
469
+ "loss": 5.3518,
470
+ "mean_token_accuracy": 0.31301232606172563,
471
+ "num_tokens": 2408957.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.496430187225342,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 1.892640233039856,
478
+ "learning_rate": 1.5052445492044786e-06,
479
+ "loss": 5.2967,
480
+ "mean_token_accuracy": 0.3181899458169937,
481
+ "num_tokens": 2462569.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.725150098800659,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 1.774940848350525,
488
+ "learning_rate": 1.4934590453741897e-06,
489
+ "loss": 5.5215,
490
+ "mean_token_accuracy": 0.29055028676986694,
491
+ "num_tokens": 2518544.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.4884827613830565,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 2.2167599201202393,
498
+ "learning_rate": 1.481673541543901e-06,
499
+ "loss": 5.2917,
500
+ "mean_token_accuracy": 0.31803421139717103,
501
+ "num_tokens": 2570863.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.697079472541809,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 1.6489030122756958,
508
+ "learning_rate": 1.4698880377136124e-06,
509
+ "loss": 5.4982,
510
+ "mean_token_accuracy": 0.2925163987278938,
511
+ "num_tokens": 2626998.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.46209939956665,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.153914451599121,
518
+ "learning_rate": 1.4581025338833235e-06,
519
+ "loss": 5.2736,
520
+ "mean_token_accuracy": 0.3182168474793434,
521
+ "num_tokens": 2681568.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.4405768728256225,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.6614978313446045,
528
+ "learning_rate": 1.4463170300530348e-06,
529
+ "loss": 5.2515,
530
+ "mean_token_accuracy": 0.3218736210465431,
531
+ "num_tokens": 2733587.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.528175053596496,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.0849746465682983,
538
+ "learning_rate": 1.434531526222746e-06,
539
+ "loss": 5.3378,
540
+ "mean_token_accuracy": 0.31061659604310987,
541
+ "num_tokens": 2787003.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.46110897064209,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 1.8315683603286743,
548
+ "learning_rate": 1.4227460223924573e-06,
549
+ "loss": 5.2782,
550
+ "mean_token_accuracy": 0.31781029611825945,
551
+ "num_tokens": 2840263.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.455560960769653,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 1.1859091520309448,
558
+ "learning_rate": 1.4109605185621684e-06,
559
+ "loss": 5.2735,
560
+ "mean_token_accuracy": 0.3194814011454582,
561
+ "num_tokens": 2894186.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.430496115684509,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3500001430511475,
568
+ "learning_rate": 1.3991750147318797e-06,
569
+ "loss": 5.2464,
570
+ "mean_token_accuracy": 0.32140792965888976,
571
+ "num_tokens": 2948171.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.588023023605347,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 1.727825403213501,
578
+ "learning_rate": 1.3873895109015909e-06,
579
+ "loss": 5.4028,
580
+ "mean_token_accuracy": 0.3039530631899834,
581
+ "num_tokens": 3002678.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.410525422096253,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.3401474952697754,
588
+ "learning_rate": 1.3756040070713022e-06,
589
+ "loss": 5.2298,
590
+ "mean_token_accuracy": 0.324065263569355,
591
+ "num_tokens": 3055844.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.36959942817688,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.1892589330673218,
598
+ "learning_rate": 1.3638185032410133e-06,
599
+ "loss": 5.1956,
600
+ "mean_token_accuracy": 0.32639502108097074,
601
+ "num_tokens": 3108636.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.53826907157898,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.2652360200881958,
608
+ "learning_rate": 1.3520329994107247e-06,
609
+ "loss": 5.3583,
610
+ "mean_token_accuracy": 0.3074926760792732,
611
+ "num_tokens": 3162627.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.417449145317078,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 1.584312915802002,
618
+ "learning_rate": 1.340247495580436e-06,
619
+ "loss": 5.2388,
620
+ "mean_token_accuracy": 0.32019727885723115,
621
+ "num_tokens": 3216409.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.241390740871429,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.5219439268112183,
628
+ "learning_rate": 1.3284619917501471e-06,
629
+ "loss": 5.0645,
630
+ "mean_token_accuracy": 0.3445430138707161,
631
+ "num_tokens": 3266967.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.405424036979675,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 2.1165153980255127,
638
+ "learning_rate": 1.3166764879198585e-06,
639
+ "loss": 5.232,
640
+ "mean_token_accuracy": 0.32085000157356264,
641
+ "num_tokens": 3319877.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.123006024360657,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 1.2189785242080688,
648
+ "learning_rate": 1.3048909840895698e-06,
649
+ "loss": 4.9582,
650
+ "mean_token_accuracy": 0.356108532845974,
651
+ "num_tokens": 3368569.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.417610831260681,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.5157604217529297,
658
+ "learning_rate": 1.2931054802592812e-06,
659
+ "loss": 5.2454,
660
+ "mean_token_accuracy": 0.31976755023002623,
661
+ "num_tokens": 3422449.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.409690895080566,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.3088161945343018,
668
+ "learning_rate": 1.2813199764289923e-06,
669
+ "loss": 5.2348,
670
+ "mean_token_accuracy": 0.32325415283441544,
671
+ "num_tokens": 3474399.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.44662567615509,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 2.178372621536255,
678
+ "learning_rate": 1.2695344725987036e-06,
679
+ "loss": 5.2661,
680
+ "mean_token_accuracy": 0.3182847076654434,
681
+ "num_tokens": 3527726.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.512614865303039,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 1.3050425052642822,
688
+ "learning_rate": 1.2577489687684147e-06,
689
+ "loss": 5.3416,
690
+ "mean_token_accuracy": 0.3084403133392334,
691
+ "num_tokens": 3581980.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.379772834777832,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4584404230117798,
698
+ "learning_rate": 1.245963464938126e-06,
699
+ "loss": 5.2087,
700
+ "mean_token_accuracy": 0.32388432770967485,
701
+ "num_tokens": 3635393.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.483665924072266,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.2157734632492065,
708
+ "learning_rate": 1.2341779611078374e-06,
709
+ "loss": 5.3101,
710
+ "mean_token_accuracy": 0.3121953472495079,
711
+ "num_tokens": 3689894.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.711394641805904,
717
+ "eval_loss": 5.55628776550293,
718
+ "eval_mean_token_accuracy": 0.2764948787777105,
719
+ "eval_model_preparation_time": 0.0045,
720
+ "eval_num_tokens": 3712724.0,
721
+ "eval_runtime": 50.187,
722
+ "eval_samples_per_second": 8.648,
723
+ "eval_steps_per_second": 4.324,
724
  "step": 3474
725
  }
726
  ],
727
  "logging_steps": 50,
728
+ "max_steps": 8685,
729
  "num_input_tokens_seen": 0,
730
+ "num_train_epochs": 5,
731
  "save_steps": 500,
732
  "stateful_callbacks": {
733
  "TrainerControl": {
 
741
  "attributes": {}
742
  }
743
  },
744
+ "total_flos": 5.088598592372736e+16,
745
  "train_batch_size": 2,
746
  "trial_name": null,
747
  "trial_params": null
checkpoint-3474/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a666397e6243ddba6f7279c90610ed552907ef4de0be511faece3826d13e618
3
  size 6225
checkpoint-5211/adapter_config.json CHANGED
@@ -29,10 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
  "k_proj",
34
  "v_proj",
35
- "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "k_proj",
34
  "v_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-5211/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96bed2a64089d15ba0d03e873c6ba43e222e9615622cb08853696f1bb3f72ed3
3
  size 26182176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4025b4926f5afc8ae1b9483e03aa404961dda3a1814cb46ca7aeea065c3fe0b
3
  size 26182176
checkpoint-5211/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc97f69c9bd94b7be821d35593073dc08cf44ccce0203ce520c9a25dfcbc93d7
3
  size 52486155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1c6ba0e1a0fa5232307b4180a95f626f91ecd955d6b67f2be41b26598f1360f
3
  size 52486155
checkpoint-5211/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bfc3867136ea1392d43912e26c993ff7e9d2c829e3cc938d41df7399c31116c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b9f29ddfbd9f77ba8789dff06c3a159107fa04d3db99c9007a614a55b3b852
3
  size 14645
checkpoint-5211/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a76316bf35b4ab1e089615992ceff4951bb9d24d95bfa6731e79f937bd9a30c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e174b3c503a209f4286065ce76cb198c717027077a7d229160809df776f0167
3
  size 1465
checkpoint-5211/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 5211,
3
- "best_metric": 5.628758430480957,
4
  "best_model_checkpoint": "./output/checkpoint-5211",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
@@ -10,1086 +10,1086 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 3.3817152976989746,
16
  "learning_rate": 4.9e-07,
17
- "loss": 13.8754,
18
- "mean_token_accuracy": 0.15036460414528846,
19
- "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 3.2541544437408447,
26
  "learning_rate": 9.9e-07,
27
- "loss": 14.2282,
28
- "mean_token_accuracy": 0.14137721598148345,
29
- "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 3.6797454357147217,
36
  "learning_rate": 1.49e-06,
37
- "loss": 13.0735,
38
- "mean_token_accuracy": 0.17473630651831626,
39
- "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 4.297911643981934,
46
  "learning_rate": 1.99e-06,
47
- "loss": 13.7392,
48
- "mean_token_accuracy": 0.1473099772632122,
49
- "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.405268669128418,
56
- "learning_rate": 1.9854771784232364e-06,
57
- "loss": 13.0797,
58
- "mean_token_accuracy": 0.16704789966344832,
59
- "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.757556438446045,
66
- "learning_rate": 1.9706579727326615e-06,
67
- "loss": 12.6321,
68
- "mean_token_accuracy": 0.1691790708899498,
69
- "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 6.406249523162842,
76
- "learning_rate": 1.955838767042086e-06,
77
- "loss": 12.2253,
78
- "mean_token_accuracy": 0.17223650276660918,
79
- "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 12.57987117767334,
86
- "learning_rate": 1.9410195613515113e-06,
87
- "loss": 11.9714,
88
- "mean_token_accuracy": 0.15997304677963256,
89
- "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 15.570313453674316,
96
- "learning_rate": 1.9262003556609364e-06,
97
- "loss": 10.8173,
98
- "mean_token_accuracy": 0.16447648257017136,
99
- "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 23.61503791809082,
106
- "learning_rate": 1.9113811499703615e-06,
107
- "loss": 9.3196,
108
- "mean_token_accuracy": 0.16179455041885377,
109
- "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 13.846810340881348,
116
- "learning_rate": 1.8965619442797864e-06,
117
- "loss": 7.9636,
118
- "mean_token_accuracy": 0.16881170988082886,
119
- "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 4.569090366363525,
126
- "learning_rate": 1.8817427385892115e-06,
127
- "loss": 7.4171,
128
- "mean_token_accuracy": 0.16941152423620223,
129
- "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 4.594696521759033,
136
- "learning_rate": 1.8669235328986366e-06,
137
- "loss": 6.9389,
138
- "mean_token_accuracy": 0.1844496901333332,
139
- "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 4.768734931945801,
146
- "learning_rate": 1.8521043272080617e-06,
147
- "loss": 6.9818,
148
- "mean_token_accuracy": 0.16990411713719367,
149
- "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 3.253056764602661,
156
- "learning_rate": 1.8372851215174864e-06,
157
- "loss": 6.7105,
158
- "mean_token_accuracy": 0.18250102579593658,
159
- "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.1871063709259033,
166
- "learning_rate": 1.8224659158269115e-06,
167
- "loss": 6.6685,
168
- "mean_token_accuracy": 0.17129646152257919,
169
- "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.2284677028656006,
176
- "learning_rate": 1.8076467101363366e-06,
177
- "loss": 6.53,
178
- "mean_token_accuracy": 0.18053789794445038,
179
- "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 2.2088730335235596,
186
- "learning_rate": 1.7928275044457617e-06,
187
- "loss": 6.4429,
188
- "mean_token_accuracy": 0.18492739230394364,
189
- "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.3000030517578125,
196
- "learning_rate": 1.7780082987551866e-06,
197
- "loss": 6.047,
198
- "mean_token_accuracy": 0.2291259828209877,
199
- "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 2.1333675384521484,
206
- "learning_rate": 1.7631890930646115e-06,
207
- "loss": 6.0919,
208
- "mean_token_accuracy": 0.22644571751356124,
209
- "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.0400779247283936,
216
- "learning_rate": 1.7483698873740366e-06,
217
- "loss": 6.094,
218
- "mean_token_accuracy": 0.2222653564810753,
219
- "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 2.8049051761627197,
226
- "learning_rate": 1.7335506816834617e-06,
227
- "loss": 5.8011,
228
- "mean_token_accuracy": 0.25127078920602797,
229
- "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 4.063963890075684,
236
- "learning_rate": 1.7187314759928866e-06,
237
- "loss": 5.6855,
238
- "mean_token_accuracy": 0.26265266716480257,
239
- "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 3.9440460205078125,
246
- "learning_rate": 1.7039122703023117e-06,
247
- "loss": 5.8578,
248
- "mean_token_accuracy": 0.24439335912466048,
249
- "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 3.20070481300354,
256
- "learning_rate": 1.6890930646117368e-06,
257
- "loss": 5.8876,
258
- "mean_token_accuracy": 0.24275501281023026,
259
- "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 2.8067362308502197,
266
- "learning_rate": 1.6742738589211617e-06,
267
- "loss": 5.8058,
268
- "mean_token_accuracy": 0.25242207854986193,
269
- "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 2.6375925540924072,
276
- "learning_rate": 1.6594546532305868e-06,
277
- "loss": 5.6718,
278
- "mean_token_accuracy": 0.2665082859992981,
279
- "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 3.951350212097168,
286
- "learning_rate": 1.6446354475400117e-06,
287
- "loss": 5.8012,
288
- "mean_token_accuracy": 0.25434976994991304,
289
- "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 3.580608606338501,
296
- "learning_rate": 1.6298162418494368e-06,
297
- "loss": 5.8027,
298
- "mean_token_accuracy": 0.25208072274923327,
299
- "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 3.9580376148223877,
306
- "learning_rate": 1.614997036158862e-06,
307
- "loss": 5.7364,
308
- "mean_token_accuracy": 0.25940640360116957,
309
- "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 4.55721378326416,
316
- "learning_rate": 1.6001778304682868e-06,
317
- "loss": 5.8092,
318
- "mean_token_accuracy": 0.2496869170665741,
319
- "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 2.330057144165039,
326
- "learning_rate": 1.5853586247777117e-06,
327
- "loss": 5.6604,
328
- "mean_token_accuracy": 0.2686630353331566,
329
- "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 2.9881200790405273,
336
- "learning_rate": 1.5705394190871368e-06,
337
- "loss": 5.8388,
338
- "mean_token_accuracy": 0.2503683388233185,
339
- "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 3.798994779586792,
346
- "learning_rate": 1.555720213396562e-06,
347
- "loss": 5.5635,
348
- "mean_token_accuracy": 0.278279125392437,
349
- "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.139133475343203,
355
- "eval_loss": 5.861395835876465,
356
- "eval_mean_token_accuracy": 0.2402858340657801,
357
- "eval_model_preparation_time": 0.0047,
358
- "eval_num_tokens": 1825107.0,
359
- "eval_runtime": 79.3994,
360
- "eval_samples_per_second": 5.466,
361
- "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 5.8970259666442875,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 2.6411802768707275,
368
- "learning_rate": 1.540901007705987e-06,
369
- "loss": 5.614,
370
- "mean_token_accuracy": 0.273006406724453,
371
- "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 6.0111794090271,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 3.6491827964782715,
378
- "learning_rate": 1.526081802015412e-06,
379
- "loss": 5.7323,
380
- "mean_token_accuracy": 0.26104256987571717,
381
- "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 5.902219276428223,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 2.593249559402466,
388
- "learning_rate": 1.5112625963248368e-06,
389
- "loss": 5.6187,
390
- "mean_token_accuracy": 0.2746362566947937,
391
- "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 5.874705944061279,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 2.554327964782715,
398
- "learning_rate": 1.496443390634262e-06,
399
- "loss": 5.6021,
400
- "mean_token_accuracy": 0.2795292744040489,
401
- "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 5.850096368789673,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 3.6060993671417236,
408
- "learning_rate": 1.481624184943687e-06,
409
- "loss": 5.576,
410
- "mean_token_accuracy": 0.28532547056674956,
411
- "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 5.802229671478272,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 3.0913314819335938,
418
- "learning_rate": 1.466804979253112e-06,
419
- "loss": 5.53,
420
- "mean_token_accuracy": 0.2916027933359146,
421
- "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 5.875646467208862,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 4.777045726776123,
428
- "learning_rate": 1.451985773562537e-06,
429
- "loss": 5.6146,
430
- "mean_token_accuracy": 0.28063644528388976,
431
- "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 5.786596937179565,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 4.207762718200684,
438
- "learning_rate": 1.437166567871962e-06,
439
- "loss": 5.5417,
440
- "mean_token_accuracy": 0.2870470091700554,
441
- "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.672234449386597,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 2.2771811485290527,
448
- "learning_rate": 1.422347362181387e-06,
449
- "loss": 5.4285,
450
- "mean_token_accuracy": 0.30194485366344453,
451
- "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 5.862573285102844,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 3.3273422718048096,
458
- "learning_rate": 1.4075281564908121e-06,
459
- "loss": 5.6169,
460
- "mean_token_accuracy": 0.278145115673542,
461
- "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 5.734760231971741,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 3.7049715518951416,
468
- "learning_rate": 1.392708950800237e-06,
469
- "loss": 5.493,
470
- "mean_token_accuracy": 0.2941485676169395,
471
- "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.665819988250733,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 3.572636604309082,
478
- "learning_rate": 1.3778897451096621e-06,
479
- "loss": 5.4352,
480
- "mean_token_accuracy": 0.3003745040297508,
481
- "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 5.890115032196045,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 2.738203525543213,
488
- "learning_rate": 1.3630705394190872e-06,
489
- "loss": 5.6555,
490
- "mean_token_accuracy": 0.2737997192144394,
491
- "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.66056040763855,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 3.1416995525360107,
498
- "learning_rate": 1.3482513337285121e-06,
499
- "loss": 5.4302,
500
- "mean_token_accuracy": 0.3000989046692848,
501
- "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 5.861240615844727,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 2.7569284439086914,
508
- "learning_rate": 1.333432128037937e-06,
509
- "loss": 5.6304,
510
- "mean_token_accuracy": 0.27707513481378554,
511
- "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.627686910629272,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 1.7750262022018433,
518
- "learning_rate": 1.3186129223473621e-06,
519
- "loss": 5.4058,
520
- "mean_token_accuracy": 0.3019809901714325,
521
- "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.607026796340943,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 3.1005160808563232,
528
- "learning_rate": 1.3037937166567872e-06,
529
- "loss": 5.3836,
530
- "mean_token_accuracy": 0.30584611505270004,
531
- "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 5.6909641885757445,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 1.6848654747009277,
538
- "learning_rate": 1.2889745109662123e-06,
539
- "loss": 5.4653,
540
- "mean_token_accuracy": 0.296178964972496,
541
- "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 5.619450302124023,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 2.469539165496826,
548
- "learning_rate": 1.274155305275637e-06,
549
- "loss": 5.4022,
550
- "mean_token_accuracy": 0.3039679077267647,
551
- "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.61073097705841,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 2.367810010910034,
558
- "learning_rate": 1.259336099585062e-06,
559
- "loss": 5.3956,
560
- "mean_token_accuracy": 0.3051413372159004,
561
- "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.5791136837005615,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 2.3874764442443848,
568
- "learning_rate": 1.2445168938944872e-06,
569
- "loss": 5.3676,
570
- "mean_token_accuracy": 0.3068238252401352,
571
- "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 5.735381307601929,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 2.2097349166870117,
578
- "learning_rate": 1.2296976882039123e-06,
579
- "loss": 5.5239,
580
- "mean_token_accuracy": 0.28974882304668426,
581
- "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.55252691745758,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 1.694831132888794,
588
- "learning_rate": 1.2148784825133372e-06,
589
- "loss": 5.351,
590
- "mean_token_accuracy": 0.3091904193162918,
591
- "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.508773093223572,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 1.8229279518127441,
598
- "learning_rate": 1.200059276822762e-06,
599
- "loss": 5.3164,
600
- "mean_token_accuracy": 0.31158645361661913,
601
- "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 5.676794271469117,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 1.7196234464645386,
608
- "learning_rate": 1.1852400711321872e-06,
609
- "loss": 5.4776,
610
- "mean_token_accuracy": 0.2929128894209862,
611
- "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.551529383659362,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 3.117525577545166,
618
- "learning_rate": 1.1704208654416123e-06,
619
- "loss": 5.3561,
620
- "mean_token_accuracy": 0.30634030640125276,
621
- "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.379635264873505,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 1.876755714416504,
628
- "learning_rate": 1.1556016597510372e-06,
629
- "loss": 5.1868,
630
- "mean_token_accuracy": 0.32913618892431257,
631
- "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.538804936408996,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 1.8670976161956787,
638
- "learning_rate": 1.1407824540604623e-06,
639
- "loss": 5.3494,
640
- "mean_token_accuracy": 0.30661171555519107,
641
- "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.258263626098633,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 2.748718023300171,
648
- "learning_rate": 1.1259632483698874e-06,
649
- "loss": 5.08,
650
- "mean_token_accuracy": 0.3413010013103485,
651
- "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.54539008140564,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 1.8556406497955322,
658
- "learning_rate": 1.1111440426793123e-06,
659
- "loss": 5.3614,
660
- "mean_token_accuracy": 0.30550685405731204,
661
- "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.5433073282241825,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 1.8386749029159546,
668
- "learning_rate": 1.0963248369887374e-06,
669
- "loss": 5.3543,
670
- "mean_token_accuracy": 0.30875524014234546,
671
- "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 5.5769769477844235,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 1.922486662864685,
678
- "learning_rate": 1.0815056312981623e-06,
679
- "loss": 5.3834,
680
- "mean_token_accuracy": 0.3035113242268562,
681
- "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 5.640013842582703,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 2.179500102996826,
688
- "learning_rate": 1.0666864256075874e-06,
689
- "loss": 5.4574,
690
- "mean_token_accuracy": 0.2947095710039139,
691
- "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.506910061836242,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 1.4014379978179932,
698
- "learning_rate": 1.0518672199170125e-06,
699
- "loss": 5.3234,
700
- "mean_token_accuracy": 0.3096472260355949,
701
- "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 5.607311015129089,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 1.41231107711792,
708
- "learning_rate": 1.0370480142264374e-06,
709
- "loss": 5.4226,
710
- "mean_token_accuracy": 0.2979922544956207,
711
- "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 5.831721861790951,
717
- "eval_loss": 5.656307220458984,
718
- "eval_mean_token_accuracy": 0.2641724460685308,
719
- "eval_model_preparation_time": 0.0047,
720
- "eval_num_tokens": 3650214.0,
721
- "eval_runtime": 79.7324,
722
- "eval_samples_per_second": 5.443,
723
- "eval_steps_per_second": 2.722,
724
  "step": 3474
725
  },
726
  {
727
- "entropy": 5.477711625099182,
728
  "epoch": 2.0149683362118593,
729
- "grad_norm": 3.0133137702941895,
730
- "learning_rate": 1.0222288085358623e-06,
731
- "loss": 5.2957,
732
- "mean_token_accuracy": 0.31543311327695844,
733
- "num_tokens": 3677883.0,
734
  "step": 3500
735
  },
736
  {
737
- "entropy": 5.599187393188476,
738
  "epoch": 2.043753598157743,
739
- "grad_norm": 1.885867714881897,
740
- "learning_rate": 1.0074096028452874e-06,
741
- "loss": 5.4142,
742
- "mean_token_accuracy": 0.3004470923542976,
743
- "num_tokens": 3730991.0,
744
  "step": 3550
745
  },
746
  {
747
- "entropy": 5.526448183059692,
748
  "epoch": 2.0725388601036268,
749
- "grad_norm": 4.50788688659668,
750
- "learning_rate": 9.925903971547125e-07,
751
- "loss": 5.3517,
752
- "mean_token_accuracy": 0.3069574165344238,
753
- "num_tokens": 3783795.0,
754
  "step": 3600
755
  },
756
  {
757
- "entropy": 5.560557870864868,
758
  "epoch": 2.1013241220495105,
759
- "grad_norm": 1.927862524986267,
760
- "learning_rate": 9.777711914641374e-07,
761
- "loss": 5.3815,
762
- "mean_token_accuracy": 0.3045575937628746,
763
- "num_tokens": 3835526.0,
764
  "step": 3650
765
  },
766
  {
767
- "entropy": 5.528058257102966,
768
  "epoch": 2.130109383995394,
769
- "grad_norm": 2.164687156677246,
770
- "learning_rate": 9.629519857735625e-07,
771
- "loss": 5.3501,
772
- "mean_token_accuracy": 0.3071546205878258,
773
- "num_tokens": 3887175.0,
774
  "step": 3700
775
  },
776
  {
777
- "entropy": 5.397617678642273,
778
  "epoch": 2.158894645941278,
779
- "grad_norm": 2.3098385334014893,
780
- "learning_rate": 9.481327800829875e-07,
781
- "loss": 5.2244,
782
- "mean_token_accuracy": 0.3226669803261757,
783
- "num_tokens": 3938003.0,
784
  "step": 3750
785
  },
786
  {
787
- "entropy": 5.529960298538208,
788
  "epoch": 2.1876799078871616,
789
- "grad_norm": 1.8144755363464355,
790
- "learning_rate": 9.333135743924125e-07,
791
- "loss": 5.3572,
792
- "mean_token_accuracy": 0.306032218337059,
793
- "num_tokens": 3990451.0,
794
  "step": 3800
795
  },
796
  {
797
- "entropy": 5.597109637260437,
798
  "epoch": 2.2164651698330453,
799
- "grad_norm": 2.7306935787200928,
800
- "learning_rate": 9.184943687018375e-07,
801
- "loss": 5.4162,
802
- "mean_token_accuracy": 0.2985941395163536,
803
- "num_tokens": 4044048.0,
804
  "step": 3850
805
  },
806
  {
807
- "entropy": 5.448684883117676,
808
  "epoch": 2.245250431778929,
809
- "grad_norm": 1.8199880123138428,
810
- "learning_rate": 9.036751630112626e-07,
811
- "loss": 5.2775,
812
- "mean_token_accuracy": 0.31548845052719116,
813
- "num_tokens": 4095276.0,
814
  "step": 3900
815
  },
816
  {
817
- "entropy": 5.5008597612380985,
818
  "epoch": 2.2740356937248127,
819
- "grad_norm": 1.755323052406311,
820
- "learning_rate": 8.888559573206875e-07,
821
- "loss": 5.3274,
822
- "mean_token_accuracy": 0.309090721309185,
823
- "num_tokens": 4148172.0,
824
  "step": 3950
825
  },
826
  {
827
- "entropy": 5.7040300464630125,
828
  "epoch": 2.3028209556706964,
829
- "grad_norm": 2.3154356479644775,
830
- "learning_rate": 8.740367516301126e-07,
831
- "loss": 5.5239,
832
- "mean_token_accuracy": 0.28589318484067916,
833
- "num_tokens": 4202733.0,
834
  "step": 4000
835
  },
836
  {
837
- "entropy": 5.549855670928955,
838
  "epoch": 2.33160621761658,
839
- "grad_norm": 1.9549669027328491,
840
- "learning_rate": 8.592175459395375e-07,
841
- "loss": 5.3755,
842
- "mean_token_accuracy": 0.3029727828502655,
843
- "num_tokens": 4255738.0,
844
  "step": 4050
845
  },
846
  {
847
- "entropy": 5.579690465927124,
848
  "epoch": 2.360391479562464,
849
- "grad_norm": 1.7018866539001465,
850
- "learning_rate": 8.443983402489626e-07,
851
- "loss": 5.4036,
852
- "mean_token_accuracy": 0.3001995691657066,
853
- "num_tokens": 4308638.0,
854
  "step": 4100
855
  },
856
  {
857
- "entropy": 5.646504878997803,
858
  "epoch": 2.3891767415083476,
859
- "grad_norm": 1.4139262437820435,
860
- "learning_rate": 8.295791345583877e-07,
861
- "loss": 5.4733,
862
- "mean_token_accuracy": 0.2912476986646652,
863
- "num_tokens": 4363170.0,
864
  "step": 4150
865
  },
866
  {
867
- "entropy": 5.554990992546082,
868
  "epoch": 2.4179620034542313,
869
- "grad_norm": 1.6886577606201172,
870
- "learning_rate": 8.147599288678126e-07,
871
- "loss": 5.3842,
872
- "mean_token_accuracy": 0.302762059867382,
873
- "num_tokens": 4415607.0,
874
  "step": 4200
875
  },
876
  {
877
- "entropy": 5.513420124053955,
878
  "epoch": 2.446747265400115,
879
- "grad_norm": 1.3537819385528564,
880
- "learning_rate": 7.999407231772377e-07,
881
- "loss": 5.3408,
882
- "mean_token_accuracy": 0.30764526218175886,
883
- "num_tokens": 4467608.0,
884
  "step": 4250
885
  },
886
  {
887
- "entropy": 5.561378569602966,
888
  "epoch": 2.4755325273459987,
889
- "grad_norm": 1.8514106273651123,
890
- "learning_rate": 7.851215174866627e-07,
891
- "loss": 5.3891,
892
- "mean_token_accuracy": 0.301382859647274,
893
- "num_tokens": 4520299.0,
894
  "step": 4300
895
  },
896
  {
897
- "entropy": 5.536689953804016,
898
  "epoch": 2.5043177892918824,
899
- "grad_norm": 2.1830835342407227,
900
- "learning_rate": 7.703023117960877e-07,
901
- "loss": 5.3672,
902
- "mean_token_accuracy": 0.3047756373882294,
903
- "num_tokens": 4573065.0,
904
  "step": 4350
905
  },
906
  {
907
- "entropy": 5.69776873588562,
908
  "epoch": 2.533103051237766,
909
- "grad_norm": 1.999536156654358,
910
- "learning_rate": 7.554831061055127e-07,
911
- "loss": 5.5236,
912
- "mean_token_accuracy": 0.2868007507920265,
913
- "num_tokens": 4626807.0,
914
  "step": 4400
915
  },
916
  {
917
- "entropy": 5.3977436876297,
918
  "epoch": 2.56188831318365,
919
- "grad_norm": 1.9608020782470703,
920
- "learning_rate": 7.406639004149378e-07,
921
- "loss": 5.2335,
922
- "mean_token_accuracy": 0.3199601462483406,
923
- "num_tokens": 4677663.0,
924
  "step": 4450
925
  },
926
  {
927
- "entropy": 5.6681678771972654,
928
  "epoch": 2.5906735751295336,
929
- "grad_norm": 1.829047441482544,
930
- "learning_rate": 7.258446947243627e-07,
931
- "loss": 5.491,
932
- "mean_token_accuracy": 0.2894612854719162,
933
- "num_tokens": 4731830.0,
934
  "step": 4500
935
  },
936
  {
937
- "entropy": 5.49174174785614,
938
  "epoch": 2.6194588370754173,
939
- "grad_norm": 1.3158719539642334,
940
- "learning_rate": 7.110254890337878e-07,
941
- "loss": 5.3225,
942
- "mean_token_accuracy": 0.3084965732693672,
943
- "num_tokens": 4784694.0,
944
  "step": 4550
945
  },
946
  {
947
- "entropy": 5.573234438896179,
948
  "epoch": 2.648244099021301,
949
- "grad_norm": 1.562915325164795,
950
- "learning_rate": 6.962062833432127e-07,
951
- "loss": 5.4028,
952
- "mean_token_accuracy": 0.2989520016312599,
953
- "num_tokens": 4838534.0,
954
  "step": 4600
955
  },
956
  {
957
- "entropy": 5.550469598770142,
958
  "epoch": 2.6770293609671847,
959
- "grad_norm": 2.114727735519409,
960
- "learning_rate": 6.813870776526378e-07,
961
- "loss": 5.3804,
962
- "mean_token_accuracy": 0.30373542964458466,
963
- "num_tokens": 4890611.0,
964
  "step": 4650
965
  },
966
  {
967
- "entropy": 5.523049550056458,
968
  "epoch": 2.7058146229130684,
969
- "grad_norm": 2.5036823749542236,
970
- "learning_rate": 6.665678719620628e-07,
971
- "loss": 5.3542,
972
- "mean_token_accuracy": 0.30681024432182313,
973
- "num_tokens": 4943571.0,
974
  "step": 4700
975
  },
976
  {
977
- "entropy": 5.323453049659729,
978
  "epoch": 2.734599884858952,
979
- "grad_norm": 1.8069168329238892,
980
- "learning_rate": 6.517486662714878e-07,
981
- "loss": 5.1583,
982
- "mean_token_accuracy": 0.32906652927398683,
983
- "num_tokens": 4993871.0,
984
  "step": 4750
985
  },
986
  {
987
- "entropy": 5.504038324356079,
988
  "epoch": 2.763385146804836,
989
- "grad_norm": 4.750283718109131,
990
- "learning_rate": 6.369294605809128e-07,
991
- "loss": 5.3366,
992
- "mean_token_accuracy": 0.3087608867883682,
993
- "num_tokens": 5046187.0,
994
  "step": 4800
995
  },
996
  {
997
- "entropy": 5.487624549865723,
998
  "epoch": 2.7921704087507195,
999
- "grad_norm": 1.4186172485351562,
1000
- "learning_rate": 6.221102548903379e-07,
1001
- "loss": 5.3237,
1002
- "mean_token_accuracy": 0.3088638699054718,
1003
- "num_tokens": 5098644.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
- "entropy": 5.346905107498169,
1008
  "epoch": 2.8209556706966032,
1009
- "grad_norm": 1.5670177936553955,
1010
- "learning_rate": 6.072910491997628e-07,
1011
- "loss": 5.1849,
1012
- "mean_token_accuracy": 0.3265886321663857,
1013
- "num_tokens": 5149345.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
- "entropy": 5.510410032272339,
1018
  "epoch": 2.849740932642487,
1019
- "grad_norm": 7.489855766296387,
1020
- "learning_rate": 5.924718435091879e-07,
1021
- "loss": 5.3424,
1022
- "mean_token_accuracy": 0.30768151730299,
1023
- "num_tokens": 5202028.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
- "entropy": 5.525181493759155,
1028
  "epoch": 2.8785261945883707,
1029
- "grad_norm": 1.8829196691513062,
1030
- "learning_rate": 5.776526378186128e-07,
1031
- "loss": 5.3654,
1032
- "mean_token_accuracy": 0.30342737555503846,
1033
- "num_tokens": 5255082.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
- "entropy": 5.374098634719848,
1038
  "epoch": 2.9073114565342544,
1039
- "grad_norm": 1.3901060819625854,
1040
- "learning_rate": 5.628334321280379e-07,
1041
- "loss": 5.2103,
1042
- "mean_token_accuracy": 0.3233291879296303,
1043
- "num_tokens": 5305042.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
- "entropy": 5.374619431495667,
1048
  "epoch": 2.936096718480138,
1049
- "grad_norm": 1.6586560010910034,
1050
- "learning_rate": 5.48014226437463e-07,
1051
- "loss": 5.2125,
1052
- "mean_token_accuracy": 0.322759662270546,
1053
- "num_tokens": 5356310.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
- "entropy": 5.527479724884033,
1058
  "epoch": 2.964881980426022,
1059
- "grad_norm": 1.6678485870361328,
1060
- "learning_rate": 5.331950207468879e-07,
1061
- "loss": 5.3627,
1062
- "mean_token_accuracy": 0.30430852621793747,
1063
- "num_tokens": 5409283.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
- "entropy": 5.6171248292922975,
1068
  "epoch": 2.9936672423719055,
1069
- "grad_norm": 1.50790274143219,
1070
- "learning_rate": 5.18375815056313e-07,
1071
- "loss": 5.4484,
1072
- "mean_token_accuracy": 0.29375598043203355,
1073
- "num_tokens": 5464332.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
- "eval_entropy": 5.78779639186947,
1079
- "eval_loss": 5.628758430480957,
1080
- "eval_mean_token_accuracy": 0.2653660801698535,
1081
- "eval_model_preparation_time": 0.0047,
1082
- "eval_num_tokens": 5475321.0,
1083
- "eval_runtime": 80.3676,
1084
- "eval_samples_per_second": 5.4,
1085
- "eval_steps_per_second": 2.7,
1086
  "step": 5211
1087
  }
1088
  ],
1089
  "logging_steps": 50,
1090
- "max_steps": 6948,
1091
  "num_input_tokens_seen": 0,
1092
- "num_train_epochs": 4,
1093
  "save_steps": 500,
1094
  "stateful_callbacks": {
1095
  "TrainerControl": {
@@ -1103,7 +1103,7 @@
1103
  "attributes": {}
1104
  }
1105
  },
1106
- "total_flos": 7.520890606086144e+16,
1107
  "train_batch_size": 2,
1108
  "trial_name": null,
1109
  "trial_params": null
 
1
  {
2
  "best_global_step": 5211,
3
+ "best_metric": 5.53223991394043,
4
  "best_model_checkpoint": "./output/checkpoint-5211",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.606692385673523,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.2999913692474365,
16
  "learning_rate": 4.9e-07,
17
+ "loss": 13.6598,
18
+ "mean_token_accuracy": 0.16028020828962325,
19
+ "num_tokens": 53993.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.618675880432129,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.101252555847168,
26
  "learning_rate": 9.9e-07,
27
+ "loss": 14.0188,
28
+ "mean_token_accuracy": 0.1508466500043869,
29
+ "num_tokens": 110134.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.5215235900878907,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.513662815093994,
36
  "learning_rate": 1.49e-06,
37
+ "loss": 12.8555,
38
+ "mean_token_accuracy": 0.18527640983462335,
39
+ "num_tokens": 160191.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.667909698486328,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.327610492706299,
46
  "learning_rate": 1.99e-06,
47
+ "loss": 13.5394,
48
+ "mean_token_accuracy": 0.157139780074358,
49
+ "num_tokens": 214993.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.768263258934021,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.290107250213623,
56
+ "learning_rate": 1.988450206246317e-06,
57
+ "loss": 12.8912,
58
+ "mean_token_accuracy": 0.17374794125556947,
59
+ "num_tokens": 268184.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 3.990619196891785,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.444278717041016,
66
+ "learning_rate": 1.976664702416028e-06,
67
+ "loss": 12.455,
68
+ "mean_token_accuracy": 0.17780130118131637,
69
+ "num_tokens": 319458.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.162646284103394,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 5.615262508392334,
76
+ "learning_rate": 1.9648791985857395e-06,
77
+ "loss": 12.0893,
78
+ "mean_token_accuracy": 0.18191319867968558,
79
+ "num_tokens": 373337.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.532100868225098,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 10.074016571044922,
86
+ "learning_rate": 1.9530936947554507e-06,
87
+ "loss": 11.9261,
88
+ "mean_token_accuracy": 0.169477596282959,
89
+ "num_tokens": 427526.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 4.923871030807495,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 16.220163345336914,
96
+ "learning_rate": 1.9413081909251622e-06,
97
+ "loss": 11.0048,
98
+ "mean_token_accuracy": 0.1704501649737358,
99
+ "num_tokens": 480528.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.521005854606629,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 29.904008865356445,
106
+ "learning_rate": 1.9295226870948733e-06,
107
+ "loss": 9.6524,
108
+ "mean_token_accuracy": 0.16450899541378022,
109
+ "num_tokens": 535314.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.092623329162597,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 17.821575164794922,
116
+ "learning_rate": 1.9177371832645845e-06,
117
+ "loss": 8.1054,
118
+ "mean_token_accuracy": 0.17205011785030366,
119
+ "num_tokens": 588410.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.385262680053711,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 5.502202987670898,
126
+ "learning_rate": 1.9059516794342958e-06,
127
+ "loss": 7.4313,
128
+ "mean_token_accuracy": 0.1734227080643177,
129
+ "num_tokens": 641736.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.278562617301941,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.4657697677612305,
136
+ "learning_rate": 1.8941661756040071e-06,
137
+ "loss": 6.9266,
138
+ "mean_token_accuracy": 0.18680249139666558,
139
+ "num_tokens": 692200.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.553266277313233,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.955812931060791,
146
+ "learning_rate": 1.8823806717737183e-06,
147
+ "loss": 6.9847,
148
+ "mean_token_accuracy": 0.16679802387952805,
149
+ "num_tokens": 745830.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.470935583114624,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.198381423950195,
156
+ "learning_rate": 1.8705951679434296e-06,
157
+ "loss": 6.7277,
158
+ "mean_token_accuracy": 0.17847734570503235,
159
+ "num_tokens": 798872.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.5620588779449465,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 3.1793746948242188,
166
+ "learning_rate": 1.8588096641131407e-06,
167
+ "loss": 6.7032,
168
+ "mean_token_accuracy": 0.17336134731769562,
169
+ "num_tokens": 853045.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.532204885482788,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.824537515640259,
176
+ "learning_rate": 1.847024160282852e-06,
177
+ "loss": 6.5762,
178
+ "mean_token_accuracy": 0.1805124071240425,
179
+ "num_tokens": 907679.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.535988225936889,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 4.350001811981201,
186
+ "learning_rate": 1.8352386564525632e-06,
187
+ "loss": 6.505,
188
+ "mean_token_accuracy": 0.1842605724930763,
189
+ "num_tokens": 964170.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.204533562660218,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.193660020828247,
196
+ "learning_rate": 1.8234531526222745e-06,
197
+ "loss": 6.1211,
198
+ "mean_token_accuracy": 0.21968430042266845,
199
+ "num_tokens": 1015909.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.308737449645996,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.325622320175171,
206
+ "learning_rate": 1.8116676487919857e-06,
207
+ "loss": 6.1653,
208
+ "mean_token_accuracy": 0.21636426240205764,
209
+ "num_tokens": 1068859.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.332560749053955,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0439090728759766,
216
+ "learning_rate": 1.799882144961697e-06,
217
+ "loss": 6.1559,
218
+ "mean_token_accuracy": 0.21859725564718246,
219
+ "num_tokens": 1123202.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.042124252319336,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 3.621903657913208,
226
+ "learning_rate": 1.7880966411314081e-06,
227
+ "loss": 5.8441,
228
+ "mean_token_accuracy": 0.24906315237283708,
229
+ "num_tokens": 1173403.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.921343173980713,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 5.658033847808838,
236
+ "learning_rate": 1.7763111373011195e-06,
237
+ "loss": 5.7104,
238
+ "mean_token_accuracy": 0.2625067520141602,
239
+ "num_tokens": 1225026.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.093586492538452,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 2.4292995929718018,
246
+ "learning_rate": 1.7645256334708308e-06,
247
+ "loss": 5.8658,
248
+ "mean_token_accuracy": 0.24842385441064835,
249
+ "num_tokens": 1279013.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.119112596511841,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.369384288787842,
256
+ "learning_rate": 1.752740129640542e-06,
257
+ "loss": 5.8784,
258
+ "mean_token_accuracy": 0.24857850253582,
259
+ "num_tokens": 1332547.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.025163550376892,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.5110116004943848,
266
+ "learning_rate": 1.7409546258102533e-06,
267
+ "loss": 5.7769,
268
+ "mean_token_accuracy": 0.25835376888513567,
269
+ "num_tokens": 1385192.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.877259612083435,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.4179303646087646,
276
+ "learning_rate": 1.7291691219799646e-06,
277
+ "loss": 5.6284,
278
+ "mean_token_accuracy": 0.2756252554059029,
279
+ "num_tokens": 1437071.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.002246947288513,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.494359016418457,
286
+ "learning_rate": 1.717383618149676e-06,
287
+ "loss": 5.747,
288
+ "mean_token_accuracy": 0.26462210685014725,
289
+ "num_tokens": 1490818.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 5.991955623626709,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 2.340975761413574,
296
+ "learning_rate": 1.705598114319387e-06,
297
+ "loss": 5.7379,
298
+ "mean_token_accuracy": 0.26444981098175047,
299
+ "num_tokens": 1544997.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 5.91768889427185,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 2.2394514083862305,
306
+ "learning_rate": 1.6938126104890984e-06,
307
+ "loss": 5.6564,
308
+ "mean_token_accuracy": 0.2730415526032448,
309
+ "num_tokens": 1598302.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 5.982716989517212,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 1.876839518547058,
316
+ "learning_rate": 1.6820271066588098e-06,
317
+ "loss": 5.7215,
318
+ "mean_token_accuracy": 0.26642445534467696,
319
+ "num_tokens": 1655267.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.820467872619629,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.219966173171997,
326
+ "learning_rate": 1.6702416028285209e-06,
327
+ "loss": 5.5555,
328
+ "mean_token_accuracy": 0.2856418335437775,
329
+ "num_tokens": 1709199.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 5.996349005699158,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.247213840484619,
336
+ "learning_rate": 1.6584560989982322e-06,
337
+ "loss": 5.7283,
338
+ "mean_token_accuracy": 0.2696125540137291,
339
+ "num_tokens": 1765443.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.696683068275451,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 2.8499979972839355,
346
+ "learning_rate": 1.6466705951679433e-06,
347
+ "loss": 5.4335,
348
+ "mean_token_accuracy": 0.29918427973985673,
349
+ "num_tokens": 1817494.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 5.993559589034401,
355
+ "eval_loss": 5.737204551696777,
356
+ "eval_mean_token_accuracy": 0.2618687468739699,
357
+ "eval_model_preparation_time": 0.0045,
358
+ "eval_num_tokens": 1856362.0,
359
+ "eval_runtime": 50.5332,
360
+ "eval_samples_per_second": 8.588,
361
+ "eval_steps_per_second": 4.294,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.746842083930969,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.33052921295166,
368
+ "learning_rate": 1.6348850913376547e-06,
369
+ "loss": 5.4796,
370
+ "mean_token_accuracy": 0.2966849410533905,
371
+ "num_tokens": 1870353.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 5.859029049873352,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 1.6248886585235596,
378
+ "learning_rate": 1.6230995875073658e-06,
379
+ "loss": 5.5975,
380
+ "mean_token_accuracy": 0.2838129925727844,
381
+ "num_tokens": 1926205.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.731445336341858,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 1.6941566467285156,
388
+ "learning_rate": 1.6113140836770771e-06,
389
+ "loss": 5.476,
390
+ "mean_token_accuracy": 0.2992346465587616,
391
+ "num_tokens": 1979821.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.6993954515457155,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 1.1746597290039062,
398
+ "learning_rate": 1.5995285798467883e-06,
399
+ "loss": 5.4608,
400
+ "mean_token_accuracy": 0.3000726142525673,
401
+ "num_tokens": 2034373.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.668873124122619,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 1.728211760520935,
408
+ "learning_rate": 1.5877430760164996e-06,
409
+ "loss": 5.4347,
410
+ "mean_token_accuracy": 0.3033922725915909,
411
+ "num_tokens": 2087339.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.624621086120605,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 1.4078539609909058,
418
+ "learning_rate": 1.5759575721862107e-06,
419
+ "loss": 5.3954,
420
+ "mean_token_accuracy": 0.30784171640872954,
421
+ "num_tokens": 2139520.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.7141213130950925,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 2.186459541320801,
428
+ "learning_rate": 1.564172068355922e-06,
429
+ "loss": 5.4847,
430
+ "mean_token_accuracy": 0.29594049394130706,
431
+ "num_tokens": 2193987.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.632415266036987,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 1.3601349592208862,
438
+ "learning_rate": 1.5523865645256334e-06,
439
+ "loss": 5.4135,
440
+ "mean_token_accuracy": 0.30366597563028336,
441
+ "num_tokens": 2249616.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.510904269218445,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.065760612487793,
448
+ "learning_rate": 1.5406010606953445e-06,
449
+ "loss": 5.2904,
450
+ "mean_token_accuracy": 0.3211754837632179,
451
+ "num_tokens": 2300863.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.703383626937867,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 1.1172698736190796,
458
+ "learning_rate": 1.5288155568650559e-06,
459
+ "loss": 5.4802,
460
+ "mean_token_accuracy": 0.29713701367378237,
461
+ "num_tokens": 2356029.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.565930342674255,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 1.7528513669967651,
468
+ "learning_rate": 1.5170300530347672e-06,
469
+ "loss": 5.3518,
470
+ "mean_token_accuracy": 0.31301232606172563,
471
+ "num_tokens": 2408957.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.496430187225342,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 1.892640233039856,
478
+ "learning_rate": 1.5052445492044786e-06,
479
+ "loss": 5.2967,
480
+ "mean_token_accuracy": 0.3181899458169937,
481
+ "num_tokens": 2462569.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.725150098800659,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 1.774940848350525,
488
+ "learning_rate": 1.4934590453741897e-06,
489
+ "loss": 5.5215,
490
+ "mean_token_accuracy": 0.29055028676986694,
491
+ "num_tokens": 2518544.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.4884827613830565,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 2.2167599201202393,
498
+ "learning_rate": 1.481673541543901e-06,
499
+ "loss": 5.2917,
500
+ "mean_token_accuracy": 0.31803421139717103,
501
+ "num_tokens": 2570863.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.697079472541809,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 1.6489030122756958,
508
+ "learning_rate": 1.4698880377136124e-06,
509
+ "loss": 5.4982,
510
+ "mean_token_accuracy": 0.2925163987278938,
511
+ "num_tokens": 2626998.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.46209939956665,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.153914451599121,
518
+ "learning_rate": 1.4581025338833235e-06,
519
+ "loss": 5.2736,
520
+ "mean_token_accuracy": 0.3182168474793434,
521
+ "num_tokens": 2681568.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.4405768728256225,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.6614978313446045,
528
+ "learning_rate": 1.4463170300530348e-06,
529
+ "loss": 5.2515,
530
+ "mean_token_accuracy": 0.3218736210465431,
531
+ "num_tokens": 2733587.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.528175053596496,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.0849746465682983,
538
+ "learning_rate": 1.434531526222746e-06,
539
+ "loss": 5.3378,
540
+ "mean_token_accuracy": 0.31061659604310987,
541
+ "num_tokens": 2787003.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.46110897064209,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 1.8315683603286743,
548
+ "learning_rate": 1.4227460223924573e-06,
549
+ "loss": 5.2782,
550
+ "mean_token_accuracy": 0.31781029611825945,
551
+ "num_tokens": 2840263.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.455560960769653,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 1.1859091520309448,
558
+ "learning_rate": 1.4109605185621684e-06,
559
+ "loss": 5.2735,
560
+ "mean_token_accuracy": 0.3194814011454582,
561
+ "num_tokens": 2894186.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.430496115684509,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3500001430511475,
568
+ "learning_rate": 1.3991750147318797e-06,
569
+ "loss": 5.2464,
570
+ "mean_token_accuracy": 0.32140792965888976,
571
+ "num_tokens": 2948171.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.588023023605347,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 1.727825403213501,
578
+ "learning_rate": 1.3873895109015909e-06,
579
+ "loss": 5.4028,
580
+ "mean_token_accuracy": 0.3039530631899834,
581
+ "num_tokens": 3002678.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.410525422096253,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.3401474952697754,
588
+ "learning_rate": 1.3756040070713022e-06,
589
+ "loss": 5.2298,
590
+ "mean_token_accuracy": 0.324065263569355,
591
+ "num_tokens": 3055844.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.36959942817688,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.1892589330673218,
598
+ "learning_rate": 1.3638185032410133e-06,
599
+ "loss": 5.1956,
600
+ "mean_token_accuracy": 0.32639502108097074,
601
+ "num_tokens": 3108636.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.53826907157898,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.2652360200881958,
608
+ "learning_rate": 1.3520329994107247e-06,
609
+ "loss": 5.3583,
610
+ "mean_token_accuracy": 0.3074926760792732,
611
+ "num_tokens": 3162627.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.417449145317078,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 1.584312915802002,
618
+ "learning_rate": 1.340247495580436e-06,
619
+ "loss": 5.2388,
620
+ "mean_token_accuracy": 0.32019727885723115,
621
+ "num_tokens": 3216409.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.241390740871429,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.5219439268112183,
628
+ "learning_rate": 1.3284619917501471e-06,
629
+ "loss": 5.0645,
630
+ "mean_token_accuracy": 0.3445430138707161,
631
+ "num_tokens": 3266967.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.405424036979675,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 2.1165153980255127,
638
+ "learning_rate": 1.3166764879198585e-06,
639
+ "loss": 5.232,
640
+ "mean_token_accuracy": 0.32085000157356264,
641
+ "num_tokens": 3319877.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.123006024360657,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 1.2189785242080688,
648
+ "learning_rate": 1.3048909840895698e-06,
649
+ "loss": 4.9582,
650
+ "mean_token_accuracy": 0.356108532845974,
651
+ "num_tokens": 3368569.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.417610831260681,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.5157604217529297,
658
+ "learning_rate": 1.2931054802592812e-06,
659
+ "loss": 5.2454,
660
+ "mean_token_accuracy": 0.31976755023002623,
661
+ "num_tokens": 3422449.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.409690895080566,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.3088161945343018,
668
+ "learning_rate": 1.2813199764289923e-06,
669
+ "loss": 5.2348,
670
+ "mean_token_accuracy": 0.32325415283441544,
671
+ "num_tokens": 3474399.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.44662567615509,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 2.178372621536255,
678
+ "learning_rate": 1.2695344725987036e-06,
679
+ "loss": 5.2661,
680
+ "mean_token_accuracy": 0.3182847076654434,
681
+ "num_tokens": 3527726.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.512614865303039,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 1.3050425052642822,
688
+ "learning_rate": 1.2577489687684147e-06,
689
+ "loss": 5.3416,
690
+ "mean_token_accuracy": 0.3084403133392334,
691
+ "num_tokens": 3581980.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.379772834777832,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4584404230117798,
698
+ "learning_rate": 1.245963464938126e-06,
699
+ "loss": 5.2087,
700
+ "mean_token_accuracy": 0.32388432770967485,
701
+ "num_tokens": 3635393.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.483665924072266,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.2157734632492065,
708
+ "learning_rate": 1.2341779611078374e-06,
709
+ "loss": 5.3101,
710
+ "mean_token_accuracy": 0.3121953472495079,
711
+ "num_tokens": 3689894.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.711394641805904,
717
+ "eval_loss": 5.55628776550293,
718
+ "eval_mean_token_accuracy": 0.2764948787777105,
719
+ "eval_model_preparation_time": 0.0045,
720
+ "eval_num_tokens": 3712724.0,
721
+ "eval_runtime": 50.187,
722
+ "eval_samples_per_second": 8.648,
723
+ "eval_steps_per_second": 4.324,
724
  "step": 3474
725
  },
726
  {
727
+ "entropy": 5.349283556938172,
728
  "epoch": 2.0149683362118593,
729
+ "grad_norm": 1.1696771383285522,
730
+ "learning_rate": 1.2223924572775486e-06,
731
+ "loss": 5.1782,
732
+ "mean_token_accuracy": 0.33028870791196824,
733
+ "num_tokens": 3740861.0,
734
  "step": 3500
735
  },
736
  {
737
+ "entropy": 5.4721107006073,
738
  "epoch": 2.043753598157743,
739
+ "grad_norm": 1.8449370861053467,
740
+ "learning_rate": 1.2106069534472599e-06,
741
+ "loss": 5.2978,
742
+ "mean_token_accuracy": 0.31511022299528124,
743
+ "num_tokens": 3794869.0,
744
  "step": 3550
745
  },
746
  {
747
+ "entropy": 5.404226851463318,
748
  "epoch": 2.0725388601036268,
749
+ "grad_norm": 3.789496660232544,
750
+ "learning_rate": 1.198821449616971e-06,
751
+ "loss": 5.2371,
752
+ "mean_token_accuracy": 0.32092176616191864,
753
+ "num_tokens": 3848573.0,
754
  "step": 3600
755
  },
756
  {
757
+ "entropy": 5.435445628166199,
758
  "epoch": 2.1013241220495105,
759
+ "grad_norm": 2.2847959995269775,
760
+ "learning_rate": 1.1870359457866824e-06,
761
+ "loss": 5.2662,
762
+ "mean_token_accuracy": 0.3186633634567261,
763
+ "num_tokens": 3901204.0,
764
  "step": 3650
765
  },
766
  {
767
+ "entropy": 5.4066293334960935,
768
  "epoch": 2.130109383995394,
769
+ "grad_norm": 1.0950902700424194,
770
+ "learning_rate": 1.1752504419563935e-06,
771
+ "loss": 5.2345,
772
+ "mean_token_accuracy": 0.32156052827835085,
773
+ "num_tokens": 3953753.0,
774
  "step": 3700
775
  },
776
  {
777
+ "entropy": 5.272332944869995,
778
  "epoch": 2.158894645941278,
779
+ "grad_norm": 2.1477339267730713,
780
+ "learning_rate": 1.1634649381261048e-06,
781
+ "loss": 5.1091,
782
+ "mean_token_accuracy": 0.3380983591079712,
783
+ "num_tokens": 4005481.0,
784
  "step": 3750
785
  },
786
  {
787
+ "entropy": 5.4118804311752315,
788
  "epoch": 2.1876799078871616,
789
+ "grad_norm": 1.4509484767913818,
790
+ "learning_rate": 1.151679434295816e-06,
791
+ "loss": 5.2448,
792
+ "mean_token_accuracy": 0.3208243528008461,
793
+ "num_tokens": 4058829.0,
794
  "step": 3800
795
  },
796
  {
797
+ "entropy": 5.4763900089263915,
798
  "epoch": 2.2164651698330453,
799
+ "grad_norm": 1.0856804847717285,
800
+ "learning_rate": 1.1398939304655273e-06,
801
+ "loss": 5.3042,
802
+ "mean_token_accuracy": 0.31338351368904116,
803
+ "num_tokens": 4113326.0,
804
  "step": 3850
805
  },
806
  {
807
+ "entropy": 5.328452725410461,
808
  "epoch": 2.245250431778929,
809
+ "grad_norm": 3.2843880653381348,
810
+ "learning_rate": 1.1281084266352386e-06,
811
+ "loss": 5.1624,
812
+ "mean_token_accuracy": 0.3305218696594238,
813
+ "num_tokens": 4165454.0,
814
  "step": 3900
815
  },
816
  {
817
+ "entropy": 5.383157343864441,
818
  "epoch": 2.2740356937248127,
819
+ "grad_norm": 2.207082748413086,
820
+ "learning_rate": 1.1163229228049497e-06,
821
+ "loss": 5.2163,
822
+ "mean_token_accuracy": 0.32331310987472534,
823
+ "num_tokens": 4219250.0,
824
  "step": 3950
825
  },
826
  {
827
+ "entropy": 5.585261764526368,
828
  "epoch": 2.3028209556706964,
829
+ "grad_norm": 2.7102835178375244,
830
+ "learning_rate": 1.104537418974661e-06,
831
+ "loss": 5.4137,
832
+ "mean_token_accuracy": 0.29959124475717547,
833
+ "num_tokens": 4274711.0,
834
  "step": 4000
835
  },
836
  {
837
+ "entropy": 5.434073266983032,
838
  "epoch": 2.33160621761658,
839
+ "grad_norm": 1.3775779008865356,
840
+ "learning_rate": 1.0927519151443724e-06,
841
+ "loss": 5.2644,
842
+ "mean_token_accuracy": 0.3175011593103409,
843
+ "num_tokens": 4328616.0,
844
  "step": 4050
845
  },
846
  {
847
+ "entropy": 5.462391858100891,
848
  "epoch": 2.360391479562464,
849
+ "grad_norm": 1.4101024866104126,
850
+ "learning_rate": 1.0809664113140838e-06,
851
+ "loss": 5.2924,
852
+ "mean_token_accuracy": 0.3137941011786461,
853
+ "num_tokens": 4382416.0,
854
  "step": 4100
855
  },
856
  {
857
+ "entropy": 5.529892563819885,
858
  "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.2311837673187256,
860
+ "learning_rate": 1.0691809074837949e-06,
861
+ "loss": 5.364,
862
+ "mean_token_accuracy": 0.3046491605043411,
863
+ "num_tokens": 4437848.0,
864
  "step": 4150
865
  },
866
  {
867
+ "entropy": 5.4370484542846675,
868
  "epoch": 2.4179620034542313,
869
+ "grad_norm": 1.0929864645004272,
870
+ "learning_rate": 1.0573954036535062e-06,
871
+ "loss": 5.2734,
872
+ "mean_token_accuracy": 0.3169013774394989,
873
+ "num_tokens": 4491185.0,
874
  "step": 4200
875
  },
876
  {
877
+ "entropy": 5.395377616882325,
878
  "epoch": 2.446747265400115,
879
+ "grad_norm": 1.5457273721694946,
880
+ "learning_rate": 1.0456098998232174e-06,
881
+ "loss": 5.2276,
882
+ "mean_token_accuracy": 0.32221508473157884,
883
+ "num_tokens": 4544086.0,
884
  "step": 4250
885
  },
886
  {
887
+ "entropy": 5.443737335205078,
888
  "epoch": 2.4755325273459987,
889
+ "grad_norm": 1.4844346046447754,
890
+ "learning_rate": 1.0338243959929287e-06,
891
+ "loss": 5.2786,
892
+ "mean_token_accuracy": 0.3157751387357712,
893
+ "num_tokens": 4597677.0,
894
  "step": 4300
895
  },
896
  {
897
+ "entropy": 5.419876251220703,
898
  "epoch": 2.5043177892918824,
899
+ "grad_norm": 1.2481963634490967,
900
+ "learning_rate": 1.02203889216264e-06,
901
+ "loss": 5.2564,
902
+ "mean_token_accuracy": 0.31889803290367125,
903
+ "num_tokens": 4651343.0,
904
  "step": 4350
905
  },
906
  {
907
+ "entropy": 5.578677978515625,
908
  "epoch": 2.533103051237766,
909
+ "grad_norm": 2.0005414485931396,
910
+ "learning_rate": 1.0102533883323512e-06,
911
+ "loss": 5.4145,
912
+ "mean_token_accuracy": 0.30037090003490446,
913
+ "num_tokens": 4705985.0,
914
  "step": 4400
915
  },
916
  {
917
+ "entropy": 5.279946126937866,
918
  "epoch": 2.56188831318365,
919
+ "grad_norm": 1.080521821975708,
920
+ "learning_rate": 9.984678845020625e-07,
921
+ "loss": 5.1226,
922
+ "mean_token_accuracy": 0.3341303279995918,
923
+ "num_tokens": 4757741.0,
924
  "step": 4450
925
  },
926
  {
927
+ "entropy": 5.551463279724121,
928
  "epoch": 2.5906735751295336,
929
+ "grad_norm": 1.28898024559021,
930
+ "learning_rate": 9.866823806717736e-07,
931
+ "loss": 5.3832,
932
+ "mean_token_accuracy": 0.3028248634934425,
933
+ "num_tokens": 4812808.0,
934
  "step": 4500
935
  },
936
  {
937
+ "entropy": 5.3787487554550175,
938
  "epoch": 2.6194588370754173,
939
+ "grad_norm": 1.5697983503341675,
940
+ "learning_rate": 9.74896876841485e-07,
941
+ "loss": 5.2141,
942
+ "mean_token_accuracy": 0.3227942296862602,
943
+ "num_tokens": 4866572.0,
944
  "step": 4550
945
  },
946
  {
947
+ "entropy": 5.460358958244324,
948
  "epoch": 2.648244099021301,
949
+ "grad_norm": 1.3180441856384277,
950
+ "learning_rate": 9.63111373011196e-07,
951
+ "loss": 5.2954,
952
+ "mean_token_accuracy": 0.31269474506378175,
953
+ "num_tokens": 4921312.0,
954
  "step": 4600
955
  },
956
  {
957
+ "entropy": 5.434084935188293,
958
  "epoch": 2.6770293609671847,
959
+ "grad_norm": 1.2409590482711792,
960
+ "learning_rate": 9.513258691809074e-07,
961
+ "loss": 5.271,
962
+ "mean_token_accuracy": 0.3172155100107193,
963
+ "num_tokens": 4974289.0,
964
  "step": 4650
965
  },
966
  {
967
+ "entropy": 5.406955418586731,
968
  "epoch": 2.7058146229130684,
969
+ "grad_norm": 1.4782609939575195,
970
+ "learning_rate": 9.395403653506187e-07,
971
+ "loss": 5.2473,
972
+ "mean_token_accuracy": 0.32031788885593415,
973
+ "num_tokens": 5028149.0,
974
  "step": 4700
975
  },
976
  {
977
+ "entropy": 5.206603040695191,
978
  "epoch": 2.734599884858952,
979
+ "grad_norm": 2.351633071899414,
980
+ "learning_rate": 9.2775486152033e-07,
981
+ "loss": 5.0478,
982
+ "mean_token_accuracy": 0.3428420132398605,
983
+ "num_tokens": 5079349.0,
984
  "step": 4750
985
  },
986
  {
987
+ "entropy": 5.388812799453735,
988
  "epoch": 2.763385146804836,
989
+ "grad_norm": 7.564618110656738,
990
+ "learning_rate": 9.159693576900412e-07,
991
+ "loss": 5.2281,
992
+ "mean_token_accuracy": 0.3222071170806885,
993
+ "num_tokens": 5132564.0,
994
  "step": 4800
995
  },
996
  {
997
+ "entropy": 5.374106278419495,
998
  "epoch": 2.7921704087507195,
999
+ "grad_norm": 1.4734679460525513,
1000
+ "learning_rate": 9.041838538597525e-07,
1001
+ "loss": 5.2161,
1002
+ "mean_token_accuracy": 0.3219477406144142,
1003
+ "num_tokens": 5185921.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
+ "entropy": 5.232998585700988,
1008
  "epoch": 2.8209556706966032,
1009
+ "grad_norm": 1.4175471067428589,
1010
+ "learning_rate": 8.923983500294637e-07,
1011
+ "loss": 5.0769,
1012
+ "mean_token_accuracy": 0.3403926733136177,
1013
+ "num_tokens": 5237521.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
+ "entropy": 5.394891719818116,
1018
  "epoch": 2.849740932642487,
1019
+ "grad_norm": 4.951873779296875,
1020
+ "learning_rate": 8.806128461991749e-07,
1021
+ "loss": 5.2344,
1022
+ "mean_token_accuracy": 0.3213117456436157,
1023
+ "num_tokens": 5291104.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
+ "entropy": 5.413805012702942,
1028
  "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.679518461227417,
1030
+ "learning_rate": 8.688273423688863e-07,
1031
+ "loss": 5.2597,
1032
+ "mean_token_accuracy": 0.3165634173154831,
1033
+ "num_tokens": 5345058.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
+ "entropy": 5.256177935600281,
1038
  "epoch": 2.9073114565342544,
1039
+ "grad_norm": 1.8892916440963745,
1040
+ "learning_rate": 8.570418385385975e-07,
1041
+ "loss": 5.1004,
1042
+ "mean_token_accuracy": 0.3369427987933159,
1043
+ "num_tokens": 5395918.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
+ "entropy": 5.259814453125,
1048
  "epoch": 2.936096718480138,
1049
+ "grad_norm": 1.3802675008773804,
1050
+ "learning_rate": 8.452563347083087e-07,
1051
+ "loss": 5.1057,
1052
+ "mean_token_accuracy": 0.3362414276599884,
1053
+ "num_tokens": 5448086.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
+ "entropy": 5.416206178665161,
1058
  "epoch": 2.964881980426022,
1059
+ "grad_norm": 1.7677236795425415,
1060
+ "learning_rate": 8.3347083087802e-07,
1061
+ "loss": 5.2562,
1062
+ "mean_token_accuracy": 0.31725785195827483,
1063
+ "num_tokens": 5501959.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
+ "entropy": 5.507337794303894,
1068
  "epoch": 2.9936672423719055,
1069
+ "grad_norm": 1.021727442741394,
1070
+ "learning_rate": 8.216853270477313e-07,
1071
+ "loss": 5.344,
1072
+ "mean_token_accuracy": 0.30679062128067014,
1073
+ "num_tokens": 5557908.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
+ "eval_entropy": 5.682707933786278,
1079
+ "eval_loss": 5.53223991394043,
1080
+ "eval_mean_token_accuracy": 0.27747743456594404,
1081
+ "eval_model_preparation_time": 0.0045,
1082
+ "eval_num_tokens": 5569086.0,
1083
+ "eval_runtime": 49.9944,
1084
+ "eval_samples_per_second": 8.681,
1085
+ "eval_steps_per_second": 4.34,
1086
  "step": 5211
1087
  }
1088
  ],
1089
  "logging_steps": 50,
1090
+ "max_steps": 8685,
1091
  "num_input_tokens_seen": 0,
1092
+ "num_train_epochs": 5,
1093
  "save_steps": 500,
1094
  "stateful_callbacks": {
1095
  "TrainerControl": {
 
1103
  "attributes": {}
1104
  }
1105
  },
1106
+ "total_flos": 7.632397197691392e+16,
1107
  "train_batch_size": 2,
1108
  "trial_name": null,
1109
  "trial_params": null
checkpoint-5211/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a666397e6243ddba6f7279c90610ed552907ef4de0be511faece3826d13e618
3
  size 6225
checkpoint-6948/adapter_config.json CHANGED
@@ -29,10 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
  "k_proj",
34
  "v_proj",
35
- "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "k_proj",
34
  "v_proj",
35
+ "q_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-6948/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a42655e5c5bf5a17388c99c67741b81d97a904a649f92d5298361717c78abaac
3
  size 26182176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad17a7c29bcc91941e8e904522c1e1408f363f45d397b0f1a5a0c57829008c18
3
  size 26182176
checkpoint-6948/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f4dc67fd123c4a9f8eb45bc8894cccfeeb5a7766daf44f4ca97786db172fd5f
3
  size 52486155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad7e1547dc98180a197d1e09ff5462cc7435be478f2eadfbb7a35f3fe318cbac
3
  size 52486155
checkpoint-6948/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de23a91d8efb3b92e132a49e237b78926ed9acaded7b594b358633abace10591
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8368c41a01c88b53eacde3119bcf65f0f4d5b3c36a14adcf04f08f24495c404
3
  size 14645
checkpoint-6948/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ced24601208e373e591e4ce71c0d860f568ef5205374f58c5db9ee9e78232103
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fc55ff863a5856e69cbaf9eb3a96203d5bcf04c7648e579610743cc43b484f9
3
  size 1465
checkpoint-6948/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 6948,
3
- "best_metric": 5.622366428375244,
4
  "best_model_checkpoint": "./output/checkpoint-6948",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
@@ -10,1438 +10,1438 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.6583470726013183,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 3.3817152976989746,
16
  "learning_rate": 4.9e-07,
17
- "loss": 13.8754,
18
- "mean_token_accuracy": 0.15036460414528846,
19
- "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 3.669608063697815,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 3.2541544437408447,
26
  "learning_rate": 9.9e-07,
27
- "loss": 14.2282,
28
- "mean_token_accuracy": 0.14137721598148345,
29
- "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 3.569736371040344,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 3.6797454357147217,
36
  "learning_rate": 1.49e-06,
37
- "loss": 13.0735,
38
- "mean_token_accuracy": 0.17473630651831626,
39
- "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 3.7253233194351196,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 4.297911643981934,
46
  "learning_rate": 1.99e-06,
47
- "loss": 13.7392,
48
- "mean_token_accuracy": 0.1473099772632122,
49
- "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 3.8280500602722167,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 4.405268669128418,
56
- "learning_rate": 1.9854771784232364e-06,
57
- "loss": 13.0797,
58
- "mean_token_accuracy": 0.16704789966344832,
59
- "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 4.066333084106446,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 4.757556438446045,
66
- "learning_rate": 1.9706579727326615e-06,
67
- "loss": 12.6321,
68
- "mean_token_accuracy": 0.1691790708899498,
69
- "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 4.257266030311585,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 6.406249523162842,
76
- "learning_rate": 1.955838767042086e-06,
77
- "loss": 12.2253,
78
- "mean_token_accuracy": 0.17223650276660918,
79
- "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 4.694105777740479,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 12.57987117767334,
86
- "learning_rate": 1.9410195613515113e-06,
87
- "loss": 11.9714,
88
- "mean_token_accuracy": 0.15997304677963256,
89
- "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 5.205010280609131,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 15.570313453674316,
96
- "learning_rate": 1.9262003556609364e-06,
97
- "loss": 10.8173,
98
- "mean_token_accuracy": 0.16447648257017136,
99
- "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 5.917805089950561,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 23.61503791809082,
106
- "learning_rate": 1.9113811499703615e-06,
107
- "loss": 9.3196,
108
- "mean_token_accuracy": 0.16179455041885377,
109
- "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.380368332862854,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 13.846810340881348,
116
- "learning_rate": 1.8965619442797864e-06,
117
- "loss": 7.9636,
118
- "mean_token_accuracy": 0.16881170988082886,
119
- "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.507339992523193,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 4.569090366363525,
126
- "learning_rate": 1.8817427385892115e-06,
127
- "loss": 7.4171,
128
- "mean_token_accuracy": 0.16941152423620223,
129
- "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.392864561080932,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 4.594696521759033,
136
- "learning_rate": 1.8669235328986366e-06,
137
- "loss": 6.9389,
138
- "mean_token_accuracy": 0.1844496901333332,
139
- "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.6726202869415285,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 4.768734931945801,
146
- "learning_rate": 1.8521043272080617e-06,
147
- "loss": 6.9818,
148
- "mean_token_accuracy": 0.16990411713719367,
149
- "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.592793455123902,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 3.253056764602661,
156
- "learning_rate": 1.8372851215174864e-06,
157
- "loss": 6.7105,
158
- "mean_token_accuracy": 0.18250102579593658,
159
- "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.683582029342651,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 2.1871063709259033,
166
- "learning_rate": 1.8224659158269115e-06,
167
- "loss": 6.6685,
168
- "mean_token_accuracy": 0.17129646152257919,
169
- "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.636875295639038,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 3.2284677028656006,
176
- "learning_rate": 1.8076467101363366e-06,
177
- "loss": 6.53,
178
- "mean_token_accuracy": 0.18053789794445038,
179
- "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.610673260688782,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 2.2088730335235596,
186
- "learning_rate": 1.7928275044457617e-06,
187
- "loss": 6.4429,
188
- "mean_token_accuracy": 0.18492739230394364,
189
- "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.242899022102356,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 2.3000030517578125,
196
- "learning_rate": 1.7780082987551866e-06,
197
- "loss": 6.047,
198
- "mean_token_accuracy": 0.2291259828209877,
199
- "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.311488924026489,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 2.1333675384521484,
206
- "learning_rate": 1.7631890930646115e-06,
207
- "loss": 6.0919,
208
- "mean_token_accuracy": 0.22644571751356124,
209
- "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.3254336166381835,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 2.0400779247283936,
216
- "learning_rate": 1.7483698873740366e-06,
217
- "loss": 6.094,
218
- "mean_token_accuracy": 0.2222653564810753,
219
- "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 6.046922063827514,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 2.8049051761627197,
226
- "learning_rate": 1.7335506816834617e-06,
227
- "loss": 5.8011,
228
- "mean_token_accuracy": 0.25127078920602797,
229
- "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.943600912094116,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 4.063963890075684,
236
- "learning_rate": 1.7187314759928866e-06,
237
- "loss": 5.6855,
238
- "mean_token_accuracy": 0.26265266716480257,
239
- "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.12883231639862,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 3.9440460205078125,
246
- "learning_rate": 1.7039122703023117e-06,
247
- "loss": 5.8578,
248
- "mean_token_accuracy": 0.24439335912466048,
249
- "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.164987115859986,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 3.20070481300354,
256
- "learning_rate": 1.6890930646117368e-06,
257
- "loss": 5.8876,
258
- "mean_token_accuracy": 0.24275501281023026,
259
- "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.080997190475464,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 2.8067362308502197,
266
- "learning_rate": 1.6742738589211617e-06,
267
- "loss": 5.8058,
268
- "mean_token_accuracy": 0.25242207854986193,
269
- "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.940848155021667,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 2.6375925540924072,
276
- "learning_rate": 1.6594546532305868e-06,
277
- "loss": 5.6718,
278
- "mean_token_accuracy": 0.2665082859992981,
279
- "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.071129274368286,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 3.951350212097168,
286
- "learning_rate": 1.6446354475400117e-06,
287
- "loss": 5.8012,
288
- "mean_token_accuracy": 0.25434976994991304,
289
- "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.069429359436035,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 3.580608606338501,
296
- "learning_rate": 1.6298162418494368e-06,
297
- "loss": 5.8027,
298
- "mean_token_accuracy": 0.25208072274923327,
299
- "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 6.005315380096436,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 3.9580376148223877,
306
- "learning_rate": 1.614997036158862e-06,
307
- "loss": 5.7364,
308
- "mean_token_accuracy": 0.25940640360116957,
309
- "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.0786464881896975,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 4.55721378326416,
316
- "learning_rate": 1.6001778304682868e-06,
317
- "loss": 5.8092,
318
- "mean_token_accuracy": 0.2496869170665741,
319
- "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 5.939382014274597,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 2.330057144165039,
326
- "learning_rate": 1.5853586247777117e-06,
327
- "loss": 5.6604,
328
- "mean_token_accuracy": 0.2686630353331566,
329
- "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.121775646209716,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 2.9881200790405273,
336
- "learning_rate": 1.5705394190871368e-06,
337
- "loss": 5.8388,
338
- "mean_token_accuracy": 0.2503683388233185,
339
- "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.840040788650513,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 3.798994779586792,
346
- "learning_rate": 1.555720213396562e-06,
347
- "loss": 5.5635,
348
- "mean_token_accuracy": 0.278279125392437,
349
- "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.139133475343203,
355
- "eval_loss": 5.861395835876465,
356
- "eval_mean_token_accuracy": 0.2402858340657801,
357
- "eval_model_preparation_time": 0.0047,
358
- "eval_num_tokens": 1825107.0,
359
- "eval_runtime": 79.3994,
360
- "eval_samples_per_second": 5.466,
361
- "eval_steps_per_second": 2.733,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 5.8970259666442875,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 2.6411802768707275,
368
- "learning_rate": 1.540901007705987e-06,
369
- "loss": 5.614,
370
- "mean_token_accuracy": 0.273006406724453,
371
- "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 6.0111794090271,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 3.6491827964782715,
378
- "learning_rate": 1.526081802015412e-06,
379
- "loss": 5.7323,
380
- "mean_token_accuracy": 0.26104256987571717,
381
- "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 5.902219276428223,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 2.593249559402466,
388
- "learning_rate": 1.5112625963248368e-06,
389
- "loss": 5.6187,
390
- "mean_token_accuracy": 0.2746362566947937,
391
- "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 5.874705944061279,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 2.554327964782715,
398
- "learning_rate": 1.496443390634262e-06,
399
- "loss": 5.6021,
400
- "mean_token_accuracy": 0.2795292744040489,
401
- "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 5.850096368789673,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 3.6060993671417236,
408
- "learning_rate": 1.481624184943687e-06,
409
- "loss": 5.576,
410
- "mean_token_accuracy": 0.28532547056674956,
411
- "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 5.802229671478272,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 3.0913314819335938,
418
- "learning_rate": 1.466804979253112e-06,
419
- "loss": 5.53,
420
- "mean_token_accuracy": 0.2916027933359146,
421
- "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 5.875646467208862,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 4.777045726776123,
428
- "learning_rate": 1.451985773562537e-06,
429
- "loss": 5.6146,
430
- "mean_token_accuracy": 0.28063644528388976,
431
- "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 5.786596937179565,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 4.207762718200684,
438
- "learning_rate": 1.437166567871962e-06,
439
- "loss": 5.5417,
440
- "mean_token_accuracy": 0.2870470091700554,
441
- "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.672234449386597,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 2.2771811485290527,
448
- "learning_rate": 1.422347362181387e-06,
449
- "loss": 5.4285,
450
- "mean_token_accuracy": 0.30194485366344453,
451
- "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 5.862573285102844,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 3.3273422718048096,
458
- "learning_rate": 1.4075281564908121e-06,
459
- "loss": 5.6169,
460
- "mean_token_accuracy": 0.278145115673542,
461
- "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 5.734760231971741,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 3.7049715518951416,
468
- "learning_rate": 1.392708950800237e-06,
469
- "loss": 5.493,
470
- "mean_token_accuracy": 0.2941485676169395,
471
- "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.665819988250733,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 3.572636604309082,
478
- "learning_rate": 1.3778897451096621e-06,
479
- "loss": 5.4352,
480
- "mean_token_accuracy": 0.3003745040297508,
481
- "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 5.890115032196045,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 2.738203525543213,
488
- "learning_rate": 1.3630705394190872e-06,
489
- "loss": 5.6555,
490
- "mean_token_accuracy": 0.2737997192144394,
491
- "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.66056040763855,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 3.1416995525360107,
498
- "learning_rate": 1.3482513337285121e-06,
499
- "loss": 5.4302,
500
- "mean_token_accuracy": 0.3000989046692848,
501
- "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 5.861240615844727,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 2.7569284439086914,
508
- "learning_rate": 1.333432128037937e-06,
509
- "loss": 5.6304,
510
- "mean_token_accuracy": 0.27707513481378554,
511
- "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.627686910629272,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 1.7750262022018433,
518
- "learning_rate": 1.3186129223473621e-06,
519
- "loss": 5.4058,
520
- "mean_token_accuracy": 0.3019809901714325,
521
- "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.607026796340943,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 3.1005160808563232,
528
- "learning_rate": 1.3037937166567872e-06,
529
- "loss": 5.3836,
530
- "mean_token_accuracy": 0.30584611505270004,
531
- "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 5.6909641885757445,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 1.6848654747009277,
538
- "learning_rate": 1.2889745109662123e-06,
539
- "loss": 5.4653,
540
- "mean_token_accuracy": 0.296178964972496,
541
- "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 5.619450302124023,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 2.469539165496826,
548
- "learning_rate": 1.274155305275637e-06,
549
- "loss": 5.4022,
550
- "mean_token_accuracy": 0.3039679077267647,
551
- "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.61073097705841,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 2.367810010910034,
558
- "learning_rate": 1.259336099585062e-06,
559
- "loss": 5.3956,
560
- "mean_token_accuracy": 0.3051413372159004,
561
- "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.5791136837005615,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 2.3874764442443848,
568
- "learning_rate": 1.2445168938944872e-06,
569
- "loss": 5.3676,
570
- "mean_token_accuracy": 0.3068238252401352,
571
- "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 5.735381307601929,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 2.2097349166870117,
578
- "learning_rate": 1.2296976882039123e-06,
579
- "loss": 5.5239,
580
- "mean_token_accuracy": 0.28974882304668426,
581
- "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.55252691745758,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 1.694831132888794,
588
- "learning_rate": 1.2148784825133372e-06,
589
- "loss": 5.351,
590
- "mean_token_accuracy": 0.3091904193162918,
591
- "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.508773093223572,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 1.8229279518127441,
598
- "learning_rate": 1.200059276822762e-06,
599
- "loss": 5.3164,
600
- "mean_token_accuracy": 0.31158645361661913,
601
- "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 5.676794271469117,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 1.7196234464645386,
608
- "learning_rate": 1.1852400711321872e-06,
609
- "loss": 5.4776,
610
- "mean_token_accuracy": 0.2929128894209862,
611
- "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.551529383659362,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 3.117525577545166,
618
- "learning_rate": 1.1704208654416123e-06,
619
- "loss": 5.3561,
620
- "mean_token_accuracy": 0.30634030640125276,
621
- "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.379635264873505,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 1.876755714416504,
628
- "learning_rate": 1.1556016597510372e-06,
629
- "loss": 5.1868,
630
- "mean_token_accuracy": 0.32913618892431257,
631
- "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.538804936408996,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 1.8670976161956787,
638
- "learning_rate": 1.1407824540604623e-06,
639
- "loss": 5.3494,
640
- "mean_token_accuracy": 0.30661171555519107,
641
- "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.258263626098633,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 2.748718023300171,
648
- "learning_rate": 1.1259632483698874e-06,
649
- "loss": 5.08,
650
- "mean_token_accuracy": 0.3413010013103485,
651
- "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.54539008140564,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 1.8556406497955322,
658
- "learning_rate": 1.1111440426793123e-06,
659
- "loss": 5.3614,
660
- "mean_token_accuracy": 0.30550685405731204,
661
- "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.5433073282241825,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 1.8386749029159546,
668
- "learning_rate": 1.0963248369887374e-06,
669
- "loss": 5.3543,
670
- "mean_token_accuracy": 0.30875524014234546,
671
- "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 5.5769769477844235,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 1.922486662864685,
678
- "learning_rate": 1.0815056312981623e-06,
679
- "loss": 5.3834,
680
- "mean_token_accuracy": 0.3035113242268562,
681
- "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 5.640013842582703,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 2.179500102996826,
688
- "learning_rate": 1.0666864256075874e-06,
689
- "loss": 5.4574,
690
- "mean_token_accuracy": 0.2947095710039139,
691
- "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.506910061836242,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 1.4014379978179932,
698
- "learning_rate": 1.0518672199170125e-06,
699
- "loss": 5.3234,
700
- "mean_token_accuracy": 0.3096472260355949,
701
- "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 5.607311015129089,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 1.41231107711792,
708
- "learning_rate": 1.0370480142264374e-06,
709
- "loss": 5.4226,
710
- "mean_token_accuracy": 0.2979922544956207,
711
- "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 5.831721861790951,
717
- "eval_loss": 5.656307220458984,
718
- "eval_mean_token_accuracy": 0.2641724460685308,
719
- "eval_model_preparation_time": 0.0047,
720
- "eval_num_tokens": 3650214.0,
721
- "eval_runtime": 79.7324,
722
- "eval_samples_per_second": 5.443,
723
- "eval_steps_per_second": 2.722,
724
  "step": 3474
725
  },
726
  {
727
- "entropy": 5.477711625099182,
728
  "epoch": 2.0149683362118593,
729
- "grad_norm": 3.0133137702941895,
730
- "learning_rate": 1.0222288085358623e-06,
731
- "loss": 5.2957,
732
- "mean_token_accuracy": 0.31543311327695844,
733
- "num_tokens": 3677883.0,
734
  "step": 3500
735
  },
736
  {
737
- "entropy": 5.599187393188476,
738
  "epoch": 2.043753598157743,
739
- "grad_norm": 1.885867714881897,
740
- "learning_rate": 1.0074096028452874e-06,
741
- "loss": 5.4142,
742
- "mean_token_accuracy": 0.3004470923542976,
743
- "num_tokens": 3730991.0,
744
  "step": 3550
745
  },
746
  {
747
- "entropy": 5.526448183059692,
748
  "epoch": 2.0725388601036268,
749
- "grad_norm": 4.50788688659668,
750
- "learning_rate": 9.925903971547125e-07,
751
- "loss": 5.3517,
752
- "mean_token_accuracy": 0.3069574165344238,
753
- "num_tokens": 3783795.0,
754
  "step": 3600
755
  },
756
  {
757
- "entropy": 5.560557870864868,
758
  "epoch": 2.1013241220495105,
759
- "grad_norm": 1.927862524986267,
760
- "learning_rate": 9.777711914641374e-07,
761
- "loss": 5.3815,
762
- "mean_token_accuracy": 0.3045575937628746,
763
- "num_tokens": 3835526.0,
764
  "step": 3650
765
  },
766
  {
767
- "entropy": 5.528058257102966,
768
  "epoch": 2.130109383995394,
769
- "grad_norm": 2.164687156677246,
770
- "learning_rate": 9.629519857735625e-07,
771
- "loss": 5.3501,
772
- "mean_token_accuracy": 0.3071546205878258,
773
- "num_tokens": 3887175.0,
774
  "step": 3700
775
  },
776
  {
777
- "entropy": 5.397617678642273,
778
  "epoch": 2.158894645941278,
779
- "grad_norm": 2.3098385334014893,
780
- "learning_rate": 9.481327800829875e-07,
781
- "loss": 5.2244,
782
- "mean_token_accuracy": 0.3226669803261757,
783
- "num_tokens": 3938003.0,
784
  "step": 3750
785
  },
786
  {
787
- "entropy": 5.529960298538208,
788
  "epoch": 2.1876799078871616,
789
- "grad_norm": 1.8144755363464355,
790
- "learning_rate": 9.333135743924125e-07,
791
- "loss": 5.3572,
792
- "mean_token_accuracy": 0.306032218337059,
793
- "num_tokens": 3990451.0,
794
  "step": 3800
795
  },
796
  {
797
- "entropy": 5.597109637260437,
798
  "epoch": 2.2164651698330453,
799
- "grad_norm": 2.7306935787200928,
800
- "learning_rate": 9.184943687018375e-07,
801
- "loss": 5.4162,
802
- "mean_token_accuracy": 0.2985941395163536,
803
- "num_tokens": 4044048.0,
804
  "step": 3850
805
  },
806
  {
807
- "entropy": 5.448684883117676,
808
  "epoch": 2.245250431778929,
809
- "grad_norm": 1.8199880123138428,
810
- "learning_rate": 9.036751630112626e-07,
811
- "loss": 5.2775,
812
- "mean_token_accuracy": 0.31548845052719116,
813
- "num_tokens": 4095276.0,
814
  "step": 3900
815
  },
816
  {
817
- "entropy": 5.5008597612380985,
818
  "epoch": 2.2740356937248127,
819
- "grad_norm": 1.755323052406311,
820
- "learning_rate": 8.888559573206875e-07,
821
- "loss": 5.3274,
822
- "mean_token_accuracy": 0.309090721309185,
823
- "num_tokens": 4148172.0,
824
  "step": 3950
825
  },
826
  {
827
- "entropy": 5.7040300464630125,
828
  "epoch": 2.3028209556706964,
829
- "grad_norm": 2.3154356479644775,
830
- "learning_rate": 8.740367516301126e-07,
831
- "loss": 5.5239,
832
- "mean_token_accuracy": 0.28589318484067916,
833
- "num_tokens": 4202733.0,
834
  "step": 4000
835
  },
836
  {
837
- "entropy": 5.549855670928955,
838
  "epoch": 2.33160621761658,
839
- "grad_norm": 1.9549669027328491,
840
- "learning_rate": 8.592175459395375e-07,
841
- "loss": 5.3755,
842
- "mean_token_accuracy": 0.3029727828502655,
843
- "num_tokens": 4255738.0,
844
  "step": 4050
845
  },
846
  {
847
- "entropy": 5.579690465927124,
848
  "epoch": 2.360391479562464,
849
- "grad_norm": 1.7018866539001465,
850
- "learning_rate": 8.443983402489626e-07,
851
- "loss": 5.4036,
852
- "mean_token_accuracy": 0.3001995691657066,
853
- "num_tokens": 4308638.0,
854
  "step": 4100
855
  },
856
  {
857
- "entropy": 5.646504878997803,
858
  "epoch": 2.3891767415083476,
859
- "grad_norm": 1.4139262437820435,
860
- "learning_rate": 8.295791345583877e-07,
861
- "loss": 5.4733,
862
- "mean_token_accuracy": 0.2912476986646652,
863
- "num_tokens": 4363170.0,
864
  "step": 4150
865
  },
866
  {
867
- "entropy": 5.554990992546082,
868
  "epoch": 2.4179620034542313,
869
- "grad_norm": 1.6886577606201172,
870
- "learning_rate": 8.147599288678126e-07,
871
- "loss": 5.3842,
872
- "mean_token_accuracy": 0.302762059867382,
873
- "num_tokens": 4415607.0,
874
  "step": 4200
875
  },
876
  {
877
- "entropy": 5.513420124053955,
878
  "epoch": 2.446747265400115,
879
- "grad_norm": 1.3537819385528564,
880
- "learning_rate": 7.999407231772377e-07,
881
- "loss": 5.3408,
882
- "mean_token_accuracy": 0.30764526218175886,
883
- "num_tokens": 4467608.0,
884
  "step": 4250
885
  },
886
  {
887
- "entropy": 5.561378569602966,
888
  "epoch": 2.4755325273459987,
889
- "grad_norm": 1.8514106273651123,
890
- "learning_rate": 7.851215174866627e-07,
891
- "loss": 5.3891,
892
- "mean_token_accuracy": 0.301382859647274,
893
- "num_tokens": 4520299.0,
894
  "step": 4300
895
  },
896
  {
897
- "entropy": 5.536689953804016,
898
  "epoch": 2.5043177892918824,
899
- "grad_norm": 2.1830835342407227,
900
- "learning_rate": 7.703023117960877e-07,
901
- "loss": 5.3672,
902
- "mean_token_accuracy": 0.3047756373882294,
903
- "num_tokens": 4573065.0,
904
  "step": 4350
905
  },
906
  {
907
- "entropy": 5.69776873588562,
908
  "epoch": 2.533103051237766,
909
- "grad_norm": 1.999536156654358,
910
- "learning_rate": 7.554831061055127e-07,
911
- "loss": 5.5236,
912
- "mean_token_accuracy": 0.2868007507920265,
913
- "num_tokens": 4626807.0,
914
  "step": 4400
915
  },
916
  {
917
- "entropy": 5.3977436876297,
918
  "epoch": 2.56188831318365,
919
- "grad_norm": 1.9608020782470703,
920
- "learning_rate": 7.406639004149378e-07,
921
- "loss": 5.2335,
922
- "mean_token_accuracy": 0.3199601462483406,
923
- "num_tokens": 4677663.0,
924
  "step": 4450
925
  },
926
  {
927
- "entropy": 5.6681678771972654,
928
  "epoch": 2.5906735751295336,
929
- "grad_norm": 1.829047441482544,
930
- "learning_rate": 7.258446947243627e-07,
931
- "loss": 5.491,
932
- "mean_token_accuracy": 0.2894612854719162,
933
- "num_tokens": 4731830.0,
934
  "step": 4500
935
  },
936
  {
937
- "entropy": 5.49174174785614,
938
  "epoch": 2.6194588370754173,
939
- "grad_norm": 1.3158719539642334,
940
- "learning_rate": 7.110254890337878e-07,
941
- "loss": 5.3225,
942
- "mean_token_accuracy": 0.3084965732693672,
943
- "num_tokens": 4784694.0,
944
  "step": 4550
945
  },
946
  {
947
- "entropy": 5.573234438896179,
948
  "epoch": 2.648244099021301,
949
- "grad_norm": 1.562915325164795,
950
- "learning_rate": 6.962062833432127e-07,
951
- "loss": 5.4028,
952
- "mean_token_accuracy": 0.2989520016312599,
953
- "num_tokens": 4838534.0,
954
  "step": 4600
955
  },
956
  {
957
- "entropy": 5.550469598770142,
958
  "epoch": 2.6770293609671847,
959
- "grad_norm": 2.114727735519409,
960
- "learning_rate": 6.813870776526378e-07,
961
- "loss": 5.3804,
962
- "mean_token_accuracy": 0.30373542964458466,
963
- "num_tokens": 4890611.0,
964
  "step": 4650
965
  },
966
  {
967
- "entropy": 5.523049550056458,
968
  "epoch": 2.7058146229130684,
969
- "grad_norm": 2.5036823749542236,
970
- "learning_rate": 6.665678719620628e-07,
971
- "loss": 5.3542,
972
- "mean_token_accuracy": 0.30681024432182313,
973
- "num_tokens": 4943571.0,
974
  "step": 4700
975
  },
976
  {
977
- "entropy": 5.323453049659729,
978
  "epoch": 2.734599884858952,
979
- "grad_norm": 1.8069168329238892,
980
- "learning_rate": 6.517486662714878e-07,
981
- "loss": 5.1583,
982
- "mean_token_accuracy": 0.32906652927398683,
983
- "num_tokens": 4993871.0,
984
  "step": 4750
985
  },
986
  {
987
- "entropy": 5.504038324356079,
988
  "epoch": 2.763385146804836,
989
- "grad_norm": 4.750283718109131,
990
- "learning_rate": 6.369294605809128e-07,
991
- "loss": 5.3366,
992
- "mean_token_accuracy": 0.3087608867883682,
993
- "num_tokens": 5046187.0,
994
  "step": 4800
995
  },
996
  {
997
- "entropy": 5.487624549865723,
998
  "epoch": 2.7921704087507195,
999
- "grad_norm": 1.4186172485351562,
1000
- "learning_rate": 6.221102548903379e-07,
1001
- "loss": 5.3237,
1002
- "mean_token_accuracy": 0.3088638699054718,
1003
- "num_tokens": 5098644.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
- "entropy": 5.346905107498169,
1008
  "epoch": 2.8209556706966032,
1009
- "grad_norm": 1.5670177936553955,
1010
- "learning_rate": 6.072910491997628e-07,
1011
- "loss": 5.1849,
1012
- "mean_token_accuracy": 0.3265886321663857,
1013
- "num_tokens": 5149345.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
- "entropy": 5.510410032272339,
1018
  "epoch": 2.849740932642487,
1019
- "grad_norm": 7.489855766296387,
1020
- "learning_rate": 5.924718435091879e-07,
1021
- "loss": 5.3424,
1022
- "mean_token_accuracy": 0.30768151730299,
1023
- "num_tokens": 5202028.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
- "entropy": 5.525181493759155,
1028
  "epoch": 2.8785261945883707,
1029
- "grad_norm": 1.8829196691513062,
1030
- "learning_rate": 5.776526378186128e-07,
1031
- "loss": 5.3654,
1032
- "mean_token_accuracy": 0.30342737555503846,
1033
- "num_tokens": 5255082.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
- "entropy": 5.374098634719848,
1038
  "epoch": 2.9073114565342544,
1039
- "grad_norm": 1.3901060819625854,
1040
- "learning_rate": 5.628334321280379e-07,
1041
- "loss": 5.2103,
1042
- "mean_token_accuracy": 0.3233291879296303,
1043
- "num_tokens": 5305042.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
- "entropy": 5.374619431495667,
1048
  "epoch": 2.936096718480138,
1049
- "grad_norm": 1.6586560010910034,
1050
- "learning_rate": 5.48014226437463e-07,
1051
- "loss": 5.2125,
1052
- "mean_token_accuracy": 0.322759662270546,
1053
- "num_tokens": 5356310.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
- "entropy": 5.527479724884033,
1058
  "epoch": 2.964881980426022,
1059
- "grad_norm": 1.6678485870361328,
1060
- "learning_rate": 5.331950207468879e-07,
1061
- "loss": 5.3627,
1062
- "mean_token_accuracy": 0.30430852621793747,
1063
- "num_tokens": 5409283.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
- "entropy": 5.6171248292922975,
1068
  "epoch": 2.9936672423719055,
1069
- "grad_norm": 1.50790274143219,
1070
- "learning_rate": 5.18375815056313e-07,
1071
- "loss": 5.4484,
1072
- "mean_token_accuracy": 0.29375598043203355,
1073
- "num_tokens": 5464332.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
- "eval_entropy": 5.78779639186947,
1079
- "eval_loss": 5.628758430480957,
1080
- "eval_mean_token_accuracy": 0.2653660801698535,
1081
- "eval_model_preparation_time": 0.0047,
1082
- "eval_num_tokens": 5475321.0,
1083
- "eval_runtime": 80.3676,
1084
- "eval_samples_per_second": 5.4,
1085
- "eval_steps_per_second": 2.7,
1086
  "step": 5211
1087
  },
1088
  {
1089
- "entropy": 5.323350539207459,
1090
  "epoch": 3.0224525043177892,
1091
- "grad_norm": 2.033228635787964,
1092
- "learning_rate": 5.03556609365738e-07,
1093
- "loss": 5.1623,
1094
- "mean_token_accuracy": 0.32844111531972886,
1095
- "num_tokens": 5514450.0,
1096
  "step": 5250
1097
  },
1098
  {
1099
- "entropy": 5.509175033569336,
1100
  "epoch": 3.051237766263673,
1101
- "grad_norm": 1.4281281232833862,
1102
- "learning_rate": 4.88737403675163e-07,
1103
- "loss": 5.3403,
1104
- "mean_token_accuracy": 0.30768867909908293,
1105
- "num_tokens": 5567345.0,
1106
  "step": 5300
1107
  },
1108
  {
1109
- "entropy": 5.4536163854599,
1110
  "epoch": 3.0800230282095566,
1111
- "grad_norm": 2.0320699214935303,
1112
- "learning_rate": 4.73918197984588e-07,
1113
- "loss": 5.2898,
1114
- "mean_token_accuracy": 0.31407355904579165,
1115
- "num_tokens": 5619654.0,
1116
  "step": 5350
1117
  },
1118
  {
1119
- "entropy": 5.487306084632873,
1120
  "epoch": 3.1088082901554404,
1121
- "grad_norm": 1.2829618453979492,
1122
- "learning_rate": 4.59098992294013e-07,
1123
- "loss": 5.3204,
1124
- "mean_token_accuracy": 0.30913869380950926,
1125
- "num_tokens": 5672269.0,
1126
  "step": 5400
1127
  },
1128
  {
1129
- "entropy": 5.569495844841003,
1130
  "epoch": 3.137593552101324,
1131
- "grad_norm": 2.231628656387329,
1132
- "learning_rate": 4.44279786603438e-07,
1133
- "loss": 5.4045,
1134
- "mean_token_accuracy": 0.30076681196689603,
1135
- "num_tokens": 5725059.0,
1136
  "step": 5450
1137
  },
1138
  {
1139
- "entropy": 5.499957413673401,
1140
  "epoch": 3.166378814047208,
1141
- "grad_norm": 1.549865484237671,
1142
- "learning_rate": 4.2946058091286305e-07,
1143
- "loss": 5.3415,
1144
- "mean_token_accuracy": 0.30755339056253433,
1145
- "num_tokens": 5776784.0,
1146
  "step": 5500
1147
  },
1148
  {
1149
- "entropy": 5.664071002006531,
1150
  "epoch": 3.1951640759930915,
1151
- "grad_norm": 1.2153443098068237,
1152
- "learning_rate": 4.146413752222881e-07,
1153
- "loss": 5.4948,
1154
- "mean_token_accuracy": 0.28785294711589815,
1155
- "num_tokens": 5832296.0,
1156
  "step": 5550
1157
  },
1158
  {
1159
- "entropy": 5.516234860420227,
1160
  "epoch": 3.223949337938975,
1161
- "grad_norm": 1.0542709827423096,
1162
- "learning_rate": 3.998221695317131e-07,
1163
- "loss": 5.3465,
1164
- "mean_token_accuracy": 0.3083792108297348,
1165
- "num_tokens": 5885122.0,
1166
  "step": 5600
1167
  },
1168
  {
1169
- "entropy": 5.500826091766357,
1170
  "epoch": 3.252734599884859,
1171
- "grad_norm": 2.2477681636810303,
1172
- "learning_rate": 3.850029638411381e-07,
1173
- "loss": 5.3385,
1174
- "mean_token_accuracy": 0.30737883657217024,
1175
- "num_tokens": 5938386.0,
1176
  "step": 5650
1177
  },
1178
  {
1179
- "entropy": 5.517533864974975,
1180
  "epoch": 3.2815198618307426,
1181
- "grad_norm": 1.03904128074646,
1182
- "learning_rate": 3.7018375815056315e-07,
1183
- "loss": 5.3533,
1184
- "mean_token_accuracy": 0.3064529225230217,
1185
- "num_tokens": 5989784.0,
1186
  "step": 5700
1187
  },
1188
  {
1189
- "entropy": 5.543709697723389,
1190
  "epoch": 3.3103051237766263,
1191
- "grad_norm": 1.562757134437561,
1192
- "learning_rate": 3.5536455245998815e-07,
1193
- "loss": 5.3766,
1194
- "mean_token_accuracy": 0.3036728450655937,
1195
- "num_tokens": 6042646.0,
1196
  "step": 5750
1197
  },
1198
  {
1199
- "entropy": 5.389412899017334,
1200
  "epoch": 3.33909038572251,
1201
- "grad_norm": 2.2124178409576416,
1202
- "learning_rate": 3.4054534676941315e-07,
1203
- "loss": 5.2287,
1204
- "mean_token_accuracy": 0.32173423111438754,
1205
- "num_tokens": 6093550.0,
1206
  "step": 5800
1207
  },
1208
  {
1209
- "entropy": 5.236968355178833,
1210
  "epoch": 3.3678756476683938,
1211
- "grad_norm": 2.146965503692627,
1212
- "learning_rate": 3.2572614107883814e-07,
1213
- "loss": 5.0793,
1214
- "mean_token_accuracy": 0.3410212889313698,
1215
- "num_tokens": 6142299.0,
1216
  "step": 5850
1217
  },
1218
  {
1219
- "entropy": 5.459367966651916,
1220
  "epoch": 3.3966609096142775,
1221
- "grad_norm": 1.0992231369018555,
1222
- "learning_rate": 3.109069353882632e-07,
1223
- "loss": 5.2978,
1224
- "mean_token_accuracy": 0.31258249312639236,
1225
- "num_tokens": 6194315.0,
1226
  "step": 5900
1227
  },
1228
  {
1229
- "entropy": 5.526850900650024,
1230
  "epoch": 3.425446171560161,
1231
- "grad_norm": 2.137270212173462,
1232
- "learning_rate": 2.960877296976882e-07,
1233
- "loss": 5.3598,
1234
- "mean_token_accuracy": 0.3052875977754593,
1235
- "num_tokens": 6246032.0,
1236
  "step": 5950
1237
  },
1238
  {
1239
- "entropy": 5.573816101551056,
1240
  "epoch": 3.454231433506045,
1241
- "grad_norm": 1.5624985694885254,
1242
- "learning_rate": 2.812685240071132e-07,
1243
- "loss": 5.4081,
1244
- "mean_token_accuracy": 0.2992635017633438,
1245
- "num_tokens": 6300018.0,
1246
  "step": 6000
1247
  },
1248
  {
1249
- "entropy": 5.514087476730347,
1250
  "epoch": 3.4830166954519286,
1251
- "grad_norm": 1.2660338878631592,
1252
- "learning_rate": 2.664493183165382e-07,
1253
- "loss": 5.3472,
1254
- "mean_token_accuracy": 0.3070674228668213,
1255
- "num_tokens": 6352988.0,
1256
  "step": 6050
1257
  },
1258
  {
1259
- "entropy": 5.430188207626343,
1260
  "epoch": 3.5118019573978123,
1261
- "grad_norm": 1.2666460275650024,
1262
- "learning_rate": 2.5163011262596324e-07,
1263
- "loss": 5.2645,
1264
- "mean_token_accuracy": 0.31776045858860014,
1265
- "num_tokens": 6405116.0,
1266
  "step": 6100
1267
  },
1268
  {
1269
- "entropy": 5.5897090005874634,
1270
  "epoch": 3.540587219343696,
1271
- "grad_norm": 1.275363802909851,
1272
- "learning_rate": 2.3681090693538824e-07,
1273
- "loss": 5.4265,
1274
- "mean_token_accuracy": 0.297469447851181,
1275
- "num_tokens": 6458789.0,
1276
  "step": 6150
1277
  },
1278
  {
1279
- "entropy": 5.422791337966919,
1280
  "epoch": 3.5693724812895797,
1281
- "grad_norm": 2.2392683029174805,
1282
- "learning_rate": 2.2199170124481327e-07,
1283
- "loss": 5.2608,
1284
- "mean_token_accuracy": 0.3180572906136513,
1285
- "num_tokens": 6510168.0,
1286
  "step": 6200
1287
  },
1288
  {
1289
- "entropy": 5.408909387588501,
1290
  "epoch": 3.5981577432354634,
1291
- "grad_norm": 2.821279525756836,
1292
- "learning_rate": 2.071724955542383e-07,
1293
- "loss": 5.2455,
1294
- "mean_token_accuracy": 0.316647432744503,
1295
- "num_tokens": 6562528.0,
1296
  "step": 6250
1297
  },
1298
  {
1299
- "entropy": 5.657666215896606,
1300
  "epoch": 3.626943005181347,
1301
- "grad_norm": 3.261878490447998,
1302
- "learning_rate": 1.9235328986366332e-07,
1303
- "loss": 5.4941,
1304
- "mean_token_accuracy": 0.28845800429582596,
1305
- "num_tokens": 6617308.0,
1306
  "step": 6300
1307
  },
1308
  {
1309
- "entropy": 5.446933870315552,
1310
  "epoch": 3.655728267127231,
1311
- "grad_norm": 1.1171406507492065,
1312
- "learning_rate": 1.7753408417308832e-07,
1313
- "loss": 5.2848,
1314
- "mean_token_accuracy": 0.31402444154024123,
1315
- "num_tokens": 6669969.0,
1316
  "step": 6350
1317
  },
1318
  {
1319
- "entropy": 5.605754513740539,
1320
  "epoch": 3.6845135290731146,
1321
- "grad_norm": 2.066650152206421,
1322
- "learning_rate": 1.6271487848251334e-07,
1323
- "loss": 5.4447,
1324
- "mean_token_accuracy": 0.2945487481355667,
1325
- "num_tokens": 6724425.0,
1326
  "step": 6400
1327
  },
1328
  {
1329
- "entropy": 5.39195601940155,
1330
  "epoch": 3.7132987910189983,
1331
- "grad_norm": 1.6908842325210571,
1332
- "learning_rate": 1.4789567279193834e-07,
1333
- "loss": 5.2298,
1334
- "mean_token_accuracy": 0.3206364804506302,
1335
- "num_tokens": 6775236.0,
1336
  "step": 6450
1337
  },
1338
  {
1339
- "entropy": 5.514347395896912,
1340
  "epoch": 3.742084052964882,
1341
- "grad_norm": 1.166090726852417,
1342
- "learning_rate": 1.3307646710136337e-07,
1343
- "loss": 5.3517,
1344
- "mean_token_accuracy": 0.30615471601486205,
1345
- "num_tokens": 6828545.0,
1346
  "step": 6500
1347
  },
1348
  {
1349
- "entropy": 5.6728374910354615,
1350
  "epoch": 3.7708693149107657,
1351
- "grad_norm": 2.3615996837615967,
1352
- "learning_rate": 1.1825726141078837e-07,
1353
- "loss": 5.5058,
1354
- "mean_token_accuracy": 0.28638383001089096,
1355
- "num_tokens": 6884005.0,
1356
  "step": 6550
1357
  },
1358
  {
1359
- "entropy": 5.4262278175354,
1360
  "epoch": 3.7996545768566494,
1361
- "grad_norm": 1.7658995389938354,
1362
- "learning_rate": 1.0343805572021339e-07,
1363
- "loss": 5.2617,
1364
- "mean_token_accuracy": 0.31743784427642824,
1365
- "num_tokens": 6935209.0,
1366
  "step": 6600
1367
  },
1368
  {
1369
- "entropy": 5.436288638114929,
1370
  "epoch": 3.828439838802533,
1371
- "grad_norm": 3.455641269683838,
1372
- "learning_rate": 8.861885002963842e-08,
1373
- "loss": 5.2706,
1374
- "mean_token_accuracy": 0.31677050977945326,
1375
- "num_tokens": 6987396.0,
1376
  "step": 6650
1377
  },
1378
  {
1379
- "entropy": 5.586358890533448,
1380
  "epoch": 3.857225100748417,
1381
- "grad_norm": 1.981423020362854,
1382
- "learning_rate": 7.379964433906343e-08,
1383
- "loss": 5.4191,
1384
- "mean_token_accuracy": 0.2982942935824394,
1385
- "num_tokens": 7041132.0,
1386
  "step": 6700
1387
  },
1388
  {
1389
- "entropy": 5.494750590324402,
1390
  "epoch": 3.8860103626943006,
1391
- "grad_norm": 1.7962652444839478,
1392
- "learning_rate": 5.8980438648488434e-08,
1393
- "loss": 5.3306,
1394
- "mean_token_accuracy": 0.3082431614398956,
1395
- "num_tokens": 7094059.0,
1396
  "step": 6750
1397
  },
1398
  {
1399
- "entropy": 5.393875141143798,
1400
  "epoch": 3.9147956246401843,
1401
- "grad_norm": 1.8328484296798706,
1402
- "learning_rate": 4.416123295791346e-08,
1403
- "loss": 5.2351,
1404
- "mean_token_accuracy": 0.3187332367897034,
1405
- "num_tokens": 7144964.0,
1406
  "step": 6800
1407
  },
1408
  {
1409
- "entropy": 5.660646886825561,
1410
  "epoch": 3.943580886586068,
1411
- "grad_norm": 0.8133105039596558,
1412
- "learning_rate": 2.934202726733847e-08,
1413
- "loss": 5.4946,
1414
- "mean_token_accuracy": 0.2876924830675125,
1415
- "num_tokens": 7200805.0,
1416
  "step": 6850
1417
  },
1418
  {
1419
- "entropy": 5.239456839561463,
1420
  "epoch": 3.9723661485319517,
1421
- "grad_norm": 7.838026523590088,
1422
- "learning_rate": 1.4522821576763486e-08,
1423
- "loss": 5.0866,
1424
- "mean_token_accuracy": 0.33811178654432295,
1425
- "num_tokens": 7250918.0,
1426
  "step": 6900
1427
  },
1428
  {
1429
  "epoch": 4.0,
1430
- "eval_entropy": 5.780879339314826,
1431
- "eval_loss": 5.622366428375244,
1432
- "eval_mean_token_accuracy": 0.26563407995733795,
1433
- "eval_model_preparation_time": 0.0047,
1434
- "eval_num_tokens": 7300428.0,
1435
- "eval_runtime": 80.4424,
1436
- "eval_samples_per_second": 5.395,
1437
- "eval_steps_per_second": 2.698,
1438
  "step": 6948
1439
  }
1440
  ],
1441
  "logging_steps": 50,
1442
- "max_steps": 6948,
1443
  "num_input_tokens_seen": 0,
1444
- "num_train_epochs": 4,
1445
  "save_steps": 500,
1446
  "stateful_callbacks": {
1447
  "TrainerControl": {
@@ -1450,12 +1450,12 @@
1450
  "should_evaluate": false,
1451
  "should_log": false,
1452
  "should_save": true,
1453
- "should_training_stop": true
1454
  },
1455
  "attributes": {}
1456
  }
1457
  },
1458
- "total_flos": 1.0021019691282432e+17,
1459
  "train_batch_size": 2,
1460
  "trial_name": null,
1461
  "trial_params": null
 
1
  {
2
  "best_global_step": 6948,
3
+ "best_metric": 5.525067329406738,
4
  "best_model_checkpoint": "./output/checkpoint-6948",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.606692385673523,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.2999913692474365,
16
  "learning_rate": 4.9e-07,
17
+ "loss": 13.6598,
18
+ "mean_token_accuracy": 0.16028020828962325,
19
+ "num_tokens": 53993.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.618675880432129,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.101252555847168,
26
  "learning_rate": 9.9e-07,
27
+ "loss": 14.0188,
28
+ "mean_token_accuracy": 0.1508466500043869,
29
+ "num_tokens": 110134.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.5215235900878907,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.513662815093994,
36
  "learning_rate": 1.49e-06,
37
+ "loss": 12.8555,
38
+ "mean_token_accuracy": 0.18527640983462335,
39
+ "num_tokens": 160191.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.667909698486328,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.327610492706299,
46
  "learning_rate": 1.99e-06,
47
+ "loss": 13.5394,
48
+ "mean_token_accuracy": 0.157139780074358,
49
+ "num_tokens": 214993.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.768263258934021,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.290107250213623,
56
+ "learning_rate": 1.988450206246317e-06,
57
+ "loss": 12.8912,
58
+ "mean_token_accuracy": 0.17374794125556947,
59
+ "num_tokens": 268184.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 3.990619196891785,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.444278717041016,
66
+ "learning_rate": 1.976664702416028e-06,
67
+ "loss": 12.455,
68
+ "mean_token_accuracy": 0.17780130118131637,
69
+ "num_tokens": 319458.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.162646284103394,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 5.615262508392334,
76
+ "learning_rate": 1.9648791985857395e-06,
77
+ "loss": 12.0893,
78
+ "mean_token_accuracy": 0.18191319867968558,
79
+ "num_tokens": 373337.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.532100868225098,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 10.074016571044922,
86
+ "learning_rate": 1.9530936947554507e-06,
87
+ "loss": 11.9261,
88
+ "mean_token_accuracy": 0.169477596282959,
89
+ "num_tokens": 427526.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 4.923871030807495,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 16.220163345336914,
96
+ "learning_rate": 1.9413081909251622e-06,
97
+ "loss": 11.0048,
98
+ "mean_token_accuracy": 0.1704501649737358,
99
+ "num_tokens": 480528.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.521005854606629,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 29.904008865356445,
106
+ "learning_rate": 1.9295226870948733e-06,
107
+ "loss": 9.6524,
108
+ "mean_token_accuracy": 0.16450899541378022,
109
+ "num_tokens": 535314.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.092623329162597,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 17.821575164794922,
116
+ "learning_rate": 1.9177371832645845e-06,
117
+ "loss": 8.1054,
118
+ "mean_token_accuracy": 0.17205011785030366,
119
+ "num_tokens": 588410.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.385262680053711,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 5.502202987670898,
126
+ "learning_rate": 1.9059516794342958e-06,
127
+ "loss": 7.4313,
128
+ "mean_token_accuracy": 0.1734227080643177,
129
+ "num_tokens": 641736.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.278562617301941,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.4657697677612305,
136
+ "learning_rate": 1.8941661756040071e-06,
137
+ "loss": 6.9266,
138
+ "mean_token_accuracy": 0.18680249139666558,
139
+ "num_tokens": 692200.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.553266277313233,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.955812931060791,
146
+ "learning_rate": 1.8823806717737183e-06,
147
+ "loss": 6.9847,
148
+ "mean_token_accuracy": 0.16679802387952805,
149
+ "num_tokens": 745830.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.470935583114624,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.198381423950195,
156
+ "learning_rate": 1.8705951679434296e-06,
157
+ "loss": 6.7277,
158
+ "mean_token_accuracy": 0.17847734570503235,
159
+ "num_tokens": 798872.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.5620588779449465,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 3.1793746948242188,
166
+ "learning_rate": 1.8588096641131407e-06,
167
+ "loss": 6.7032,
168
+ "mean_token_accuracy": 0.17336134731769562,
169
+ "num_tokens": 853045.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.532204885482788,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.824537515640259,
176
+ "learning_rate": 1.847024160282852e-06,
177
+ "loss": 6.5762,
178
+ "mean_token_accuracy": 0.1805124071240425,
179
+ "num_tokens": 907679.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.535988225936889,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 4.350001811981201,
186
+ "learning_rate": 1.8352386564525632e-06,
187
+ "loss": 6.505,
188
+ "mean_token_accuracy": 0.1842605724930763,
189
+ "num_tokens": 964170.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.204533562660218,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.193660020828247,
196
+ "learning_rate": 1.8234531526222745e-06,
197
+ "loss": 6.1211,
198
+ "mean_token_accuracy": 0.21968430042266845,
199
+ "num_tokens": 1015909.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.308737449645996,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.325622320175171,
206
+ "learning_rate": 1.8116676487919857e-06,
207
+ "loss": 6.1653,
208
+ "mean_token_accuracy": 0.21636426240205764,
209
+ "num_tokens": 1068859.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.332560749053955,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0439090728759766,
216
+ "learning_rate": 1.799882144961697e-06,
217
+ "loss": 6.1559,
218
+ "mean_token_accuracy": 0.21859725564718246,
219
+ "num_tokens": 1123202.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.042124252319336,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 3.621903657913208,
226
+ "learning_rate": 1.7880966411314081e-06,
227
+ "loss": 5.8441,
228
+ "mean_token_accuracy": 0.24906315237283708,
229
+ "num_tokens": 1173403.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.921343173980713,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 5.658033847808838,
236
+ "learning_rate": 1.7763111373011195e-06,
237
+ "loss": 5.7104,
238
+ "mean_token_accuracy": 0.2625067520141602,
239
+ "num_tokens": 1225026.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.093586492538452,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 2.4292995929718018,
246
+ "learning_rate": 1.7645256334708308e-06,
247
+ "loss": 5.8658,
248
+ "mean_token_accuracy": 0.24842385441064835,
249
+ "num_tokens": 1279013.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.119112596511841,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.369384288787842,
256
+ "learning_rate": 1.752740129640542e-06,
257
+ "loss": 5.8784,
258
+ "mean_token_accuracy": 0.24857850253582,
259
+ "num_tokens": 1332547.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.025163550376892,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.5110116004943848,
266
+ "learning_rate": 1.7409546258102533e-06,
267
+ "loss": 5.7769,
268
+ "mean_token_accuracy": 0.25835376888513567,
269
+ "num_tokens": 1385192.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.877259612083435,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.4179303646087646,
276
+ "learning_rate": 1.7291691219799646e-06,
277
+ "loss": 5.6284,
278
+ "mean_token_accuracy": 0.2756252554059029,
279
+ "num_tokens": 1437071.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.002246947288513,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.494359016418457,
286
+ "learning_rate": 1.717383618149676e-06,
287
+ "loss": 5.747,
288
+ "mean_token_accuracy": 0.26462210685014725,
289
+ "num_tokens": 1490818.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 5.991955623626709,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 2.340975761413574,
296
+ "learning_rate": 1.705598114319387e-06,
297
+ "loss": 5.7379,
298
+ "mean_token_accuracy": 0.26444981098175047,
299
+ "num_tokens": 1544997.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 5.91768889427185,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 2.2394514083862305,
306
+ "learning_rate": 1.6938126104890984e-06,
307
+ "loss": 5.6564,
308
+ "mean_token_accuracy": 0.2730415526032448,
309
+ "num_tokens": 1598302.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 5.982716989517212,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 1.876839518547058,
316
+ "learning_rate": 1.6820271066588098e-06,
317
+ "loss": 5.7215,
318
+ "mean_token_accuracy": 0.26642445534467696,
319
+ "num_tokens": 1655267.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.820467872619629,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.219966173171997,
326
+ "learning_rate": 1.6702416028285209e-06,
327
+ "loss": 5.5555,
328
+ "mean_token_accuracy": 0.2856418335437775,
329
+ "num_tokens": 1709199.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 5.996349005699158,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.247213840484619,
336
+ "learning_rate": 1.6584560989982322e-06,
337
+ "loss": 5.7283,
338
+ "mean_token_accuracy": 0.2696125540137291,
339
+ "num_tokens": 1765443.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.696683068275451,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 2.8499979972839355,
346
+ "learning_rate": 1.6466705951679433e-06,
347
+ "loss": 5.4335,
348
+ "mean_token_accuracy": 0.29918427973985673,
349
+ "num_tokens": 1817494.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 5.993559589034401,
355
+ "eval_loss": 5.737204551696777,
356
+ "eval_mean_token_accuracy": 0.2618687468739699,
357
+ "eval_model_preparation_time": 0.0045,
358
+ "eval_num_tokens": 1856362.0,
359
+ "eval_runtime": 50.5332,
360
+ "eval_samples_per_second": 8.588,
361
+ "eval_steps_per_second": 4.294,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.746842083930969,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.33052921295166,
368
+ "learning_rate": 1.6348850913376547e-06,
369
+ "loss": 5.4796,
370
+ "mean_token_accuracy": 0.2966849410533905,
371
+ "num_tokens": 1870353.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 5.859029049873352,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 1.6248886585235596,
378
+ "learning_rate": 1.6230995875073658e-06,
379
+ "loss": 5.5975,
380
+ "mean_token_accuracy": 0.2838129925727844,
381
+ "num_tokens": 1926205.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.731445336341858,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 1.6941566467285156,
388
+ "learning_rate": 1.6113140836770771e-06,
389
+ "loss": 5.476,
390
+ "mean_token_accuracy": 0.2992346465587616,
391
+ "num_tokens": 1979821.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.6993954515457155,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 1.1746597290039062,
398
+ "learning_rate": 1.5995285798467883e-06,
399
+ "loss": 5.4608,
400
+ "mean_token_accuracy": 0.3000726142525673,
401
+ "num_tokens": 2034373.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.668873124122619,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 1.728211760520935,
408
+ "learning_rate": 1.5877430760164996e-06,
409
+ "loss": 5.4347,
410
+ "mean_token_accuracy": 0.3033922725915909,
411
+ "num_tokens": 2087339.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.624621086120605,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 1.4078539609909058,
418
+ "learning_rate": 1.5759575721862107e-06,
419
+ "loss": 5.3954,
420
+ "mean_token_accuracy": 0.30784171640872954,
421
+ "num_tokens": 2139520.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.7141213130950925,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 2.186459541320801,
428
+ "learning_rate": 1.564172068355922e-06,
429
+ "loss": 5.4847,
430
+ "mean_token_accuracy": 0.29594049394130706,
431
+ "num_tokens": 2193987.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.632415266036987,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 1.3601349592208862,
438
+ "learning_rate": 1.5523865645256334e-06,
439
+ "loss": 5.4135,
440
+ "mean_token_accuracy": 0.30366597563028336,
441
+ "num_tokens": 2249616.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.510904269218445,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.065760612487793,
448
+ "learning_rate": 1.5406010606953445e-06,
449
+ "loss": 5.2904,
450
+ "mean_token_accuracy": 0.3211754837632179,
451
+ "num_tokens": 2300863.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.703383626937867,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 1.1172698736190796,
458
+ "learning_rate": 1.5288155568650559e-06,
459
+ "loss": 5.4802,
460
+ "mean_token_accuracy": 0.29713701367378237,
461
+ "num_tokens": 2356029.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.565930342674255,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 1.7528513669967651,
468
+ "learning_rate": 1.5170300530347672e-06,
469
+ "loss": 5.3518,
470
+ "mean_token_accuracy": 0.31301232606172563,
471
+ "num_tokens": 2408957.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.496430187225342,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 1.892640233039856,
478
+ "learning_rate": 1.5052445492044786e-06,
479
+ "loss": 5.2967,
480
+ "mean_token_accuracy": 0.3181899458169937,
481
+ "num_tokens": 2462569.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.725150098800659,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 1.774940848350525,
488
+ "learning_rate": 1.4934590453741897e-06,
489
+ "loss": 5.5215,
490
+ "mean_token_accuracy": 0.29055028676986694,
491
+ "num_tokens": 2518544.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.4884827613830565,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 2.2167599201202393,
498
+ "learning_rate": 1.481673541543901e-06,
499
+ "loss": 5.2917,
500
+ "mean_token_accuracy": 0.31803421139717103,
501
+ "num_tokens": 2570863.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.697079472541809,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 1.6489030122756958,
508
+ "learning_rate": 1.4698880377136124e-06,
509
+ "loss": 5.4982,
510
+ "mean_token_accuracy": 0.2925163987278938,
511
+ "num_tokens": 2626998.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.46209939956665,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.153914451599121,
518
+ "learning_rate": 1.4581025338833235e-06,
519
+ "loss": 5.2736,
520
+ "mean_token_accuracy": 0.3182168474793434,
521
+ "num_tokens": 2681568.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.4405768728256225,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.6614978313446045,
528
+ "learning_rate": 1.4463170300530348e-06,
529
+ "loss": 5.2515,
530
+ "mean_token_accuracy": 0.3218736210465431,
531
+ "num_tokens": 2733587.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.528175053596496,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.0849746465682983,
538
+ "learning_rate": 1.434531526222746e-06,
539
+ "loss": 5.3378,
540
+ "mean_token_accuracy": 0.31061659604310987,
541
+ "num_tokens": 2787003.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.46110897064209,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 1.8315683603286743,
548
+ "learning_rate": 1.4227460223924573e-06,
549
+ "loss": 5.2782,
550
+ "mean_token_accuracy": 0.31781029611825945,
551
+ "num_tokens": 2840263.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.455560960769653,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 1.1859091520309448,
558
+ "learning_rate": 1.4109605185621684e-06,
559
+ "loss": 5.2735,
560
+ "mean_token_accuracy": 0.3194814011454582,
561
+ "num_tokens": 2894186.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.430496115684509,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3500001430511475,
568
+ "learning_rate": 1.3991750147318797e-06,
569
+ "loss": 5.2464,
570
+ "mean_token_accuracy": 0.32140792965888976,
571
+ "num_tokens": 2948171.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.588023023605347,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 1.727825403213501,
578
+ "learning_rate": 1.3873895109015909e-06,
579
+ "loss": 5.4028,
580
+ "mean_token_accuracy": 0.3039530631899834,
581
+ "num_tokens": 3002678.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.410525422096253,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.3401474952697754,
588
+ "learning_rate": 1.3756040070713022e-06,
589
+ "loss": 5.2298,
590
+ "mean_token_accuracy": 0.324065263569355,
591
+ "num_tokens": 3055844.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.36959942817688,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.1892589330673218,
598
+ "learning_rate": 1.3638185032410133e-06,
599
+ "loss": 5.1956,
600
+ "mean_token_accuracy": 0.32639502108097074,
601
+ "num_tokens": 3108636.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.53826907157898,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.2652360200881958,
608
+ "learning_rate": 1.3520329994107247e-06,
609
+ "loss": 5.3583,
610
+ "mean_token_accuracy": 0.3074926760792732,
611
+ "num_tokens": 3162627.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.417449145317078,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 1.584312915802002,
618
+ "learning_rate": 1.340247495580436e-06,
619
+ "loss": 5.2388,
620
+ "mean_token_accuracy": 0.32019727885723115,
621
+ "num_tokens": 3216409.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.241390740871429,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.5219439268112183,
628
+ "learning_rate": 1.3284619917501471e-06,
629
+ "loss": 5.0645,
630
+ "mean_token_accuracy": 0.3445430138707161,
631
+ "num_tokens": 3266967.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.405424036979675,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 2.1165153980255127,
638
+ "learning_rate": 1.3166764879198585e-06,
639
+ "loss": 5.232,
640
+ "mean_token_accuracy": 0.32085000157356264,
641
+ "num_tokens": 3319877.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.123006024360657,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 1.2189785242080688,
648
+ "learning_rate": 1.3048909840895698e-06,
649
+ "loss": 4.9582,
650
+ "mean_token_accuracy": 0.356108532845974,
651
+ "num_tokens": 3368569.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.417610831260681,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.5157604217529297,
658
+ "learning_rate": 1.2931054802592812e-06,
659
+ "loss": 5.2454,
660
+ "mean_token_accuracy": 0.31976755023002623,
661
+ "num_tokens": 3422449.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.409690895080566,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.3088161945343018,
668
+ "learning_rate": 1.2813199764289923e-06,
669
+ "loss": 5.2348,
670
+ "mean_token_accuracy": 0.32325415283441544,
671
+ "num_tokens": 3474399.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.44662567615509,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 2.178372621536255,
678
+ "learning_rate": 1.2695344725987036e-06,
679
+ "loss": 5.2661,
680
+ "mean_token_accuracy": 0.3182847076654434,
681
+ "num_tokens": 3527726.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.512614865303039,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 1.3050425052642822,
688
+ "learning_rate": 1.2577489687684147e-06,
689
+ "loss": 5.3416,
690
+ "mean_token_accuracy": 0.3084403133392334,
691
+ "num_tokens": 3581980.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.379772834777832,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4584404230117798,
698
+ "learning_rate": 1.245963464938126e-06,
699
+ "loss": 5.2087,
700
+ "mean_token_accuracy": 0.32388432770967485,
701
+ "num_tokens": 3635393.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.483665924072266,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.2157734632492065,
708
+ "learning_rate": 1.2341779611078374e-06,
709
+ "loss": 5.3101,
710
+ "mean_token_accuracy": 0.3121953472495079,
711
+ "num_tokens": 3689894.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.711394641805904,
717
+ "eval_loss": 5.55628776550293,
718
+ "eval_mean_token_accuracy": 0.2764948787777105,
719
+ "eval_model_preparation_time": 0.0045,
720
+ "eval_num_tokens": 3712724.0,
721
+ "eval_runtime": 50.187,
722
+ "eval_samples_per_second": 8.648,
723
+ "eval_steps_per_second": 4.324,
724
  "step": 3474
725
  },
726
  {
727
+ "entropy": 5.349283556938172,
728
  "epoch": 2.0149683362118593,
729
+ "grad_norm": 1.1696771383285522,
730
+ "learning_rate": 1.2223924572775486e-06,
731
+ "loss": 5.1782,
732
+ "mean_token_accuracy": 0.33028870791196824,
733
+ "num_tokens": 3740861.0,
734
  "step": 3500
735
  },
736
  {
737
+ "entropy": 5.4721107006073,
738
  "epoch": 2.043753598157743,
739
+ "grad_norm": 1.8449370861053467,
740
+ "learning_rate": 1.2106069534472599e-06,
741
+ "loss": 5.2978,
742
+ "mean_token_accuracy": 0.31511022299528124,
743
+ "num_tokens": 3794869.0,
744
  "step": 3550
745
  },
746
  {
747
+ "entropy": 5.404226851463318,
748
  "epoch": 2.0725388601036268,
749
+ "grad_norm": 3.789496660232544,
750
+ "learning_rate": 1.198821449616971e-06,
751
+ "loss": 5.2371,
752
+ "mean_token_accuracy": 0.32092176616191864,
753
+ "num_tokens": 3848573.0,
754
  "step": 3600
755
  },
756
  {
757
+ "entropy": 5.435445628166199,
758
  "epoch": 2.1013241220495105,
759
+ "grad_norm": 2.2847959995269775,
760
+ "learning_rate": 1.1870359457866824e-06,
761
+ "loss": 5.2662,
762
+ "mean_token_accuracy": 0.3186633634567261,
763
+ "num_tokens": 3901204.0,
764
  "step": 3650
765
  },
766
  {
767
+ "entropy": 5.4066293334960935,
768
  "epoch": 2.130109383995394,
769
+ "grad_norm": 1.0950902700424194,
770
+ "learning_rate": 1.1752504419563935e-06,
771
+ "loss": 5.2345,
772
+ "mean_token_accuracy": 0.32156052827835085,
773
+ "num_tokens": 3953753.0,
774
  "step": 3700
775
  },
776
  {
777
+ "entropy": 5.272332944869995,
778
  "epoch": 2.158894645941278,
779
+ "grad_norm": 2.1477339267730713,
780
+ "learning_rate": 1.1634649381261048e-06,
781
+ "loss": 5.1091,
782
+ "mean_token_accuracy": 0.3380983591079712,
783
+ "num_tokens": 4005481.0,
784
  "step": 3750
785
  },
786
  {
787
+ "entropy": 5.4118804311752315,
788
  "epoch": 2.1876799078871616,
789
+ "grad_norm": 1.4509484767913818,
790
+ "learning_rate": 1.151679434295816e-06,
791
+ "loss": 5.2448,
792
+ "mean_token_accuracy": 0.3208243528008461,
793
+ "num_tokens": 4058829.0,
794
  "step": 3800
795
  },
796
  {
797
+ "entropy": 5.4763900089263915,
798
  "epoch": 2.2164651698330453,
799
+ "grad_norm": 1.0856804847717285,
800
+ "learning_rate": 1.1398939304655273e-06,
801
+ "loss": 5.3042,
802
+ "mean_token_accuracy": 0.31338351368904116,
803
+ "num_tokens": 4113326.0,
804
  "step": 3850
805
  },
806
  {
807
+ "entropy": 5.328452725410461,
808
  "epoch": 2.245250431778929,
809
+ "grad_norm": 3.2843880653381348,
810
+ "learning_rate": 1.1281084266352386e-06,
811
+ "loss": 5.1624,
812
+ "mean_token_accuracy": 0.3305218696594238,
813
+ "num_tokens": 4165454.0,
814
  "step": 3900
815
  },
816
  {
817
+ "entropy": 5.383157343864441,
818
  "epoch": 2.2740356937248127,
819
+ "grad_norm": 2.207082748413086,
820
+ "learning_rate": 1.1163229228049497e-06,
821
+ "loss": 5.2163,
822
+ "mean_token_accuracy": 0.32331310987472534,
823
+ "num_tokens": 4219250.0,
824
  "step": 3950
825
  },
826
  {
827
+ "entropy": 5.585261764526368,
828
  "epoch": 2.3028209556706964,
829
+ "grad_norm": 2.7102835178375244,
830
+ "learning_rate": 1.104537418974661e-06,
831
+ "loss": 5.4137,
832
+ "mean_token_accuracy": 0.29959124475717547,
833
+ "num_tokens": 4274711.0,
834
  "step": 4000
835
  },
836
  {
837
+ "entropy": 5.434073266983032,
838
  "epoch": 2.33160621761658,
839
+ "grad_norm": 1.3775779008865356,
840
+ "learning_rate": 1.0927519151443724e-06,
841
+ "loss": 5.2644,
842
+ "mean_token_accuracy": 0.3175011593103409,
843
+ "num_tokens": 4328616.0,
844
  "step": 4050
845
  },
846
  {
847
+ "entropy": 5.462391858100891,
848
  "epoch": 2.360391479562464,
849
+ "grad_norm": 1.4101024866104126,
850
+ "learning_rate": 1.0809664113140838e-06,
851
+ "loss": 5.2924,
852
+ "mean_token_accuracy": 0.3137941011786461,
853
+ "num_tokens": 4382416.0,
854
  "step": 4100
855
  },
856
  {
857
+ "entropy": 5.529892563819885,
858
  "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.2311837673187256,
860
+ "learning_rate": 1.0691809074837949e-06,
861
+ "loss": 5.364,
862
+ "mean_token_accuracy": 0.3046491605043411,
863
+ "num_tokens": 4437848.0,
864
  "step": 4150
865
  },
866
  {
867
+ "entropy": 5.4370484542846675,
868
  "epoch": 2.4179620034542313,
869
+ "grad_norm": 1.0929864645004272,
870
+ "learning_rate": 1.0573954036535062e-06,
871
+ "loss": 5.2734,
872
+ "mean_token_accuracy": 0.3169013774394989,
873
+ "num_tokens": 4491185.0,
874
  "step": 4200
875
  },
876
  {
877
+ "entropy": 5.395377616882325,
878
  "epoch": 2.446747265400115,
879
+ "grad_norm": 1.5457273721694946,
880
+ "learning_rate": 1.0456098998232174e-06,
881
+ "loss": 5.2276,
882
+ "mean_token_accuracy": 0.32221508473157884,
883
+ "num_tokens": 4544086.0,
884
  "step": 4250
885
  },
886
  {
887
+ "entropy": 5.443737335205078,
888
  "epoch": 2.4755325273459987,
889
+ "grad_norm": 1.4844346046447754,
890
+ "learning_rate": 1.0338243959929287e-06,
891
+ "loss": 5.2786,
892
+ "mean_token_accuracy": 0.3157751387357712,
893
+ "num_tokens": 4597677.0,
894
  "step": 4300
895
  },
896
  {
897
+ "entropy": 5.419876251220703,
898
  "epoch": 2.5043177892918824,
899
+ "grad_norm": 1.2481963634490967,
900
+ "learning_rate": 1.02203889216264e-06,
901
+ "loss": 5.2564,
902
+ "mean_token_accuracy": 0.31889803290367125,
903
+ "num_tokens": 4651343.0,
904
  "step": 4350
905
  },
906
  {
907
+ "entropy": 5.578677978515625,
908
  "epoch": 2.533103051237766,
909
+ "grad_norm": 2.0005414485931396,
910
+ "learning_rate": 1.0102533883323512e-06,
911
+ "loss": 5.4145,
912
+ "mean_token_accuracy": 0.30037090003490446,
913
+ "num_tokens": 4705985.0,
914
  "step": 4400
915
  },
916
  {
917
+ "entropy": 5.279946126937866,
918
  "epoch": 2.56188831318365,
919
+ "grad_norm": 1.080521821975708,
920
+ "learning_rate": 9.984678845020625e-07,
921
+ "loss": 5.1226,
922
+ "mean_token_accuracy": 0.3341303279995918,
923
+ "num_tokens": 4757741.0,
924
  "step": 4450
925
  },
926
  {
927
+ "entropy": 5.551463279724121,
928
  "epoch": 2.5906735751295336,
929
+ "grad_norm": 1.28898024559021,
930
+ "learning_rate": 9.866823806717736e-07,
931
+ "loss": 5.3832,
932
+ "mean_token_accuracy": 0.3028248634934425,
933
+ "num_tokens": 4812808.0,
934
  "step": 4500
935
  },
936
  {
937
+ "entropy": 5.3787487554550175,
938
  "epoch": 2.6194588370754173,
939
+ "grad_norm": 1.5697983503341675,
940
+ "learning_rate": 9.74896876841485e-07,
941
+ "loss": 5.2141,
942
+ "mean_token_accuracy": 0.3227942296862602,
943
+ "num_tokens": 4866572.0,
944
  "step": 4550
945
  },
946
  {
947
+ "entropy": 5.460358958244324,
948
  "epoch": 2.648244099021301,
949
+ "grad_norm": 1.3180441856384277,
950
+ "learning_rate": 9.63111373011196e-07,
951
+ "loss": 5.2954,
952
+ "mean_token_accuracy": 0.31269474506378175,
953
+ "num_tokens": 4921312.0,
954
  "step": 4600
955
  },
956
  {
957
+ "entropy": 5.434084935188293,
958
  "epoch": 2.6770293609671847,
959
+ "grad_norm": 1.2409590482711792,
960
+ "learning_rate": 9.513258691809074e-07,
961
+ "loss": 5.271,
962
+ "mean_token_accuracy": 0.3172155100107193,
963
+ "num_tokens": 4974289.0,
964
  "step": 4650
965
  },
966
  {
967
+ "entropy": 5.406955418586731,
968
  "epoch": 2.7058146229130684,
969
+ "grad_norm": 1.4782609939575195,
970
+ "learning_rate": 9.395403653506187e-07,
971
+ "loss": 5.2473,
972
+ "mean_token_accuracy": 0.32031788885593415,
973
+ "num_tokens": 5028149.0,
974
  "step": 4700
975
  },
976
  {
977
+ "entropy": 5.206603040695191,
978
  "epoch": 2.734599884858952,
979
+ "grad_norm": 2.351633071899414,
980
+ "learning_rate": 9.2775486152033e-07,
981
+ "loss": 5.0478,
982
+ "mean_token_accuracy": 0.3428420132398605,
983
+ "num_tokens": 5079349.0,
984
  "step": 4750
985
  },
986
  {
987
+ "entropy": 5.388812799453735,
988
  "epoch": 2.763385146804836,
989
+ "grad_norm": 7.564618110656738,
990
+ "learning_rate": 9.159693576900412e-07,
991
+ "loss": 5.2281,
992
+ "mean_token_accuracy": 0.3222071170806885,
993
+ "num_tokens": 5132564.0,
994
  "step": 4800
995
  },
996
  {
997
+ "entropy": 5.374106278419495,
998
  "epoch": 2.7921704087507195,
999
+ "grad_norm": 1.4734679460525513,
1000
+ "learning_rate": 9.041838538597525e-07,
1001
+ "loss": 5.2161,
1002
+ "mean_token_accuracy": 0.3219477406144142,
1003
+ "num_tokens": 5185921.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
+ "entropy": 5.232998585700988,
1008
  "epoch": 2.8209556706966032,
1009
+ "grad_norm": 1.4175471067428589,
1010
+ "learning_rate": 8.923983500294637e-07,
1011
+ "loss": 5.0769,
1012
+ "mean_token_accuracy": 0.3403926733136177,
1013
+ "num_tokens": 5237521.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
+ "entropy": 5.394891719818116,
1018
  "epoch": 2.849740932642487,
1019
+ "grad_norm": 4.951873779296875,
1020
+ "learning_rate": 8.806128461991749e-07,
1021
+ "loss": 5.2344,
1022
+ "mean_token_accuracy": 0.3213117456436157,
1023
+ "num_tokens": 5291104.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
+ "entropy": 5.413805012702942,
1028
  "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.679518461227417,
1030
+ "learning_rate": 8.688273423688863e-07,
1031
+ "loss": 5.2597,
1032
+ "mean_token_accuracy": 0.3165634173154831,
1033
+ "num_tokens": 5345058.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
+ "entropy": 5.256177935600281,
1038
  "epoch": 2.9073114565342544,
1039
+ "grad_norm": 1.8892916440963745,
1040
+ "learning_rate": 8.570418385385975e-07,
1041
+ "loss": 5.1004,
1042
+ "mean_token_accuracy": 0.3369427987933159,
1043
+ "num_tokens": 5395918.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
+ "entropy": 5.259814453125,
1048
  "epoch": 2.936096718480138,
1049
+ "grad_norm": 1.3802675008773804,
1050
+ "learning_rate": 8.452563347083087e-07,
1051
+ "loss": 5.1057,
1052
+ "mean_token_accuracy": 0.3362414276599884,
1053
+ "num_tokens": 5448086.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
+ "entropy": 5.416206178665161,
1058
  "epoch": 2.964881980426022,
1059
+ "grad_norm": 1.7677236795425415,
1060
+ "learning_rate": 8.3347083087802e-07,
1061
+ "loss": 5.2562,
1062
+ "mean_token_accuracy": 0.31725785195827483,
1063
+ "num_tokens": 5501959.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
+ "entropy": 5.507337794303894,
1068
  "epoch": 2.9936672423719055,
1069
+ "grad_norm": 1.021727442741394,
1070
+ "learning_rate": 8.216853270477313e-07,
1071
+ "loss": 5.344,
1072
+ "mean_token_accuracy": 0.30679062128067014,
1073
+ "num_tokens": 5557908.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
+ "eval_entropy": 5.682707933786278,
1079
+ "eval_loss": 5.53223991394043,
1080
+ "eval_mean_token_accuracy": 0.27747743456594404,
1081
+ "eval_model_preparation_time": 0.0045,
1082
+ "eval_num_tokens": 5569086.0,
1083
+ "eval_runtime": 49.9944,
1084
+ "eval_samples_per_second": 8.681,
1085
+ "eval_steps_per_second": 4.34,
1086
  "step": 5211
1087
  },
1088
  {
1089
+ "entropy": 5.209756035804748,
1090
  "epoch": 3.0224525043177892,
1091
+ "grad_norm": 1.725786566734314,
1092
+ "learning_rate": 8.098998232174425e-07,
1093
+ "loss": 5.0541,
1094
+ "mean_token_accuracy": 0.34166110813617706,
1095
+ "num_tokens": 5608917.0,
1096
  "step": 5250
1097
  },
1098
  {
1099
+ "entropy": 5.396296281814575,
1100
  "epoch": 3.051237766263673,
1101
+ "grad_norm": 0.7720207571983337,
1102
+ "learning_rate": 7.981143193871538e-07,
1103
+ "loss": 5.2337,
1104
+ "mean_token_accuracy": 0.32116260558366777,
1105
+ "num_tokens": 5662712.0,
1106
  "step": 5300
1107
  },
1108
  {
1109
+ "entropy": 5.341518473625183,
1110
  "epoch": 3.0800230282095566,
1111
+ "grad_norm": 2.2686808109283447,
1112
+ "learning_rate": 7.86328815556865e-07,
1113
+ "loss": 5.1824,
1114
+ "mean_token_accuracy": 0.32726580530405047,
1115
+ "num_tokens": 5715921.0,
1116
  "step": 5350
1117
  },
1118
  {
1119
+ "entropy": 5.376176896095276,
1120
  "epoch": 3.1088082901554404,
1121
+ "grad_norm": 1.2420796155929565,
1122
+ "learning_rate": 7.745433117265762e-07,
1123
+ "loss": 5.2162,
1124
+ "mean_token_accuracy": 0.32142678707838057,
1125
+ "num_tokens": 5769436.0,
1126
  "step": 5400
1127
  },
1128
  {
1129
+ "entropy": 5.4553061914443965,
1130
  "epoch": 3.137593552101324,
1131
+ "grad_norm": 1.2402859926223755,
1132
+ "learning_rate": 7.627578078962876e-07,
1133
+ "loss": 5.2971,
1134
+ "mean_token_accuracy": 0.31396267503499986,
1135
+ "num_tokens": 5823126.0,
1136
  "step": 5450
1137
  },
1138
  {
1139
+ "entropy": 5.385247969627381,
1140
  "epoch": 3.166378814047208,
1141
+ "grad_norm": 1.112062931060791,
1142
+ "learning_rate": 7.509723040659988e-07,
1143
+ "loss": 5.2324,
1144
+ "mean_token_accuracy": 0.3207343602180481,
1145
+ "num_tokens": 5875751.0,
1146
  "step": 5500
1147
  },
1148
  {
1149
+ "entropy": 5.55422221660614,
1150
  "epoch": 3.1951640759930915,
1151
+ "grad_norm": 1.5440446138381958,
1152
+ "learning_rate": 7.3918680023571e-07,
1153
+ "loss": 5.3902,
1154
+ "mean_token_accuracy": 0.3006985321640968,
1155
+ "num_tokens": 5932163.0,
1156
  "step": 5550
1157
  },
1158
  {
1159
+ "entropy": 5.403217372894287,
1160
  "epoch": 3.223949337938975,
1161
+ "grad_norm": 0.8481096625328064,
1162
+ "learning_rate": 7.274012964054213e-07,
1163
+ "loss": 5.2417,
1164
+ "mean_token_accuracy": 0.3210747820138931,
1165
+ "num_tokens": 5985889.0,
1166
  "step": 5600
1167
  },
1168
  {
1169
+ "entropy": 5.388293180465698,
1170
  "epoch": 3.252734599884859,
1171
+ "grad_norm": 0.9305989146232605,
1172
+ "learning_rate": 7.156157925751326e-07,
1173
+ "loss": 5.2319,
1174
+ "mean_token_accuracy": 0.3206030324101448,
1175
+ "num_tokens": 6040052.0,
1176
  "step": 5650
1177
  },
1178
  {
1179
+ "entropy": 5.401709322929382,
1180
  "epoch": 3.2815198618307426,
1181
+ "grad_norm": 0.8080459237098694,
1182
+ "learning_rate": 7.038302887448438e-07,
1183
+ "loss": 5.2438,
1184
+ "mean_token_accuracy": 0.3199671137332916,
1185
+ "num_tokens": 6092350.0,
1186
  "step": 5700
1187
  },
1188
  {
1189
+ "entropy": 5.4320423412323,
1190
  "epoch": 3.3103051237766263,
1191
+ "grad_norm": 1.9186089038848877,
1192
+ "learning_rate": 6.920447849145551e-07,
1193
+ "loss": 5.2696,
1194
+ "mean_token_accuracy": 0.31657984614372253,
1195
+ "num_tokens": 6146112.0,
1196
  "step": 5750
1197
  },
1198
  {
1199
+ "entropy": 5.276471285820008,
1200
  "epoch": 3.33909038572251,
1201
+ "grad_norm": 1.032879114151001,
1202
+ "learning_rate": 6.802592810842663e-07,
1203
+ "loss": 5.1224,
1204
+ "mean_token_accuracy": 0.3347566506266594,
1205
+ "num_tokens": 6197916.0,
1206
  "step": 5800
1207
  },
1208
  {
1209
+ "entropy": 5.122317051887512,
1210
  "epoch": 3.3678756476683938,
1211
+ "grad_norm": 3.156858444213867,
1212
+ "learning_rate": 6.684737772539775e-07,
1213
+ "loss": 4.9706,
1214
+ "mean_token_accuracy": 0.35455317378044127,
1215
+ "num_tokens": 6247565.0,
1216
  "step": 5850
1217
  },
1218
  {
1219
+ "entropy": 5.346597375869751,
1220
  "epoch": 3.3966609096142775,
1221
+ "grad_norm": 1.2619549036026,
1222
+ "learning_rate": 6.566882734236889e-07,
1223
+ "loss": 5.1902,
1224
+ "mean_token_accuracy": 0.3258721518516541,
1225
+ "num_tokens": 6300481.0,
1226
  "step": 5900
1227
  },
1228
  {
1229
+ "entropy": 5.413151068687439,
1230
  "epoch": 3.425446171560161,
1231
+ "grad_norm": 1.801740050315857,
1232
+ "learning_rate": 6.449027695934001e-07,
1233
+ "loss": 5.2513,
1234
+ "mean_token_accuracy": 0.3187857499718666,
1235
+ "num_tokens": 6353098.0,
1236
  "step": 5950
1237
  },
1238
  {
1239
+ "entropy": 5.464186942577362,
1240
  "epoch": 3.454231433506045,
1241
+ "grad_norm": 1.6306997537612915,
1242
+ "learning_rate": 6.331172657631113e-07,
1243
+ "loss": 5.3043,
1244
+ "mean_token_accuracy": 0.31154109388589857,
1245
+ "num_tokens": 6407984.0,
1246
  "step": 6000
1247
  },
1248
  {
1249
+ "entropy": 5.401795778274536,
1250
  "epoch": 3.4830166954519286,
1251
+ "grad_norm": 1.1694583892822266,
1252
+ "learning_rate": 6.213317619328226e-07,
1253
+ "loss": 5.2427,
1254
+ "mean_token_accuracy": 0.31954523265361784,
1255
+ "num_tokens": 6461854.0,
1256
  "step": 6050
1257
  },
1258
  {
1259
+ "entropy": 5.317689285278321,
1260
  "epoch": 3.5118019573978123,
1261
+ "grad_norm": 0.9361855387687683,
1262
+ "learning_rate": 6.095462581025339e-07,
1263
+ "loss": 5.1588,
1264
+ "mean_token_accuracy": 0.330586878657341,
1265
+ "num_tokens": 6514882.0,
1266
  "step": 6100
1267
  },
1268
  {
1269
+ "entropy": 5.478708257675171,
1270
  "epoch": 3.540587219343696,
1271
+ "grad_norm": 1.05711030960083,
1272
+ "learning_rate": 5.977607542722451e-07,
1273
+ "loss": 5.321,
1274
+ "mean_token_accuracy": 0.3104448106884956,
1275
+ "num_tokens": 6569455.0,
1276
  "step": 6150
1277
  },
1278
  {
1279
+ "entropy": 5.309361801147461,
1280
  "epoch": 3.5693724812895797,
1281
+ "grad_norm": 1.3499550819396973,
1282
+ "learning_rate": 5.859752504419564e-07,
1283
+ "loss": 5.153,
1284
+ "mean_token_accuracy": 0.331512533724308,
1285
+ "num_tokens": 6621734.0,
1286
  "step": 6200
1287
  },
1288
  {
1289
+ "entropy": 5.296572666168213,
1290
  "epoch": 3.5981577432354634,
1291
+ "grad_norm": 1.940708875656128,
1292
+ "learning_rate": 5.741897466116676e-07,
1293
+ "loss": 5.14,
1294
+ "mean_token_accuracy": 0.3299832499027252,
1295
+ "num_tokens": 6674994.0,
1296
  "step": 6250
1297
  },
1298
  {
1299
+ "entropy": 5.544284400939941,
1300
  "epoch": 3.626943005181347,
1301
+ "grad_norm": 1.8903827667236328,
1302
+ "learning_rate": 5.624042427813788e-07,
1303
+ "loss": 5.3885,
1304
+ "mean_token_accuracy": 0.3016947290301323,
1305
+ "num_tokens": 6730674.0,
1306
  "step": 6300
1307
  },
1308
  {
1309
+ "entropy": 5.333053431510925,
1310
  "epoch": 3.655728267127231,
1311
+ "grad_norm": 1.1618578433990479,
1312
+ "learning_rate": 5.506187389510902e-07,
1313
+ "loss": 5.1781,
1314
+ "mean_token_accuracy": 0.3275001719594002,
1315
+ "num_tokens": 6784235.0,
1316
  "step": 6350
1317
  },
1318
  {
1319
+ "entropy": 5.4938449716568,
1320
  "epoch": 3.6845135290731146,
1321
+ "grad_norm": 1.384329080581665,
1322
+ "learning_rate": 5.388332351208014e-07,
1323
+ "loss": 5.3399,
1324
+ "mean_token_accuracy": 0.3068840709328651,
1325
+ "num_tokens": 6839590.0,
1326
  "step": 6400
1327
  },
1328
  {
1329
+ "entropy": 5.277545223236084,
1330
  "epoch": 3.7132987910189983,
1331
+ "grad_norm": 1.8918265104293823,
1332
+ "learning_rate": 5.270477312905126e-07,
1333
+ "loss": 5.1221,
1334
+ "mean_token_accuracy": 0.33364981949329375,
1335
+ "num_tokens": 6891301.0,
1336
  "step": 6450
1337
  },
1338
  {
1339
+ "entropy": 5.40100293636322,
1340
  "epoch": 3.742084052964882,
1341
+ "grad_norm": 1.6968809366226196,
1342
+ "learning_rate": 5.152622274602239e-07,
1343
+ "loss": 5.2471,
1344
+ "mean_token_accuracy": 0.31912936180830004,
1345
+ "num_tokens": 6945510.0,
1346
  "step": 6500
1347
  },
1348
  {
1349
+ "entropy": 5.561220169067383,
1350
  "epoch": 3.7708693149107657,
1351
+ "grad_norm": 2.066960573196411,
1352
+ "learning_rate": 5.034767236299352e-07,
1353
+ "loss": 5.4026,
1354
+ "mean_token_accuracy": 0.2984810543060303,
1355
+ "num_tokens": 7001870.0,
1356
  "step": 6550
1357
  },
1358
  {
1359
+ "entropy": 5.3108087682724,
1360
  "epoch": 3.7996545768566494,
1361
+ "grad_norm": 1.6065007448196411,
1362
+ "learning_rate": 4.916912197996464e-07,
1363
+ "loss": 5.155,
1364
+ "mean_token_accuracy": 0.3304683968424797,
1365
+ "num_tokens": 7053974.0,
1366
  "step": 6600
1367
  },
1368
  {
1369
+ "entropy": 5.323807754516602,
1370
  "epoch": 3.828439838802533,
1371
+ "grad_norm": 2.6806318759918213,
1372
+ "learning_rate": 4.799057159693577e-07,
1373
+ "loss": 5.1653,
1374
+ "mean_token_accuracy": 0.3294159671664238,
1375
+ "num_tokens": 7107061.0,
1376
  "step": 6650
1377
  },
1378
  {
1379
+ "entropy": 5.4716163873672485,
1380
  "epoch": 3.857225100748417,
1381
+ "grad_norm": 1.8264856338500977,
1382
+ "learning_rate": 4.6812021213906895e-07,
1383
+ "loss": 5.3124,
1384
+ "mean_token_accuracy": 0.3109353107213974,
1385
+ "num_tokens": 7161697.0,
1386
  "step": 6700
1387
  },
1388
  {
1389
+ "entropy": 5.382365622520447,
1390
  "epoch": 3.8860103626943006,
1391
+ "grad_norm": 0.9954923987388611,
1392
+ "learning_rate": 4.563347083087802e-07,
1393
+ "loss": 5.2237,
1394
+ "mean_token_accuracy": 0.32161149621009827,
1395
+ "num_tokens": 7215524.0,
1396
  "step": 6750
1397
  },
1398
  {
1399
+ "entropy": 5.277496585845947,
1400
  "epoch": 3.9147956246401843,
1401
+ "grad_norm": 1.267786979675293,
1402
+ "learning_rate": 4.445492044784914e-07,
1403
+ "loss": 5.1265,
1404
+ "mean_token_accuracy": 0.3319795566797257,
1405
+ "num_tokens": 7267329.0,
1406
  "step": 6800
1407
  },
1408
  {
1409
+ "entropy": 5.550942025184631,
1410
  "epoch": 3.943580886586068,
1411
+ "grad_norm": 0.9425063133239746,
1412
+ "learning_rate": 4.3276370064820265e-07,
1413
+ "loss": 5.3898,
1414
+ "mean_token_accuracy": 0.30050904959440233,
1415
+ "num_tokens": 7324070.0,
1416
  "step": 6850
1417
  },
1418
  {
1419
+ "entropy": 5.125799627304077,
1420
  "epoch": 3.9723661485319517,
1421
+ "grad_norm": 5.447021007537842,
1422
+ "learning_rate": 4.20978196817914e-07,
1423
+ "loss": 4.9781,
1424
+ "mean_token_accuracy": 0.3520450854301453,
1425
+ "num_tokens": 7375083.0,
1426
  "step": 6900
1427
  },
1428
  {
1429
  "epoch": 4.0,
1430
+ "eval_entropy": 5.6681923492712905,
1431
+ "eval_loss": 5.525067329406738,
1432
+ "eval_mean_token_accuracy": 0.2779707208893816,
1433
+ "eval_model_preparation_time": 0.0045,
1434
+ "eval_num_tokens": 7425448.0,
1435
+ "eval_runtime": 49.7944,
1436
+ "eval_samples_per_second": 8.716,
1437
+ "eval_steps_per_second": 4.358,
1438
  "step": 6948
1439
  }
1440
  ],
1441
  "logging_steps": 50,
1442
+ "max_steps": 8685,
1443
  "num_input_tokens_seen": 0,
1444
+ "num_train_epochs": 5,
1445
  "save_steps": 500,
1446
  "stateful_callbacks": {
1447
  "TrainerControl": {
 
1450
  "should_evaluate": false,
1451
  "should_log": false,
1452
  "should_save": true,
1453
+ "should_training_stop": false
1454
  },
1455
  "attributes": {}
1456
  }
1457
  },
1458
+ "total_flos": 1.016969752533504e+17,
1459
  "train_batch_size": 2,
1460
  "trial_name": null,
1461
  "trial_params": null
checkpoint-6948/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a666397e6243ddba6f7279c90610ed552907ef4de0be511faece3826d13e618
3
  size 6225
checkpoint-8685/adapter_config.json CHANGED
@@ -25,14 +25,14 @@
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
- "r": 8,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "k_proj",
33
  "v_proj",
34
- "q_proj",
35
- "o_proj"
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
 
25
  "peft_type": "LORA",
26
  "peft_version": "0.18.0",
27
  "qalora_group_size": 16,
28
+ "r": 24,
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "o_proj",
33
  "k_proj",
34
  "v_proj",
35
+ "q_proj"
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
checkpoint-8685/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2381d61542c1032294bdfd8d93b87c507ec0307a2bd423dfa1c90ac19f153434
3
- size 8749064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b4ecb107db701acdc04f96300149f10454a4f22cc800cab0b968eae74c3415
3
+ size 26182176
checkpoint-8685/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36cfb0e2c01a3583f649b7157010998f7cfe60c81f2d8dd9f8a236e6ac0ea717
3
- size 17621003
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1299200e098e830dd07921adbf8a60a6476e8cf36d7b7f707c1a922d4319d4d
3
+ size 52486155
checkpoint-8685/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9e2210ef2bde81f3b01f24b5d3b56f7929de6f0dc6c10e40739165ab0cf536d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400a0ef098a3a7945e367fce95960239dc97197e53382a77b353016a149f9f93
3
  size 14645
checkpoint-8685/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa0ca13eecc178cf19160562d582bc8e65df34a81c00371e4700177518add503
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e56af9f486c216b1657390be7da71b3945baf2e0925cfac7cc23d69d3cd231
3
  size 1465
checkpoint-8685/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 8685,
3
- "best_metric": 5.603951930999756,
4
  "best_model_checkpoint": "./output/checkpoint-8685",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
@@ -10,1800 +10,1800 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 3.657764935493469,
14
  "epoch": 0.028785261945883708,
15
- "grad_norm": 5.520808696746826,
16
  "learning_rate": 4.9e-07,
17
- "loss": 13.8756,
18
- "mean_token_accuracy": 0.15039000600576402,
19
- "num_tokens": 53093.0,
20
  "step": 50
21
  },
22
  {
23
- "entropy": 3.6701580238342286,
24
  "epoch": 0.057570523891767415,
25
- "grad_norm": 5.364443302154541,
26
  "learning_rate": 9.9e-07,
27
- "loss": 14.2271,
28
- "mean_token_accuracy": 0.14123578995466232,
29
- "num_tokens": 108334.0,
30
  "step": 100
31
  },
32
  {
33
- "entropy": 3.571350417137146,
34
  "epoch": 0.08635578583765112,
35
- "grad_norm": 6.120348930358887,
36
  "learning_rate": 1.49e-06,
37
- "loss": 13.0719,
38
- "mean_token_accuracy": 0.17464223861694336,
39
- "num_tokens": 157491.0,
40
  "step": 150
41
  },
42
  {
43
- "entropy": 3.7263419818878174,
44
  "epoch": 0.11514104778353483,
45
- "grad_norm": 7.090941429138184,
46
  "learning_rate": 1.99e-06,
47
- "loss": 13.7425,
48
- "mean_token_accuracy": 0.14771999716758727,
49
- "num_tokens": 211394.0,
50
  "step": 200
51
  },
52
  {
53
- "entropy": 3.8246818876266477,
54
  "epoch": 0.14392630972941853,
55
- "grad_norm": 7.317005157470703,
56
- "learning_rate": 1.9904128350616315e-06,
57
- "loss": 13.0931,
58
- "mean_token_accuracy": 0.16831266060471534,
59
- "num_tokens": 263685.0,
60
  "step": 250
61
  },
62
  {
63
- "entropy": 4.055451736450196,
64
  "epoch": 0.17271157167530224,
65
- "grad_norm": 7.5889105796813965,
66
- "learning_rate": 1.98063001369595e-06,
67
- "loss": 12.6562,
68
- "mean_token_accuracy": 0.17028855353593828,
69
- "num_tokens": 314059.0,
70
  "step": 300
71
  },
72
  {
73
- "entropy": 4.237709469795227,
74
  "epoch": 0.20149683362118595,
75
- "grad_norm": 9.854535102844238,
76
- "learning_rate": 1.970847192330268e-06,
77
- "loss": 12.2612,
78
- "mean_token_accuracy": 0.1734047804772854,
79
- "num_tokens": 367038.0,
80
  "step": 350
81
  },
82
  {
83
- "entropy": 4.658133435249328,
84
  "epoch": 0.23028209556706966,
85
- "grad_norm": 18.193012237548828,
86
- "learning_rate": 1.961064370964586e-06,
87
- "loss": 12.0381,
88
- "mean_token_accuracy": 0.16058035269379617,
89
- "num_tokens": 420327.0,
90
  "step": 400
91
  },
92
  {
93
- "entropy": 5.148408350944519,
94
  "epoch": 0.25906735751295334,
95
- "grad_norm": 23.47320556640625,
96
- "learning_rate": 1.9512815495989045e-06,
97
- "loss": 10.9474,
98
- "mean_token_accuracy": 0.16484014570713043,
99
- "num_tokens": 472429.0,
100
  "step": 450
101
  },
102
  {
103
- "entropy": 5.857514944076538,
104
  "epoch": 0.28785261945883706,
105
- "grad_norm": 38.577083587646484,
106
- "learning_rate": 1.9414987282332225e-06,
107
- "loss": 9.4627,
108
- "mean_token_accuracy": 0.16253757804632188,
109
- "num_tokens": 526315.0,
110
  "step": 500
111
  },
112
  {
113
- "entropy": 6.357027115821839,
114
  "epoch": 0.31663788140472077,
115
- "grad_norm": 21.497377395629883,
116
- "learning_rate": 1.9317159068675404e-06,
117
- "loss": 8.0126,
118
- "mean_token_accuracy": 0.16963028475642206,
119
- "num_tokens": 578511.0,
120
  "step": 550
121
  },
122
  {
123
- "entropy": 6.513781118392944,
124
  "epoch": 0.3454231433506045,
125
- "grad_norm": 7.595526218414307,
126
- "learning_rate": 1.921933085501859e-06,
127
- "loss": 7.4114,
128
- "mean_token_accuracy": 0.16999967724084855,
129
- "num_tokens": 630937.0,
130
  "step": 600
131
  },
132
  {
133
- "entropy": 6.396916694641114,
134
  "epoch": 0.3742084052964882,
135
- "grad_norm": 7.277398109436035,
136
- "learning_rate": 1.9121502641361767e-06,
137
- "loss": 6.915,
138
- "mean_token_accuracy": 0.18526431500911714,
139
- "num_tokens": 680501.0,
140
  "step": 650
141
  },
142
  {
143
- "entropy": 6.685241918563843,
144
  "epoch": 0.4029936672423719,
145
- "grad_norm": 6.1464009284973145,
146
- "learning_rate": 1.902367442770495e-06,
147
- "loss": 6.939,
148
- "mean_token_accuracy": 0.17030285254120828,
149
- "num_tokens": 733231.0,
150
  "step": 700
151
  },
152
  {
153
- "entropy": 6.59221194267273,
154
  "epoch": 0.4317789291882556,
155
- "grad_norm": 4.717687129974365,
156
- "learning_rate": 1.892584621404813e-06,
157
- "loss": 6.6518,
158
- "mean_token_accuracy": 0.18416573852300644,
159
- "num_tokens": 785373.0,
160
  "step": 750
161
  },
162
  {
163
- "entropy": 6.6572853946685795,
164
  "epoch": 0.4605641911341393,
165
- "grad_norm": 3.5309388637542725,
166
- "learning_rate": 1.8828018000391312e-06,
167
- "loss": 6.6038,
168
- "mean_token_accuracy": 0.17593510583043098,
169
- "num_tokens": 838646.0,
170
  "step": 800
171
  },
172
  {
173
- "entropy": 6.598165597915649,
174
  "epoch": 0.48934945308002303,
175
- "grad_norm": 4.042945384979248,
176
- "learning_rate": 1.8730189786734493e-06,
177
- "loss": 6.4659,
178
- "mean_token_accuracy": 0.184499751329422,
179
- "num_tokens": 892380.0,
180
  "step": 850
181
  },
182
  {
183
- "entropy": 6.5754234790802,
184
  "epoch": 0.5181347150259067,
185
- "grad_norm": 3.1833558082580566,
186
- "learning_rate": 1.8632361573077675e-06,
187
- "loss": 6.3822,
188
- "mean_token_accuracy": 0.19283706933259964,
189
- "num_tokens": 947971.0,
190
  "step": 900
191
  },
192
  {
193
- "entropy": 6.213122038841248,
194
  "epoch": 0.5469199769717904,
195
- "grad_norm": 4.177810192108154,
196
- "learning_rate": 1.8534533359420857e-06,
197
- "loss": 5.9971,
198
- "mean_token_accuracy": 0.23489593595266342,
199
- "num_tokens": 998810.0,
200
  "step": 950
201
  },
202
  {
203
- "entropy": 6.298044099807739,
204
  "epoch": 0.5757052389176741,
205
- "grad_norm": 3.098109722137451,
206
- "learning_rate": 1.8436705145764038e-06,
207
- "loss": 6.0561,
208
- "mean_token_accuracy": 0.22645144850015642,
209
- "num_tokens": 1050860.0,
210
  "step": 1000
211
  },
212
  {
213
- "entropy": 6.316655559539795,
214
  "epoch": 0.6044905008635578,
215
- "grad_norm": 4.703200817108154,
216
- "learning_rate": 1.833887693210722e-06,
217
- "loss": 6.0663,
218
- "mean_token_accuracy": 0.22275402665138244,
219
- "num_tokens": 1104304.0,
220
  "step": 1050
221
  },
222
  {
223
- "entropy": 6.038654108047485,
224
  "epoch": 0.6332757628094415,
225
- "grad_norm": 4.759252548217773,
226
- "learning_rate": 1.8241048718450401e-06,
227
- "loss": 5.7784,
228
- "mean_token_accuracy": 0.25002532452344894,
229
- "num_tokens": 1153605.0,
230
  "step": 1100
231
  },
232
  {
233
- "entropy": 5.933659896850586,
234
  "epoch": 0.6620610247553252,
235
- "grad_norm": 8.207566261291504,
236
- "learning_rate": 1.814322050479358e-06,
237
- "loss": 5.6655,
238
- "mean_token_accuracy": 0.2621264266967773,
239
- "num_tokens": 1204328.0,
240
  "step": 1150
241
  },
242
  {
243
- "entropy": 6.116096878051758,
244
  "epoch": 0.690846286701209,
245
- "grad_norm": 6.12317419052124,
246
- "learning_rate": 1.8045392291136762e-06,
247
- "loss": 5.839,
248
- "mean_token_accuracy": 0.24515481561422348,
249
- "num_tokens": 1257415.0,
250
  "step": 1200
251
  },
252
  {
253
- "entropy": 6.150517730712891,
254
  "epoch": 0.7196315486470927,
255
- "grad_norm": 5.22706937789917,
256
- "learning_rate": 1.7947564077479944e-06,
257
- "loss": 5.8714,
258
- "mean_token_accuracy": 0.24524727016687392,
259
- "num_tokens": 1310049.0,
260
  "step": 1250
261
  },
262
  {
263
- "entropy": 6.066065754890442,
264
  "epoch": 0.7484168105929764,
265
- "grad_norm": 5.889018535614014,
266
- "learning_rate": 1.7849735863823125e-06,
267
- "loss": 5.7909,
268
- "mean_token_accuracy": 0.2534559938311577,
269
- "num_tokens": 1361794.0,
270
  "step": 1300
271
  },
272
  {
273
- "entropy": 5.9255893468856815,
274
  "epoch": 0.7772020725388601,
275
- "grad_norm": 6.514400005340576,
276
- "learning_rate": 1.7751907650166307e-06,
277
- "loss": 5.6586,
278
- "mean_token_accuracy": 0.26638238221406935,
279
- "num_tokens": 1412773.0,
280
  "step": 1350
281
  },
282
  {
283
- "entropy": 6.057256698608398,
284
  "epoch": 0.8059873344847438,
285
- "grad_norm": 8.18265438079834,
286
- "learning_rate": 1.7654079436509488e-06,
287
- "loss": 5.7887,
288
- "mean_token_accuracy": 0.2534638229012489,
289
- "num_tokens": 1465620.0,
290
  "step": 1400
291
  },
292
  {
293
- "entropy": 6.05293836593628,
294
  "epoch": 0.8347725964306275,
295
- "grad_norm": 5.678530216217041,
296
- "learning_rate": 1.755625122285267e-06,
297
- "loss": 5.7898,
298
- "mean_token_accuracy": 0.2528185424208641,
299
- "num_tokens": 1518899.0,
300
  "step": 1450
301
  },
302
  {
303
- "entropy": 5.990337147712707,
304
  "epoch": 0.8635578583765112,
305
- "grad_norm": 6.633603096008301,
306
- "learning_rate": 1.7458423009195851e-06,
307
- "loss": 5.7232,
308
- "mean_token_accuracy": 0.26182772636413576,
309
- "num_tokens": 1571304.0,
310
  "step": 1500
311
  },
312
  {
313
- "entropy": 6.063954038619995,
314
  "epoch": 0.8923431203223949,
315
- "grad_norm": 5.510496139526367,
316
- "learning_rate": 1.7360594795539033e-06,
317
- "loss": 5.7971,
318
- "mean_token_accuracy": 0.2533514684438705,
319
- "num_tokens": 1627369.0,
320
  "step": 1550
321
  },
322
  {
323
- "entropy": 5.924159088134766,
324
  "epoch": 0.9211283822682786,
325
- "grad_norm": 4.613114356994629,
326
- "learning_rate": 1.7262766581882212e-06,
327
- "loss": 5.6507,
328
- "mean_token_accuracy": 0.27155053317546846,
329
- "num_tokens": 1680401.0,
330
  "step": 1600
331
  },
332
  {
333
- "entropy": 6.103739204406739,
334
  "epoch": 0.9499136442141624,
335
- "grad_norm": 5.6744842529296875,
336
- "learning_rate": 1.7164938368225394e-06,
337
- "loss": 5.8274,
338
- "mean_token_accuracy": 0.25028307527303695,
339
- "num_tokens": 1735745.0,
340
  "step": 1650
341
  },
342
  {
343
- "entropy": 5.8188560962677,
344
  "epoch": 0.9786989061600461,
345
- "grad_norm": 5.090628147125244,
346
- "learning_rate": 1.7067110154568575e-06,
347
- "loss": 5.5508,
348
- "mean_token_accuracy": 0.27921974420547485,
349
- "num_tokens": 1786896.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
- "eval_entropy": 6.118273651544949,
355
- "eval_loss": 5.848378658294678,
356
- "eval_mean_token_accuracy": 0.23943741564651788,
357
- "eval_model_preparation_time": 0.0048,
358
- "eval_num_tokens": 1825107.0,
359
- "eval_runtime": 79.9471,
360
- "eval_samples_per_second": 5.429,
361
- "eval_steps_per_second": 2.714,
362
  "step": 1737
363
  },
364
  {
365
- "entropy": 5.8781821727752686,
366
  "epoch": 1.0074841681059297,
367
- "grad_norm": 4.617892742156982,
368
- "learning_rate": 1.6969281940911757e-06,
369
- "loss": 5.5999,
370
- "mean_token_accuracy": 0.272807405591011,
371
- "num_tokens": 1838864.0,
372
  "step": 1750
373
  },
374
  {
375
- "entropy": 5.990532498359681,
376
  "epoch": 1.0362694300518134,
377
- "grad_norm": 4.281843185424805,
378
- "learning_rate": 1.687145372725494e-06,
379
- "loss": 5.7122,
380
- "mean_token_accuracy": 0.2623971113562584,
381
- "num_tokens": 1893816.0,
382
  "step": 1800
383
  },
384
  {
385
- "entropy": 5.881164779663086,
386
  "epoch": 1.065054691997697,
387
- "grad_norm": 5.956342697143555,
388
- "learning_rate": 1.6773625513598122e-06,
389
- "loss": 5.5935,
390
- "mean_token_accuracy": 0.281170434653759,
391
- "num_tokens": 1946532.0,
392
  "step": 1850
393
  },
394
  {
395
- "entropy": 5.846065778732299,
396
  "epoch": 1.0938399539435808,
397
- "grad_norm": 2.9725143909454346,
398
- "learning_rate": 1.6675797299941304e-06,
399
- "loss": 5.5704,
400
- "mean_token_accuracy": 0.28434582442045214,
401
- "num_tokens": 2000184.0,
402
  "step": 1900
403
  },
404
  {
405
- "entropy": 5.80983151435852,
406
  "epoch": 1.1226252158894645,
407
- "grad_norm": 7.230545520782471,
408
- "learning_rate": 1.6577969086284485e-06,
409
- "loss": 5.5435,
410
- "mean_token_accuracy": 0.28852490842342376,
411
- "num_tokens": 2052250.0,
412
  "step": 1950
413
  },
414
  {
415
- "entropy": 5.764914684295654,
416
  "epoch": 1.1514104778353482,
417
- "grad_norm": 5.969006538391113,
418
- "learning_rate": 1.6480140872627667e-06,
419
- "loss": 5.505,
420
- "mean_token_accuracy": 0.291971475481987,
421
- "num_tokens": 2103531.0,
422
  "step": 2000
423
  },
424
  {
425
- "entropy": 5.847496213912964,
426
  "epoch": 1.180195739781232,
427
- "grad_norm": 5.924343109130859,
428
- "learning_rate": 1.6382312658970846e-06,
429
- "loss": 5.5929,
430
- "mean_token_accuracy": 0.28092912048101426,
431
- "num_tokens": 2157098.0,
432
  "step": 2050
433
  },
434
  {
435
- "entropy": 5.761440043449402,
436
  "epoch": 1.2089810017271156,
437
- "grad_norm": 3.5072126388549805,
438
- "learning_rate": 1.6284484445314028e-06,
439
- "loss": 5.5199,
440
- "mean_token_accuracy": 0.2879746726155281,
441
- "num_tokens": 2211827.0,
442
  "step": 2100
443
  },
444
  {
445
- "entropy": 5.64789267539978,
446
  "epoch": 1.2377662636729994,
447
- "grad_norm": 3.549797534942627,
448
- "learning_rate": 1.618665623165721e-06,
449
- "loss": 5.4051,
450
- "mean_token_accuracy": 0.30424477279186246,
451
- "num_tokens": 2262174.0,
452
  "step": 2150
453
  },
454
  {
455
- "entropy": 5.837811284065246,
456
  "epoch": 1.266551525618883,
457
- "grad_norm": 3.551928758621216,
458
- "learning_rate": 1.608882801800039e-06,
459
- "loss": 5.5924,
460
- "mean_token_accuracy": 0.2800884509086609,
461
- "num_tokens": 2316440.0,
462
  "step": 2200
463
  },
464
  {
465
- "entropy": 5.710742998123169,
466
  "epoch": 1.2953367875647668,
467
- "grad_norm": 5.189817905426025,
468
- "learning_rate": 1.5990999804343572e-06,
469
- "loss": 5.4659,
470
- "mean_token_accuracy": 0.2982784253358841,
471
- "num_tokens": 2368468.0,
472
  "step": 2250
473
  },
474
  {
475
- "entropy": 5.639391016960144,
476
  "epoch": 1.3241220495106505,
477
- "grad_norm": 5.6753153800964355,
478
- "learning_rate": 1.5893171590686754e-06,
479
- "loss": 5.4055,
480
- "mean_token_accuracy": 0.30424224823713303,
481
- "num_tokens": 2421180.0,
482
  "step": 2300
483
  },
484
  {
485
- "entropy": 5.862650499343872,
486
  "epoch": 1.3529073114565342,
487
- "grad_norm": 3.0865261554718018,
488
- "learning_rate": 1.5795343377029935e-06,
489
- "loss": 5.6251,
490
- "mean_token_accuracy": 0.2777055302262306,
491
- "num_tokens": 2476255.0,
492
  "step": 2350
493
  },
494
  {
495
- "entropy": 5.623318548202515,
496
  "epoch": 1.381692573402418,
497
- "grad_norm": 3.22993803024292,
498
- "learning_rate": 1.5697515163373117e-06,
499
- "loss": 5.3962,
500
- "mean_token_accuracy": 0.30466238647699356,
501
- "num_tokens": 2527674.0,
502
  "step": 2400
503
  },
504
  {
505
- "entropy": 5.821551780700684,
506
  "epoch": 1.4104778353483016,
507
- "grad_norm": 4.010779857635498,
508
- "learning_rate": 1.5599686949716298e-06,
509
- "loss": 5.5974,
510
- "mean_token_accuracy": 0.2798157992959023,
511
- "num_tokens": 2582909.0,
512
  "step": 2450
513
  },
514
  {
515
- "entropy": 5.585216546058655,
516
  "epoch": 1.4392630972941853,
517
- "grad_norm": 5.638322353363037,
518
- "learning_rate": 1.5501858736059478e-06,
519
- "loss": 5.3738,
520
- "mean_token_accuracy": 0.30478625535964965,
521
- "num_tokens": 2636579.0,
522
  "step": 2500
523
  },
524
  {
525
- "entropy": 5.561697783470154,
526
  "epoch": 1.468048359240069,
527
- "grad_norm": 5.703922748565674,
528
- "learning_rate": 1.540403052240266e-06,
529
- "loss": 5.3534,
530
- "mean_token_accuracy": 0.30862479507923124,
531
- "num_tokens": 2687698.0,
532
  "step": 2550
533
  },
534
  {
535
- "entropy": 5.647481231689453,
536
  "epoch": 1.4968336211859528,
537
- "grad_norm": 2.2600433826446533,
538
- "learning_rate": 1.530620230874584e-06,
539
- "loss": 5.4388,
540
- "mean_token_accuracy": 0.2973997402191162,
541
- "num_tokens": 2740214.0,
542
  "step": 2600
543
  },
544
  {
545
- "entropy": 5.578100996017456,
546
  "epoch": 1.5256188831318365,
547
- "grad_norm": 4.077702522277832,
548
- "learning_rate": 1.5208374095089022e-06,
549
- "loss": 5.3769,
550
- "mean_token_accuracy": 0.30530194252729415,
551
- "num_tokens": 2792574.0,
552
  "step": 2650
553
  },
554
  {
555
- "entropy": 5.573838739395142,
556
  "epoch": 1.5544041450777202,
557
- "grad_norm": 3.5293424129486084,
558
- "learning_rate": 1.5110545881432204e-06,
559
- "loss": 5.3726,
560
- "mean_token_accuracy": 0.30599710553884507,
561
- "num_tokens": 2845597.0,
562
  "step": 2700
563
  },
564
  {
565
- "entropy": 5.5452189445495605,
566
  "epoch": 1.583189407023604,
567
- "grad_norm": 4.356649398803711,
568
- "learning_rate": 1.5012717667775385e-06,
569
- "loss": 5.3457,
570
- "mean_token_accuracy": 0.3081423792243004,
571
- "num_tokens": 2898683.0,
572
  "step": 2750
573
  },
574
  {
575
- "entropy": 5.7015859985351565,
576
  "epoch": 1.6119746689694876,
577
- "grad_norm": 3.822186231613159,
578
- "learning_rate": 1.4914889454118567e-06,
579
- "loss": 5.5027,
580
- "mean_token_accuracy": 0.29116119146347047,
581
- "num_tokens": 2952290.0,
582
  "step": 2800
583
  },
584
  {
585
- "entropy": 5.521328859329223,
586
  "epoch": 1.6407599309153713,
587
- "grad_norm": 3.176685094833374,
588
- "learning_rate": 1.4817061240461749e-06,
589
- "loss": 5.3289,
590
- "mean_token_accuracy": 0.31105489522218704,
591
- "num_tokens": 3004556.0,
592
  "step": 2850
593
  },
594
  {
595
- "entropy": 5.479469141960144,
596
  "epoch": 1.669545192861255,
597
- "grad_norm": 2.152296781539917,
598
- "learning_rate": 1.471923302680493e-06,
599
- "loss": 5.296,
600
- "mean_token_accuracy": 0.3132500395178795,
601
- "num_tokens": 3056448.0,
602
  "step": 2900
603
  },
604
  {
605
- "entropy": 5.647905979156494,
606
  "epoch": 1.6983304548071387,
607
- "grad_norm": 2.7002599239349365,
608
- "learning_rate": 1.462140481314811e-06,
609
- "loss": 5.4571,
610
- "mean_token_accuracy": 0.2947456142306328,
611
- "num_tokens": 3109539.0,
612
  "step": 2950
613
  },
614
  {
615
- "entropy": 5.522715563774109,
616
  "epoch": 1.7271157167530224,
617
- "grad_norm": 5.419680595397949,
618
- "learning_rate": 1.452357659949129e-06,
619
- "loss": 5.3362,
620
- "mean_token_accuracy": 0.30760616779327393,
621
- "num_tokens": 3162421.0,
622
  "step": 3000
623
  },
624
  {
625
- "entropy": 5.352699360847473,
626
  "epoch": 1.7559009786989062,
627
- "grad_norm": 4.145458698272705,
628
- "learning_rate": 1.4425748385834473e-06,
629
- "loss": 5.1654,
630
- "mean_token_accuracy": 0.3313426411151886,
631
- "num_tokens": 3212079.0,
632
  "step": 3050
633
  },
634
  {
635
- "entropy": 5.51159740447998,
636
  "epoch": 1.7846862406447899,
637
- "grad_norm": 2.685253858566284,
638
- "learning_rate": 1.4327920172177654e-06,
639
- "loss": 5.33,
640
- "mean_token_accuracy": 0.3081912398338318,
641
- "num_tokens": 3264089.0,
642
  "step": 3100
643
  },
644
  {
645
- "entropy": 5.230629982948304,
646
  "epoch": 1.8134715025906736,
647
- "grad_norm": 3.12331223487854,
648
- "learning_rate": 1.4230091958520836e-06,
649
- "loss": 5.0596,
650
- "mean_token_accuracy": 0.34315449446439744,
651
- "num_tokens": 3311881.0,
652
  "step": 3150
653
  },
654
  {
655
- "entropy": 5.51932089805603,
656
  "epoch": 1.8422567645365573,
657
- "grad_norm": 2.483748435974121,
658
- "learning_rate": 1.413226374486402e-06,
659
- "loss": 5.3417,
660
- "mean_token_accuracy": 0.3069167789816856,
661
- "num_tokens": 3364861.0,
662
  "step": 3200
663
  },
664
  {
665
- "entropy": 5.516515297889709,
666
  "epoch": 1.871042026482441,
667
- "grad_norm": 2.8922979831695557,
668
- "learning_rate": 1.40344355312072e-06,
669
- "loss": 5.3344,
670
- "mean_token_accuracy": 0.31019389897584915,
671
- "num_tokens": 3415911.0,
672
  "step": 3250
673
  },
674
  {
675
- "entropy": 5.548952431678772,
676
  "epoch": 1.8998272884283247,
677
- "grad_norm": 2.2430121898651123,
678
- "learning_rate": 1.3936607317550382e-06,
679
- "loss": 5.3644,
680
- "mean_token_accuracy": 0.30517252802848815,
681
- "num_tokens": 3468338.0,
682
  "step": 3300
683
  },
684
  {
685
- "entropy": 5.6118639993667605,
686
  "epoch": 1.9286125503742084,
687
- "grad_norm": 2.498114824295044,
688
- "learning_rate": 1.3838779103893564e-06,
689
- "loss": 5.4373,
690
- "mean_token_accuracy": 0.29593590170145034,
691
- "num_tokens": 3521693.0,
692
  "step": 3350
693
  },
694
  {
695
- "entropy": 5.479673957824707,
696
  "epoch": 1.9573978123200921,
697
- "grad_norm": 4.210599899291992,
698
- "learning_rate": 1.3740950890236743e-06,
699
- "loss": 5.3043,
700
- "mean_token_accuracy": 0.31091805547475815,
701
- "num_tokens": 3574206.0,
702
  "step": 3400
703
  },
704
  {
705
- "entropy": 5.581016225814819,
706
  "epoch": 1.9861830742659758,
707
- "grad_norm": 2.762880325317383,
708
- "learning_rate": 1.3643122676579925e-06,
709
- "loss": 5.4041,
710
- "mean_token_accuracy": 0.29961841195821765,
711
- "num_tokens": 3627807.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
- "eval_entropy": 5.80436185977426,
717
- "eval_loss": 5.64275598526001,
718
- "eval_mean_token_accuracy": 0.26474372877777996,
719
- "eval_model_preparation_time": 0.0048,
720
- "eval_num_tokens": 3650214.0,
721
- "eval_runtime": 80.7759,
722
- "eval_samples_per_second": 5.373,
723
- "eval_steps_per_second": 2.686,
724
  "step": 3474
725
  },
726
  {
727
- "entropy": 5.4502338743209835,
728
  "epoch": 2.0149683362118593,
729
- "grad_norm": 3.085745334625244,
730
- "learning_rate": 1.3545294462923106e-06,
731
- "loss": 5.2776,
732
- "mean_token_accuracy": 0.3168214797973633,
733
- "num_tokens": 3677883.0,
734
  "step": 3500
735
  },
736
  {
737
- "entropy": 5.572550778388977,
738
  "epoch": 2.043753598157743,
739
- "grad_norm": 4.035412788391113,
740
- "learning_rate": 1.3447466249266288e-06,
741
- "loss": 5.3954,
742
- "mean_token_accuracy": 0.30198297649621964,
743
- "num_tokens": 3730991.0,
744
  "step": 3550
745
  },
746
  {
747
- "entropy": 5.501015219688416,
748
  "epoch": 2.0725388601036268,
749
- "grad_norm": 7.273624897003174,
750
- "learning_rate": 1.334963803560947e-06,
751
- "loss": 5.3335,
752
- "mean_token_accuracy": 0.3079583531618118,
753
- "num_tokens": 3783795.0,
754
  "step": 3600
755
  },
756
  {
757
- "entropy": 5.535852227210999,
758
  "epoch": 2.1013241220495105,
759
- "grad_norm": 2.6426734924316406,
760
- "learning_rate": 1.325180982195265e-06,
761
- "loss": 5.3644,
762
- "mean_token_accuracy": 0.30542428642511366,
763
- "num_tokens": 3835526.0,
764
  "step": 3650
765
  },
766
  {
767
- "entropy": 5.504179673194885,
768
  "epoch": 2.130109383995394,
769
- "grad_norm": 2.1749041080474854,
770
- "learning_rate": 1.3153981608295833e-06,
771
- "loss": 5.3314,
772
- "mean_token_accuracy": 0.30851521909236906,
773
- "num_tokens": 3887175.0,
774
  "step": 3700
775
  },
776
  {
777
- "entropy": 5.3724824857711795,
778
  "epoch": 2.158894645941278,
779
- "grad_norm": 2.3251688480377197,
780
- "learning_rate": 1.3056153394639014e-06,
781
- "loss": 5.2056,
782
- "mean_token_accuracy": 0.3245506736636162,
783
- "num_tokens": 3938003.0,
784
  "step": 3750
785
  },
786
  {
787
- "entropy": 5.505883145332336,
788
  "epoch": 2.1876799078871616,
789
- "grad_norm": 2.406859874725342,
790
- "learning_rate": 1.2958325180982196e-06,
791
- "loss": 5.3383,
792
- "mean_token_accuracy": 0.3079656678438187,
793
- "num_tokens": 3990451.0,
794
  "step": 3800
795
  },
796
  {
797
- "entropy": 5.571912684440613,
798
  "epoch": 2.2164651698330453,
799
- "grad_norm": 2.8970186710357666,
800
- "learning_rate": 1.2860496967325375e-06,
801
- "loss": 5.3978,
802
- "mean_token_accuracy": 0.3008614909648895,
803
- "num_tokens": 4044048.0,
804
  "step": 3850
805
  },
806
  {
807
- "entropy": 5.423692960739135,
808
  "epoch": 2.245250431778929,
809
- "grad_norm": 5.843964576721191,
810
- "learning_rate": 1.2762668753668557e-06,
811
- "loss": 5.2595,
812
- "mean_token_accuracy": 0.31735698133707047,
813
- "num_tokens": 4095276.0,
814
  "step": 3900
815
  },
816
  {
817
- "entropy": 5.477164916992187,
818
  "epoch": 2.2740356937248127,
819
- "grad_norm": 3.271005392074585,
820
- "learning_rate": 1.2664840540011738e-06,
821
- "loss": 5.3098,
822
- "mean_token_accuracy": 0.31092607975006104,
823
- "num_tokens": 4148172.0,
824
  "step": 3950
825
  },
826
  {
827
- "entropy": 5.679584302902222,
828
  "epoch": 2.3028209556706964,
829
- "grad_norm": 3.6198225021362305,
830
- "learning_rate": 1.256701232635492e-06,
831
- "loss": 5.5076,
832
- "mean_token_accuracy": 0.2872884130477905,
833
- "num_tokens": 4202733.0,
834
  "step": 4000
835
  },
836
  {
837
- "entropy": 5.527384333610534,
838
  "epoch": 2.33160621761658,
839
- "grad_norm": 3.32027268409729,
840
- "learning_rate": 1.2469184112698101e-06,
841
- "loss": 5.3586,
842
- "mean_token_accuracy": 0.304608636200428,
843
- "num_tokens": 4255738.0,
844
  "step": 4050
845
  },
846
  {
847
- "entropy": 5.556081314086914,
848
  "epoch": 2.360391479562464,
849
- "grad_norm": 3.729132652282715,
850
- "learning_rate": 1.2371355899041283e-06,
851
- "loss": 5.387,
852
- "mean_token_accuracy": 0.3014680635929108,
853
- "num_tokens": 4308638.0,
854
  "step": 4100
855
  },
856
  {
857
- "entropy": 5.623760852813721,
858
  "epoch": 2.3891767415083476,
859
- "grad_norm": 3.670278549194336,
860
- "learning_rate": 1.2273527685384464e-06,
861
- "loss": 5.4573,
862
- "mean_token_accuracy": 0.2922948771715164,
863
- "num_tokens": 4363170.0,
864
  "step": 4150
865
  },
866
  {
867
- "entropy": 5.532849233150483,
868
  "epoch": 2.4179620034542313,
869
- "grad_norm": 1.8806607723236084,
870
- "learning_rate": 1.2175699471727646e-06,
871
- "loss": 5.368,
872
- "mean_token_accuracy": 0.3040569290518761,
873
- "num_tokens": 4415607.0,
874
  "step": 4200
875
  },
876
  {
877
- "entropy": 5.491175107955932,
878
  "epoch": 2.446747265400115,
879
- "grad_norm": 1.9178470373153687,
880
- "learning_rate": 1.2077871258070827e-06,
881
- "loss": 5.3246,
882
- "mean_token_accuracy": 0.309355776309967,
883
- "num_tokens": 4467608.0,
884
  "step": 4250
885
  },
886
  {
887
- "entropy": 5.539590878486633,
888
  "epoch": 2.4755325273459987,
889
- "grad_norm": 2.679412364959717,
890
- "learning_rate": 1.1980043044414007e-06,
891
- "loss": 5.3729,
892
- "mean_token_accuracy": 0.3032041811943054,
893
- "num_tokens": 4520299.0,
894
  "step": 4300
895
  },
896
  {
897
- "entropy": 5.5143190240859985,
898
  "epoch": 2.5043177892918824,
899
- "grad_norm": 2.913151741027832,
900
- "learning_rate": 1.1882214830757188e-06,
901
- "loss": 5.3514,
902
- "mean_token_accuracy": 0.30634128242731096,
903
- "num_tokens": 4573065.0,
904
  "step": 4350
905
  },
906
  {
907
- "entropy": 5.676259469985962,
908
  "epoch": 2.533103051237766,
909
- "grad_norm": 3.4577906131744385,
910
- "learning_rate": 1.178438661710037e-06,
911
- "loss": 5.5081,
912
- "mean_token_accuracy": 0.28829957485198976,
913
- "num_tokens": 4626807.0,
914
  "step": 4400
915
  },
916
  {
917
- "entropy": 5.377592206001282,
918
  "epoch": 2.56188831318365,
919
- "grad_norm": 2.2610204219818115,
920
- "learning_rate": 1.1686558403443551e-06,
921
- "loss": 5.2171,
922
- "mean_token_accuracy": 0.32207680940628053,
923
- "num_tokens": 4677663.0,
924
  "step": 4450
925
  },
926
  {
927
- "entropy": 5.646258478164673,
928
  "epoch": 2.5906735751295336,
929
- "grad_norm": 2.0913829803466797,
930
- "learning_rate": 1.1588730189786733e-06,
931
- "loss": 5.4762,
932
- "mean_token_accuracy": 0.2905121323466301,
933
- "num_tokens": 4731830.0,
934
  "step": 4500
935
  },
936
  {
937
- "entropy": 5.469613900184632,
938
  "epoch": 2.6194588370754173,
939
- "grad_norm": 2.4179136753082275,
940
- "learning_rate": 1.1490901976129917e-06,
941
- "loss": 5.3066,
942
- "mean_token_accuracy": 0.31084585636854173,
943
- "num_tokens": 4784694.0,
944
  "step": 4550
945
  },
946
  {
947
- "entropy": 5.552228803634644,
948
  "epoch": 2.648244099021301,
949
- "grad_norm": 2.213059663772583,
950
- "learning_rate": 1.1393073762473098e-06,
951
- "loss": 5.3872,
952
- "mean_token_accuracy": 0.3006081366539001,
953
- "num_tokens": 4838534.0,
954
  "step": 4600
955
  },
956
  {
957
- "entropy": 5.5307044506073,
958
  "epoch": 2.6770293609671847,
959
- "grad_norm": 2.6984758377075195,
960
- "learning_rate": 1.129524554881628e-06,
961
- "loss": 5.3655,
962
- "mean_token_accuracy": 0.30505997538566587,
963
- "num_tokens": 4890611.0,
964
  "step": 4650
965
  },
966
  {
967
- "entropy": 5.501814904212952,
968
  "epoch": 2.7058146229130684,
969
- "grad_norm": 3.783916711807251,
970
- "learning_rate": 1.1197417335159461e-06,
971
- "loss": 5.3385,
972
- "mean_token_accuracy": 0.3082756090164185,
973
- "num_tokens": 4943571.0,
974
  "step": 4700
975
  },
976
  {
977
- "entropy": 5.301318726539612,
978
  "epoch": 2.734599884858952,
979
- "grad_norm": 3.2715396881103516,
980
- "learning_rate": 1.109958912150264e-06,
981
- "loss": 5.1434,
982
- "mean_token_accuracy": 0.3303494158387184,
983
- "num_tokens": 4993871.0,
984
  "step": 4750
985
  },
986
  {
987
- "entropy": 5.483540773391724,
988
  "epoch": 2.763385146804836,
989
- "grad_norm": 8.074889183044434,
990
- "learning_rate": 1.1001760907845822e-06,
991
- "loss": 5.3212,
992
- "mean_token_accuracy": 0.30983769208192824,
993
- "num_tokens": 5046187.0,
994
  "step": 4800
995
  },
996
  {
997
- "entropy": 5.468323736190796,
998
  "epoch": 2.7921704087507195,
999
- "grad_norm": 2.2561752796173096,
1000
- "learning_rate": 1.0903932694189004e-06,
1001
- "loss": 5.3088,
1002
- "mean_token_accuracy": 0.30977604538202286,
1003
- "num_tokens": 5098644.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
- "entropy": 5.326909036636352,
1008
  "epoch": 2.8209556706966032,
1009
- "grad_norm": 2.868459701538086,
1010
- "learning_rate": 1.0806104480532185e-06,
1011
- "loss": 5.1694,
1012
- "mean_token_accuracy": 0.32808348089456557,
1013
- "num_tokens": 5149345.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
- "entropy": 5.490628228187561,
1018
  "epoch": 2.849740932642487,
1019
- "grad_norm": 7.378853797912598,
1020
- "learning_rate": 1.0708276266875367e-06,
1021
- "loss": 5.3273,
1022
- "mean_token_accuracy": 0.309133038520813,
1023
- "num_tokens": 5202028.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
- "entropy": 5.505302619934082,
1028
  "epoch": 2.8785261945883707,
1029
- "grad_norm": 2.8309922218322754,
1030
- "learning_rate": 1.0610448053218548e-06,
1031
- "loss": 5.3511,
1032
- "mean_token_accuracy": 0.3045568335056305,
1033
- "num_tokens": 5255082.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
- "entropy": 5.354332094192505,
1038
  "epoch": 2.9073114565342544,
1039
- "grad_norm": 2.403330087661743,
1040
- "learning_rate": 1.051261983956173e-06,
1041
- "loss": 5.1958,
1042
- "mean_token_accuracy": 0.3243831008672714,
1043
- "num_tokens": 5305042.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
- "entropy": 5.354864113330841,
1048
  "epoch": 2.936096718480138,
1049
- "grad_norm": 3.686944007873535,
1050
- "learning_rate": 1.0414791625904911e-06,
1051
- "loss": 5.1976,
1052
- "mean_token_accuracy": 0.3240946170687675,
1053
- "num_tokens": 5356310.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
- "entropy": 5.5070520734786985,
1058
  "epoch": 2.964881980426022,
1059
- "grad_norm": 3.5234930515289307,
1060
- "learning_rate": 1.0316963412248093e-06,
1061
- "loss": 5.348,
1062
- "mean_token_accuracy": 0.3057019948959351,
1063
- "num_tokens": 5409283.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
- "entropy": 5.598388237953186,
1068
  "epoch": 2.9936672423719055,
1069
- "grad_norm": 1.5409276485443115,
1070
- "learning_rate": 1.0219135198591272e-06,
1071
- "loss": 5.4331,
1072
- "mean_token_accuracy": 0.29459414482116697,
1073
- "num_tokens": 5464332.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
- "eval_entropy": 5.769902462234145,
1079
- "eval_loss": 5.6168341636657715,
1080
- "eval_mean_token_accuracy": 0.266466455456848,
1081
- "eval_model_preparation_time": 0.0048,
1082
- "eval_num_tokens": 5475321.0,
1083
- "eval_runtime": 79.9248,
1084
- "eval_samples_per_second": 5.43,
1085
- "eval_steps_per_second": 2.715,
1086
  "step": 5211
1087
  },
1088
  {
1089
- "entropy": 5.303410301208496,
1090
  "epoch": 3.0224525043177892,
1091
- "grad_norm": 2.317394256591797,
1092
- "learning_rate": 1.0121306984934454e-06,
1093
- "loss": 5.1472,
1094
- "mean_token_accuracy": 0.32964429914951326,
1095
- "num_tokens": 5514450.0,
1096
  "step": 5250
1097
  },
1098
  {
1099
- "entropy": 5.48880750656128,
1100
  "epoch": 3.051237766263673,
1101
- "grad_norm": 1.6131863594055176,
1102
- "learning_rate": 1.0023478771277635e-06,
1103
- "loss": 5.3251,
1104
- "mean_token_accuracy": 0.30899042904376983,
1105
- "num_tokens": 5567345.0,
1106
  "step": 5300
1107
  },
1108
  {
1109
- "entropy": 5.434209570884705,
1110
  "epoch": 3.0800230282095566,
1111
- "grad_norm": 2.421576499938965,
1112
- "learning_rate": 9.925650557620817e-07,
1113
- "loss": 5.2747,
1114
- "mean_token_accuracy": 0.3147905930876732,
1115
- "num_tokens": 5619654.0,
1116
  "step": 5350
1117
  },
1118
  {
1119
- "entropy": 5.4664768409729,
1120
  "epoch": 3.1088082901554404,
1121
- "grad_norm": 1.8281446695327759,
1122
- "learning_rate": 9.827822343963998e-07,
1123
- "loss": 5.3059,
1124
- "mean_token_accuracy": 0.3102376765012741,
1125
- "num_tokens": 5672269.0,
1126
  "step": 5400
1127
  },
1128
  {
1129
- "entropy": 5.550107531547546,
1130
  "epoch": 3.137593552101324,
1131
- "grad_norm": 1.9315296411514282,
1132
- "learning_rate": 9.72999413030718e-07,
1133
- "loss": 5.3911,
1134
- "mean_token_accuracy": 0.301893512904644,
1135
- "num_tokens": 5725059.0,
1136
  "step": 5450
1137
  },
1138
  {
1139
- "entropy": 5.48068968296051,
1140
  "epoch": 3.166378814047208,
1141
- "grad_norm": 1.4690601825714111,
1142
- "learning_rate": 9.632165916650362e-07,
1143
- "loss": 5.3265,
1144
- "mean_token_accuracy": 0.3086855486035347,
1145
- "num_tokens": 5776784.0,
1146
  "step": 5500
1147
  },
1148
  {
1149
- "entropy": 5.644747486114502,
1150
  "epoch": 3.1951640759930915,
1151
- "grad_norm": 2.134573221206665,
1152
- "learning_rate": 9.534337702993543e-07,
1153
- "loss": 5.4804,
1154
- "mean_token_accuracy": 0.2887866684794426,
1155
- "num_tokens": 5832296.0,
1156
  "step": 5550
1157
  },
1158
  {
1159
- "entropy": 5.4956268882751464,
1160
  "epoch": 3.223949337938975,
1161
- "grad_norm": 1.5776804685592651,
1162
- "learning_rate": 9.436509489336725e-07,
1163
- "loss": 5.3328,
1164
- "mean_token_accuracy": 0.30928332000970843,
1165
- "num_tokens": 5885122.0,
1166
  "step": 5600
1167
  },
1168
  {
1169
- "entropy": 5.480846815109253,
1170
  "epoch": 3.252734599884859,
1171
- "grad_norm": 1.858426809310913,
1172
- "learning_rate": 9.338681275679906e-07,
1173
- "loss": 5.3237,
1174
- "mean_token_accuracy": 0.30814250469207766,
1175
- "num_tokens": 5938386.0,
1176
  "step": 5650
1177
  },
1178
  {
1179
- "entropy": 5.49823664188385,
1180
  "epoch": 3.2815198618307426,
1181
- "grad_norm": 2.0530879497528076,
1182
- "learning_rate": 9.240853062023088e-07,
1183
- "loss": 5.3393,
1184
- "mean_token_accuracy": 0.3074695497751236,
1185
- "num_tokens": 5989784.0,
1186
  "step": 5700
1187
  },
1188
  {
1189
- "entropy": 5.524387803077698,
1190
  "epoch": 3.3103051237766263,
1191
- "grad_norm": 2.564458131790161,
1192
- "learning_rate": 9.143024848366268e-07,
1193
- "loss": 5.3623,
1194
- "mean_token_accuracy": 0.3044606932997704,
1195
- "num_tokens": 6042646.0,
1196
  "step": 5750
1197
  },
1198
  {
1199
- "entropy": 5.369810061454773,
1200
  "epoch": 3.33909038572251,
1201
- "grad_norm": 2.5260229110717773,
1202
- "learning_rate": 9.04519663470945e-07,
1203
- "loss": 5.2143,
1204
- "mean_token_accuracy": 0.3225390288233757,
1205
- "num_tokens": 6093550.0,
1206
  "step": 5800
1207
  },
1208
  {
1209
- "entropy": 5.216504397392273,
1210
  "epoch": 3.3678756476683938,
1211
- "grad_norm": 2.785037040710449,
1212
- "learning_rate": 8.947368421052631e-07,
1213
- "loss": 5.0643,
1214
- "mean_token_accuracy": 0.3421778553724289,
1215
- "num_tokens": 6142299.0,
1216
  "step": 5850
1217
  },
1218
  {
1219
- "entropy": 5.439607830047607,
1220
  "epoch": 3.3966609096142775,
1221
- "grad_norm": 1.3990309238433838,
1222
- "learning_rate": 8.849540207395813e-07,
1223
- "loss": 5.283,
1224
- "mean_token_accuracy": 0.3137792694568634,
1225
- "num_tokens": 6194315.0,
1226
  "step": 5900
1227
  },
1228
  {
1229
- "entropy": 5.506643385887146,
1230
  "epoch": 3.425446171560161,
1231
- "grad_norm": 2.5383260250091553,
1232
- "learning_rate": 8.751711993738994e-07,
1233
- "loss": 5.3444,
1234
- "mean_token_accuracy": 0.30654319524765017,
1235
- "num_tokens": 6246032.0,
1236
  "step": 5950
1237
  },
1238
  {
1239
- "entropy": 5.555388352870941,
1240
  "epoch": 3.454231433506045,
1241
- "grad_norm": 2.3208353519439697,
1242
- "learning_rate": 8.653883780082175e-07,
1243
- "loss": 5.3934,
1244
- "mean_token_accuracy": 0.29988031029701234,
1245
- "num_tokens": 6300018.0,
1246
  "step": 6000
1247
  },
1248
  {
1249
- "entropy": 5.493673405647278,
1250
  "epoch": 3.4830166954519286,
1251
- "grad_norm": 2.0232412815093994,
1252
- "learning_rate": 8.556055566425356e-07,
1253
- "loss": 5.3325,
1254
- "mean_token_accuracy": 0.308107231259346,
1255
- "num_tokens": 6352988.0,
1256
  "step": 6050
1257
  },
1258
  {
1259
- "entropy": 5.408542203903198,
1260
  "epoch": 3.5118019573978123,
1261
- "grad_norm": 1.9977768659591675,
1262
- "learning_rate": 8.458227352768538e-07,
1263
- "loss": 5.2492,
1264
- "mean_token_accuracy": 0.3188119521737099,
1265
- "num_tokens": 6405116.0,
1266
  "step": 6100
1267
  },
1268
  {
1269
- "entropy": 5.5706956100463865,
1270
  "epoch": 3.540587219343696,
1271
- "grad_norm": 1.855686902999878,
1272
- "learning_rate": 8.360399139111719e-07,
1273
- "loss": 5.4116,
1274
- "mean_token_accuracy": 0.2984398019313812,
1275
- "num_tokens": 6458789.0,
1276
  "step": 6150
1277
  },
1278
  {
1279
- "entropy": 5.402184357643128,
1280
  "epoch": 3.5693724812895797,
1281
- "grad_norm": 2.1038777828216553,
1282
- "learning_rate": 8.2625709254549e-07,
1283
- "loss": 5.2451,
1284
- "mean_token_accuracy": 0.3192365264892578,
1285
- "num_tokens": 6510168.0,
1286
  "step": 6200
1287
  },
1288
  {
1289
- "entropy": 5.387748742103577,
1290
  "epoch": 3.5981577432354634,
1291
- "grad_norm": 4.078054904937744,
1292
- "learning_rate": 8.164742711798082e-07,
1293
- "loss": 5.2308,
1294
- "mean_token_accuracy": 0.3177107447385788,
1295
- "num_tokens": 6562528.0,
1296
  "step": 6250
1297
  },
1298
  {
1299
- "entropy": 5.636818246841431,
1300
  "epoch": 3.626943005181347,
1301
- "grad_norm": 2.3809945583343506,
1302
- "learning_rate": 8.066914498141264e-07,
1303
- "loss": 5.4791,
1304
- "mean_token_accuracy": 0.2893213045597076,
1305
- "num_tokens": 6617308.0,
1306
  "step": 6300
1307
  },
1308
  {
1309
- "entropy": 5.424606614112854,
1310
  "epoch": 3.655728267127231,
1311
- "grad_norm": 1.5998064279556274,
1312
- "learning_rate": 7.969086284484446e-07,
1313
- "loss": 5.2694,
1314
- "mean_token_accuracy": 0.31542753279209135,
1315
- "num_tokens": 6669969.0,
1316
  "step": 6350
1317
  },
1318
  {
1319
- "entropy": 5.585567483901977,
1320
  "epoch": 3.6845135290731146,
1321
- "grad_norm": 2.5134034156799316,
1322
- "learning_rate": 7.871258070827627e-07,
1323
- "loss": 5.4298,
1324
- "mean_token_accuracy": 0.2948091793060303,
1325
- "num_tokens": 6724425.0,
1326
  "step": 6400
1327
  },
1328
  {
1329
- "entropy": 5.370481524467468,
1330
  "epoch": 3.7132987910189983,
1331
- "grad_norm": 3.1583099365234375,
1332
- "learning_rate": 7.773429857170808e-07,
1333
- "loss": 5.2138,
1334
- "mean_token_accuracy": 0.32175571620464327,
1335
- "num_tokens": 6775236.0,
1336
  "step": 6450
1337
  },
1338
  {
1339
- "entropy": 5.492145628929138,
1340
  "epoch": 3.742084052964882,
1341
- "grad_norm": 2.628059148788452,
1342
- "learning_rate": 7.675601643513989e-07,
1343
- "loss": 5.3357,
1344
- "mean_token_accuracy": 0.3072090440988541,
1345
- "num_tokens": 6828545.0,
1346
  "step": 6500
1347
  },
1348
  {
1349
- "entropy": 5.651436891555786,
1350
  "epoch": 3.7708693149107657,
1351
- "grad_norm": 2.5757639408111572,
1352
- "learning_rate": 7.577773429857171e-07,
1353
- "loss": 5.4907,
1354
- "mean_token_accuracy": 0.2872760292887688,
1355
- "num_tokens": 6884005.0,
1356
  "step": 6550
1357
  },
1358
  {
1359
- "entropy": 5.404571523666382,
1360
  "epoch": 3.7996545768566494,
1361
- "grad_norm": 3.1881847381591797,
1362
- "learning_rate": 7.479945216200352e-07,
1363
- "loss": 5.2462,
1364
- "mean_token_accuracy": 0.31890535563230515,
1365
- "num_tokens": 6935209.0,
1366
  "step": 6600
1367
  },
1368
  {
1369
- "entropy": 5.4135113716125485,
1370
  "epoch": 3.828439838802533,
1371
- "grad_norm": 2.6072287559509277,
1372
- "learning_rate": 7.382117002543533e-07,
1373
- "loss": 5.2552,
1374
- "mean_token_accuracy": 0.3171507343649864,
1375
- "num_tokens": 6987396.0,
1376
  "step": 6650
1377
  },
1378
  {
1379
- "entropy": 5.564729566574097,
1380
  "epoch": 3.857225100748417,
1381
- "grad_norm": 3.593822956085205,
1382
- "learning_rate": 7.284288788886714e-07,
1383
- "loss": 5.4031,
1384
- "mean_token_accuracy": 0.2990047359466553,
1385
- "num_tokens": 7041132.0,
1386
  "step": 6700
1387
  },
1388
  {
1389
- "entropy": 5.472572989463806,
1390
  "epoch": 3.8860103626943006,
1391
- "grad_norm": 1.4898858070373535,
1392
- "learning_rate": 7.186460575229896e-07,
1393
- "loss": 5.3129,
1394
- "mean_token_accuracy": 0.3096827921271324,
1395
- "num_tokens": 7094059.0,
1396
  "step": 6750
1397
  },
1398
  {
1399
- "entropy": 5.370901441574096,
1400
  "epoch": 3.9147956246401843,
1401
- "grad_norm": 1.974563717842102,
1402
- "learning_rate": 7.088632361573077e-07,
1403
- "loss": 5.2186,
1404
- "mean_token_accuracy": 0.31926353454589845,
1405
- "num_tokens": 7144964.0,
1406
  "step": 6800
1407
  },
1408
  {
1409
- "entropy": 5.640433759689331,
1410
  "epoch": 3.943580886586068,
1411
- "grad_norm": 1.1786425113677979,
1412
- "learning_rate": 6.990804147916259e-07,
1413
- "loss": 5.4773,
1414
- "mean_token_accuracy": 0.28867036461830137,
1415
- "num_tokens": 7200805.0,
1416
  "step": 6850
1417
  },
1418
  {
1419
- "entropy": 5.216271538734436,
1420
  "epoch": 3.9723661485319517,
1421
- "grad_norm": 5.045248985290527,
1422
- "learning_rate": 6.892975934259439e-07,
1423
- "loss": 5.0688,
1424
- "mean_token_accuracy": 0.3398125246167183,
1425
- "num_tokens": 7250918.0,
1426
  "step": 6900
1427
  },
1428
  {
1429
  "epoch": 4.0,
1430
- "eval_entropy": 5.757329591408303,
1431
- "eval_loss": 5.606628894805908,
1432
- "eval_mean_token_accuracy": 0.26740948622402505,
1433
- "eval_model_preparation_time": 0.0048,
1434
- "eval_num_tokens": 7300428.0,
1435
- "eval_runtime": 81.2062,
1436
- "eval_samples_per_second": 5.344,
1437
- "eval_steps_per_second": 2.672,
1438
  "step": 6948
1439
  },
1440
  {
1441
- "entropy": 5.476274309158325,
1442
  "epoch": 4.001151410477835,
1443
- "grad_norm": 1.600521206855774,
1444
- "learning_rate": 6.795147720602622e-07,
1445
- "loss": 5.3151,
1446
- "mean_token_accuracy": 0.30966584622859955,
1447
- "num_tokens": 7302803.0,
1448
  "step": 6950
1449
  },
1450
  {
1451
- "entropy": 5.241420259475708,
1452
  "epoch": 4.029936672423719,
1453
- "grad_norm": 2.3073410987854004,
1454
- "learning_rate": 6.697319506945803e-07,
1455
- "loss": 5.0865,
1456
- "mean_token_accuracy": 0.3380747744441032,
1457
- "num_tokens": 7352390.0,
1458
  "step": 7000
1459
  },
1460
  {
1461
- "entropy": 5.641773633956909,
1462
  "epoch": 4.058721934369602,
1463
- "grad_norm": 1.6614909172058105,
1464
- "learning_rate": 6.599491293288985e-07,
1465
- "loss": 5.4819,
1466
- "mean_token_accuracy": 0.2898050233721733,
1467
- "num_tokens": 7406538.0,
1468
  "step": 7050
1469
  },
1470
  {
1471
- "entropy": 5.347248024940491,
1472
  "epoch": 4.087507196315486,
1473
- "grad_norm": 1.927628517150879,
1474
- "learning_rate": 6.501663079632165e-07,
1475
- "loss": 5.1913,
1476
- "mean_token_accuracy": 0.3241666054725647,
1477
- "num_tokens": 7457482.0,
1478
  "step": 7100
1479
  },
1480
  {
1481
- "entropy": 5.493693404197693,
1482
  "epoch": 4.11629245826137,
1483
- "grad_norm": 1.5919839143753052,
1484
- "learning_rate": 6.403834865975347e-07,
1485
- "loss": 5.3338,
1486
- "mean_token_accuracy": 0.3070410805940628,
1487
- "num_tokens": 7510037.0,
1488
  "step": 7150
1489
  },
1490
  {
1491
- "entropy": 5.3547215032577515,
1492
  "epoch": 4.1450777202072535,
1493
- "grad_norm": 1.4471710920333862,
1494
- "learning_rate": 6.306006652318528e-07,
1495
- "loss": 5.2021,
1496
- "mean_token_accuracy": 0.3222602027654648,
1497
- "num_tokens": 7561667.0,
1498
  "step": 7200
1499
  },
1500
  {
1501
- "entropy": 5.425107836723328,
1502
  "epoch": 4.173862982153137,
1503
- "grad_norm": 1.9010282754898071,
1504
- "learning_rate": 6.20817843866171e-07,
1505
- "loss": 5.2617,
1506
- "mean_token_accuracy": 0.3172686892747879,
1507
- "num_tokens": 7614240.0,
1508
  "step": 7250
1509
  },
1510
  {
1511
- "entropy": 5.351503825187683,
1512
  "epoch": 4.202648244099021,
1513
- "grad_norm": 2.7427563667297363,
1514
- "learning_rate": 6.110350225004892e-07,
1515
- "loss": 5.1957,
1516
- "mean_token_accuracy": 0.32644627422094347,
1517
- "num_tokens": 7665244.0,
1518
  "step": 7300
1519
  },
1520
  {
1521
- "entropy": 5.642507076263428,
1522
  "epoch": 4.231433506044905,
1523
- "grad_norm": 1.773911952972412,
1524
- "learning_rate": 6.012522011348072e-07,
1525
- "loss": 5.4893,
1526
- "mean_token_accuracy": 0.2874875500798225,
1527
- "num_tokens": 7719566.0,
1528
  "step": 7350
1529
  },
1530
  {
1531
- "entropy": 5.471351361274719,
1532
  "epoch": 4.260218767990788,
1533
- "grad_norm": 2.5300779342651367,
1534
- "learning_rate": 5.914693797691254e-07,
1535
- "loss": 5.3174,
1536
- "mean_token_accuracy": 0.3082795682549477,
1537
- "num_tokens": 7772208.0,
1538
  "step": 7400
1539
  },
1540
  {
1541
- "entropy": 5.494040999412537,
1542
  "epoch": 4.289004029936672,
1543
- "grad_norm": 1.5798758268356323,
1544
- "learning_rate": 5.816865584034435e-07,
1545
- "loss": 5.3364,
1546
- "mean_token_accuracy": 0.30637153565883635,
1547
- "num_tokens": 7825098.0,
1548
  "step": 7450
1549
  },
1550
  {
1551
- "entropy": 5.483609600067139,
1552
  "epoch": 4.317789291882556,
1553
- "grad_norm": 2.237882614135742,
1554
- "learning_rate": 5.719037370377617e-07,
1555
- "loss": 5.3241,
1556
- "mean_token_accuracy": 0.309529247879982,
1557
- "num_tokens": 7878204.0,
1558
  "step": 7500
1559
  },
1560
  {
1561
- "entropy": 5.5503676223754885,
1562
  "epoch": 4.3465745538284395,
1563
- "grad_norm": 1.932957649230957,
1564
- "learning_rate": 5.621209156720797e-07,
1565
- "loss": 5.3927,
1566
- "mean_token_accuracy": 0.29939470887184144,
1567
- "num_tokens": 7932907.0,
1568
  "step": 7550
1569
  },
1570
  {
1571
- "entropy": 5.424697647094726,
1572
  "epoch": 4.375359815774323,
1573
- "grad_norm": 3.6192433834075928,
1574
- "learning_rate": 5.52338094306398e-07,
1575
- "loss": 5.2715,
1576
- "mean_token_accuracy": 0.31552656859159467,
1577
- "num_tokens": 7984257.0,
1578
  "step": 7600
1579
  },
1580
  {
1581
- "entropy": 5.484405131340027,
1582
  "epoch": 4.404145077720207,
1583
- "grad_norm": 1.48371160030365,
1584
- "learning_rate": 5.425552729407161e-07,
1585
- "loss": 5.3238,
1586
- "mean_token_accuracy": 0.3086192473769188,
1587
- "num_tokens": 8036820.0,
1588
  "step": 7650
1589
  },
1590
  {
1591
- "entropy": 5.558430523872375,
1592
  "epoch": 4.432930339666091,
1593
- "grad_norm": 2.446159839630127,
1594
- "learning_rate": 5.327724515750343e-07,
1595
- "loss": 5.4013,
1596
- "mean_token_accuracy": 0.29872660636901854,
1597
- "num_tokens": 8090915.0,
1598
  "step": 7700
1599
  },
1600
  {
1601
- "entropy": 5.417011890411377,
1602
  "epoch": 4.461715601611974,
1603
- "grad_norm": 2.102670669555664,
1604
- "learning_rate": 5.229896302093524e-07,
1605
- "loss": 5.2629,
1606
- "mean_token_accuracy": 0.31736556023359297,
1607
- "num_tokens": 8142122.0,
1608
  "step": 7750
1609
  },
1610
  {
1611
- "entropy": 5.323890132904053,
1612
  "epoch": 4.490500863557858,
1613
- "grad_norm": 1.9391731023788452,
1614
- "learning_rate": 5.132068088436705e-07,
1615
- "loss": 5.1701,
1616
- "mean_token_accuracy": 0.32738196045160295,
1617
- "num_tokens": 8192836.0,
1618
  "step": 7800
1619
  },
1620
  {
1621
- "entropy": 5.333034319877624,
1622
  "epoch": 4.519286125503742,
1623
- "grad_norm": 3.0682387351989746,
1624
- "learning_rate": 5.034239874779886e-07,
1625
- "loss": 5.1845,
1626
- "mean_token_accuracy": 0.32533166408538816,
1627
- "num_tokens": 8244147.0,
1628
  "step": 7850
1629
  },
1630
  {
1631
- "entropy": 5.598471412658691,
1632
  "epoch": 4.5480713874496255,
1633
- "grad_norm": 3.0168399810791016,
1634
- "learning_rate": 4.936411661123068e-07,
1635
- "loss": 5.4375,
1636
- "mean_token_accuracy": 0.2929014927148819,
1637
- "num_tokens": 8299242.0,
1638
  "step": 7900
1639
  },
1640
  {
1641
- "entropy": 5.568692264556884,
1642
  "epoch": 4.576856649395509,
1643
- "grad_norm": 2.0127766132354736,
1644
- "learning_rate": 4.838583447466249e-07,
1645
- "loss": 5.4099,
1646
- "mean_token_accuracy": 0.29787965178489684,
1647
- "num_tokens": 8353432.0,
1648
  "step": 7950
1649
  },
1650
  {
1651
- "entropy": 5.416954412460327,
1652
  "epoch": 4.605641911341393,
1653
- "grad_norm": 3.8953006267547607,
1654
- "learning_rate": 4.7407552338094304e-07,
1655
- "loss": 5.2623,
1656
- "mean_token_accuracy": 0.31632343590259554,
1657
- "num_tokens": 8405979.0,
1658
  "step": 8000
1659
  },
1660
  {
1661
- "entropy": 5.688009605407715,
1662
  "epoch": 4.634427173287277,
1663
- "grad_norm": 2.4968953132629395,
1664
- "learning_rate": 4.6429270201526114e-07,
1665
- "loss": 5.5266,
1666
- "mean_token_accuracy": 0.28443060010671617,
1667
- "num_tokens": 8460859.0,
1668
  "step": 8050
1669
  },
1670
  {
1671
- "entropy": 5.4750879192352295,
1672
  "epoch": 4.66321243523316,
1673
- "grad_norm": 1.6564769744873047,
1674
- "learning_rate": 4.545098806495793e-07,
1675
- "loss": 5.3167,
1676
- "mean_token_accuracy": 0.30900719910860064,
1677
- "num_tokens": 8513937.0,
1678
  "step": 8100
1679
  },
1680
  {
1681
- "entropy": 5.329486901760101,
1682
  "epoch": 4.691997697179044,
1683
- "grad_norm": 2.582631826400757,
1684
- "learning_rate": 4.4472705928389745e-07,
1685
- "loss": 5.1773,
1686
- "mean_token_accuracy": 0.325746659040451,
1687
- "num_tokens": 8564547.0,
1688
  "step": 8150
1689
  },
1690
  {
1691
- "entropy": 5.467891716957093,
1692
  "epoch": 4.720782959124928,
1693
- "grad_norm": 3.1262214183807373,
1694
- "learning_rate": 4.349442379182156e-07,
1695
- "loss": 5.3117,
1696
- "mean_token_accuracy": 0.31063371926546096,
1697
- "num_tokens": 8615609.0,
1698
  "step": 8200
1699
  },
1700
  {
1701
- "entropy": 5.448032402992249,
1702
  "epoch": 4.7495682210708114,
1703
- "grad_norm": 2.420801877975464,
1704
- "learning_rate": 4.2516141655253376e-07,
1705
- "loss": 5.2861,
1706
- "mean_token_accuracy": 0.31280285567045213,
1707
- "num_tokens": 8668388.0,
1708
  "step": 8250
1709
  },
1710
  {
1711
- "entropy": 5.474744281768799,
1712
  "epoch": 4.778353483016695,
1713
- "grad_norm": 2.4817428588867188,
1714
- "learning_rate": 4.1537859518685186e-07,
1715
- "loss": 5.3191,
1716
- "mean_token_accuracy": 0.3090439081192017,
1717
- "num_tokens": 8720158.0,
1718
  "step": 8300
1719
  },
1720
  {
1721
- "entropy": 5.3045838880538945,
1722
  "epoch": 4.807138744962579,
1723
- "grad_norm": 2.723879814147949,
1724
- "learning_rate": 4.0559577382117e-07,
1725
- "loss": 5.1525,
1726
- "mean_token_accuracy": 0.3299118718504906,
1727
- "num_tokens": 8770236.0,
1728
  "step": 8350
1729
  },
1730
  {
1731
- "entropy": 5.680503091812134,
1732
  "epoch": 4.835924006908463,
1733
- "grad_norm": 1.730972409248352,
1734
- "learning_rate": 3.958129524554881e-07,
1735
- "loss": 5.5194,
1736
- "mean_token_accuracy": 0.2835462909936905,
1737
- "num_tokens": 8825543.0,
1738
  "step": 8400
1739
  },
1740
  {
1741
- "entropy": 5.447949981689453,
1742
  "epoch": 4.864709268854346,
1743
- "grad_norm": 1.9867438077926636,
1744
- "learning_rate": 3.860301310898063e-07,
1745
- "loss": 5.2872,
1746
- "mean_token_accuracy": 0.3123849251866341,
1747
- "num_tokens": 8878664.0,
1748
  "step": 8450
1749
  },
1750
  {
1751
- "entropy": 5.5367047977447506,
1752
  "epoch": 4.89349453080023,
1753
- "grad_norm": 2.616309881210327,
1754
- "learning_rate": 3.762473097241244e-07,
1755
- "loss": 5.3804,
1756
- "mean_token_accuracy": 0.30146124720573425,
1757
- "num_tokens": 8931957.0,
1758
  "step": 8500
1759
  },
1760
  {
1761
- "entropy": 5.510420970916748,
1762
  "epoch": 4.922279792746114,
1763
- "grad_norm": 2.0481622219085693,
1764
- "learning_rate": 3.664644883584426e-07,
1765
- "loss": 5.3497,
1766
- "mean_token_accuracy": 0.30415455549955367,
1767
- "num_tokens": 8985201.0,
1768
  "step": 8550
1769
  },
1770
  {
1771
- "entropy": 5.335472793579101,
1772
  "epoch": 4.951065054691997,
1773
- "grad_norm": 1.7575494050979614,
1774
- "learning_rate": 3.566816669927607e-07,
1775
- "loss": 5.1842,
1776
- "mean_token_accuracy": 0.3253355652093887,
1777
- "num_tokens": 9035766.0,
1778
  "step": 8600
1779
  },
1780
  {
1781
- "entropy": 5.562930407524109,
1782
  "epoch": 4.979850316637881,
1783
- "grad_norm": 1.838478446006775,
1784
- "learning_rate": 3.4689884562707883e-07,
1785
- "loss": 5.4048,
1786
- "mean_token_accuracy": 0.3003630799055099,
1787
- "num_tokens": 9089369.0,
1788
  "step": 8650
1789
  },
1790
  {
1791
  "epoch": 5.0,
1792
- "eval_entropy": 5.752057710550897,
1793
- "eval_loss": 5.603951930999756,
1794
- "eval_mean_token_accuracy": 0.2677862888657003,
1795
- "eval_model_preparation_time": 0.0048,
1796
- "eval_num_tokens": 9125535.0,
1797
- "eval_runtime": 80.6223,
1798
- "eval_samples_per_second": 5.383,
1799
- "eval_steps_per_second": 2.692,
1800
  "step": 8685
1801
  }
1802
  ],
1803
  "logging_steps": 50,
1804
- "max_steps": 10422,
1805
  "num_input_tokens_seen": 0,
1806
- "num_train_epochs": 6,
1807
  "save_steps": 500,
1808
  "stateful_callbacks": {
1809
  "TrainerControl": {
@@ -1812,12 +1812,12 @@
1812
  "should_evaluate": false,
1813
  "should_log": false,
1814
  "should_save": true,
1815
- "should_training_stop": false
1816
  },
1817
  "attributes": {}
1818
  }
1819
  },
1820
- "total_flos": 1.2497546465467392e+17,
1821
  "train_batch_size": 2,
1822
  "trial_name": null,
1823
  "trial_params": null
 
1
  {
2
  "best_global_step": 8685,
3
+ "best_metric": 5.523473739624023,
4
  "best_model_checkpoint": "./output/checkpoint-8685",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 3.606692385673523,
14
  "epoch": 0.028785261945883708,
15
+ "grad_norm": 3.2999913692474365,
16
  "learning_rate": 4.9e-07,
17
+ "loss": 13.6598,
18
+ "mean_token_accuracy": 0.16028020828962325,
19
+ "num_tokens": 53993.0,
20
  "step": 50
21
  },
22
  {
23
+ "entropy": 3.618675880432129,
24
  "epoch": 0.057570523891767415,
25
+ "grad_norm": 3.101252555847168,
26
  "learning_rate": 9.9e-07,
27
+ "loss": 14.0188,
28
+ "mean_token_accuracy": 0.1508466500043869,
29
+ "num_tokens": 110134.0,
30
  "step": 100
31
  },
32
  {
33
+ "entropy": 3.5215235900878907,
34
  "epoch": 0.08635578583765112,
35
+ "grad_norm": 3.513662815093994,
36
  "learning_rate": 1.49e-06,
37
+ "loss": 12.8555,
38
+ "mean_token_accuracy": 0.18527640983462335,
39
+ "num_tokens": 160191.0,
40
  "step": 150
41
  },
42
  {
43
+ "entropy": 3.667909698486328,
44
  "epoch": 0.11514104778353483,
45
+ "grad_norm": 4.327610492706299,
46
  "learning_rate": 1.99e-06,
47
+ "loss": 13.5394,
48
+ "mean_token_accuracy": 0.157139780074358,
49
+ "num_tokens": 214993.0,
50
  "step": 200
51
  },
52
  {
53
+ "entropy": 3.768263258934021,
54
  "epoch": 0.14392630972941853,
55
+ "grad_norm": 4.290107250213623,
56
+ "learning_rate": 1.988450206246317e-06,
57
+ "loss": 12.8912,
58
+ "mean_token_accuracy": 0.17374794125556947,
59
+ "num_tokens": 268184.0,
60
  "step": 250
61
  },
62
  {
63
+ "entropy": 3.990619196891785,
64
  "epoch": 0.17271157167530224,
65
+ "grad_norm": 4.444278717041016,
66
+ "learning_rate": 1.976664702416028e-06,
67
+ "loss": 12.455,
68
+ "mean_token_accuracy": 0.17780130118131637,
69
+ "num_tokens": 319458.0,
70
  "step": 300
71
  },
72
  {
73
+ "entropy": 4.162646284103394,
74
  "epoch": 0.20149683362118595,
75
+ "grad_norm": 5.615262508392334,
76
+ "learning_rate": 1.9648791985857395e-06,
77
+ "loss": 12.0893,
78
+ "mean_token_accuracy": 0.18191319867968558,
79
+ "num_tokens": 373337.0,
80
  "step": 350
81
  },
82
  {
83
+ "entropy": 4.532100868225098,
84
  "epoch": 0.23028209556706966,
85
+ "grad_norm": 10.074016571044922,
86
+ "learning_rate": 1.9530936947554507e-06,
87
+ "loss": 11.9261,
88
+ "mean_token_accuracy": 0.169477596282959,
89
+ "num_tokens": 427526.0,
90
  "step": 400
91
  },
92
  {
93
+ "entropy": 4.923871030807495,
94
  "epoch": 0.25906735751295334,
95
+ "grad_norm": 16.220163345336914,
96
+ "learning_rate": 1.9413081909251622e-06,
97
+ "loss": 11.0048,
98
+ "mean_token_accuracy": 0.1704501649737358,
99
+ "num_tokens": 480528.0,
100
  "step": 450
101
  },
102
  {
103
+ "entropy": 5.521005854606629,
104
  "epoch": 0.28785261945883706,
105
+ "grad_norm": 29.904008865356445,
106
+ "learning_rate": 1.9295226870948733e-06,
107
+ "loss": 9.6524,
108
+ "mean_token_accuracy": 0.16450899541378022,
109
+ "num_tokens": 535314.0,
110
  "step": 500
111
  },
112
  {
113
+ "entropy": 6.092623329162597,
114
  "epoch": 0.31663788140472077,
115
+ "grad_norm": 17.821575164794922,
116
+ "learning_rate": 1.9177371832645845e-06,
117
+ "loss": 8.1054,
118
+ "mean_token_accuracy": 0.17205011785030366,
119
+ "num_tokens": 588410.0,
120
  "step": 550
121
  },
122
  {
123
+ "entropy": 6.385262680053711,
124
  "epoch": 0.3454231433506045,
125
+ "grad_norm": 5.502202987670898,
126
+ "learning_rate": 1.9059516794342958e-06,
127
+ "loss": 7.4313,
128
+ "mean_token_accuracy": 0.1734227080643177,
129
+ "num_tokens": 641736.0,
130
  "step": 600
131
  },
132
  {
133
+ "entropy": 6.278562617301941,
134
  "epoch": 0.3742084052964882,
135
+ "grad_norm": 5.4657697677612305,
136
+ "learning_rate": 1.8941661756040071e-06,
137
+ "loss": 6.9266,
138
+ "mean_token_accuracy": 0.18680249139666558,
139
+ "num_tokens": 692200.0,
140
  "step": 650
141
  },
142
  {
143
+ "entropy": 6.553266277313233,
144
  "epoch": 0.4029936672423719,
145
+ "grad_norm": 4.955812931060791,
146
+ "learning_rate": 1.8823806717737183e-06,
147
+ "loss": 6.9847,
148
+ "mean_token_accuracy": 0.16679802387952805,
149
+ "num_tokens": 745830.0,
150
  "step": 700
151
  },
152
  {
153
+ "entropy": 6.470935583114624,
154
  "epoch": 0.4317789291882556,
155
+ "grad_norm": 4.198381423950195,
156
+ "learning_rate": 1.8705951679434296e-06,
157
+ "loss": 6.7277,
158
+ "mean_token_accuracy": 0.17847734570503235,
159
+ "num_tokens": 798872.0,
160
  "step": 750
161
  },
162
  {
163
+ "entropy": 6.5620588779449465,
164
  "epoch": 0.4605641911341393,
165
+ "grad_norm": 3.1793746948242188,
166
+ "learning_rate": 1.8588096641131407e-06,
167
+ "loss": 6.7032,
168
+ "mean_token_accuracy": 0.17336134731769562,
169
+ "num_tokens": 853045.0,
170
  "step": 800
171
  },
172
  {
173
+ "entropy": 6.532204885482788,
174
  "epoch": 0.48934945308002303,
175
+ "grad_norm": 3.824537515640259,
176
+ "learning_rate": 1.847024160282852e-06,
177
+ "loss": 6.5762,
178
+ "mean_token_accuracy": 0.1805124071240425,
179
+ "num_tokens": 907679.0,
180
  "step": 850
181
  },
182
  {
183
+ "entropy": 6.535988225936889,
184
  "epoch": 0.5181347150259067,
185
+ "grad_norm": 4.350001811981201,
186
+ "learning_rate": 1.8352386564525632e-06,
187
+ "loss": 6.505,
188
+ "mean_token_accuracy": 0.1842605724930763,
189
+ "num_tokens": 964170.0,
190
  "step": 900
191
  },
192
  {
193
+ "entropy": 6.204533562660218,
194
  "epoch": 0.5469199769717904,
195
+ "grad_norm": 2.193660020828247,
196
+ "learning_rate": 1.8234531526222745e-06,
197
+ "loss": 6.1211,
198
+ "mean_token_accuracy": 0.21968430042266845,
199
+ "num_tokens": 1015909.0,
200
  "step": 950
201
  },
202
  {
203
+ "entropy": 6.308737449645996,
204
  "epoch": 0.5757052389176741,
205
+ "grad_norm": 2.325622320175171,
206
+ "learning_rate": 1.8116676487919857e-06,
207
+ "loss": 6.1653,
208
+ "mean_token_accuracy": 0.21636426240205764,
209
+ "num_tokens": 1068859.0,
210
  "step": 1000
211
  },
212
  {
213
+ "entropy": 6.332560749053955,
214
  "epoch": 0.6044905008635578,
215
+ "grad_norm": 2.0439090728759766,
216
+ "learning_rate": 1.799882144961697e-06,
217
+ "loss": 6.1559,
218
+ "mean_token_accuracy": 0.21859725564718246,
219
+ "num_tokens": 1123202.0,
220
  "step": 1050
221
  },
222
  {
223
+ "entropy": 6.042124252319336,
224
  "epoch": 0.6332757628094415,
225
+ "grad_norm": 3.621903657913208,
226
+ "learning_rate": 1.7880966411314081e-06,
227
+ "loss": 5.8441,
228
+ "mean_token_accuracy": 0.24906315237283708,
229
+ "num_tokens": 1173403.0,
230
  "step": 1100
231
  },
232
  {
233
+ "entropy": 5.921343173980713,
234
  "epoch": 0.6620610247553252,
235
+ "grad_norm": 5.658033847808838,
236
+ "learning_rate": 1.7763111373011195e-06,
237
+ "loss": 5.7104,
238
+ "mean_token_accuracy": 0.2625067520141602,
239
+ "num_tokens": 1225026.0,
240
  "step": 1150
241
  },
242
  {
243
+ "entropy": 6.093586492538452,
244
  "epoch": 0.690846286701209,
245
+ "grad_norm": 2.4292995929718018,
246
+ "learning_rate": 1.7645256334708308e-06,
247
+ "loss": 5.8658,
248
+ "mean_token_accuracy": 0.24842385441064835,
249
+ "num_tokens": 1279013.0,
250
  "step": 1200
251
  },
252
  {
253
+ "entropy": 6.119112596511841,
254
  "epoch": 0.7196315486470927,
255
+ "grad_norm": 3.369384288787842,
256
+ "learning_rate": 1.752740129640542e-06,
257
+ "loss": 5.8784,
258
+ "mean_token_accuracy": 0.24857850253582,
259
+ "num_tokens": 1332547.0,
260
  "step": 1250
261
  },
262
  {
263
+ "entropy": 6.025163550376892,
264
  "epoch": 0.7484168105929764,
265
+ "grad_norm": 2.5110116004943848,
266
+ "learning_rate": 1.7409546258102533e-06,
267
+ "loss": 5.7769,
268
+ "mean_token_accuracy": 0.25835376888513567,
269
+ "num_tokens": 1385192.0,
270
  "step": 1300
271
  },
272
  {
273
+ "entropy": 5.877259612083435,
274
  "epoch": 0.7772020725388601,
275
+ "grad_norm": 2.4179303646087646,
276
+ "learning_rate": 1.7291691219799646e-06,
277
+ "loss": 5.6284,
278
+ "mean_token_accuracy": 0.2756252554059029,
279
+ "num_tokens": 1437071.0,
280
  "step": 1350
281
  },
282
  {
283
+ "entropy": 6.002246947288513,
284
  "epoch": 0.8059873344847438,
285
+ "grad_norm": 3.494359016418457,
286
+ "learning_rate": 1.717383618149676e-06,
287
+ "loss": 5.747,
288
+ "mean_token_accuracy": 0.26462210685014725,
289
+ "num_tokens": 1490818.0,
290
  "step": 1400
291
  },
292
  {
293
+ "entropy": 5.991955623626709,
294
  "epoch": 0.8347725964306275,
295
+ "grad_norm": 2.340975761413574,
296
+ "learning_rate": 1.705598114319387e-06,
297
+ "loss": 5.7379,
298
+ "mean_token_accuracy": 0.26444981098175047,
299
+ "num_tokens": 1544997.0,
300
  "step": 1450
301
  },
302
  {
303
+ "entropy": 5.91768889427185,
304
  "epoch": 0.8635578583765112,
305
+ "grad_norm": 2.2394514083862305,
306
+ "learning_rate": 1.6938126104890984e-06,
307
+ "loss": 5.6564,
308
+ "mean_token_accuracy": 0.2730415526032448,
309
+ "num_tokens": 1598302.0,
310
  "step": 1500
311
  },
312
  {
313
+ "entropy": 5.982716989517212,
314
  "epoch": 0.8923431203223949,
315
+ "grad_norm": 1.876839518547058,
316
+ "learning_rate": 1.6820271066588098e-06,
317
+ "loss": 5.7215,
318
+ "mean_token_accuracy": 0.26642445534467696,
319
+ "num_tokens": 1655267.0,
320
  "step": 1550
321
  },
322
  {
323
+ "entropy": 5.820467872619629,
324
  "epoch": 0.9211283822682786,
325
+ "grad_norm": 2.219966173171997,
326
+ "learning_rate": 1.6702416028285209e-06,
327
+ "loss": 5.5555,
328
+ "mean_token_accuracy": 0.2856418335437775,
329
+ "num_tokens": 1709199.0,
330
  "step": 1600
331
  },
332
  {
333
+ "entropy": 5.996349005699158,
334
  "epoch": 0.9499136442141624,
335
+ "grad_norm": 2.247213840484619,
336
+ "learning_rate": 1.6584560989982322e-06,
337
+ "loss": 5.7283,
338
+ "mean_token_accuracy": 0.2696125540137291,
339
+ "num_tokens": 1765443.0,
340
  "step": 1650
341
  },
342
  {
343
+ "entropy": 5.696683068275451,
344
  "epoch": 0.9786989061600461,
345
+ "grad_norm": 2.8499979972839355,
346
+ "learning_rate": 1.6466705951679433e-06,
347
+ "loss": 5.4335,
348
+ "mean_token_accuracy": 0.29918427973985673,
349
+ "num_tokens": 1817494.0,
350
  "step": 1700
351
  },
352
  {
353
  "epoch": 1.0,
354
+ "eval_entropy": 5.993559589034401,
355
+ "eval_loss": 5.737204551696777,
356
+ "eval_mean_token_accuracy": 0.2618687468739699,
357
+ "eval_model_preparation_time": 0.0045,
358
+ "eval_num_tokens": 1856362.0,
359
+ "eval_runtime": 50.5332,
360
+ "eval_samples_per_second": 8.588,
361
+ "eval_steps_per_second": 4.294,
362
  "step": 1737
363
  },
364
  {
365
+ "entropy": 5.746842083930969,
366
  "epoch": 1.0074841681059297,
367
+ "grad_norm": 2.33052921295166,
368
+ "learning_rate": 1.6348850913376547e-06,
369
+ "loss": 5.4796,
370
+ "mean_token_accuracy": 0.2966849410533905,
371
+ "num_tokens": 1870353.0,
372
  "step": 1750
373
  },
374
  {
375
+ "entropy": 5.859029049873352,
376
  "epoch": 1.0362694300518134,
377
+ "grad_norm": 1.6248886585235596,
378
+ "learning_rate": 1.6230995875073658e-06,
379
+ "loss": 5.5975,
380
+ "mean_token_accuracy": 0.2838129925727844,
381
+ "num_tokens": 1926205.0,
382
  "step": 1800
383
  },
384
  {
385
+ "entropy": 5.731445336341858,
386
  "epoch": 1.065054691997697,
387
+ "grad_norm": 1.6941566467285156,
388
+ "learning_rate": 1.6113140836770771e-06,
389
+ "loss": 5.476,
390
+ "mean_token_accuracy": 0.2992346465587616,
391
+ "num_tokens": 1979821.0,
392
  "step": 1850
393
  },
394
  {
395
+ "entropy": 5.6993954515457155,
396
  "epoch": 1.0938399539435808,
397
+ "grad_norm": 1.1746597290039062,
398
+ "learning_rate": 1.5995285798467883e-06,
399
+ "loss": 5.4608,
400
+ "mean_token_accuracy": 0.3000726142525673,
401
+ "num_tokens": 2034373.0,
402
  "step": 1900
403
  },
404
  {
405
+ "entropy": 5.668873124122619,
406
  "epoch": 1.1226252158894645,
407
+ "grad_norm": 1.728211760520935,
408
+ "learning_rate": 1.5877430760164996e-06,
409
+ "loss": 5.4347,
410
+ "mean_token_accuracy": 0.3033922725915909,
411
+ "num_tokens": 2087339.0,
412
  "step": 1950
413
  },
414
  {
415
+ "entropy": 5.624621086120605,
416
  "epoch": 1.1514104778353482,
417
+ "grad_norm": 1.4078539609909058,
418
+ "learning_rate": 1.5759575721862107e-06,
419
+ "loss": 5.3954,
420
+ "mean_token_accuracy": 0.30784171640872954,
421
+ "num_tokens": 2139520.0,
422
  "step": 2000
423
  },
424
  {
425
+ "entropy": 5.7141213130950925,
426
  "epoch": 1.180195739781232,
427
+ "grad_norm": 2.186459541320801,
428
+ "learning_rate": 1.564172068355922e-06,
429
+ "loss": 5.4847,
430
+ "mean_token_accuracy": 0.29594049394130706,
431
+ "num_tokens": 2193987.0,
432
  "step": 2050
433
  },
434
  {
435
+ "entropy": 5.632415266036987,
436
  "epoch": 1.2089810017271156,
437
+ "grad_norm": 1.3601349592208862,
438
+ "learning_rate": 1.5523865645256334e-06,
439
+ "loss": 5.4135,
440
+ "mean_token_accuracy": 0.30366597563028336,
441
+ "num_tokens": 2249616.0,
442
  "step": 2100
443
  },
444
  {
445
+ "entropy": 5.510904269218445,
446
  "epoch": 1.2377662636729994,
447
+ "grad_norm": 2.065760612487793,
448
+ "learning_rate": 1.5406010606953445e-06,
449
+ "loss": 5.2904,
450
+ "mean_token_accuracy": 0.3211754837632179,
451
+ "num_tokens": 2300863.0,
452
  "step": 2150
453
  },
454
  {
455
+ "entropy": 5.703383626937867,
456
  "epoch": 1.266551525618883,
457
+ "grad_norm": 1.1172698736190796,
458
+ "learning_rate": 1.5288155568650559e-06,
459
+ "loss": 5.4802,
460
+ "mean_token_accuracy": 0.29713701367378237,
461
+ "num_tokens": 2356029.0,
462
  "step": 2200
463
  },
464
  {
465
+ "entropy": 5.565930342674255,
466
  "epoch": 1.2953367875647668,
467
+ "grad_norm": 1.7528513669967651,
468
+ "learning_rate": 1.5170300530347672e-06,
469
+ "loss": 5.3518,
470
+ "mean_token_accuracy": 0.31301232606172563,
471
+ "num_tokens": 2408957.0,
472
  "step": 2250
473
  },
474
  {
475
+ "entropy": 5.496430187225342,
476
  "epoch": 1.3241220495106505,
477
+ "grad_norm": 1.892640233039856,
478
+ "learning_rate": 1.5052445492044786e-06,
479
+ "loss": 5.2967,
480
+ "mean_token_accuracy": 0.3181899458169937,
481
+ "num_tokens": 2462569.0,
482
  "step": 2300
483
  },
484
  {
485
+ "entropy": 5.725150098800659,
486
  "epoch": 1.3529073114565342,
487
+ "grad_norm": 1.774940848350525,
488
+ "learning_rate": 1.4934590453741897e-06,
489
+ "loss": 5.5215,
490
+ "mean_token_accuracy": 0.29055028676986694,
491
+ "num_tokens": 2518544.0,
492
  "step": 2350
493
  },
494
  {
495
+ "entropy": 5.4884827613830565,
496
  "epoch": 1.381692573402418,
497
+ "grad_norm": 2.2167599201202393,
498
+ "learning_rate": 1.481673541543901e-06,
499
+ "loss": 5.2917,
500
+ "mean_token_accuracy": 0.31803421139717103,
501
+ "num_tokens": 2570863.0,
502
  "step": 2400
503
  },
504
  {
505
+ "entropy": 5.697079472541809,
506
  "epoch": 1.4104778353483016,
507
+ "grad_norm": 1.6489030122756958,
508
+ "learning_rate": 1.4698880377136124e-06,
509
+ "loss": 5.4982,
510
+ "mean_token_accuracy": 0.2925163987278938,
511
+ "num_tokens": 2626998.0,
512
  "step": 2450
513
  },
514
  {
515
+ "entropy": 5.46209939956665,
516
  "epoch": 1.4392630972941853,
517
+ "grad_norm": 1.153914451599121,
518
+ "learning_rate": 1.4581025338833235e-06,
519
+ "loss": 5.2736,
520
+ "mean_token_accuracy": 0.3182168474793434,
521
+ "num_tokens": 2681568.0,
522
  "step": 2500
523
  },
524
  {
525
+ "entropy": 5.4405768728256225,
526
  "epoch": 1.468048359240069,
527
+ "grad_norm": 3.6614978313446045,
528
+ "learning_rate": 1.4463170300530348e-06,
529
+ "loss": 5.2515,
530
+ "mean_token_accuracy": 0.3218736210465431,
531
+ "num_tokens": 2733587.0,
532
  "step": 2550
533
  },
534
  {
535
+ "entropy": 5.528175053596496,
536
  "epoch": 1.4968336211859528,
537
+ "grad_norm": 1.0849746465682983,
538
+ "learning_rate": 1.434531526222746e-06,
539
+ "loss": 5.3378,
540
+ "mean_token_accuracy": 0.31061659604310987,
541
+ "num_tokens": 2787003.0,
542
  "step": 2600
543
  },
544
  {
545
+ "entropy": 5.46110897064209,
546
  "epoch": 1.5256188831318365,
547
+ "grad_norm": 1.8315683603286743,
548
+ "learning_rate": 1.4227460223924573e-06,
549
+ "loss": 5.2782,
550
+ "mean_token_accuracy": 0.31781029611825945,
551
+ "num_tokens": 2840263.0,
552
  "step": 2650
553
  },
554
  {
555
+ "entropy": 5.455560960769653,
556
  "epoch": 1.5544041450777202,
557
+ "grad_norm": 1.1859091520309448,
558
+ "learning_rate": 1.4109605185621684e-06,
559
+ "loss": 5.2735,
560
+ "mean_token_accuracy": 0.3194814011454582,
561
+ "num_tokens": 2894186.0,
562
  "step": 2700
563
  },
564
  {
565
+ "entropy": 5.430496115684509,
566
  "epoch": 1.583189407023604,
567
+ "grad_norm": 2.3500001430511475,
568
+ "learning_rate": 1.3991750147318797e-06,
569
+ "loss": 5.2464,
570
+ "mean_token_accuracy": 0.32140792965888976,
571
+ "num_tokens": 2948171.0,
572
  "step": 2750
573
  },
574
  {
575
+ "entropy": 5.588023023605347,
576
  "epoch": 1.6119746689694876,
577
+ "grad_norm": 1.727825403213501,
578
+ "learning_rate": 1.3873895109015909e-06,
579
+ "loss": 5.4028,
580
+ "mean_token_accuracy": 0.3039530631899834,
581
+ "num_tokens": 3002678.0,
582
  "step": 2800
583
  },
584
  {
585
+ "entropy": 5.410525422096253,
586
  "epoch": 1.6407599309153713,
587
+ "grad_norm": 1.3401474952697754,
588
+ "learning_rate": 1.3756040070713022e-06,
589
+ "loss": 5.2298,
590
+ "mean_token_accuracy": 0.324065263569355,
591
+ "num_tokens": 3055844.0,
592
  "step": 2850
593
  },
594
  {
595
+ "entropy": 5.36959942817688,
596
  "epoch": 1.669545192861255,
597
+ "grad_norm": 1.1892589330673218,
598
+ "learning_rate": 1.3638185032410133e-06,
599
+ "loss": 5.1956,
600
+ "mean_token_accuracy": 0.32639502108097074,
601
+ "num_tokens": 3108636.0,
602
  "step": 2900
603
  },
604
  {
605
+ "entropy": 5.53826907157898,
606
  "epoch": 1.6983304548071387,
607
+ "grad_norm": 1.2652360200881958,
608
+ "learning_rate": 1.3520329994107247e-06,
609
+ "loss": 5.3583,
610
+ "mean_token_accuracy": 0.3074926760792732,
611
+ "num_tokens": 3162627.0,
612
  "step": 2950
613
  },
614
  {
615
+ "entropy": 5.417449145317078,
616
  "epoch": 1.7271157167530224,
617
+ "grad_norm": 1.584312915802002,
618
+ "learning_rate": 1.340247495580436e-06,
619
+ "loss": 5.2388,
620
+ "mean_token_accuracy": 0.32019727885723115,
621
+ "num_tokens": 3216409.0,
622
  "step": 3000
623
  },
624
  {
625
+ "entropy": 5.241390740871429,
626
  "epoch": 1.7559009786989062,
627
+ "grad_norm": 1.5219439268112183,
628
+ "learning_rate": 1.3284619917501471e-06,
629
+ "loss": 5.0645,
630
+ "mean_token_accuracy": 0.3445430138707161,
631
+ "num_tokens": 3266967.0,
632
  "step": 3050
633
  },
634
  {
635
+ "entropy": 5.405424036979675,
636
  "epoch": 1.7846862406447899,
637
+ "grad_norm": 2.1165153980255127,
638
+ "learning_rate": 1.3166764879198585e-06,
639
+ "loss": 5.232,
640
+ "mean_token_accuracy": 0.32085000157356264,
641
+ "num_tokens": 3319877.0,
642
  "step": 3100
643
  },
644
  {
645
+ "entropy": 5.123006024360657,
646
  "epoch": 1.8134715025906736,
647
+ "grad_norm": 1.2189785242080688,
648
+ "learning_rate": 1.3048909840895698e-06,
649
+ "loss": 4.9582,
650
+ "mean_token_accuracy": 0.356108532845974,
651
+ "num_tokens": 3368569.0,
652
  "step": 3150
653
  },
654
  {
655
+ "entropy": 5.417610831260681,
656
  "epoch": 1.8422567645365573,
657
+ "grad_norm": 1.5157604217529297,
658
+ "learning_rate": 1.2931054802592812e-06,
659
+ "loss": 5.2454,
660
+ "mean_token_accuracy": 0.31976755023002623,
661
+ "num_tokens": 3422449.0,
662
  "step": 3200
663
  },
664
  {
665
+ "entropy": 5.409690895080566,
666
  "epoch": 1.871042026482441,
667
+ "grad_norm": 1.3088161945343018,
668
+ "learning_rate": 1.2813199764289923e-06,
669
+ "loss": 5.2348,
670
+ "mean_token_accuracy": 0.32325415283441544,
671
+ "num_tokens": 3474399.0,
672
  "step": 3250
673
  },
674
  {
675
+ "entropy": 5.44662567615509,
676
  "epoch": 1.8998272884283247,
677
+ "grad_norm": 2.178372621536255,
678
+ "learning_rate": 1.2695344725987036e-06,
679
+ "loss": 5.2661,
680
+ "mean_token_accuracy": 0.3182847076654434,
681
+ "num_tokens": 3527726.0,
682
  "step": 3300
683
  },
684
  {
685
+ "entropy": 5.512614865303039,
686
  "epoch": 1.9286125503742084,
687
+ "grad_norm": 1.3050425052642822,
688
+ "learning_rate": 1.2577489687684147e-06,
689
+ "loss": 5.3416,
690
+ "mean_token_accuracy": 0.3084403133392334,
691
+ "num_tokens": 3581980.0,
692
  "step": 3350
693
  },
694
  {
695
+ "entropy": 5.379772834777832,
696
  "epoch": 1.9573978123200921,
697
+ "grad_norm": 1.4584404230117798,
698
+ "learning_rate": 1.245963464938126e-06,
699
+ "loss": 5.2087,
700
+ "mean_token_accuracy": 0.32388432770967485,
701
+ "num_tokens": 3635393.0,
702
  "step": 3400
703
  },
704
  {
705
+ "entropy": 5.483665924072266,
706
  "epoch": 1.9861830742659758,
707
+ "grad_norm": 1.2157734632492065,
708
+ "learning_rate": 1.2341779611078374e-06,
709
+ "loss": 5.3101,
710
+ "mean_token_accuracy": 0.3121953472495079,
711
+ "num_tokens": 3689894.0,
712
  "step": 3450
713
  },
714
  {
715
  "epoch": 2.0,
716
+ "eval_entropy": 5.711394641805904,
717
+ "eval_loss": 5.55628776550293,
718
+ "eval_mean_token_accuracy": 0.2764948787777105,
719
+ "eval_model_preparation_time": 0.0045,
720
+ "eval_num_tokens": 3712724.0,
721
+ "eval_runtime": 50.187,
722
+ "eval_samples_per_second": 8.648,
723
+ "eval_steps_per_second": 4.324,
724
  "step": 3474
725
  },
726
  {
727
+ "entropy": 5.349283556938172,
728
  "epoch": 2.0149683362118593,
729
+ "grad_norm": 1.1696771383285522,
730
+ "learning_rate": 1.2223924572775486e-06,
731
+ "loss": 5.1782,
732
+ "mean_token_accuracy": 0.33028870791196824,
733
+ "num_tokens": 3740861.0,
734
  "step": 3500
735
  },
736
  {
737
+ "entropy": 5.4721107006073,
738
  "epoch": 2.043753598157743,
739
+ "grad_norm": 1.8449370861053467,
740
+ "learning_rate": 1.2106069534472599e-06,
741
+ "loss": 5.2978,
742
+ "mean_token_accuracy": 0.31511022299528124,
743
+ "num_tokens": 3794869.0,
744
  "step": 3550
745
  },
746
  {
747
+ "entropy": 5.404226851463318,
748
  "epoch": 2.0725388601036268,
749
+ "grad_norm": 3.789496660232544,
750
+ "learning_rate": 1.198821449616971e-06,
751
+ "loss": 5.2371,
752
+ "mean_token_accuracy": 0.32092176616191864,
753
+ "num_tokens": 3848573.0,
754
  "step": 3600
755
  },
756
  {
757
+ "entropy": 5.435445628166199,
758
  "epoch": 2.1013241220495105,
759
+ "grad_norm": 2.2847959995269775,
760
+ "learning_rate": 1.1870359457866824e-06,
761
+ "loss": 5.2662,
762
+ "mean_token_accuracy": 0.3186633634567261,
763
+ "num_tokens": 3901204.0,
764
  "step": 3650
765
  },
766
  {
767
+ "entropy": 5.4066293334960935,
768
  "epoch": 2.130109383995394,
769
+ "grad_norm": 1.0950902700424194,
770
+ "learning_rate": 1.1752504419563935e-06,
771
+ "loss": 5.2345,
772
+ "mean_token_accuracy": 0.32156052827835085,
773
+ "num_tokens": 3953753.0,
774
  "step": 3700
775
  },
776
  {
777
+ "entropy": 5.272332944869995,
778
  "epoch": 2.158894645941278,
779
+ "grad_norm": 2.1477339267730713,
780
+ "learning_rate": 1.1634649381261048e-06,
781
+ "loss": 5.1091,
782
+ "mean_token_accuracy": 0.3380983591079712,
783
+ "num_tokens": 4005481.0,
784
  "step": 3750
785
  },
786
  {
787
+ "entropy": 5.4118804311752315,
788
  "epoch": 2.1876799078871616,
789
+ "grad_norm": 1.4509484767913818,
790
+ "learning_rate": 1.151679434295816e-06,
791
+ "loss": 5.2448,
792
+ "mean_token_accuracy": 0.3208243528008461,
793
+ "num_tokens": 4058829.0,
794
  "step": 3800
795
  },
796
  {
797
+ "entropy": 5.4763900089263915,
798
  "epoch": 2.2164651698330453,
799
+ "grad_norm": 1.0856804847717285,
800
+ "learning_rate": 1.1398939304655273e-06,
801
+ "loss": 5.3042,
802
+ "mean_token_accuracy": 0.31338351368904116,
803
+ "num_tokens": 4113326.0,
804
  "step": 3850
805
  },
806
  {
807
+ "entropy": 5.328452725410461,
808
  "epoch": 2.245250431778929,
809
+ "grad_norm": 3.2843880653381348,
810
+ "learning_rate": 1.1281084266352386e-06,
811
+ "loss": 5.1624,
812
+ "mean_token_accuracy": 0.3305218696594238,
813
+ "num_tokens": 4165454.0,
814
  "step": 3900
815
  },
816
  {
817
+ "entropy": 5.383157343864441,
818
  "epoch": 2.2740356937248127,
819
+ "grad_norm": 2.207082748413086,
820
+ "learning_rate": 1.1163229228049497e-06,
821
+ "loss": 5.2163,
822
+ "mean_token_accuracy": 0.32331310987472534,
823
+ "num_tokens": 4219250.0,
824
  "step": 3950
825
  },
826
  {
827
+ "entropy": 5.585261764526368,
828
  "epoch": 2.3028209556706964,
829
+ "grad_norm": 2.7102835178375244,
830
+ "learning_rate": 1.104537418974661e-06,
831
+ "loss": 5.4137,
832
+ "mean_token_accuracy": 0.29959124475717547,
833
+ "num_tokens": 4274711.0,
834
  "step": 4000
835
  },
836
  {
837
+ "entropy": 5.434073266983032,
838
  "epoch": 2.33160621761658,
839
+ "grad_norm": 1.3775779008865356,
840
+ "learning_rate": 1.0927519151443724e-06,
841
+ "loss": 5.2644,
842
+ "mean_token_accuracy": 0.3175011593103409,
843
+ "num_tokens": 4328616.0,
844
  "step": 4050
845
  },
846
  {
847
+ "entropy": 5.462391858100891,
848
  "epoch": 2.360391479562464,
849
+ "grad_norm": 1.4101024866104126,
850
+ "learning_rate": 1.0809664113140838e-06,
851
+ "loss": 5.2924,
852
+ "mean_token_accuracy": 0.3137941011786461,
853
+ "num_tokens": 4382416.0,
854
  "step": 4100
855
  },
856
  {
857
+ "entropy": 5.529892563819885,
858
  "epoch": 2.3891767415083476,
859
+ "grad_norm": 1.2311837673187256,
860
+ "learning_rate": 1.0691809074837949e-06,
861
+ "loss": 5.364,
862
+ "mean_token_accuracy": 0.3046491605043411,
863
+ "num_tokens": 4437848.0,
864
  "step": 4150
865
  },
866
  {
867
+ "entropy": 5.4370484542846675,
868
  "epoch": 2.4179620034542313,
869
+ "grad_norm": 1.0929864645004272,
870
+ "learning_rate": 1.0573954036535062e-06,
871
+ "loss": 5.2734,
872
+ "mean_token_accuracy": 0.3169013774394989,
873
+ "num_tokens": 4491185.0,
874
  "step": 4200
875
  },
876
  {
877
+ "entropy": 5.395377616882325,
878
  "epoch": 2.446747265400115,
879
+ "grad_norm": 1.5457273721694946,
880
+ "learning_rate": 1.0456098998232174e-06,
881
+ "loss": 5.2276,
882
+ "mean_token_accuracy": 0.32221508473157884,
883
+ "num_tokens": 4544086.0,
884
  "step": 4250
885
  },
886
  {
887
+ "entropy": 5.443737335205078,
888
  "epoch": 2.4755325273459987,
889
+ "grad_norm": 1.4844346046447754,
890
+ "learning_rate": 1.0338243959929287e-06,
891
+ "loss": 5.2786,
892
+ "mean_token_accuracy": 0.3157751387357712,
893
+ "num_tokens": 4597677.0,
894
  "step": 4300
895
  },
896
  {
897
+ "entropy": 5.419876251220703,
898
  "epoch": 2.5043177892918824,
899
+ "grad_norm": 1.2481963634490967,
900
+ "learning_rate": 1.02203889216264e-06,
901
+ "loss": 5.2564,
902
+ "mean_token_accuracy": 0.31889803290367125,
903
+ "num_tokens": 4651343.0,
904
  "step": 4350
905
  },
906
  {
907
+ "entropy": 5.578677978515625,
908
  "epoch": 2.533103051237766,
909
+ "grad_norm": 2.0005414485931396,
910
+ "learning_rate": 1.0102533883323512e-06,
911
+ "loss": 5.4145,
912
+ "mean_token_accuracy": 0.30037090003490446,
913
+ "num_tokens": 4705985.0,
914
  "step": 4400
915
  },
916
  {
917
+ "entropy": 5.279946126937866,
918
  "epoch": 2.56188831318365,
919
+ "grad_norm": 1.080521821975708,
920
+ "learning_rate": 9.984678845020625e-07,
921
+ "loss": 5.1226,
922
+ "mean_token_accuracy": 0.3341303279995918,
923
+ "num_tokens": 4757741.0,
924
  "step": 4450
925
  },
926
  {
927
+ "entropy": 5.551463279724121,
928
  "epoch": 2.5906735751295336,
929
+ "grad_norm": 1.28898024559021,
930
+ "learning_rate": 9.866823806717736e-07,
931
+ "loss": 5.3832,
932
+ "mean_token_accuracy": 0.3028248634934425,
933
+ "num_tokens": 4812808.0,
934
  "step": 4500
935
  },
936
  {
937
+ "entropy": 5.3787487554550175,
938
  "epoch": 2.6194588370754173,
939
+ "grad_norm": 1.5697983503341675,
940
+ "learning_rate": 9.74896876841485e-07,
941
+ "loss": 5.2141,
942
+ "mean_token_accuracy": 0.3227942296862602,
943
+ "num_tokens": 4866572.0,
944
  "step": 4550
945
  },
946
  {
947
+ "entropy": 5.460358958244324,
948
  "epoch": 2.648244099021301,
949
+ "grad_norm": 1.3180441856384277,
950
+ "learning_rate": 9.63111373011196e-07,
951
+ "loss": 5.2954,
952
+ "mean_token_accuracy": 0.31269474506378175,
953
+ "num_tokens": 4921312.0,
954
  "step": 4600
955
  },
956
  {
957
+ "entropy": 5.434084935188293,
958
  "epoch": 2.6770293609671847,
959
+ "grad_norm": 1.2409590482711792,
960
+ "learning_rate": 9.513258691809074e-07,
961
+ "loss": 5.271,
962
+ "mean_token_accuracy": 0.3172155100107193,
963
+ "num_tokens": 4974289.0,
964
  "step": 4650
965
  },
966
  {
967
+ "entropy": 5.406955418586731,
968
  "epoch": 2.7058146229130684,
969
+ "grad_norm": 1.4782609939575195,
970
+ "learning_rate": 9.395403653506187e-07,
971
+ "loss": 5.2473,
972
+ "mean_token_accuracy": 0.32031788885593415,
973
+ "num_tokens": 5028149.0,
974
  "step": 4700
975
  },
976
  {
977
+ "entropy": 5.206603040695191,
978
  "epoch": 2.734599884858952,
979
+ "grad_norm": 2.351633071899414,
980
+ "learning_rate": 9.2775486152033e-07,
981
+ "loss": 5.0478,
982
+ "mean_token_accuracy": 0.3428420132398605,
983
+ "num_tokens": 5079349.0,
984
  "step": 4750
985
  },
986
  {
987
+ "entropy": 5.388812799453735,
988
  "epoch": 2.763385146804836,
989
+ "grad_norm": 7.564618110656738,
990
+ "learning_rate": 9.159693576900412e-07,
991
+ "loss": 5.2281,
992
+ "mean_token_accuracy": 0.3222071170806885,
993
+ "num_tokens": 5132564.0,
994
  "step": 4800
995
  },
996
  {
997
+ "entropy": 5.374106278419495,
998
  "epoch": 2.7921704087507195,
999
+ "grad_norm": 1.4734679460525513,
1000
+ "learning_rate": 9.041838538597525e-07,
1001
+ "loss": 5.2161,
1002
+ "mean_token_accuracy": 0.3219477406144142,
1003
+ "num_tokens": 5185921.0,
1004
  "step": 4850
1005
  },
1006
  {
1007
+ "entropy": 5.232998585700988,
1008
  "epoch": 2.8209556706966032,
1009
+ "grad_norm": 1.4175471067428589,
1010
+ "learning_rate": 8.923983500294637e-07,
1011
+ "loss": 5.0769,
1012
+ "mean_token_accuracy": 0.3403926733136177,
1013
+ "num_tokens": 5237521.0,
1014
  "step": 4900
1015
  },
1016
  {
1017
+ "entropy": 5.394891719818116,
1018
  "epoch": 2.849740932642487,
1019
+ "grad_norm": 4.951873779296875,
1020
+ "learning_rate": 8.806128461991749e-07,
1021
+ "loss": 5.2344,
1022
+ "mean_token_accuracy": 0.3213117456436157,
1023
+ "num_tokens": 5291104.0,
1024
  "step": 4950
1025
  },
1026
  {
1027
+ "entropy": 5.413805012702942,
1028
  "epoch": 2.8785261945883707,
1029
+ "grad_norm": 1.679518461227417,
1030
+ "learning_rate": 8.688273423688863e-07,
1031
+ "loss": 5.2597,
1032
+ "mean_token_accuracy": 0.3165634173154831,
1033
+ "num_tokens": 5345058.0,
1034
  "step": 5000
1035
  },
1036
  {
1037
+ "entropy": 5.256177935600281,
1038
  "epoch": 2.9073114565342544,
1039
+ "grad_norm": 1.8892916440963745,
1040
+ "learning_rate": 8.570418385385975e-07,
1041
+ "loss": 5.1004,
1042
+ "mean_token_accuracy": 0.3369427987933159,
1043
+ "num_tokens": 5395918.0,
1044
  "step": 5050
1045
  },
1046
  {
1047
+ "entropy": 5.259814453125,
1048
  "epoch": 2.936096718480138,
1049
+ "grad_norm": 1.3802675008773804,
1050
+ "learning_rate": 8.452563347083087e-07,
1051
+ "loss": 5.1057,
1052
+ "mean_token_accuracy": 0.3362414276599884,
1053
+ "num_tokens": 5448086.0,
1054
  "step": 5100
1055
  },
1056
  {
1057
+ "entropy": 5.416206178665161,
1058
  "epoch": 2.964881980426022,
1059
+ "grad_norm": 1.7677236795425415,
1060
+ "learning_rate": 8.3347083087802e-07,
1061
+ "loss": 5.2562,
1062
+ "mean_token_accuracy": 0.31725785195827483,
1063
+ "num_tokens": 5501959.0,
1064
  "step": 5150
1065
  },
1066
  {
1067
+ "entropy": 5.507337794303894,
1068
  "epoch": 2.9936672423719055,
1069
+ "grad_norm": 1.021727442741394,
1070
+ "learning_rate": 8.216853270477313e-07,
1071
+ "loss": 5.344,
1072
+ "mean_token_accuracy": 0.30679062128067014,
1073
+ "num_tokens": 5557908.0,
1074
  "step": 5200
1075
  },
1076
  {
1077
  "epoch": 3.0,
1078
+ "eval_entropy": 5.682707933786278,
1079
+ "eval_loss": 5.53223991394043,
1080
+ "eval_mean_token_accuracy": 0.27747743456594404,
1081
+ "eval_model_preparation_time": 0.0045,
1082
+ "eval_num_tokens": 5569086.0,
1083
+ "eval_runtime": 49.9944,
1084
+ "eval_samples_per_second": 8.681,
1085
+ "eval_steps_per_second": 4.34,
1086
  "step": 5211
1087
  },
1088
  {
1089
+ "entropy": 5.209756035804748,
1090
  "epoch": 3.0224525043177892,
1091
+ "grad_norm": 1.725786566734314,
1092
+ "learning_rate": 8.098998232174425e-07,
1093
+ "loss": 5.0541,
1094
+ "mean_token_accuracy": 0.34166110813617706,
1095
+ "num_tokens": 5608917.0,
1096
  "step": 5250
1097
  },
1098
  {
1099
+ "entropy": 5.396296281814575,
1100
  "epoch": 3.051237766263673,
1101
+ "grad_norm": 0.7720207571983337,
1102
+ "learning_rate": 7.981143193871538e-07,
1103
+ "loss": 5.2337,
1104
+ "mean_token_accuracy": 0.32116260558366777,
1105
+ "num_tokens": 5662712.0,
1106
  "step": 5300
1107
  },
1108
  {
1109
+ "entropy": 5.341518473625183,
1110
  "epoch": 3.0800230282095566,
1111
+ "grad_norm": 2.2686808109283447,
1112
+ "learning_rate": 7.86328815556865e-07,
1113
+ "loss": 5.1824,
1114
+ "mean_token_accuracy": 0.32726580530405047,
1115
+ "num_tokens": 5715921.0,
1116
  "step": 5350
1117
  },
1118
  {
1119
+ "entropy": 5.376176896095276,
1120
  "epoch": 3.1088082901554404,
1121
+ "grad_norm": 1.2420796155929565,
1122
+ "learning_rate": 7.745433117265762e-07,
1123
+ "loss": 5.2162,
1124
+ "mean_token_accuracy": 0.32142678707838057,
1125
+ "num_tokens": 5769436.0,
1126
  "step": 5400
1127
  },
1128
  {
1129
+ "entropy": 5.4553061914443965,
1130
  "epoch": 3.137593552101324,
1131
+ "grad_norm": 1.2402859926223755,
1132
+ "learning_rate": 7.627578078962876e-07,
1133
+ "loss": 5.2971,
1134
+ "mean_token_accuracy": 0.31396267503499986,
1135
+ "num_tokens": 5823126.0,
1136
  "step": 5450
1137
  },
1138
  {
1139
+ "entropy": 5.385247969627381,
1140
  "epoch": 3.166378814047208,
1141
+ "grad_norm": 1.112062931060791,
1142
+ "learning_rate": 7.509723040659988e-07,
1143
+ "loss": 5.2324,
1144
+ "mean_token_accuracy": 0.3207343602180481,
1145
+ "num_tokens": 5875751.0,
1146
  "step": 5500
1147
  },
1148
  {
1149
+ "entropy": 5.55422221660614,
1150
  "epoch": 3.1951640759930915,
1151
+ "grad_norm": 1.5440446138381958,
1152
+ "learning_rate": 7.3918680023571e-07,
1153
+ "loss": 5.3902,
1154
+ "mean_token_accuracy": 0.3006985321640968,
1155
+ "num_tokens": 5932163.0,
1156
  "step": 5550
1157
  },
1158
  {
1159
+ "entropy": 5.403217372894287,
1160
  "epoch": 3.223949337938975,
1161
+ "grad_norm": 0.8481096625328064,
1162
+ "learning_rate": 7.274012964054213e-07,
1163
+ "loss": 5.2417,
1164
+ "mean_token_accuracy": 0.3210747820138931,
1165
+ "num_tokens": 5985889.0,
1166
  "step": 5600
1167
  },
1168
  {
1169
+ "entropy": 5.388293180465698,
1170
  "epoch": 3.252734599884859,
1171
+ "grad_norm": 0.9305989146232605,
1172
+ "learning_rate": 7.156157925751326e-07,
1173
+ "loss": 5.2319,
1174
+ "mean_token_accuracy": 0.3206030324101448,
1175
+ "num_tokens": 6040052.0,
1176
  "step": 5650
1177
  },
1178
  {
1179
+ "entropy": 5.401709322929382,
1180
  "epoch": 3.2815198618307426,
1181
+ "grad_norm": 0.8080459237098694,
1182
+ "learning_rate": 7.038302887448438e-07,
1183
+ "loss": 5.2438,
1184
+ "mean_token_accuracy": 0.3199671137332916,
1185
+ "num_tokens": 6092350.0,
1186
  "step": 5700
1187
  },
1188
  {
1189
+ "entropy": 5.4320423412323,
1190
  "epoch": 3.3103051237766263,
1191
+ "grad_norm": 1.9186089038848877,
1192
+ "learning_rate": 6.920447849145551e-07,
1193
+ "loss": 5.2696,
1194
+ "mean_token_accuracy": 0.31657984614372253,
1195
+ "num_tokens": 6146112.0,
1196
  "step": 5750
1197
  },
1198
  {
1199
+ "entropy": 5.276471285820008,
1200
  "epoch": 3.33909038572251,
1201
+ "grad_norm": 1.032879114151001,
1202
+ "learning_rate": 6.802592810842663e-07,
1203
+ "loss": 5.1224,
1204
+ "mean_token_accuracy": 0.3347566506266594,
1205
+ "num_tokens": 6197916.0,
1206
  "step": 5800
1207
  },
1208
  {
1209
+ "entropy": 5.122317051887512,
1210
  "epoch": 3.3678756476683938,
1211
+ "grad_norm": 3.156858444213867,
1212
+ "learning_rate": 6.684737772539775e-07,
1213
+ "loss": 4.9706,
1214
+ "mean_token_accuracy": 0.35455317378044127,
1215
+ "num_tokens": 6247565.0,
1216
  "step": 5850
1217
  },
1218
  {
1219
+ "entropy": 5.346597375869751,
1220
  "epoch": 3.3966609096142775,
1221
+ "grad_norm": 1.2619549036026,
1222
+ "learning_rate": 6.566882734236889e-07,
1223
+ "loss": 5.1902,
1224
+ "mean_token_accuracy": 0.3258721518516541,
1225
+ "num_tokens": 6300481.0,
1226
  "step": 5900
1227
  },
1228
  {
1229
+ "entropy": 5.413151068687439,
1230
  "epoch": 3.425446171560161,
1231
+ "grad_norm": 1.801740050315857,
1232
+ "learning_rate": 6.449027695934001e-07,
1233
+ "loss": 5.2513,
1234
+ "mean_token_accuracy": 0.3187857499718666,
1235
+ "num_tokens": 6353098.0,
1236
  "step": 5950
1237
  },
1238
  {
1239
+ "entropy": 5.464186942577362,
1240
  "epoch": 3.454231433506045,
1241
+ "grad_norm": 1.6306997537612915,
1242
+ "learning_rate": 6.331172657631113e-07,
1243
+ "loss": 5.3043,
1244
+ "mean_token_accuracy": 0.31154109388589857,
1245
+ "num_tokens": 6407984.0,
1246
  "step": 6000
1247
  },
1248
  {
1249
+ "entropy": 5.401795778274536,
1250
  "epoch": 3.4830166954519286,
1251
+ "grad_norm": 1.1694583892822266,
1252
+ "learning_rate": 6.213317619328226e-07,
1253
+ "loss": 5.2427,
1254
+ "mean_token_accuracy": 0.31954523265361784,
1255
+ "num_tokens": 6461854.0,
1256
  "step": 6050
1257
  },
1258
  {
1259
+ "entropy": 5.317689285278321,
1260
  "epoch": 3.5118019573978123,
1261
+ "grad_norm": 0.9361855387687683,
1262
+ "learning_rate": 6.095462581025339e-07,
1263
+ "loss": 5.1588,
1264
+ "mean_token_accuracy": 0.330586878657341,
1265
+ "num_tokens": 6514882.0,
1266
  "step": 6100
1267
  },
1268
  {
1269
+ "entropy": 5.478708257675171,
1270
  "epoch": 3.540587219343696,
1271
+ "grad_norm": 1.05711030960083,
1272
+ "learning_rate": 5.977607542722451e-07,
1273
+ "loss": 5.321,
1274
+ "mean_token_accuracy": 0.3104448106884956,
1275
+ "num_tokens": 6569455.0,
1276
  "step": 6150
1277
  },
1278
  {
1279
+ "entropy": 5.309361801147461,
1280
  "epoch": 3.5693724812895797,
1281
+ "grad_norm": 1.3499550819396973,
1282
+ "learning_rate": 5.859752504419564e-07,
1283
+ "loss": 5.153,
1284
+ "mean_token_accuracy": 0.331512533724308,
1285
+ "num_tokens": 6621734.0,
1286
  "step": 6200
1287
  },
1288
  {
1289
+ "entropy": 5.296572666168213,
1290
  "epoch": 3.5981577432354634,
1291
+ "grad_norm": 1.940708875656128,
1292
+ "learning_rate": 5.741897466116676e-07,
1293
+ "loss": 5.14,
1294
+ "mean_token_accuracy": 0.3299832499027252,
1295
+ "num_tokens": 6674994.0,
1296
  "step": 6250
1297
  },
1298
  {
1299
+ "entropy": 5.544284400939941,
1300
  "epoch": 3.626943005181347,
1301
+ "grad_norm": 1.8903827667236328,
1302
+ "learning_rate": 5.624042427813788e-07,
1303
+ "loss": 5.3885,
1304
+ "mean_token_accuracy": 0.3016947290301323,
1305
+ "num_tokens": 6730674.0,
1306
  "step": 6300
1307
  },
1308
  {
1309
+ "entropy": 5.333053431510925,
1310
  "epoch": 3.655728267127231,
1311
+ "grad_norm": 1.1618578433990479,
1312
+ "learning_rate": 5.506187389510902e-07,
1313
+ "loss": 5.1781,
1314
+ "mean_token_accuracy": 0.3275001719594002,
1315
+ "num_tokens": 6784235.0,
1316
  "step": 6350
1317
  },
1318
  {
1319
+ "entropy": 5.4938449716568,
1320
  "epoch": 3.6845135290731146,
1321
+ "grad_norm": 1.384329080581665,
1322
+ "learning_rate": 5.388332351208014e-07,
1323
+ "loss": 5.3399,
1324
+ "mean_token_accuracy": 0.3068840709328651,
1325
+ "num_tokens": 6839590.0,
1326
  "step": 6400
1327
  },
1328
  {
1329
+ "entropy": 5.277545223236084,
1330
  "epoch": 3.7132987910189983,
1331
+ "grad_norm": 1.8918265104293823,
1332
+ "learning_rate": 5.270477312905126e-07,
1333
+ "loss": 5.1221,
1334
+ "mean_token_accuracy": 0.33364981949329375,
1335
+ "num_tokens": 6891301.0,
1336
  "step": 6450
1337
  },
1338
  {
1339
+ "entropy": 5.40100293636322,
1340
  "epoch": 3.742084052964882,
1341
+ "grad_norm": 1.6968809366226196,
1342
+ "learning_rate": 5.152622274602239e-07,
1343
+ "loss": 5.2471,
1344
+ "mean_token_accuracy": 0.31912936180830004,
1345
+ "num_tokens": 6945510.0,
1346
  "step": 6500
1347
  },
1348
  {
1349
+ "entropy": 5.561220169067383,
1350
  "epoch": 3.7708693149107657,
1351
+ "grad_norm": 2.066960573196411,
1352
+ "learning_rate": 5.034767236299352e-07,
1353
+ "loss": 5.4026,
1354
+ "mean_token_accuracy": 0.2984810543060303,
1355
+ "num_tokens": 7001870.0,
1356
  "step": 6550
1357
  },
1358
  {
1359
+ "entropy": 5.3108087682724,
1360
  "epoch": 3.7996545768566494,
1361
+ "grad_norm": 1.6065007448196411,
1362
+ "learning_rate": 4.916912197996464e-07,
1363
+ "loss": 5.155,
1364
+ "mean_token_accuracy": 0.3304683968424797,
1365
+ "num_tokens": 7053974.0,
1366
  "step": 6600
1367
  },
1368
  {
1369
+ "entropy": 5.323807754516602,
1370
  "epoch": 3.828439838802533,
1371
+ "grad_norm": 2.6806318759918213,
1372
+ "learning_rate": 4.799057159693577e-07,
1373
+ "loss": 5.1653,
1374
+ "mean_token_accuracy": 0.3294159671664238,
1375
+ "num_tokens": 7107061.0,
1376
  "step": 6650
1377
  },
1378
  {
1379
+ "entropy": 5.4716163873672485,
1380
  "epoch": 3.857225100748417,
1381
+ "grad_norm": 1.8264856338500977,
1382
+ "learning_rate": 4.6812021213906895e-07,
1383
+ "loss": 5.3124,
1384
+ "mean_token_accuracy": 0.3109353107213974,
1385
+ "num_tokens": 7161697.0,
1386
  "step": 6700
1387
  },
1388
  {
1389
+ "entropy": 5.382365622520447,
1390
  "epoch": 3.8860103626943006,
1391
+ "grad_norm": 0.9954923987388611,
1392
+ "learning_rate": 4.563347083087802e-07,
1393
+ "loss": 5.2237,
1394
+ "mean_token_accuracy": 0.32161149621009827,
1395
+ "num_tokens": 7215524.0,
1396
  "step": 6750
1397
  },
1398
  {
1399
+ "entropy": 5.277496585845947,
1400
  "epoch": 3.9147956246401843,
1401
+ "grad_norm": 1.267786979675293,
1402
+ "learning_rate": 4.445492044784914e-07,
1403
+ "loss": 5.1265,
1404
+ "mean_token_accuracy": 0.3319795566797257,
1405
+ "num_tokens": 7267329.0,
1406
  "step": 6800
1407
  },
1408
  {
1409
+ "entropy": 5.550942025184631,
1410
  "epoch": 3.943580886586068,
1411
+ "grad_norm": 0.9425063133239746,
1412
+ "learning_rate": 4.3276370064820265e-07,
1413
+ "loss": 5.3898,
1414
+ "mean_token_accuracy": 0.30050904959440233,
1415
+ "num_tokens": 7324070.0,
1416
  "step": 6850
1417
  },
1418
  {
1419
+ "entropy": 5.125799627304077,
1420
  "epoch": 3.9723661485319517,
1421
+ "grad_norm": 5.447021007537842,
1422
+ "learning_rate": 4.20978196817914e-07,
1423
+ "loss": 4.9781,
1424
+ "mean_token_accuracy": 0.3520450854301453,
1425
+ "num_tokens": 7375083.0,
1426
  "step": 6900
1427
  },
1428
  {
1429
  "epoch": 4.0,
1430
+ "eval_entropy": 5.6681923492712905,
1431
+ "eval_loss": 5.525067329406738,
1432
+ "eval_mean_token_accuracy": 0.2779707208893816,
1433
+ "eval_model_preparation_time": 0.0045,
1434
+ "eval_num_tokens": 7425448.0,
1435
+ "eval_runtime": 49.7944,
1436
+ "eval_samples_per_second": 8.716,
1437
+ "eval_steps_per_second": 4.358,
1438
  "step": 6948
1439
  },
1440
  {
1441
+ "entropy": 5.38487633228302,
1442
  "epoch": 4.001151410477835,
1443
+ "grad_norm": 1.2222857475280762,
1444
+ "learning_rate": 4.091926929876252e-07,
1445
+ "loss": 5.2243,
1446
+ "mean_token_accuracy": 0.32152373433113096,
1447
+ "num_tokens": 7427859.0,
1448
  "step": 6950
1449
  },
1450
  {
1451
+ "entropy": 5.151427125930786,
1452
  "epoch": 4.029936672423719,
1453
+ "grad_norm": 1.804520606994629,
1454
+ "learning_rate": 3.9740718915733645e-07,
1455
+ "loss": 4.9966,
1456
+ "mean_token_accuracy": 0.3502719843387604,
1457
+ "num_tokens": 7478346.0,
1458
  "step": 7000
1459
  },
1460
  {
1461
+ "entropy": 5.550741171836853,
1462
  "epoch": 4.058721934369602,
1463
+ "grad_norm": 0.9588176608085632,
1464
+ "learning_rate": 3.856216853270477e-07,
1465
+ "loss": 5.3924,
1466
+ "mean_token_accuracy": 0.30148445934057233,
1467
+ "num_tokens": 7533394.0,
1468
  "step": 7050
1469
  },
1470
  {
1471
+ "entropy": 5.254554944038391,
1472
  "epoch": 4.087507196315486,
1473
+ "grad_norm": 2.456322431564331,
1474
+ "learning_rate": 3.7383618149675897e-07,
1475
+ "loss": 5.1002,
1476
+ "mean_token_accuracy": 0.3362416020035744,
1477
+ "num_tokens": 7585238.0,
1478
  "step": 7100
1479
  },
1480
  {
1481
+ "entropy": 5.400679998397827,
1482
  "epoch": 4.11629245826137,
1483
+ "grad_norm": 1.2165497541427612,
1484
+ "learning_rate": 3.6205067766647026e-07,
1485
+ "loss": 5.2434,
1486
+ "mean_token_accuracy": 0.3191445592045784,
1487
+ "num_tokens": 7638693.0,
1488
  "step": 7150
1489
  },
1490
  {
1491
+ "entropy": 5.261249952316284,
1492
  "epoch": 4.1450777202072535,
1493
+ "grad_norm": 1.1991236209869385,
1494
+ "learning_rate": 3.502651738361815e-07,
1495
+ "loss": 5.1129,
1496
+ "mean_token_accuracy": 0.33393161594867704,
1497
+ "num_tokens": 7691223.0,
1498
  "step": 7200
1499
  },
1500
  {
1501
+ "entropy": 5.33568838596344,
1502
  "epoch": 4.173862982153137,
1503
+ "grad_norm": 1.2175132036209106,
1504
+ "learning_rate": 3.384796700058927e-07,
1505
+ "loss": 5.1725,
1506
+ "mean_token_accuracy": 0.32868497937917707,
1507
+ "num_tokens": 7744696.0,
1508
  "step": 7250
1509
  },
1510
  {
1511
+ "entropy": 5.25973867893219,
1512
  "epoch": 4.202648244099021,
1513
+ "grad_norm": 1.4588052034378052,
1514
+ "learning_rate": 3.2669416617560395e-07,
1515
+ "loss": 5.105,
1516
+ "mean_token_accuracy": 0.3384602865576744,
1517
+ "num_tokens": 7796600.0,
1518
  "step": 7300
1519
  },
1520
  {
1521
+ "entropy": 5.551795811653137,
1522
  "epoch": 4.231433506044905,
1523
+ "grad_norm": 1.017745852470398,
1524
+ "learning_rate": 3.149086623453153e-07,
1525
+ "loss": 5.3996,
1526
+ "mean_token_accuracy": 0.29976913034915925,
1527
+ "num_tokens": 7851822.0,
1528
  "step": 7350
1529
  },
1530
  {
1531
+ "entropy": 5.379249300956726,
1532
  "epoch": 4.260218767990788,
1533
+ "grad_norm": 1.7130656242370605,
1534
+ "learning_rate": 3.031231585150265e-07,
1535
+ "loss": 5.2267,
1536
+ "mean_token_accuracy": 0.3196685525774956,
1537
+ "num_tokens": 7905364.0,
1538
  "step": 7400
1539
  },
1540
  {
1541
+ "entropy": 5.402966260910034,
1542
  "epoch": 4.289004029936672,
1543
+ "grad_norm": 1.3162118196487427,
1544
+ "learning_rate": 2.9133765468473776e-07,
1545
+ "loss": 5.2473,
1546
+ "mean_token_accuracy": 0.31836180537939074,
1547
+ "num_tokens": 7959154.0,
1548
  "step": 7450
1549
  },
1550
  {
1551
+ "entropy": 5.393479719161987,
1552
  "epoch": 4.317789291882556,
1553
+ "grad_norm": 1.5286513566970825,
1554
+ "learning_rate": 2.79552150854449e-07,
1555
+ "loss": 5.2345,
1556
+ "mean_token_accuracy": 0.32134009718894957,
1557
+ "num_tokens": 8013160.0,
1558
  "step": 7500
1559
  },
1560
  {
1561
+ "entropy": 5.46117250919342,
1562
  "epoch": 4.3465745538284395,
1563
+ "grad_norm": 1.3793199062347412,
1564
+ "learning_rate": 2.6776664702416027e-07,
1565
+ "loss": 5.3056,
1566
+ "mean_token_accuracy": 0.311002559363842,
1567
+ "num_tokens": 8068763.0,
1568
  "step": 7550
1569
  },
1570
  {
1571
+ "entropy": 5.331483993530274,
1572
  "epoch": 4.375359815774323,
1573
+ "grad_norm": 2.3478105068206787,
1574
+ "learning_rate": 2.5598114319387156e-07,
1575
+ "loss": 5.1795,
1576
+ "mean_token_accuracy": 0.32778155684471133,
1577
+ "num_tokens": 8121013.0,
1578
  "step": 7600
1579
  },
1580
  {
1581
+ "entropy": 5.39360936164856,
1582
  "epoch": 4.404145077720207,
1583
+ "grad_norm": 1.095144510269165,
1584
+ "learning_rate": 2.441956393635828e-07,
1585
+ "loss": 5.2346,
1586
+ "mean_token_accuracy": 0.3206799927353859,
1587
+ "num_tokens": 8174476.0,
1588
  "step": 7650
1589
  },
1590
  {
1591
+ "entropy": 5.468272385597229,
1592
  "epoch": 4.432930339666091,
1593
+ "grad_norm": 1.161205768585205,
1594
+ "learning_rate": 2.3241013553329402e-07,
1595
+ "loss": 5.3126,
1596
+ "mean_token_accuracy": 0.3104386702179909,
1597
+ "num_tokens": 8229471.0,
1598
  "step": 7700
1599
  },
1600
  {
1601
+ "entropy": 5.325088725090027,
1602
  "epoch": 4.461715601611974,
1603
+ "grad_norm": 1.2813605070114136,
1604
+ "learning_rate": 2.206246317030053e-07,
1605
+ "loss": 5.1722,
1606
+ "mean_token_accuracy": 0.32928301066160204,
1607
+ "num_tokens": 8281578.0,
1608
  "step": 7750
1609
  },
1610
  {
1611
+ "entropy": 5.231600880622864,
1612
  "epoch": 4.490500863557858,
1613
+ "grad_norm": 1.0202534198760986,
1614
+ "learning_rate": 2.0883912787271654e-07,
1615
+ "loss": 5.0794,
1616
+ "mean_token_accuracy": 0.3391410967707634,
1617
+ "num_tokens": 8333192.0,
1618
  "step": 7800
1619
  },
1620
  {
1621
+ "entropy": 5.243114166259765,
1622
  "epoch": 4.519286125503742,
1623
+ "grad_norm": 1.8178458213806152,
1624
+ "learning_rate": 1.9705362404242783e-07,
1625
+ "loss": 5.0944,
1626
+ "mean_token_accuracy": 0.33702247083187103,
1627
+ "num_tokens": 8385403.0,
1628
  "step": 7850
1629
  },
1630
  {
1631
+ "entropy": 5.509276785850525,
1632
  "epoch": 4.5480713874496255,
1633
+ "grad_norm": 1.2947449684143066,
1634
+ "learning_rate": 1.8526812021213906e-07,
1635
+ "loss": 5.3504,
1636
+ "mean_token_accuracy": 0.30472870826721193,
1637
+ "num_tokens": 8441398.0,
1638
  "step": 7900
1639
  },
1640
  {
1641
+ "entropy": 5.47810329914093,
1642
  "epoch": 4.576856649395509,
1643
+ "grad_norm": 1.3030011653900146,
1644
+ "learning_rate": 1.7348261638185032e-07,
1645
+ "loss": 5.3205,
1646
+ "mean_token_accuracy": 0.3097413584589958,
1647
+ "num_tokens": 8496488.0,
1648
  "step": 7950
1649
  },
1650
  {
1651
+ "entropy": 5.325540552139282,
1652
  "epoch": 4.605641911341393,
1653
+ "grad_norm": 1.807919979095459,
1654
+ "learning_rate": 1.6169711255156158e-07,
1655
+ "loss": 5.1724,
1656
+ "mean_token_accuracy": 0.32793802350759504,
1657
+ "num_tokens": 8549935.0,
1658
  "step": 8000
1659
  },
1660
  {
1661
+ "entropy": 5.597971448898315,
1662
  "epoch": 4.634427173287277,
1663
+ "grad_norm": 2.5746006965637207,
1664
+ "learning_rate": 1.499116087212728e-07,
1665
+ "loss": 5.4384,
1666
+ "mean_token_accuracy": 0.2962429064512253,
1667
+ "num_tokens": 8605715.0,
1668
  "step": 8050
1669
  },
1670
  {
1671
+ "entropy": 5.385147652626038,
1672
  "epoch": 4.66321243523316,
1673
+ "grad_norm": 1.3230502605438232,
1674
+ "learning_rate": 1.381261048909841e-07,
1675
+ "loss": 5.2287,
1676
+ "mean_token_accuracy": 0.32063129514455796,
1677
+ "num_tokens": 8659693.0,
1678
  "step": 8100
1679
  },
1680
  {
1681
+ "entropy": 5.237564296722412,
1682
  "epoch": 4.691997697179044,
1683
+ "grad_norm": 1.5461162328720093,
1684
+ "learning_rate": 1.2634060106069533e-07,
1685
+ "loss": 5.0868,
1686
+ "mean_token_accuracy": 0.3380086237192154,
1687
+ "num_tokens": 8711203.0,
1688
  "step": 8150
1689
  },
1690
  {
1691
+ "entropy": 5.3754426288604735,
1692
  "epoch": 4.720782959124928,
1693
+ "grad_norm": 1.540111780166626,
1694
+ "learning_rate": 1.145550972304066e-07,
1695
+ "loss": 5.2201,
1696
+ "mean_token_accuracy": 0.3232385951280594,
1697
+ "num_tokens": 8763164.0,
1698
  "step": 8200
1699
  },
1700
  {
1701
+ "entropy": 5.358118782043457,
1702
  "epoch": 4.7495682210708114,
1703
+ "grad_norm": 1.160130500793457,
1704
+ "learning_rate": 1.0276959340011786e-07,
1705
+ "loss": 5.1986,
1706
+ "mean_token_accuracy": 0.32435122996568677,
1707
+ "num_tokens": 8816843.0,
1708
  "step": 8250
1709
  },
1710
  {
1711
+ "entropy": 5.382785997390747,
1712
  "epoch": 4.778353483016695,
1713
+ "grad_norm": 1.19467031955719,
1714
+ "learning_rate": 9.09840895698291e-08,
1715
+ "loss": 5.2294,
1716
+ "mean_token_accuracy": 0.320581151843071,
1717
+ "num_tokens": 8869513.0,
1718
  "step": 8300
1719
  },
1720
  {
1721
+ "entropy": 5.213022809028626,
1722
  "epoch": 4.807138744962579,
1723
+ "grad_norm": 2.9769742488861084,
1724
+ "learning_rate": 7.919858573954036e-08,
1725
+ "loss": 5.0619,
1726
+ "mean_token_accuracy": 0.3414875140786171,
1727
+ "num_tokens": 8920491.0,
1728
  "step": 8350
1729
  },
1730
  {
1731
+ "entropy": 5.5911472082138065,
1732
  "epoch": 4.835924006908463,
1733
+ "grad_norm": 1.083928108215332,
1734
+ "learning_rate": 6.741308190925162e-08,
1735
+ "loss": 5.4315,
1736
+ "mean_token_accuracy": 0.2953196579217911,
1737
+ "num_tokens": 8976698.0,
1738
  "step": 8400
1739
  },
1740
  {
1741
+ "entropy": 5.360013060569763,
1742
  "epoch": 4.864709268854346,
1743
+ "grad_norm": 0.9796308279037476,
1744
+ "learning_rate": 5.562757807896287e-08,
1745
+ "loss": 5.1997,
1746
+ "mean_token_accuracy": 0.32382034361362455,
1747
+ "num_tokens": 9030719.0,
1748
  "step": 8450
1749
  },
1750
  {
1751
+ "entropy": 5.447426581382752,
1752
  "epoch": 4.89349453080023,
1753
+ "grad_norm": 1.2438750267028809,
1754
+ "learning_rate": 4.384207424867413e-08,
1755
+ "loss": 5.2921,
1756
+ "mean_token_accuracy": 0.31339066684246064,
1757
+ "num_tokens": 9084912.0,
1758
  "step": 8500
1759
  },
1760
  {
1761
+ "entropy": 5.419556441307068,
1762
  "epoch": 4.922279792746114,
1763
+ "grad_norm": 1.0203313827514648,
1764
+ "learning_rate": 3.205657041838539e-08,
1765
+ "loss": 5.2623,
1766
+ "mean_token_accuracy": 0.31564354538917544,
1767
+ "num_tokens": 9139056.0,
1768
  "step": 8550
1769
  },
1770
  {
1771
+ "entropy": 5.24355613231659,
1772
  "epoch": 4.951065054691997,
1773
+ "grad_norm": 1.5694708824157715,
1774
+ "learning_rate": 2.027106658809664e-08,
1775
+ "loss": 5.0942,
1776
+ "mean_token_accuracy": 0.3373285266757011,
1777
+ "num_tokens": 9190520.0,
1778
  "step": 8600
1779
  },
1780
  {
1781
+ "entropy": 5.4742378282547,
1782
  "epoch": 4.979850316637881,
1783
+ "grad_norm": 1.4123504161834717,
1784
+ "learning_rate": 8.485562757807896e-09,
1785
+ "loss": 5.3169,
1786
+ "mean_token_accuracy": 0.3118377339839935,
1787
+ "num_tokens": 9245023.0,
1788
  "step": 8650
1789
  },
1790
  {
1791
  "epoch": 5.0,
1792
+ "eval_entropy": 5.6646331444313995,
1793
+ "eval_loss": 5.523473739624023,
1794
+ "eval_mean_token_accuracy": 0.27800820795347064,
1795
+ "eval_model_preparation_time": 0.0045,
1796
+ "eval_num_tokens": 9281810.0,
1797
+ "eval_runtime": 49.9701,
1798
+ "eval_samples_per_second": 8.685,
1799
+ "eval_steps_per_second": 4.343,
1800
  "step": 8685
1801
  }
1802
  ],
1803
  "logging_steps": 50,
1804
+ "max_steps": 8685,
1805
  "num_input_tokens_seen": 0,
1806
+ "num_train_epochs": 5,
1807
  "save_steps": 500,
1808
  "stateful_callbacks": {
1809
  "TrainerControl": {
 
1812
  "should_evaluate": false,
1813
  "should_log": false,
1814
  "should_save": true,
1815
+ "should_training_stop": true
1816
  },
1817
  "attributes": {}
1818
  }
1819
  },
1820
+ "total_flos": 1.2710932022552064e+17,
1821
  "train_batch_size": 2,
1822
  "trial_name": null,
1823
  "trial_params": null
checkpoint-8685/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5475e33e58f46f793ad6cd889040e12bcec7c861a0875b6fe270cefbfc94cccc
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a666397e6243ddba6f7279c90610ed552907ef4de0be511faece3826d13e618
3
  size 6225
runs/Dec04_11-47-13_129-213-84-8/events.out.tfevents.1764848895.129-213-84-8.25442.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d77d61fb6af4433636be0b2505f03048f2b7390f5cb67124dae3bb4156bd4898
3
+ size 77711
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db5c304963110404ebb6947b83ba95bd9b8aad1f9b8b578cc33c46d601e13dc
3
  size 6225
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a666397e6243ddba6f7279c90610ed552907ef4de0be511faece3826d13e618
3
  size 6225