bluuluu commited on
Commit
558ee2e
·
verified ·
1 Parent(s): dd8a44b

Model save

Browse files
Files changed (4) hide show
  1. README.md +4 -6
  2. all_results.json +4 -9
  3. train_results.json +4 -4
  4. trainer_state.json +912 -928
README.md CHANGED
@@ -1,16 +1,14 @@
1
  ---
2
- datasets: HuggingFaceH4/Bespoke-Stratos-17k
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
8
  licence: license
9
  ---
10
 
11
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
12
 
13
- This model is a fine-tuned version of [None](https://huggingface.co/None) on the [HuggingFaceH4/Bespoke-Stratos-17k](https://huggingface.co/datasets/HuggingFaceH4/Bespoke-Stratos-17k) dataset.
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
@@ -26,7 +24,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/2741919970-hustvl/huggingface/runs/ibmpuibf)
30
 
31
 
32
  This model was trained with SFT.
@@ -35,8 +33,8 @@ This model was trained with SFT.
35
 
36
  - TRL: 0.16.0.dev0
37
  - Transformers: 4.50.0.dev0
38
- - Pytorch: 2.5.1
39
- - Datasets: 3.3.1
40
  - Tokenizers: 0.21.0
41
 
42
  ## Citations
 
1
  ---
 
2
  library_name: transformers
3
  model_name: Qwen2.5-1.5B-Open-R1-Distill
4
  tags:
5
  - generated_from_trainer
 
6
  licence: license
7
  ---
8
 
9
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
10
 
11
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
12
  It has been trained using [TRL](https://github.com/huggingface/trl).
13
 
14
  ## Quick start
 
24
 
25
  ## Training procedure
26
 
27
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/2741919970-hustvl/huggingface/runs/3jcwqhpk)
28
 
29
 
30
  This model was trained with SFT.
 
33
 
34
  - TRL: 0.16.0.dev0
35
  - Transformers: 4.50.0.dev0
36
+ - Pytorch: 2.6.0
37
+ - Datasets: 3.3.2
38
  - Tokenizers: 0.21.0
39
 
40
  ## Citations
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "eval_loss": 0.9801562428474426,
3
- "eval_runtime": 8.4113,
4
- "eval_samples": 100,
5
- "eval_samples_per_second": 11.889,
6
- "eval_steps_per_second": 1.546,
7
  "total_flos": 0.0,
8
- "train_loss": 1.0554086321351157,
9
- "train_runtime": 8232.1254,
10
  "train_samples": 16610,
11
- "train_samples_per_second": 2.018,
12
- "train_steps_per_second": 0.126
13
  }
 
1
  {
 
 
 
 
 
2
  "total_flos": 0.0,
3
+ "train_loss": 0.9946218774200528,
4
+ "train_runtime": 21041.6427,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 1.566,
7
+ "train_steps_per_second": 0.049
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 1.0554086321351157,
4
- "train_runtime": 8232.1254,
5
  "train_samples": 16610,
6
- "train_samples_per_second": 2.018,
7
- "train_steps_per_second": 0.126
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.9946218774200528,
4
+ "train_runtime": 21041.6427,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 1.566,
7
+ "train_steps_per_second": 0.049
8
  }
trainer_state.json CHANGED
@@ -1,1774 +1,1758 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9997592102094871,
5
  "eval_steps": 100,
6
- "global_step": 1038,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.004815795810257645,
13
- "grad_norm": 1.2147226333618164,
14
- "learning_rate": 6.930673784291687e-06,
15
- "loss": 1.5557,
16
- "mean_token_accuracy": 0.630362007021904,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.00963159162051529,
21
- "grad_norm": 0.8979360461235046,
22
- "learning_rate": 9.915552514839931e-06,
23
- "loss": 1.5975,
24
- "mean_token_accuracy": 0.620550400018692,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 0.014447387430772935,
29
- "grad_norm": 0.6861452460289001,
30
- "learning_rate": 1.1661594641410821e-05,
31
- "loss": 1.4971,
32
- "mean_token_accuracy": 0.6368873298168183,
33
  "step": 15
34
  },
35
  {
36
- "epoch": 0.01926318324103058,
37
- "grad_norm": 0.6750317215919495,
38
- "learning_rate": 1.2900431245388171e-05,
39
- "loss": 1.4613,
40
- "mean_token_accuracy": 0.6366240382194519,
41
  "step": 20
42
  },
43
  {
44
- "epoch": 0.024078979051288224,
45
- "grad_norm": 0.6331086158752441,
46
- "learning_rate": 1.3861347568583374e-05,
47
- "loss": 1.3617,
48
- "mean_token_accuracy": 0.6534930646419526,
49
  "step": 25
50
  },
51
  {
52
- "epoch": 0.02889477486154587,
53
- "grad_norm": 0.6154865622520447,
54
- "learning_rate": 1.4646473371959063e-05,
55
- "loss": 1.3332,
56
- "mean_token_accuracy": 0.6580578774213791,
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.03371057067180352,
61
- "grad_norm": 0.546588122844696,
62
- "learning_rate": 1.5310287780241346e-05,
63
- "loss": 1.3098,
64
- "mean_token_accuracy": 0.6614556431770324,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.03852636648206116,
69
- "grad_norm": 0.5029358863830566,
70
- "learning_rate": 1.5885309975936413e-05,
71
- "loss": 1.2399,
72
- "mean_token_accuracy": 0.6782816559076309,
73
  "step": 40
74
  },
75
  {
76
- "epoch": 0.043342162292318805,
77
- "grad_norm": 0.5402973890304565,
78
- "learning_rate": 1.639251549852995e-05,
79
- "loss": 1.243,
80
- "mean_token_accuracy": 0.6727433562278747,
81
  "step": 45
82
  },
83
  {
84
- "epoch": 0.04815795810257645,
85
- "grad_norm": 0.4923946261405945,
86
- "learning_rate": 1.684622629913162e-05,
87
- "loss": 1.211,
88
- "mean_token_accuracy": 0.6798111200332642,
89
  "step": 50
90
  },
91
  {
92
- "epoch": 0.0529737539128341,
93
- "grad_norm": 0.4764332175254822,
94
- "learning_rate": 1.7256657642548236e-05,
95
- "loss": 1.2481,
96
- "mean_token_accuracy": 0.6730508714914322,
97
  "step": 55
98
  },
99
  {
100
- "epoch": 0.05778954972309174,
101
- "grad_norm": 0.4757765829563141,
102
- "learning_rate": 1.76313521025073e-05,
103
- "loss": 1.2497,
104
- "mean_token_accuracy": 0.6692704439163208,
105
  "step": 60
106
  },
107
  {
108
- "epoch": 0.06260534553334939,
109
- "grad_norm": 0.4728986918926239,
110
- "learning_rate": 1.7976037592646964e-05,
111
- "loss": 1.2109,
112
- "mean_token_accuracy": 0.6769249439239502,
113
  "step": 65
114
  },
115
  {
116
- "epoch": 0.06742114134360704,
117
- "grad_norm": 0.44879817962646484,
118
- "learning_rate": 1.8295166510789593e-05,
119
- "loss": 1.2003,
120
- "mean_token_accuracy": 0.6800386220216751,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.07223693715386467,
125
- "grad_norm": 0.4452269971370697,
126
- "learning_rate": 1.8592268425702507e-05,
127
- "loss": 1.1783,
128
- "mean_token_accuracy": 0.6860542267560958,
129
  "step": 75
130
  },
131
  {
132
- "epoch": 0.07705273296412232,
133
- "grad_norm": 0.4323599636554718,
134
- "learning_rate": 1.8870188706484653e-05,
135
- "loss": 1.1889,
136
- "mean_token_accuracy": 0.6835256606340409,
137
  "step": 80
138
  },
139
  {
140
- "epoch": 0.08186852877437997,
141
- "grad_norm": 0.5153534412384033,
142
- "learning_rate": 1.9131254681046113e-05,
143
- "loss": 1.1452,
144
- "mean_token_accuracy": 0.6931861788034439,
145
  "step": 85
146
  },
147
  {
148
- "epoch": 0.08668432458463761,
149
- "grad_norm": 0.48632609844207764,
150
- "learning_rate": 1.9377394229078192e-05,
151
- "loss": 1.1546,
152
- "mean_token_accuracy": 0.6885204911231995,
153
  "step": 90
154
  },
155
  {
156
- "epoch": 0.09150012039489526,
157
- "grad_norm": 0.46052947640419006,
158
- "learning_rate": 1.9610222268080128e-05,
159
- "loss": 1.1688,
160
- "mean_token_accuracy": 0.6875470966100693,
161
  "step": 95
162
  },
163
  {
164
- "epoch": 0.0963159162051529,
165
- "grad_norm": 0.4639471471309662,
166
- "learning_rate": 1.9831105029679863e-05,
167
- "loss": 1.1842,
168
- "mean_token_accuracy": 0.6816806256771087,
169
  "step": 100
170
  },
171
  {
172
- "epoch": 0.0963159162051529,
173
- "eval_loss": 1.1612499952316284,
174
- "eval_mean_token_accuracy": 0.6914980319830087,
175
- "eval_runtime": 8.642,
176
- "eval_samples_per_second": 11.571,
177
- "eval_steps_per_second": 1.504,
178
  "step": 100
179
  },
180
  {
181
- "epoch": 0.10113171201541055,
182
- "grad_norm": 0.4894552528858185,
183
  "learning_rate": 2e-05,
184
- "loss": 1.167,
185
- "mean_token_accuracy": 0.6845104664564132,
186
  "step": 105
187
  },
188
  {
189
- "epoch": 0.1059475078256682,
190
- "grad_norm": 0.4709993004798889,
191
  "learning_rate": 2e-05,
192
- "loss": 1.18,
193
- "mean_token_accuracy": 0.683196634054184,
194
  "step": 110
195
  },
196
  {
197
- "epoch": 0.11076330363592583,
198
- "grad_norm": 0.47942236065864563,
199
  "learning_rate": 2e-05,
200
- "loss": 1.1273,
201
- "mean_token_accuracy": 0.6949256509542465,
202
  "step": 115
203
  },
204
  {
205
- "epoch": 0.11557909944618348,
206
- "grad_norm": 0.48427385091781616,
207
  "learning_rate": 2e-05,
208
- "loss": 1.1286,
209
- "mean_token_accuracy": 0.6935882419347763,
210
  "step": 120
211
  },
212
  {
213
- "epoch": 0.12039489525644112,
214
- "grad_norm": 0.5041924118995667,
215
  "learning_rate": 2e-05,
216
- "loss": 1.1458,
217
- "mean_token_accuracy": 0.6929215133190155,
218
  "step": 125
219
  },
220
  {
221
- "epoch": 0.12521069106669877,
222
- "grad_norm": 0.44542378187179565,
223
  "learning_rate": 2e-05,
224
- "loss": 1.1207,
225
- "mean_token_accuracy": 0.6961991012096405,
226
  "step": 130
227
  },
228
  {
229
- "epoch": 0.13002648687695642,
230
- "grad_norm": 0.47258466482162476,
231
  "learning_rate": 2e-05,
232
- "loss": 1.0835,
233
- "mean_token_accuracy": 0.7047353446483612,
234
  "step": 135
235
  },
236
  {
237
- "epoch": 0.13484228268721407,
238
- "grad_norm": 0.4928162097930908,
239
  "learning_rate": 2e-05,
240
- "loss": 1.1012,
241
- "mean_token_accuracy": 0.6981151014566421,
242
  "step": 140
243
  },
244
  {
245
- "epoch": 0.1396580784974717,
246
- "grad_norm": 0.47707924246788025,
247
  "learning_rate": 2e-05,
248
- "loss": 1.1487,
249
- "mean_token_accuracy": 0.6873587876558304,
250
  "step": 145
251
  },
252
  {
253
- "epoch": 0.14447387430772934,
254
- "grad_norm": 0.4788207709789276,
255
  "learning_rate": 2e-05,
256
- "loss": 1.1288,
257
- "mean_token_accuracy": 0.6957270383834839,
258
  "step": 150
259
  },
260
  {
261
- "epoch": 0.149289670117987,
262
- "grad_norm": 0.4736686050891876,
263
  "learning_rate": 2e-05,
264
- "loss": 1.1175,
265
- "mean_token_accuracy": 0.6955223828554153,
266
  "step": 155
267
  },
268
  {
269
- "epoch": 0.15410546592824464,
270
- "grad_norm": 0.46731919050216675,
271
  "learning_rate": 2e-05,
272
- "loss": 1.0898,
273
- "mean_token_accuracy": 0.7016006588935852,
274
  "step": 160
275
  },
276
  {
277
- "epoch": 0.1589212617385023,
278
- "grad_norm": 0.46417754888534546,
279
  "learning_rate": 2e-05,
280
- "loss": 1.1243,
281
- "mean_token_accuracy": 0.6959028095006943,
282
  "step": 165
283
  },
284
  {
285
- "epoch": 0.16373705754875995,
286
- "grad_norm": 0.4850030839443207,
287
  "learning_rate": 2e-05,
288
- "loss": 1.113,
289
- "mean_token_accuracy": 0.6952222913503647,
290
  "step": 170
291
  },
292
  {
293
- "epoch": 0.16855285335901757,
294
- "grad_norm": 0.47516992688179016,
295
  "learning_rate": 2e-05,
296
- "loss": 1.1213,
297
- "mean_token_accuracy": 0.6963592380285263,
298
  "step": 175
299
  },
300
  {
301
- "epoch": 0.17336864916927522,
302
- "grad_norm": 0.4773334860801697,
303
  "learning_rate": 2e-05,
304
- "loss": 1.1026,
305
- "mean_token_accuracy": 0.7007035493850708,
306
  "step": 180
307
  },
308
  {
309
- "epoch": 0.17818444497953287,
310
- "grad_norm": 0.4331914484500885,
311
  "learning_rate": 2e-05,
312
- "loss": 1.0891,
313
- "mean_token_accuracy": 0.7041437685489654,
314
  "step": 185
315
  },
316
  {
317
- "epoch": 0.18300024078979052,
318
- "grad_norm": 0.4739309847354889,
319
  "learning_rate": 2e-05,
320
- "loss": 1.0904,
321
- "mean_token_accuracy": 0.7006603956222535,
322
  "step": 190
323
  },
324
  {
325
- "epoch": 0.18781603660004817,
326
- "grad_norm": 0.5098666548728943,
327
  "learning_rate": 2e-05,
328
- "loss": 1.09,
329
- "mean_token_accuracy": 0.7024506121873856,
330
  "step": 195
331
  },
332
  {
333
- "epoch": 0.1926318324103058,
334
- "grad_norm": 0.47458314895629883,
335
  "learning_rate": 2e-05,
336
- "loss": 1.0966,
337
- "mean_token_accuracy": 0.6993225693702698,
338
  "step": 200
339
  },
340
  {
341
- "epoch": 0.1926318324103058,
342
- "eval_loss": 1.1015625,
343
- "eval_mean_token_accuracy": 0.7014527458410996,
344
- "eval_runtime": 8.6243,
345
- "eval_samples_per_second": 11.595,
346
- "eval_steps_per_second": 1.507,
347
  "step": 200
348
  },
349
  {
350
- "epoch": 0.19744762822056344,
351
- "grad_norm": 0.4341985285282135,
352
  "learning_rate": 2e-05,
353
- "loss": 1.0608,
354
- "mean_token_accuracy": 0.7089000910520553,
355
  "step": 205
356
  },
357
  {
358
- "epoch": 0.2022634240308211,
359
- "grad_norm": 0.4928109645843506,
360
  "learning_rate": 2e-05,
361
- "loss": 1.1052,
362
- "mean_token_accuracy": 0.6985082745552063,
363
  "step": 210
364
  },
365
  {
366
- "epoch": 0.20707921984107874,
367
- "grad_norm": 0.4761582314968109,
368
  "learning_rate": 2e-05,
369
- "loss": 1.0525,
370
- "mean_token_accuracy": 0.709627678990364,
371
  "step": 215
372
  },
373
  {
374
- "epoch": 0.2118950156513364,
375
- "grad_norm": 0.48992499709129333,
376
  "learning_rate": 2e-05,
377
- "loss": 1.0987,
378
- "mean_token_accuracy": 0.7000472754240036,
379
  "step": 220
380
  },
381
  {
382
- "epoch": 0.21671081146159402,
383
- "grad_norm": 0.4606421887874603,
384
  "learning_rate": 2e-05,
385
- "loss": 1.1326,
386
- "mean_token_accuracy": 0.6904013842344284,
387
  "step": 225
388
  },
389
  {
390
- "epoch": 0.22152660727185167,
391
- "grad_norm": 0.4784524738788605,
392
  "learning_rate": 2e-05,
393
- "loss": 1.0829,
394
- "mean_token_accuracy": 0.7019407242536545,
395
  "step": 230
396
  },
397
  {
398
- "epoch": 0.22634240308210932,
399
- "grad_norm": 0.4791426360607147,
400
  "learning_rate": 2e-05,
401
- "loss": 1.0741,
402
- "mean_token_accuracy": 0.7062644183635711,
403
  "step": 235
404
  },
405
  {
406
- "epoch": 0.23115819889236697,
407
- "grad_norm": 0.5347750186920166,
408
  "learning_rate": 2e-05,
409
- "loss": 1.0513,
410
- "mean_token_accuracy": 0.7106054097414016,
411
  "step": 240
412
  },
413
  {
414
- "epoch": 0.23597399470262462,
415
- "grad_norm": 0.5430010557174683,
416
  "learning_rate": 2e-05,
417
- "loss": 1.0813,
418
- "mean_token_accuracy": 0.7051879912614822,
419
  "step": 245
420
  },
421
  {
422
- "epoch": 0.24078979051288224,
423
- "grad_norm": 0.5055649876594543,
424
  "learning_rate": 2e-05,
425
- "loss": 1.0936,
426
- "mean_token_accuracy": 0.7014876544475556,
427
  "step": 250
428
  },
429
  {
430
- "epoch": 0.2456055863231399,
431
- "grad_norm": 0.4852530360221863,
432
  "learning_rate": 2e-05,
433
- "loss": 1.0804,
434
- "mean_token_accuracy": 0.7026357978582383,
435
  "step": 255
436
  },
437
  {
438
- "epoch": 0.25042138213339754,
439
- "grad_norm": 0.5060104727745056,
440
  "learning_rate": 2e-05,
441
- "loss": 1.0673,
442
- "mean_token_accuracy": 0.7043526142835617,
443
  "step": 260
444
  },
445
  {
446
- "epoch": 0.2552371779436552,
447
- "grad_norm": 0.4761098027229309,
448
  "learning_rate": 2e-05,
449
- "loss": 1.074,
450
- "mean_token_accuracy": 0.7045256137847901,
451
  "step": 265
452
  },
453
  {
454
- "epoch": 0.26005297375391284,
455
- "grad_norm": 0.47418275475502014,
456
  "learning_rate": 2e-05,
457
- "loss": 1.0739,
458
- "mean_token_accuracy": 0.7012639313936233,
459
  "step": 270
460
  },
461
  {
462
- "epoch": 0.2648687695641705,
463
- "grad_norm": 0.4582968056201935,
464
  "learning_rate": 2e-05,
465
- "loss": 1.0838,
466
- "mean_token_accuracy": 0.7021653652191162,
467
  "step": 275
468
  },
469
  {
470
- "epoch": 0.26968456537442814,
471
- "grad_norm": 0.47323882579803467,
472
  "learning_rate": 2e-05,
473
- "loss": 1.0757,
474
- "mean_token_accuracy": 0.7021588802337646,
475
  "step": 280
476
  },
477
  {
478
- "epoch": 0.2745003611846858,
479
- "grad_norm": 0.47690513730049133,
480
  "learning_rate": 2e-05,
481
- "loss": 1.09,
482
- "mean_token_accuracy": 0.7010543704032898,
483
  "step": 285
484
  },
485
  {
486
- "epoch": 0.2793161569949434,
487
- "grad_norm": 0.5104436278343201,
488
  "learning_rate": 2e-05,
489
- "loss": 1.0699,
490
- "mean_token_accuracy": 0.7045444875955582,
491
  "step": 290
492
  },
493
  {
494
- "epoch": 0.28413195280520104,
495
- "grad_norm": 0.46935001015663147,
496
  "learning_rate": 2e-05,
497
- "loss": 1.0901,
498
- "mean_token_accuracy": 0.6992536067962647,
499
  "step": 295
500
  },
501
  {
502
- "epoch": 0.2889477486154587,
503
- "grad_norm": 0.4757845401763916,
504
  "learning_rate": 2e-05,
505
- "loss": 1.051,
506
- "mean_token_accuracy": 0.7108464986085892,
507
  "step": 300
508
  },
509
  {
510
- "epoch": 0.2889477486154587,
511
- "eval_loss": 1.0712499618530273,
512
- "eval_mean_token_accuracy": 0.7071489003988413,
513
- "eval_runtime": 8.6239,
514
- "eval_samples_per_second": 11.596,
515
- "eval_steps_per_second": 1.507,
516
  "step": 300
517
  },
518
  {
519
- "epoch": 0.29376354442571634,
520
- "grad_norm": 0.46388521790504456,
521
  "learning_rate": 2e-05,
522
- "loss": 1.0548,
523
- "mean_token_accuracy": 0.7095113962888717,
524
  "step": 305
525
  },
526
  {
527
- "epoch": 0.298579340235974,
528
- "grad_norm": 0.48812395334243774,
529
  "learning_rate": 2e-05,
530
- "loss": 1.0941,
531
- "mean_token_accuracy": 0.6993561059236526,
532
  "step": 310
533
  },
534
  {
535
- "epoch": 0.30339513604623164,
536
- "grad_norm": 0.42772531509399414,
537
  "learning_rate": 2e-05,
538
- "loss": 1.0995,
539
- "mean_token_accuracy": 0.6970925956964493,
540
  "step": 315
541
  },
542
  {
543
- "epoch": 0.3082109318564893,
544
- "grad_norm": 0.4491855800151825,
545
  "learning_rate": 2e-05,
546
- "loss": 1.1388,
547
- "mean_token_accuracy": 0.6878453463315963,
548
  "step": 320
549
  },
550
  {
551
- "epoch": 0.31302672766674694,
552
- "grad_norm": 0.4632098376750946,
553
  "learning_rate": 2e-05,
554
- "loss": 1.0551,
555
- "mean_token_accuracy": 0.7084896057844162,
556
  "step": 325
557
  },
558
  {
559
- "epoch": 0.3178425234770046,
560
- "grad_norm": 0.5304334759712219,
561
  "learning_rate": 2e-05,
562
- "loss": 1.0636,
563
- "mean_token_accuracy": 0.706383016705513,
564
  "step": 330
565
  },
566
  {
567
- "epoch": 0.32265831928726224,
568
- "grad_norm": 0.45156118273735046,
569
  "learning_rate": 2e-05,
570
- "loss": 1.0381,
571
- "mean_token_accuracy": 0.7106526464223861,
572
  "step": 335
573
  },
574
  {
575
- "epoch": 0.3274741150975199,
576
- "grad_norm": 0.4394085705280304,
577
  "learning_rate": 2e-05,
578
- "loss": 1.0858,
579
- "mean_token_accuracy": 0.7021456390619278,
580
  "step": 340
581
  },
582
  {
583
- "epoch": 0.3322899109077775,
584
- "grad_norm": 0.45397478342056274,
585
  "learning_rate": 2e-05,
586
- "loss": 1.034,
587
- "mean_token_accuracy": 0.7121325343847275,
588
  "step": 345
589
  },
590
  {
591
- "epoch": 0.33710570671803514,
592
- "grad_norm": 0.4641047418117523,
593
  "learning_rate": 2e-05,
594
- "loss": 1.025,
595
- "mean_token_accuracy": 0.7169907033443451,
596
  "step": 350
597
  },
598
  {
599
- "epoch": 0.3419215025282928,
600
- "grad_norm": 0.4733441174030304,
601
  "learning_rate": 2e-05,
602
- "loss": 1.0207,
603
- "mean_token_accuracy": 0.7163655549287796,
604
  "step": 355
605
  },
606
  {
607
- "epoch": 0.34673729833855044,
608
- "grad_norm": 0.49704018235206604,
609
  "learning_rate": 2e-05,
610
- "loss": 1.0638,
611
- "mean_token_accuracy": 0.7053900867700577,
612
  "step": 360
613
  },
614
  {
615
- "epoch": 0.3515530941488081,
616
- "grad_norm": 0.4795050024986267,
617
  "learning_rate": 2e-05,
618
- "loss": 1.0723,
619
- "mean_token_accuracy": 0.7046771228313446,
620
  "step": 365
621
  },
622
  {
623
- "epoch": 0.35636888995906574,
624
- "grad_norm": 0.4925204813480377,
625
  "learning_rate": 2e-05,
626
- "loss": 1.0333,
627
- "mean_token_accuracy": 0.7136953860521317,
628
  "step": 370
629
  },
630
  {
631
- "epoch": 0.3611846857693234,
632
- "grad_norm": 0.4750489294528961,
633
  "learning_rate": 2e-05,
634
- "loss": 1.0482,
635
- "mean_token_accuracy": 0.7103820770978928,
636
  "step": 375
637
  },
638
  {
639
- "epoch": 0.36600048157958104,
640
- "grad_norm": 0.48978808522224426,
641
  "learning_rate": 2e-05,
642
- "loss": 1.0436,
643
- "mean_token_accuracy": 0.7093423455953598,
644
  "step": 380
645
  },
646
  {
647
- "epoch": 0.3708162773898387,
648
- "grad_norm": 0.5102350115776062,
649
  "learning_rate": 2e-05,
650
- "loss": 1.0036,
651
- "mean_token_accuracy": 0.7225006580352783,
652
  "step": 385
653
  },
654
  {
655
- "epoch": 0.37563207320009634,
656
- "grad_norm": 0.4660072922706604,
657
  "learning_rate": 2e-05,
658
- "loss": 1.063,
659
- "mean_token_accuracy": 0.7056207448244095,
660
  "step": 390
661
  },
662
  {
663
- "epoch": 0.38044786901035393,
664
- "grad_norm": 0.45591866970062256,
665
  "learning_rate": 2e-05,
666
- "loss": 1.0513,
667
- "mean_token_accuracy": 0.7092340558767318,
668
  "step": 395
669
  },
670
  {
671
- "epoch": 0.3852636648206116,
672
- "grad_norm": 0.4969187378883362,
673
  "learning_rate": 2e-05,
674
- "loss": 1.024,
675
- "mean_token_accuracy": 0.7148027062416077,
676
  "step": 400
677
  },
678
  {
679
- "epoch": 0.3852636648206116,
680
- "eval_loss": 1.050937533378601,
681
- "eval_mean_token_accuracy": 0.7114615348669199,
682
- "eval_runtime": 8.565,
683
- "eval_samples_per_second": 11.675,
684
- "eval_steps_per_second": 1.518,
685
  "step": 400
686
  },
687
  {
688
- "epoch": 0.39007946063086923,
689
- "grad_norm": 0.49762463569641113,
690
  "learning_rate": 2e-05,
691
- "loss": 1.0973,
692
- "mean_token_accuracy": 0.6980582684278488,
693
  "step": 405
694
  },
695
  {
696
- "epoch": 0.3948952564411269,
697
- "grad_norm": 0.47561171650886536,
698
  "learning_rate": 2e-05,
699
- "loss": 1.0179,
700
- "mean_token_accuracy": 0.7156326770782471,
701
  "step": 410
702
  },
703
  {
704
- "epoch": 0.39971105225138454,
705
- "grad_norm": 0.47592732310295105,
706
  "learning_rate": 2e-05,
707
- "loss": 1.076,
708
- "mean_token_accuracy": 0.7023025274276733,
709
  "step": 415
710
  },
711
  {
712
- "epoch": 0.4045268480616422,
713
- "grad_norm": 0.4640940725803375,
714
  "learning_rate": 2e-05,
715
- "loss": 1.038,
716
- "mean_token_accuracy": 0.7106958746910095,
717
  "step": 420
718
  },
719
  {
720
- "epoch": 0.40934264387189984,
721
- "grad_norm": 0.4999053478240967,
722
  "learning_rate": 2e-05,
723
- "loss": 1.0565,
724
- "mean_token_accuracy": 0.7072661310434342,
725
  "step": 425
726
  },
727
  {
728
- "epoch": 0.4141584396821575,
729
- "grad_norm": 0.4585224390029907,
730
  "learning_rate": 2e-05,
731
- "loss": 1.0373,
732
- "mean_token_accuracy": 0.710054212808609,
733
  "step": 430
734
  },
735
  {
736
- "epoch": 0.41897423549241514,
737
- "grad_norm": 0.4704591631889343,
738
  "learning_rate": 2e-05,
739
- "loss": 1.0255,
740
- "mean_token_accuracy": 0.7148306041955947,
741
  "step": 435
742
  },
743
  {
744
- "epoch": 0.4237900313026728,
745
- "grad_norm": 0.4596955478191376,
746
  "learning_rate": 2e-05,
747
- "loss": 1.0681,
748
- "mean_token_accuracy": 0.7043379992246628,
749
  "step": 440
750
  },
751
  {
752
- "epoch": 0.42860582711293044,
753
- "grad_norm": 0.502312958240509,
754
  "learning_rate": 2e-05,
755
- "loss": 1.0014,
756
- "mean_token_accuracy": 0.7187342494726181,
757
  "step": 445
758
  },
759
  {
760
- "epoch": 0.43342162292318803,
761
- "grad_norm": 0.503431499004364,
762
  "learning_rate": 2e-05,
763
- "loss": 1.0492,
764
- "mean_token_accuracy": 0.7069419741630554,
765
  "step": 450
766
  },
767
  {
768
- "epoch": 0.4382374187334457,
769
- "grad_norm": 0.5078609585762024,
770
  "learning_rate": 2e-05,
771
- "loss": 1.0445,
772
- "mean_token_accuracy": 0.7107535511255264,
773
  "step": 455
774
  },
775
  {
776
- "epoch": 0.44305321454370333,
777
- "grad_norm": 0.492558091878891,
778
  "learning_rate": 2e-05,
779
- "loss": 1.0243,
780
- "mean_token_accuracy": 0.7159764289855957,
781
  "step": 460
782
  },
783
  {
784
- "epoch": 0.447869010353961,
785
- "grad_norm": 0.46418413519859314,
786
  "learning_rate": 2e-05,
787
- "loss": 1.0096,
788
- "mean_token_accuracy": 0.7168796956539154,
789
  "step": 465
790
  },
791
  {
792
- "epoch": 0.45268480616421863,
793
- "grad_norm": 0.44123366475105286,
794
  "learning_rate": 2e-05,
795
- "loss": 1.0108,
796
- "mean_token_accuracy": 0.7186406791210175,
797
  "step": 470
798
  },
799
  {
800
- "epoch": 0.4575006019744763,
801
- "grad_norm": 0.4705427885055542,
802
  "learning_rate": 2e-05,
803
- "loss": 1.0173,
804
- "mean_token_accuracy": 0.7164324700832367,
805
  "step": 475
806
  },
807
  {
808
- "epoch": 0.46231639778473393,
809
- "grad_norm": 0.43676939606666565,
810
  "learning_rate": 2e-05,
811
- "loss": 1.0316,
812
- "mean_token_accuracy": 0.7118833988904953,
813
  "step": 480
814
  },
815
  {
816
- "epoch": 0.4671321935949916,
817
- "grad_norm": 0.4746619164943695,
818
  "learning_rate": 2e-05,
819
- "loss": 1.0304,
820
- "mean_token_accuracy": 0.712785804271698,
821
  "step": 485
822
  },
823
  {
824
- "epoch": 0.47194798940524924,
825
- "grad_norm": 0.4496391713619232,
826
  "learning_rate": 2e-05,
827
- "loss": 1.0064,
828
- "mean_token_accuracy": 0.7193306088447571,
829
  "step": 490
830
  },
831
  {
832
- "epoch": 0.4767637852155069,
833
- "grad_norm": 0.4668291211128235,
834
  "learning_rate": 2e-05,
835
- "loss": 1.0087,
836
- "mean_token_accuracy": 0.717681935429573,
837
  "step": 495
838
  },
839
  {
840
- "epoch": 0.4815795810257645,
841
- "grad_norm": 0.49410480260849,
842
  "learning_rate": 2e-05,
843
- "loss": 0.9928,
844
- "mean_token_accuracy": 0.7212499767541886,
845
  "step": 500
846
  },
847
  {
848
- "epoch": 0.4815795810257645,
849
- "eval_loss": 1.0334374904632568,
850
- "eval_mean_token_accuracy": 0.7148144520246066,
851
- "eval_runtime": 8.5672,
852
- "eval_samples_per_second": 11.672,
853
- "eval_steps_per_second": 1.517,
854
  "step": 500
855
  },
856
  {
857
- "epoch": 0.48639537683602213,
858
- "grad_norm": 0.4331699311733246,
859
  "learning_rate": 2e-05,
860
- "loss": 1.0531,
861
- "mean_token_accuracy": 0.7062293201684952,
862
  "step": 505
863
  },
864
  {
865
- "epoch": 0.4912111726462798,
866
- "grad_norm": 0.4359816312789917,
867
  "learning_rate": 2e-05,
868
- "loss": 1.0122,
869
- "mean_token_accuracy": 0.7192949712276459,
870
  "step": 510
871
  },
872
  {
873
- "epoch": 0.49602696845653743,
874
- "grad_norm": 0.5358240008354187,
875
  "learning_rate": 2e-05,
876
- "loss": 1.0434,
877
- "mean_token_accuracy": 0.7085084766149521,
878
  "step": 515
879
  },
880
  {
881
- "epoch": 0.5008427642667951,
882
- "grad_norm": 0.4746890366077423,
883
  "learning_rate": 2e-05,
884
- "loss": 1.0096,
885
- "mean_token_accuracy": 0.716804152727127,
886
  "step": 520
887
  },
888
  {
889
- "epoch": 0.5056585600770528,
890
- "grad_norm": 0.48278114199638367,
891
  "learning_rate": 2e-05,
892
- "loss": 0.9902,
893
- "mean_token_accuracy": 0.7223504304885864,
894
  "step": 525
895
  },
896
  {
897
- "epoch": 0.5104743558873104,
898
- "grad_norm": 0.49195483326911926,
899
  "learning_rate": 2e-05,
900
- "loss": 1.018,
901
- "mean_token_accuracy": 0.716482064127922,
902
  "step": 530
903
  },
904
  {
905
- "epoch": 0.515290151697568,
906
- "grad_norm": 0.47295352816581726,
907
  "learning_rate": 2e-05,
908
- "loss": 1.0475,
909
- "mean_token_accuracy": 0.7076182216405869,
910
  "step": 535
911
  },
912
  {
913
- "epoch": 0.5201059475078257,
914
- "grad_norm": 0.4583546221256256,
915
  "learning_rate": 2e-05,
916
- "loss": 1.0188,
917
- "mean_token_accuracy": 0.7161876708269119,
918
  "step": 540
919
  },
920
  {
921
- "epoch": 0.5249217433180833,
922
- "grad_norm": 0.4691885709762573,
923
  "learning_rate": 2e-05,
924
- "loss": 1.0316,
925
- "mean_token_accuracy": 0.7126393973827362,
926
  "step": 545
927
  },
928
  {
929
- "epoch": 0.529737539128341,
930
- "grad_norm": 0.5013365149497986,
931
  "learning_rate": 2e-05,
932
- "loss": 0.9992,
933
- "mean_token_accuracy": 0.7188058078289032,
934
  "step": 550
935
  },
936
  {
937
- "epoch": 0.5345533349385986,
938
- "grad_norm": 0.4390871822834015,
939
  "learning_rate": 2e-05,
940
- "loss": 1.0112,
941
- "mean_token_accuracy": 0.7182704269886017,
942
  "step": 555
943
  },
944
  {
945
- "epoch": 0.5393691307488563,
946
- "grad_norm": 0.4545508027076721,
947
  "learning_rate": 2e-05,
948
- "loss": 1.0161,
949
- "mean_token_accuracy": 0.7146501332521439,
950
  "step": 560
951
  },
952
  {
953
- "epoch": 0.5441849265591139,
954
- "grad_norm": 0.46719858050346375,
955
  "learning_rate": 2e-05,
956
- "loss": 1.0514,
957
- "mean_token_accuracy": 0.7060492634773254,
958
  "step": 565
959
  },
960
  {
961
- "epoch": 0.5490007223693716,
962
- "grad_norm": 0.4321208596229553,
963
  "learning_rate": 2e-05,
964
- "loss": 0.9971,
965
- "mean_token_accuracy": 0.7195345431566238,
966
  "step": 570
967
  },
968
  {
969
- "epoch": 0.5538165181796292,
970
- "grad_norm": 0.4826374053955078,
971
  "learning_rate": 2e-05,
972
- "loss": 1.0532,
973
- "mean_token_accuracy": 0.7067425459623337,
974
  "step": 575
975
  },
976
  {
977
- "epoch": 0.5586323139898868,
978
- "grad_norm": 0.48376429080963135,
979
  "learning_rate": 2e-05,
980
- "loss": 0.9979,
981
- "mean_token_accuracy": 0.7204372942447662,
982
  "step": 580
983
  },
984
  {
985
- "epoch": 0.5634481098001445,
986
- "grad_norm": 0.5080297589302063,
987
  "learning_rate": 2e-05,
988
- "loss": 1.0432,
989
- "mean_token_accuracy": 0.7083571314811706,
990
  "step": 585
991
  },
992
  {
993
- "epoch": 0.5682639056104021,
994
- "grad_norm": 0.4134162664413452,
995
  "learning_rate": 2e-05,
996
- "loss": 1.0107,
997
- "mean_token_accuracy": 0.7153516292572022,
998
  "step": 590
999
  },
1000
  {
1001
- "epoch": 0.5730797014206598,
1002
- "grad_norm": 0.45565879344940186,
1003
  "learning_rate": 2e-05,
1004
- "loss": 1.0531,
1005
- "mean_token_accuracy": 0.7076080977916718,
1006
  "step": 595
1007
  },
1008
  {
1009
- "epoch": 0.5778954972309174,
1010
- "grad_norm": 0.49836477637290955,
1011
  "learning_rate": 2e-05,
1012
- "loss": 1.0051,
1013
- "mean_token_accuracy": 0.7182355552911759,
1014
  "step": 600
1015
  },
1016
  {
1017
- "epoch": 0.5778954972309174,
1018
- "eval_loss": 1.0204687118530273,
1019
- "eval_mean_token_accuracy": 0.7171955704689026,
1020
- "eval_runtime": 8.482,
1021
- "eval_samples_per_second": 11.79,
1022
- "eval_steps_per_second": 1.533,
1023
  "step": 600
1024
  },
1025
  {
1026
- "epoch": 0.5827112930411751,
1027
- "grad_norm": 0.46696388721466064,
1028
  "learning_rate": 2e-05,
1029
- "loss": 1.0366,
1030
- "mean_token_accuracy": 0.7119109451770782,
1031
  "step": 605
1032
  },
1033
  {
1034
- "epoch": 0.5875270888514327,
1035
- "grad_norm": 0.4734061062335968,
1036
  "learning_rate": 2e-05,
1037
- "loss": 1.0322,
1038
- "mean_token_accuracy": 0.7130448073148727,
1039
  "step": 610
1040
  },
1041
  {
1042
- "epoch": 0.5923428846616904,
1043
- "grad_norm": 0.4501568078994751,
1044
  "learning_rate": 2e-05,
1045
- "loss": 0.9959,
1046
- "mean_token_accuracy": 0.7202717989683152,
1047
  "step": 615
1048
  },
1049
  {
1050
- "epoch": 0.597158680471948,
1051
- "grad_norm": 0.46749380230903625,
1052
  "learning_rate": 2e-05,
1053
- "loss": 1.0644,
1054
- "mean_token_accuracy": 0.7030862450599671,
1055
  "step": 620
1056
  },
1057
  {
1058
- "epoch": 0.6019744762822057,
1059
- "grad_norm": 0.4468248784542084,
1060
  "learning_rate": 2e-05,
1061
- "loss": 0.9866,
1062
- "mean_token_accuracy": 0.7225469410419464,
1063
  "step": 625
1064
  },
1065
  {
1066
- "epoch": 0.6067902720924633,
1067
- "grad_norm": 0.47491517663002014,
1068
  "learning_rate": 2e-05,
1069
- "loss": 1.0006,
1070
- "mean_token_accuracy": 0.7195418655872345,
1071
  "step": 630
1072
  },
1073
  {
1074
- "epoch": 0.6116060679027209,
1075
- "grad_norm": 0.46158623695373535,
1076
  "learning_rate": 2e-05,
1077
- "loss": 1.0398,
1078
- "mean_token_accuracy": 0.7092684119939804,
1079
  "step": 635
1080
  },
1081
  {
1082
- "epoch": 0.6164218637129786,
1083
- "grad_norm": 0.4365028738975525,
1084
  "learning_rate": 2e-05,
1085
- "loss": 1.0264,
1086
- "mean_token_accuracy": 0.7130223363637924,
1087
  "step": 640
1088
  },
1089
  {
1090
- "epoch": 0.6212376595232362,
1091
- "grad_norm": 0.49602553248405457,
1092
  "learning_rate": 2e-05,
1093
- "loss": 1.0078,
1094
- "mean_token_accuracy": 0.7170433759689331,
1095
  "step": 645
1096
  },
1097
  {
1098
- "epoch": 0.6260534553334939,
1099
- "grad_norm": 0.4484293758869171,
1100
  "learning_rate": 2e-05,
1101
- "loss": 1.0257,
1102
- "mean_token_accuracy": 0.7127807974815369,
1103
  "step": 650
1104
  },
1105
  {
1106
- "epoch": 0.6308692511437515,
1107
- "grad_norm": 0.544967532157898,
1108
  "learning_rate": 2e-05,
1109
- "loss": 1.0318,
1110
- "mean_token_accuracy": 0.7124882370233536,
1111
  "step": 655
1112
  },
1113
  {
1114
- "epoch": 0.6356850469540092,
1115
- "grad_norm": 0.49626436829566956,
1116
  "learning_rate": 2e-05,
1117
- "loss": 1.0418,
1118
- "mean_token_accuracy": 0.7077222436666488,
1119
  "step": 660
1120
  },
1121
  {
1122
- "epoch": 0.6405008427642668,
1123
- "grad_norm": 0.43986326456069946,
1124
  "learning_rate": 2e-05,
1125
- "loss": 0.9766,
1126
- "mean_token_accuracy": 0.7237329006195068,
1127
  "step": 665
1128
  },
1129
  {
1130
- "epoch": 0.6453166385745245,
1131
- "grad_norm": 0.48558539152145386,
1132
  "learning_rate": 2e-05,
1133
- "loss": 1.0335,
1134
- "mean_token_accuracy": 0.7114945560693741,
1135
  "step": 670
1136
  },
1137
  {
1138
- "epoch": 0.6501324343847821,
1139
- "grad_norm": 0.4739987552165985,
1140
  "learning_rate": 2e-05,
1141
- "loss": 0.9827,
1142
- "mean_token_accuracy": 0.7252886116504669,
1143
  "step": 675
1144
  },
1145
  {
1146
- "epoch": 0.6549482301950398,
1147
- "grad_norm": 0.469598650932312,
1148
  "learning_rate": 2e-05,
1149
- "loss": 1.0184,
1150
- "mean_token_accuracy": 0.7135869711637497,
1151
  "step": 680
1152
  },
1153
  {
1154
- "epoch": 0.6597640260052974,
1155
- "grad_norm": 0.44300341606140137,
1156
  "learning_rate": 2e-05,
1157
- "loss": 0.9925,
1158
- "mean_token_accuracy": 0.7203141242265702,
1159
  "step": 685
1160
  },
1161
  {
1162
- "epoch": 0.664579821815555,
1163
- "grad_norm": 0.4621056318283081,
1164
  "learning_rate": 2e-05,
1165
- "loss": 1.0253,
1166
- "mean_token_accuracy": 0.7131445229053497,
1167
  "step": 690
1168
  },
1169
  {
1170
- "epoch": 0.6693956176258127,
1171
- "grad_norm": 0.4649716913700104,
1172
  "learning_rate": 2e-05,
1173
- "loss": 1.0125,
1174
- "mean_token_accuracy": 0.7149119585752487,
1175
  "step": 695
1176
  },
1177
  {
1178
- "epoch": 0.6742114134360703,
1179
- "grad_norm": 0.49358126521110535,
1180
  "learning_rate": 2e-05,
1181
- "loss": 1.0136,
1182
- "mean_token_accuracy": 0.7160834163427353,
1183
  "step": 700
1184
  },
1185
  {
1186
- "epoch": 0.6742114134360703,
1187
- "eval_loss": 1.0096875429153442,
1188
- "eval_mean_token_accuracy": 0.7198267166431134,
1189
- "eval_runtime": 8.6585,
1190
- "eval_samples_per_second": 11.549,
1191
- "eval_steps_per_second": 1.501,
1192
  "step": 700
1193
  },
1194
  {
1195
- "epoch": 0.679027209246328,
1196
- "grad_norm": 0.4873298406600952,
1197
  "learning_rate": 2e-05,
1198
- "loss": 1.0203,
1199
- "mean_token_accuracy": 0.7126143038272857,
1200
  "step": 705
1201
  },
1202
  {
1203
- "epoch": 0.6838430050565856,
1204
- "grad_norm": 0.4695189893245697,
1205
  "learning_rate": 2e-05,
1206
- "loss": 1.0062,
1207
- "mean_token_accuracy": 0.7176679968833923,
1208
  "step": 710
1209
  },
1210
  {
1211
- "epoch": 0.6886588008668433,
1212
- "grad_norm": 0.46720319986343384,
1213
  "learning_rate": 2e-05,
1214
- "loss": 1.0077,
1215
- "mean_token_accuracy": 0.7165515303611756,
1216
  "step": 715
1217
  },
1218
  {
1219
- "epoch": 0.6934745966771009,
1220
- "grad_norm": 0.4640096426010132,
1221
  "learning_rate": 2e-05,
1222
- "loss": 1.0396,
1223
- "mean_token_accuracy": 0.7095631629228591,
1224
  "step": 720
1225
  },
1226
  {
1227
- "epoch": 0.6982903924873586,
1228
- "grad_norm": 0.4294661283493042,
1229
  "learning_rate": 2e-05,
1230
- "loss": 1.0619,
1231
- "mean_token_accuracy": 0.7026620030403137,
1232
  "step": 725
1233
  },
1234
  {
1235
- "epoch": 0.7031061882976162,
1236
- "grad_norm": 0.4739510416984558,
1237
  "learning_rate": 2e-05,
1238
- "loss": 0.9951,
1239
- "mean_token_accuracy": 0.7200070083141327,
1240
  "step": 730
1241
  },
1242
  {
1243
- "epoch": 0.7079219841078739,
1244
- "grad_norm": 0.492569237947464,
1245
  "learning_rate": 2e-05,
1246
- "loss": 1.0022,
1247
- "mean_token_accuracy": 0.7179245352745056,
1248
  "step": 735
1249
  },
1250
  {
1251
- "epoch": 0.7127377799181315,
1252
- "grad_norm": 0.48600509762763977,
1253
  "learning_rate": 2e-05,
1254
- "loss": 1.0159,
1255
- "mean_token_accuracy": 0.7153581887483597,
1256
  "step": 740
1257
  },
1258
  {
1259
- "epoch": 0.7175535757283891,
1260
- "grad_norm": 0.5048158764839172,
1261
  "learning_rate": 2e-05,
1262
- "loss": 0.9899,
1263
- "mean_token_accuracy": 0.7200082540512085,
1264
  "step": 745
1265
  },
1266
  {
1267
- "epoch": 0.7223693715386468,
1268
- "grad_norm": 0.4951934516429901,
1269
  "learning_rate": 2e-05,
1270
- "loss": 0.9982,
1271
- "mean_token_accuracy": 0.7204364091157913,
1272
  "step": 750
1273
  },
1274
  {
1275
- "epoch": 0.7271851673489044,
1276
- "grad_norm": 0.49931296706199646,
1277
  "learning_rate": 2e-05,
1278
- "loss": 0.998,
1279
- "mean_token_accuracy": 0.7174190193414688,
1280
  "step": 755
1281
  },
1282
  {
1283
- "epoch": 0.7320009631591621,
1284
- "grad_norm": 0.4547603726387024,
1285
  "learning_rate": 2e-05,
1286
- "loss": 1.0261,
1287
- "mean_token_accuracy": 0.7145938724279404,
1288
  "step": 760
1289
  },
1290
  {
1291
- "epoch": 0.7368167589694197,
1292
- "grad_norm": 0.43703392148017883,
1293
  "learning_rate": 2e-05,
1294
- "loss": 0.9416,
1295
- "mean_token_accuracy": 0.7310493141412735,
1296
  "step": 765
1297
  },
1298
  {
1299
- "epoch": 0.7416325547796774,
1300
- "grad_norm": 0.44958263635635376,
1301
  "learning_rate": 2e-05,
1302
- "loss": 0.9991,
1303
- "mean_token_accuracy": 0.7186336666345596,
1304
  "step": 770
1305
  },
1306
  {
1307
- "epoch": 0.746448350589935,
1308
- "grad_norm": 0.4758422374725342,
1309
  "learning_rate": 2e-05,
1310
- "loss": 1.0164,
1311
- "mean_token_accuracy": 0.7137081116437912,
1312
  "step": 775
1313
  },
1314
  {
1315
- "epoch": 0.7512641464001927,
1316
- "grad_norm": 0.488331139087677,
1317
  "learning_rate": 2e-05,
1318
- "loss": 0.9698,
1319
- "mean_token_accuracy": 0.7264787226915359,
1320
  "step": 780
1321
  },
1322
  {
1323
- "epoch": 0.7560799422104503,
1324
- "grad_norm": 0.4642072021961212,
1325
  "learning_rate": 2e-05,
1326
- "loss": 0.9731,
1327
- "mean_token_accuracy": 0.7251051425933838,
1328
  "step": 785
1329
  },
1330
  {
1331
- "epoch": 0.7608957380207079,
1332
- "grad_norm": 0.4261150360107422,
1333
  "learning_rate": 2e-05,
1334
- "loss": 1.0072,
1335
- "mean_token_accuracy": 0.7165987342596054,
1336
  "step": 790
1337
  },
1338
  {
1339
- "epoch": 0.7657115338309656,
1340
- "grad_norm": 0.4364739656448364,
1341
  "learning_rate": 2e-05,
1342
- "loss": 0.9603,
1343
- "mean_token_accuracy": 0.7281170040369034,
1344
  "step": 795
1345
  },
1346
  {
1347
- "epoch": 0.7705273296412232,
1348
- "grad_norm": 0.471077024936676,
1349
  "learning_rate": 2e-05,
1350
- "loss": 1.0483,
1351
- "mean_token_accuracy": 0.7061151295900345,
1352
  "step": 800
1353
  },
1354
  {
1355
- "epoch": 0.7705273296412232,
1356
- "eval_loss": 1.0003124475479126,
1357
- "eval_mean_token_accuracy": 0.7220352200361398,
1358
- "eval_runtime": 8.5998,
1359
- "eval_samples_per_second": 11.628,
1360
- "eval_steps_per_second": 1.512,
1361
  "step": 800
1362
  },
1363
  {
1364
- "epoch": 0.7753431254514809,
1365
- "grad_norm": 0.4885280728340149,
1366
  "learning_rate": 2e-05,
1367
- "loss": 1.0073,
1368
- "mean_token_accuracy": 0.7162327229976654,
1369
  "step": 805
1370
  },
1371
  {
1372
- "epoch": 0.7801589212617385,
1373
- "grad_norm": 0.5416684746742249,
1374
  "learning_rate": 2e-05,
1375
- "loss": 1.0244,
1376
- "mean_token_accuracy": 0.7137985616922379,
1377
  "step": 810
1378
  },
1379
  {
1380
- "epoch": 0.7849747170719962,
1381
- "grad_norm": 0.47809484601020813,
1382
  "learning_rate": 2e-05,
1383
- "loss": 1.0041,
1384
- "mean_token_accuracy": 0.7155015915632248,
1385
  "step": 815
1386
  },
1387
  {
1388
- "epoch": 0.7897905128822538,
1389
- "grad_norm": 0.4929503798484802,
1390
  "learning_rate": 2e-05,
1391
- "loss": 0.9961,
1392
- "mean_token_accuracy": 0.7197568088769912,
1393
  "step": 820
1394
  },
1395
  {
1396
- "epoch": 0.7946063086925115,
1397
- "grad_norm": 0.442008912563324,
1398
  "learning_rate": 2e-05,
1399
- "loss": 0.9856,
1400
- "mean_token_accuracy": 0.7208785116672516,
1401
  "step": 825
1402
  },
1403
  {
1404
- "epoch": 0.7994221045027691,
1405
- "grad_norm": 0.4885089099407196,
1406
  "learning_rate": 2e-05,
1407
- "loss": 0.9814,
1408
- "mean_token_accuracy": 0.7215597033500671,
1409
  "step": 830
1410
  },
1411
  {
1412
- "epoch": 0.8042379003130268,
1413
- "grad_norm": 0.4534910023212433,
1414
  "learning_rate": 2e-05,
1415
- "loss": 0.971,
1416
- "mean_token_accuracy": 0.7238455027341842,
1417
  "step": 835
1418
  },
1419
  {
1420
- "epoch": 0.8090536961232844,
1421
- "grad_norm": 0.4507865011692047,
1422
  "learning_rate": 2e-05,
1423
- "loss": 0.9817,
1424
- "mean_token_accuracy": 0.7246822834014892,
1425
  "step": 840
1426
  },
1427
  {
1428
- "epoch": 0.813869491933542,
1429
- "grad_norm": 0.4892081618309021,
1430
  "learning_rate": 2e-05,
1431
- "loss": 1.0188,
1432
- "mean_token_accuracy": 0.7129034757614136,
1433
  "step": 845
1434
  },
1435
  {
1436
- "epoch": 0.8186852877437997,
1437
- "grad_norm": 0.45840218663215637,
1438
  "learning_rate": 2e-05,
1439
- "loss": 1.001,
1440
- "mean_token_accuracy": 0.7185318768024445,
1441
  "step": 850
1442
  },
1443
  {
1444
- "epoch": 0.8235010835540573,
1445
- "grad_norm": 0.4619063436985016,
1446
  "learning_rate": 2e-05,
1447
- "loss": 0.9486,
1448
- "mean_token_accuracy": 0.7325035721063614,
1449
  "step": 855
1450
  },
1451
  {
1452
- "epoch": 0.828316879364315,
1453
- "grad_norm": 0.44516247510910034,
1454
  "learning_rate": 2e-05,
1455
- "loss": 1.0055,
1456
- "mean_token_accuracy": 0.7171078979969024,
1457
  "step": 860
1458
  },
1459
  {
1460
- "epoch": 0.8331326751745726,
1461
- "grad_norm": 0.5152034163475037,
1462
  "learning_rate": 2e-05,
1463
- "loss": 1.026,
1464
- "mean_token_accuracy": 0.7162949174642563,
1465
  "step": 865
1466
  },
1467
  {
1468
- "epoch": 0.8379484709848303,
1469
- "grad_norm": 0.4782991111278534,
1470
  "learning_rate": 2e-05,
1471
- "loss": 0.9898,
1472
- "mean_token_accuracy": 0.7220347046852111,
1473
  "step": 870
1474
  },
1475
  {
1476
- "epoch": 0.8427642667950879,
1477
- "grad_norm": 0.43797022104263306,
1478
  "learning_rate": 2e-05,
1479
- "loss": 0.973,
1480
- "mean_token_accuracy": 0.7254473388195037,
1481
  "step": 875
1482
  },
1483
  {
1484
- "epoch": 0.8475800626053456,
1485
- "grad_norm": 0.42285850644111633,
1486
  "learning_rate": 2e-05,
1487
- "loss": 0.948,
1488
- "mean_token_accuracy": 0.7321132332086563,
1489
  "step": 880
1490
  },
1491
  {
1492
- "epoch": 0.8523958584156032,
1493
- "grad_norm": 0.4795195460319519,
1494
  "learning_rate": 2e-05,
1495
- "loss": 1.0138,
1496
- "mean_token_accuracy": 0.7152723044157028,
1497
  "step": 885
1498
  },
1499
  {
1500
- "epoch": 0.8572116542258609,
1501
- "grad_norm": 0.4528616666793823,
1502
  "learning_rate": 2e-05,
1503
- "loss": 0.9734,
1504
- "mean_token_accuracy": 0.7251696765422821,
1505
  "step": 890
1506
  },
1507
  {
1508
- "epoch": 0.8620274500361185,
1509
- "grad_norm": 0.45938703417778015,
1510
  "learning_rate": 2e-05,
1511
- "loss": 0.9851,
1512
- "mean_token_accuracy": 0.7221759200096131,
1513
  "step": 895
1514
  },
1515
  {
1516
- "epoch": 0.8668432458463761,
1517
- "grad_norm": 0.48591721057891846,
1518
  "learning_rate": 2e-05,
1519
- "loss": 0.9815,
1520
- "mean_token_accuracy": 0.7246751219034195,
1521
  "step": 900
1522
  },
1523
  {
1524
- "epoch": 0.8668432458463761,
1525
- "eval_loss": 0.9910937547683716,
1526
- "eval_mean_token_accuracy": 0.7242004642119775,
1527
- "eval_runtime": 8.6229,
1528
- "eval_samples_per_second": 11.597,
1529
- "eval_steps_per_second": 1.508,
1530
  "step": 900
1531
  },
1532
  {
1533
- "epoch": 0.8716590416566338,
1534
- "grad_norm": 0.548160970211029,
1535
  "learning_rate": 2e-05,
1536
- "loss": 0.9728,
1537
- "mean_token_accuracy": 0.7271899342536926,
1538
  "step": 905
1539
  },
1540
  {
1541
- "epoch": 0.8764748374668914,
1542
- "grad_norm": 0.4917408525943756,
1543
  "learning_rate": 2e-05,
1544
- "loss": 0.983,
1545
- "mean_token_accuracy": 0.722226795554161,
1546
  "step": 910
1547
  },
1548
  {
1549
- "epoch": 0.8812906332771491,
1550
- "grad_norm": 0.43940261006355286,
1551
  "learning_rate": 2e-05,
1552
- "loss": 0.967,
1553
- "mean_token_accuracy": 0.7271131485700607,
1554
  "step": 915
1555
  },
1556
  {
1557
- "epoch": 0.8861064290874067,
1558
- "grad_norm": 0.46846315264701843,
1559
  "learning_rate": 2e-05,
1560
- "loss": 0.9694,
1561
- "mean_token_accuracy": 0.723258101940155,
1562
  "step": 920
1563
  },
1564
  {
1565
- "epoch": 0.8909222248976644,
1566
- "grad_norm": 0.5005144476890564,
1567
  "learning_rate": 2e-05,
1568
- "loss": 1.0068,
1569
- "mean_token_accuracy": 0.7149937510490417,
1570
  "step": 925
1571
  },
1572
  {
1573
- "epoch": 0.895738020707922,
1574
- "grad_norm": 0.43692487478256226,
1575
  "learning_rate": 2e-05,
1576
- "loss": 0.9817,
1577
- "mean_token_accuracy": 0.721362081170082,
1578
  "step": 930
1579
  },
1580
  {
1581
- "epoch": 0.9005538165181797,
1582
- "grad_norm": 0.47597751021385193,
1583
  "learning_rate": 2e-05,
1584
- "loss": 0.9869,
1585
- "mean_token_accuracy": 0.7244188725948334,
1586
  "step": 935
1587
  },
1588
  {
1589
- "epoch": 0.9053696123284373,
1590
- "grad_norm": 0.47237148880958557,
1591
  "learning_rate": 2e-05,
1592
- "loss": 0.9847,
1593
- "mean_token_accuracy": 0.7214221894741059,
1594
  "step": 940
1595
  },
1596
  {
1597
- "epoch": 0.910185408138695,
1598
- "grad_norm": 0.44735458493232727,
1599
  "learning_rate": 2e-05,
1600
- "loss": 0.9859,
1601
- "mean_token_accuracy": 0.7203694522380829,
1602
  "step": 945
1603
  },
1604
  {
1605
- "epoch": 0.9150012039489526,
1606
- "grad_norm": 0.47945865988731384,
1607
  "learning_rate": 2e-05,
1608
- "loss": 0.9398,
1609
- "mean_token_accuracy": 0.7325642824172973,
1610
  "step": 950
1611
  },
1612
  {
1613
- "epoch": 0.9198169997592102,
1614
- "grad_norm": 0.43617284297943115,
1615
  "learning_rate": 2e-05,
1616
- "loss": 0.9659,
1617
- "mean_token_accuracy": 0.7265276938676835,
1618
  "step": 955
1619
  },
1620
  {
1621
- "epoch": 0.9246327955694679,
1622
- "grad_norm": 0.4993121325969696,
1623
  "learning_rate": 2e-05,
1624
- "loss": 0.9695,
1625
- "mean_token_accuracy": 0.7261034786701203,
1626
  "step": 960
1627
  },
1628
  {
1629
- "epoch": 0.9294485913797255,
1630
- "grad_norm": 0.4212028682231903,
1631
  "learning_rate": 2e-05,
1632
- "loss": 1.0269,
1633
- "mean_token_accuracy": 0.7113113075494766,
1634
  "step": 965
1635
  },
1636
  {
1637
- "epoch": 0.9342643871899832,
1638
- "grad_norm": 0.4470522701740265,
1639
  "learning_rate": 2e-05,
1640
- "loss": 0.9729,
1641
- "mean_token_accuracy": 0.72572822868824,
1642
  "step": 970
1643
  },
1644
  {
1645
- "epoch": 0.9390801830002408,
1646
- "grad_norm": 0.442359060049057,
1647
  "learning_rate": 2e-05,
1648
- "loss": 0.9818,
1649
- "mean_token_accuracy": 0.7235529303550721,
1650
  "step": 975
1651
  },
1652
  {
1653
- "epoch": 0.9438959788104985,
1654
- "grad_norm": 0.49310287833213806,
1655
  "learning_rate": 2e-05,
1656
- "loss": 0.9653,
1657
- "mean_token_accuracy": 0.7279765665531158,
1658
  "step": 980
1659
  },
1660
  {
1661
- "epoch": 0.9487117746207561,
1662
- "grad_norm": 0.44634732604026794,
1663
  "learning_rate": 2e-05,
1664
- "loss": 0.9761,
1665
- "mean_token_accuracy": 0.7246888697147369,
1666
  "step": 985
1667
  },
1668
  {
1669
- "epoch": 0.9535275704310138,
1670
- "grad_norm": 0.4682416021823883,
1671
  "learning_rate": 2e-05,
1672
- "loss": 1.0052,
1673
- "mean_token_accuracy": 0.7163143336772919,
1674
  "step": 990
1675
  },
1676
  {
1677
- "epoch": 0.9583433662412714,
1678
- "grad_norm": 0.49107804894447327,
1679
  "learning_rate": 2e-05,
1680
- "loss": 1.0147,
1681
- "mean_token_accuracy": 0.7137022405862808,
1682
  "step": 995
1683
  },
1684
  {
1685
- "epoch": 0.963159162051529,
1686
- "grad_norm": 0.4588830769062042,
1687
  "learning_rate": 2e-05,
1688
- "loss": 0.9901,
1689
- "mean_token_accuracy": 0.7186317384243012,
1690
  "step": 1000
1691
  },
1692
  {
1693
- "epoch": 0.963159162051529,
1694
- "eval_loss": 0.9823437333106995,
1695
- "eval_mean_token_accuracy": 0.7257784238228431,
1696
- "eval_runtime": 8.6223,
1697
- "eval_samples_per_second": 11.598,
1698
- "eval_steps_per_second": 1.508,
1699
  "step": 1000
1700
  },
1701
  {
1702
- "epoch": 0.9679749578617867,
1703
- "grad_norm": 0.4416583180427551,
1704
  "learning_rate": 2e-05,
1705
- "loss": 0.9372,
1706
- "mean_token_accuracy": 0.732732167840004,
1707
  "step": 1005
1708
  },
1709
  {
1710
- "epoch": 0.9727907536720443,
1711
- "grad_norm": 0.4314471185207367,
1712
  "learning_rate": 2e-05,
1713
- "loss": 0.9854,
1714
- "mean_token_accuracy": 0.7232161700725556,
1715
  "step": 1010
1716
  },
1717
  {
1718
- "epoch": 0.977606549482302,
1719
- "grad_norm": 0.4762505888938904,
1720
  "learning_rate": 2e-05,
1721
- "loss": 0.9713,
1722
- "mean_token_accuracy": 0.7264497399330139,
1723
  "step": 1015
1724
  },
1725
  {
1726
- "epoch": 0.9824223452925596,
1727
- "grad_norm": 0.4602217972278595,
1728
  "learning_rate": 2e-05,
1729
- "loss": 0.9487,
1730
- "mean_token_accuracy": 0.7303879648447037,
1731
  "step": 1020
1732
  },
1733
  {
1734
- "epoch": 0.9872381411028173,
1735
- "grad_norm": 0.40625375509262085,
1736
  "learning_rate": 2e-05,
1737
- "loss": 0.9711,
1738
- "mean_token_accuracy": 0.7242156893014908,
1739
  "step": 1025
1740
  },
1741
  {
1742
- "epoch": 0.9920539369130749,
1743
- "grad_norm": 0.5164700746536255,
1744
- "learning_rate": 2e-05,
1745
- "loss": 0.966,
1746
- "mean_token_accuracy": 0.7264703720808029,
1747
- "step": 1030
1748
- },
1749
- {
1750
- "epoch": 0.9968697327233326,
1751
- "grad_norm": 0.45738255977630615,
1752
- "learning_rate": 2e-05,
1753
- "loss": 0.9525,
1754
- "mean_token_accuracy": 0.7287515312433243,
1755
- "step": 1035
1756
- },
1757
- {
1758
- "epoch": 0.9997592102094871,
1759
- "mean_token_accuracy": 0.7337231040000916,
1760
- "step": 1038,
1761
  "total_flos": 0.0,
1762
- "train_loss": 1.0554086321351157,
1763
- "train_runtime": 8232.1254,
1764
- "train_samples_per_second": 2.018,
1765
- "train_steps_per_second": 0.126
1766
  }
1767
  ],
1768
  "logging_steps": 5,
1769
- "max_steps": 1038,
1770
  "num_input_tokens_seen": 0,
1771
- "num_train_epochs": 1,
1772
  "save_steps": 500,
1773
  "stateful_callbacks": {
1774
  "TrainerControl": {
@@ -1783,7 +1767,7 @@
1783
  }
1784
  },
1785
  "total_flos": 0.0,
1786
- "train_batch_size": 2,
1787
  "trial_name": null,
1788
  "trial_params": null
1789
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9989078995267566,
5
  "eval_steps": 100,
6
+ "global_step": 1029,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.014561339643247179,
13
+ "grad_norm": 37.66851806640625,
14
+ "learning_rate": 6.94512199751671e-06,
15
+ "loss": 2.2001,
16
+ "mean_token_accuracy": 0.5431830704212188,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.029122679286494358,
21
+ "grad_norm": 35.93154525756836,
22
+ "learning_rate": 9.936223234807016e-06,
23
+ "loss": 2.0917,
24
+ "mean_token_accuracy": 0.5551130786538124,
25
  "step": 10
26
  },
27
  {
28
+ "epoch": 0.04368401892974154,
29
+ "grad_norm": 11.33205509185791,
30
+ "learning_rate": 1.1685905294482498e-05,
31
+ "loss": 1.6615,
32
+ "mean_token_accuracy": 0.6157955378293991,
33
  "step": 15
34
  },
35
  {
36
+ "epoch": 0.058245358572988716,
37
+ "grad_norm": 0.5927252769470215,
38
+ "learning_rate": 1.292732447209732e-05,
39
+ "loss": 1.4254,
40
+ "mean_token_accuracy": 0.6511392086744309,
41
  "step": 20
42
  },
43
  {
44
+ "epoch": 0.0728066982162359,
45
+ "grad_norm": 0.420287549495697,
46
+ "learning_rate": 1.389024399503342e-05,
47
+ "loss": 1.3534,
48
+ "mean_token_accuracy": 0.6588488951325416,
49
  "step": 25
50
  },
51
  {
52
+ "epoch": 0.08736803785948308,
53
+ "grad_norm": 0.3345254361629486,
54
+ "learning_rate": 1.4677006531772802e-05,
55
+ "loss": 1.3017,
56
+ "mean_token_accuracy": 0.6658114358782768,
57
  "step": 30
58
  },
59
  {
60
+ "epoch": 0.10192937750273025,
61
+ "grad_norm": 0.28586578369140625,
62
+ "learning_rate": 1.5342204778396236e-05,
63
+ "loss": 1.2668,
64
+ "mean_token_accuracy": 0.6703020080924034,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.11649071714597743,
69
+ "grad_norm": 0.2802036702632904,
70
+ "learning_rate": 1.5918425709387625e-05,
71
+ "loss": 1.2384,
72
+ "mean_token_accuracy": 0.6751068279147148,
73
  "step": 40
74
  },
75
  {
76
+ "epoch": 0.1310520567892246,
77
+ "grad_norm": 0.2390078604221344,
78
+ "learning_rate": 1.6426688591448284e-05,
79
+ "loss": 1.2159,
80
+ "mean_token_accuracy": 0.6786533042788505,
81
  "step": 45
82
  },
83
  {
84
+ "epoch": 0.1456133964324718,
85
+ "grad_norm": 0.22002895176410675,
86
+ "learning_rate": 1.6881345232323726e-05,
87
+ "loss": 1.1892,
88
+ "mean_token_accuracy": 0.6840800851583481,
89
  "step": 50
90
  },
91
  {
92
+ "epoch": 0.16017473607571897,
93
+ "grad_norm": 0.24121476709842682,
94
+ "learning_rate": 1.7292632192343935e-05,
95
+ "loss": 1.1866,
96
+ "mean_token_accuracy": 0.6831674978137017,
97
  "step": 55
98
  },
99
  {
100
+ "epoch": 0.17473607571896616,
101
+ "grad_norm": 0.2153487652540207,
102
+ "learning_rate": 1.7668107769063104e-05,
103
+ "loss": 1.1414,
104
+ "mean_token_accuracy": 0.6937515258789062,
105
  "step": 60
106
  },
107
  {
108
+ "epoch": 0.1892974153622133,
109
+ "grad_norm": 0.19544684886932373,
110
+ "learning_rate": 1.8013511816966716e-05,
111
+ "loss": 1.1364,
112
+ "mean_token_accuracy": 0.6935119941830635,
113
  "step": 65
114
  },
115
  {
116
+ "epoch": 0.2038587550054605,
117
+ "grad_norm": 0.20832200348377228,
118
+ "learning_rate": 1.8333306015686545e-05,
119
+ "loss": 1.1522,
120
+ "mean_token_accuracy": 0.6890207245945931,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.21842009464870768,
125
+ "grad_norm": 0.20898529887199402,
126
+ "learning_rate": 1.8631027291999205e-05,
127
+ "loss": 1.1359,
128
+ "mean_token_accuracy": 0.6920820102095604,
129
  "step": 75
130
  },
131
  {
132
+ "epoch": 0.23298143429195486,
133
+ "grad_norm": 0.21433797478675842,
134
+ "learning_rate": 1.890952694667793e-05,
135
+ "loss": 1.1177,
136
+ "mean_token_accuracy": 0.6967326954007149,
137
  "step": 80
138
  },
139
  {
140
+ "epoch": 0.24754277393520205,
141
+ "grad_norm": 0.19631509482860565,
142
+ "learning_rate": 1.9171137159358747e-05,
143
+ "loss": 1.1084,
144
+ "mean_token_accuracy": 0.6984182327985764,
145
  "step": 85
146
  },
147
  {
148
+ "epoch": 0.2621041135784492,
149
+ "grad_norm": 0.18118107318878174,
150
+ "learning_rate": 1.941778982873859e-05,
151
+ "loss": 1.1085,
152
+ "mean_token_accuracy": 0.6977160558104515,
153
  "step": 90
154
  },
155
  {
156
+ "epoch": 0.2766654532216964,
157
+ "grad_norm": 0.20939502120018005,
158
+ "learning_rate": 1.965110323889735e-05,
159
+ "loss": 1.1065,
160
+ "mean_token_accuracy": 0.6980448961257935,
161
  "step": 95
162
  },
163
  {
164
+ "epoch": 0.2912267928649436,
165
+ "grad_norm": 0.1910240799188614,
166
+ "learning_rate": 1.987244646961403e-05,
167
+ "loss": 1.098,
168
+ "mean_token_accuracy": 0.6989446982741356,
169
  "step": 100
170
  },
171
  {
172
+ "epoch": 0.2912267928649436,
173
+ "eval_loss": 1.1112689971923828,
174
+ "eval_mean_token_accuracy": 0.6931903600692749,
175
+ "eval_runtime": 13.1717,
176
+ "eval_samples_per_second": 5.011,
177
+ "eval_steps_per_second": 0.38,
178
  "step": 100
179
  },
180
  {
181
+ "epoch": 0.30578813250819076,
182
+ "grad_norm": 0.19251097738742828,
183
  "learning_rate": 2e-05,
184
+ "loss": 1.0873,
185
+ "mean_token_accuracy": 0.7030002444982528,
186
  "step": 105
187
  },
188
  {
189
+ "epoch": 0.32034947215143794,
190
+ "grad_norm": 0.1976984441280365,
191
  "learning_rate": 2e-05,
192
+ "loss": 1.0954,
193
+ "mean_token_accuracy": 0.6988905444741249,
194
  "step": 110
195
  },
196
  {
197
+ "epoch": 0.3349108117946851,
198
+ "grad_norm": 0.18475359678268433,
199
  "learning_rate": 2e-05,
200
+ "loss": 1.0734,
201
+ "mean_token_accuracy": 0.7050403520464897,
202
  "step": 115
203
  },
204
  {
205
+ "epoch": 0.3494721514379323,
206
+ "grad_norm": 0.21479089558124542,
207
  "learning_rate": 2e-05,
208
+ "loss": 1.0792,
209
+ "mean_token_accuracy": 0.7049864172935486,
210
  "step": 120
211
  },
212
  {
213
+ "epoch": 0.3640334910811795,
214
+ "grad_norm": 0.20598657429218292,
215
  "learning_rate": 2e-05,
216
+ "loss": 1.0853,
217
+ "mean_token_accuracy": 0.7013437002897263,
218
  "step": 125
219
  },
220
  {
221
+ "epoch": 0.3785948307244266,
222
+ "grad_norm": 0.2035069763660431,
223
  "learning_rate": 2e-05,
224
+ "loss": 1.0758,
225
+ "mean_token_accuracy": 0.7027781993150711,
226
  "step": 130
227
  },
228
  {
229
+ "epoch": 0.3931561703676738,
230
+ "grad_norm": 0.20786674320697784,
231
  "learning_rate": 2e-05,
232
+ "loss": 1.086,
233
+ "mean_token_accuracy": 0.7012925714254379,
234
  "step": 135
235
  },
236
  {
237
+ "epoch": 0.407717510010921,
238
+ "grad_norm": 0.19930203258991241,
239
  "learning_rate": 2e-05,
240
+ "loss": 1.0819,
241
+ "mean_token_accuracy": 0.7024676471948623,
242
  "step": 140
243
  },
244
  {
245
+ "epoch": 0.4222788496541682,
246
+ "grad_norm": 0.2310095727443695,
247
  "learning_rate": 2e-05,
248
+ "loss": 1.0658,
249
+ "mean_token_accuracy": 0.7064277902245522,
250
  "step": 145
251
  },
252
  {
253
+ "epoch": 0.43684018929741536,
254
+ "grad_norm": 0.2163601964712143,
255
  "learning_rate": 2e-05,
256
+ "loss": 1.0741,
257
+ "mean_token_accuracy": 0.7032444015145302,
258
  "step": 150
259
  },
260
  {
261
+ "epoch": 0.45140152894066254,
262
+ "grad_norm": 0.22852301597595215,
263
  "learning_rate": 2e-05,
264
+ "loss": 1.0647,
265
+ "mean_token_accuracy": 0.7054958015680313,
266
  "step": 155
267
  },
268
  {
269
+ "epoch": 0.46596286858390973,
270
+ "grad_norm": 0.21535199880599976,
271
  "learning_rate": 2e-05,
272
+ "loss": 1.0753,
273
+ "mean_token_accuracy": 0.7024859562516212,
274
  "step": 160
275
  },
276
  {
277
+ "epoch": 0.4805242082271569,
278
+ "grad_norm": 0.1874823421239853,
279
  "learning_rate": 2e-05,
280
+ "loss": 1.0466,
281
+ "mean_token_accuracy": 0.710732813179493,
282
  "step": 165
283
  },
284
  {
285
+ "epoch": 0.4950855478704041,
286
+ "grad_norm": 0.17631468176841736,
287
  "learning_rate": 2e-05,
288
+ "loss": 1.0599,
289
+ "mean_token_accuracy": 0.7064499124884606,
290
  "step": 170
291
  },
292
  {
293
+ "epoch": 0.5096468875136513,
294
+ "grad_norm": 0.20174729824066162,
295
  "learning_rate": 2e-05,
296
+ "loss": 1.0447,
297
+ "mean_token_accuracy": 0.7100659251213074,
298
  "step": 175
299
  },
300
  {
301
+ "epoch": 0.5242082271568984,
302
+ "grad_norm": 0.1740773767232895,
303
  "learning_rate": 2e-05,
304
+ "loss": 1.0538,
305
+ "mean_token_accuracy": 0.7067421570420265,
306
  "step": 180
307
  },
308
  {
309
+ "epoch": 0.5387695668001456,
310
+ "grad_norm": 0.18247714638710022,
311
  "learning_rate": 2e-05,
312
+ "loss": 1.0487,
313
+ "mean_token_accuracy": 0.7085062861442566,
314
  "step": 185
315
  },
316
  {
317
+ "epoch": 0.5533309064433928,
318
+ "grad_norm": 0.19148823618888855,
319
  "learning_rate": 2e-05,
320
+ "loss": 1.0597,
321
+ "mean_token_accuracy": 0.7060058280825615,
322
  "step": 190
323
  },
324
  {
325
+ "epoch": 0.56789224608664,
326
+ "grad_norm": 0.1849949061870575,
327
  "learning_rate": 2e-05,
328
+ "loss": 1.0422,
329
+ "mean_token_accuracy": 0.7095287501811981,
330
  "step": 195
331
  },
332
  {
333
+ "epoch": 0.5824535857298871,
334
+ "grad_norm": 0.18132270872592926,
335
  "learning_rate": 2e-05,
336
+ "loss": 1.0468,
337
+ "mean_token_accuracy": 0.7086604192852974,
338
  "step": 200
339
  },
340
  {
341
+ "epoch": 0.5824535857298871,
342
+ "eval_loss": 1.0518466234207153,
343
+ "eval_mean_token_accuracy": 0.7042254090309144,
344
+ "eval_runtime": 9.0776,
345
+ "eval_samples_per_second": 7.271,
346
+ "eval_steps_per_second": 0.551,
347
  "step": 200
348
  },
349
  {
350
+ "epoch": 0.5970149253731343,
351
+ "grad_norm": 0.18448956310749054,
352
  "learning_rate": 2e-05,
353
+ "loss": 1.037,
354
+ "mean_token_accuracy": 0.7115538448095322,
355
  "step": 205
356
  },
357
  {
358
+ "epoch": 0.6115762650163815,
359
+ "grad_norm": 0.18751472234725952,
360
  "learning_rate": 2e-05,
361
+ "loss": 1.0445,
362
+ "mean_token_accuracy": 0.7096409142017365,
363
  "step": 210
364
  },
365
  {
366
+ "epoch": 0.6261376046596286,
367
+ "grad_norm": 0.1885630488395691,
368
  "learning_rate": 2e-05,
369
+ "loss": 1.03,
370
+ "mean_token_accuracy": 0.7122749090194702,
371
  "step": 215
372
  },
373
  {
374
+ "epoch": 0.6406989443028759,
375
+ "grad_norm": 0.16414892673492432,
376
  "learning_rate": 2e-05,
377
+ "loss": 1.0203,
378
+ "mean_token_accuracy": 0.7152423396706581,
379
  "step": 220
380
  },
381
  {
382
+ "epoch": 0.655260283946123,
383
+ "grad_norm": 0.19612085819244385,
384
  "learning_rate": 2e-05,
385
+ "loss": 1.0325,
386
+ "mean_token_accuracy": 0.7118910998106003,
387
  "step": 225
388
  },
389
  {
390
+ "epoch": 0.6698216235893703,
391
+ "grad_norm": 0.21399515867233276,
392
  "learning_rate": 2e-05,
393
+ "loss": 1.0264,
394
+ "mean_token_accuracy": 0.7130562499165535,
395
  "step": 230
396
  },
397
  {
398
+ "epoch": 0.6843829632326174,
399
+ "grad_norm": 0.19661396741867065,
400
  "learning_rate": 2e-05,
401
+ "loss": 1.043,
402
+ "mean_token_accuracy": 0.7081087440252304,
403
  "step": 235
404
  },
405
  {
406
+ "epoch": 0.6989443028758646,
407
+ "grad_norm": 0.1965187042951584,
408
  "learning_rate": 2e-05,
409
+ "loss": 1.0232,
410
+ "mean_token_accuracy": 0.7128947824239731,
411
  "step": 240
412
  },
413
  {
414
+ "epoch": 0.7135056425191117,
415
+ "grad_norm": 0.19409024715423584,
416
  "learning_rate": 2e-05,
417
+ "loss": 1.0328,
418
+ "mean_token_accuracy": 0.7117997944355011,
419
  "step": 245
420
  },
421
  {
422
+ "epoch": 0.728066982162359,
423
+ "grad_norm": 0.19648292660713196,
424
  "learning_rate": 2e-05,
425
+ "loss": 1.038,
426
+ "mean_token_accuracy": 0.7104994520545006,
427
  "step": 250
428
  },
429
  {
430
+ "epoch": 0.7426283218056061,
431
+ "grad_norm": 0.16873674094676971,
432
  "learning_rate": 2e-05,
433
+ "loss": 1.0204,
434
+ "mean_token_accuracy": 0.7144693836569787,
435
  "step": 255
436
  },
437
  {
438
+ "epoch": 0.7571896614488532,
439
+ "grad_norm": 0.196326345205307,
440
  "learning_rate": 2e-05,
441
+ "loss": 1.0213,
442
+ "mean_token_accuracy": 0.7139619678258896,
443
  "step": 260
444
  },
445
  {
446
+ "epoch": 0.7717510010921005,
447
+ "grad_norm": 0.2151579111814499,
448
  "learning_rate": 2e-05,
449
+ "loss": 1.0218,
450
+ "mean_token_accuracy": 0.7128418371081352,
451
  "step": 265
452
  },
453
  {
454
+ "epoch": 0.7863123407353476,
455
+ "grad_norm": 0.17913345992565155,
456
  "learning_rate": 2e-05,
457
+ "loss": 1.0265,
458
+ "mean_token_accuracy": 0.7119010165333748,
459
  "step": 270
460
  },
461
  {
462
+ "epoch": 0.8008736803785949,
463
+ "grad_norm": 0.1754622459411621,
464
  "learning_rate": 2e-05,
465
+ "loss": 1.0154,
466
+ "mean_token_accuracy": 0.7142972201108932,
467
  "step": 275
468
  },
469
  {
470
+ "epoch": 0.815435020021842,
471
+ "grad_norm": 0.19478586316108704,
472
  "learning_rate": 2e-05,
473
+ "loss": 1.0302,
474
+ "mean_token_accuracy": 0.711237944662571,
475
  "step": 280
476
  },
477
  {
478
+ "epoch": 0.8299963596650892,
479
+ "grad_norm": 0.18054774403572083,
480
  "learning_rate": 2e-05,
481
+ "loss": 1.0102,
482
+ "mean_token_accuracy": 0.7163647577166558,
483
  "step": 285
484
  },
485
  {
486
+ "epoch": 0.8445576993083364,
487
+ "grad_norm": 0.2034561187028885,
488
  "learning_rate": 2e-05,
489
+ "loss": 0.9997,
490
+ "mean_token_accuracy": 0.7193443968892097,
491
  "step": 290
492
  },
493
  {
494
+ "epoch": 0.8591190389515836,
495
+ "grad_norm": 0.202008917927742,
496
  "learning_rate": 2e-05,
497
+ "loss": 1.0059,
498
+ "mean_token_accuracy": 0.718295231461525,
499
  "step": 295
500
  },
501
  {
502
+ "epoch": 0.8736803785948307,
503
+ "grad_norm": 0.207170769572258,
504
  "learning_rate": 2e-05,
505
+ "loss": 1.0078,
506
+ "mean_token_accuracy": 0.7157909572124481,
507
  "step": 300
508
  },
509
  {
510
+ "epoch": 0.8736803785948307,
511
+ "eval_loss": 1.019649624824524,
512
+ "eval_mean_token_accuracy": 0.7100205540657043,
513
+ "eval_runtime": 9.0111,
514
+ "eval_samples_per_second": 7.324,
515
+ "eval_steps_per_second": 0.555,
516
  "step": 300
517
  },
518
  {
519
+ "epoch": 0.8882417182380778,
520
+ "grad_norm": 0.21266800165176392,
521
  "learning_rate": 2e-05,
522
+ "loss": 1.0074,
523
+ "mean_token_accuracy": 0.7171560257673264,
524
  "step": 305
525
  },
526
  {
527
+ "epoch": 0.9028030578813251,
528
+ "grad_norm": 0.19650128483772278,
529
  "learning_rate": 2e-05,
530
+ "loss": 1.0142,
531
+ "mean_token_accuracy": 0.7148646369576455,
532
  "step": 310
533
  },
534
  {
535
+ "epoch": 0.9173643975245722,
536
+ "grad_norm": 0.22513243556022644,
537
  "learning_rate": 2e-05,
538
+ "loss": 0.9986,
539
+ "mean_token_accuracy": 0.7187789976596832,
540
  "step": 315
541
  },
542
  {
543
+ "epoch": 0.9319257371678195,
544
+ "grad_norm": 0.21226869523525238,
545
  "learning_rate": 2e-05,
546
+ "loss": 1.0095,
547
+ "mean_token_accuracy": 0.7166241943836212,
548
  "step": 320
549
  },
550
  {
551
+ "epoch": 0.9464870768110666,
552
+ "grad_norm": 0.20326650142669678,
553
  "learning_rate": 2e-05,
554
+ "loss": 1.0193,
555
+ "mean_token_accuracy": 0.7132607430219651,
556
  "step": 325
557
  },
558
  {
559
+ "epoch": 0.9610484164543138,
560
+ "grad_norm": 0.2005230039358139,
561
  "learning_rate": 2e-05,
562
+ "loss": 0.9969,
563
+ "mean_token_accuracy": 0.7200944602489472,
564
  "step": 330
565
  },
566
  {
567
+ "epoch": 0.975609756097561,
568
+ "grad_norm": 0.18904553353786469,
569
  "learning_rate": 2e-05,
570
+ "loss": 1.0021,
571
+ "mean_token_accuracy": 0.7175276219844818,
572
  "step": 335
573
  },
574
  {
575
+ "epoch": 0.9901710957408082,
576
+ "grad_norm": 0.19811247289180756,
577
  "learning_rate": 2e-05,
578
+ "loss": 0.9993,
579
+ "mean_token_accuracy": 0.7185588896274566,
580
  "step": 340
581
  },
582
  {
583
+ "epoch": 1.0058245358572988,
584
+ "grad_norm": 0.1984570026397705,
585
  "learning_rate": 2e-05,
586
+ "loss": 1.0395,
587
+ "mean_token_accuracy": 0.719094998779751,
588
  "step": 345
589
  },
590
  {
591
+ "epoch": 1.0203858755005462,
592
+ "grad_norm": 0.19051611423492432,
593
  "learning_rate": 2e-05,
594
+ "loss": 0.9733,
595
+ "mean_token_accuracy": 0.724665792286396,
596
  "step": 350
597
  },
598
  {
599
+ "epoch": 1.0349472151437933,
600
+ "grad_norm": 0.17580822110176086,
601
  "learning_rate": 2e-05,
602
+ "loss": 0.9811,
603
+ "mean_token_accuracy": 0.7220699548721313,
604
  "step": 355
605
  },
606
  {
607
+ "epoch": 1.0495085547870404,
608
+ "grad_norm": 0.18404695391654968,
609
  "learning_rate": 2e-05,
610
+ "loss": 0.9757,
611
+ "mean_token_accuracy": 0.7235838070511817,
612
  "step": 360
613
  },
614
  {
615
+ "epoch": 1.0640698944302875,
616
+ "grad_norm": 0.19152525067329407,
617
  "learning_rate": 2e-05,
618
+ "loss": 0.9828,
619
+ "mean_token_accuracy": 0.7204630091786385,
620
  "step": 365
621
  },
622
  {
623
+ "epoch": 1.0786312340735347,
624
+ "grad_norm": 0.19752167165279388,
625
  "learning_rate": 2e-05,
626
+ "loss": 0.9974,
627
+ "mean_token_accuracy": 0.7173234552145005,
628
  "step": 370
629
  },
630
  {
631
+ "epoch": 1.093192573716782,
632
+ "grad_norm": 0.1899857074022293,
633
  "learning_rate": 2e-05,
634
+ "loss": 0.979,
635
+ "mean_token_accuracy": 0.7223904326558113,
636
  "step": 375
637
  },
638
  {
639
+ "epoch": 1.1077539133600292,
640
+ "grad_norm": 0.19329330325126648,
641
  "learning_rate": 2e-05,
642
+ "loss": 0.9859,
643
+ "mean_token_accuracy": 0.7198724135756492,
644
  "step": 380
645
  },
646
  {
647
+ "epoch": 1.1223152530032763,
648
+ "grad_norm": 0.18269023299217224,
649
  "learning_rate": 2e-05,
650
+ "loss": 0.9642,
651
+ "mean_token_accuracy": 0.727630938589573,
652
  "step": 385
653
  },
654
  {
655
+ "epoch": 1.1368765926465234,
656
+ "grad_norm": 0.1874740570783615,
657
  "learning_rate": 2e-05,
658
+ "loss": 0.9778,
659
+ "mean_token_accuracy": 0.722543029487133,
660
  "step": 390
661
  },
662
  {
663
+ "epoch": 1.1514379322897708,
664
+ "grad_norm": 0.174119770526886,
665
  "learning_rate": 2e-05,
666
+ "loss": 0.9568,
667
+ "mean_token_accuracy": 0.7279941365122795,
668
  "step": 395
669
  },
670
  {
671
+ "epoch": 1.1659992719330179,
672
+ "grad_norm": 0.1855311542749405,
673
  "learning_rate": 2e-05,
674
+ "loss": 0.9688,
675
+ "mean_token_accuracy": 0.7247176736593246,
676
  "step": 400
677
  },
678
  {
679
+ "epoch": 1.1659992719330179,
680
+ "eval_loss": 0.9988163113594055,
681
+ "eval_mean_token_accuracy": 0.7142685413360595,
682
+ "eval_runtime": 9.0627,
683
+ "eval_samples_per_second": 7.283,
684
+ "eval_steps_per_second": 0.552,
685
  "step": 400
686
  },
687
  {
688
+ "epoch": 1.180560611576265,
689
+ "grad_norm": 0.2153940200805664,
690
  "learning_rate": 2e-05,
691
+ "loss": 0.9744,
692
+ "mean_token_accuracy": 0.723403736948967,
693
  "step": 405
694
  },
695
  {
696
+ "epoch": 1.1951219512195121,
697
+ "grad_norm": 0.20522184669971466,
698
  "learning_rate": 2e-05,
699
+ "loss": 0.9725,
700
+ "mean_token_accuracy": 0.7236453905701637,
701
  "step": 410
702
  },
703
  {
704
+ "epoch": 1.2096832908627593,
705
+ "grad_norm": 0.1979447901248932,
706
  "learning_rate": 2e-05,
707
+ "loss": 0.978,
708
+ "mean_token_accuracy": 0.7225941598415375,
709
  "step": 415
710
  },
711
  {
712
+ "epoch": 1.2242446305060066,
713
+ "grad_norm": 0.19050033390522003,
714
  "learning_rate": 2e-05,
715
+ "loss": 0.9645,
716
+ "mean_token_accuracy": 0.7253967747092247,
717
  "step": 420
718
  },
719
  {
720
+ "epoch": 1.2388059701492538,
721
+ "grad_norm": 0.19461773335933685,
722
  "learning_rate": 2e-05,
723
+ "loss": 0.9469,
724
+ "mean_token_accuracy": 0.7304514393210411,
725
  "step": 425
726
  },
727
  {
728
+ "epoch": 1.2533673097925009,
729
+ "grad_norm": 0.19699646532535553,
730
  "learning_rate": 2e-05,
731
+ "loss": 0.9661,
732
+ "mean_token_accuracy": 0.7242850378155709,
733
  "step": 430
734
  },
735
  {
736
+ "epoch": 1.267928649435748,
737
+ "grad_norm": 0.1707252860069275,
738
  "learning_rate": 2e-05,
739
+ "loss": 0.9677,
740
+ "mean_token_accuracy": 0.7248802036046982,
741
  "step": 435
742
  },
743
  {
744
+ "epoch": 1.2824899890789951,
745
+ "grad_norm": 0.191995769739151,
746
  "learning_rate": 2e-05,
747
+ "loss": 0.9526,
748
+ "mean_token_accuracy": 0.7288147822022438,
749
  "step": 440
750
  },
751
  {
752
+ "epoch": 1.2970513287222425,
753
+ "grad_norm": 0.18631628155708313,
754
  "learning_rate": 2e-05,
755
+ "loss": 0.9705,
756
+ "mean_token_accuracy": 0.723578467965126,
757
  "step": 445
758
  },
759
  {
760
+ "epoch": 1.3116126683654896,
761
+ "grad_norm": 0.18883132934570312,
762
  "learning_rate": 2e-05,
763
+ "loss": 0.963,
764
+ "mean_token_accuracy": 0.7254890978336335,
765
  "step": 450
766
  },
767
  {
768
+ "epoch": 1.3261740080087367,
769
+ "grad_norm": 0.19409437477588654,
770
  "learning_rate": 2e-05,
771
+ "loss": 0.9664,
772
+ "mean_token_accuracy": 0.7247894033789635,
773
  "step": 455
774
  },
775
  {
776
+ "epoch": 1.340735347651984,
777
+ "grad_norm": 0.17440907657146454,
778
  "learning_rate": 2e-05,
779
+ "loss": 0.9539,
780
+ "mean_token_accuracy": 0.7279758274555206,
781
  "step": 460
782
  },
783
  {
784
+ "epoch": 1.3552966872952312,
785
+ "grad_norm": 0.19074754416942596,
786
  "learning_rate": 2e-05,
787
+ "loss": 0.9577,
788
+ "mean_token_accuracy": 0.726808387041092,
789
  "step": 465
790
  },
791
  {
792
+ "epoch": 1.3698580269384784,
793
+ "grad_norm": 0.17158101499080658,
794
  "learning_rate": 2e-05,
795
+ "loss": 0.9624,
796
+ "mean_token_accuracy": 0.7265920951962471,
797
  "step": 470
798
  },
799
  {
800
+ "epoch": 1.3844193665817255,
801
+ "grad_norm": 0.18509717285633087,
802
  "learning_rate": 2e-05,
803
+ "loss": 0.9537,
804
+ "mean_token_accuracy": 0.7284008353948593,
805
  "step": 475
806
  },
807
  {
808
+ "epoch": 1.3989807062249726,
809
+ "grad_norm": 0.18281057476997375,
810
  "learning_rate": 2e-05,
811
+ "loss": 0.9676,
812
+ "mean_token_accuracy": 0.7247360810637474,
813
  "step": 480
814
  },
815
  {
816
+ "epoch": 1.41354204586822,
817
+ "grad_norm": 0.17993324995040894,
818
  "learning_rate": 2e-05,
819
+ "loss": 0.9662,
820
+ "mean_token_accuracy": 0.7256203427910805,
821
  "step": 485
822
  },
823
  {
824
+ "epoch": 1.428103385511467,
825
+ "grad_norm": 0.21310538053512573,
826
  "learning_rate": 2e-05,
827
+ "loss": 0.9601,
828
+ "mean_token_accuracy": 0.7264375537633896,
829
  "step": 490
830
  },
831
  {
832
+ "epoch": 1.4426647251547142,
833
+ "grad_norm": 0.18315577507019043,
834
  "learning_rate": 2e-05,
835
+ "loss": 0.9434,
836
+ "mean_token_accuracy": 0.73150485008955,
837
  "step": 495
838
  },
839
  {
840
+ "epoch": 1.4572260647979614,
841
+ "grad_norm": 0.19212491810321808,
842
  "learning_rate": 2e-05,
843
+ "loss": 0.949,
844
+ "mean_token_accuracy": 0.7304915532469749,
845
  "step": 500
846
  },
847
  {
848
+ "epoch": 1.4572260647979614,
849
+ "eval_loss": 0.9828361868858337,
850
+ "eval_mean_token_accuracy": 0.7173765182495118,
851
+ "eval_runtime": 9.0411,
852
+ "eval_samples_per_second": 7.3,
853
+ "eval_steps_per_second": 0.553,
854
  "step": 500
855
  },
856
  {
857
+ "epoch": 1.4717874044412085,
858
+ "grad_norm": 0.17383301258087158,
859
  "learning_rate": 2e-05,
860
+ "loss": 0.9673,
861
+ "mean_token_accuracy": 0.7242163941264153,
862
  "step": 505
863
  },
864
  {
865
+ "epoch": 1.4863487440844558,
866
+ "grad_norm": 0.1923714131116867,
867
  "learning_rate": 2e-05,
868
+ "loss": 0.9525,
869
+ "mean_token_accuracy": 0.7281391903758049,
870
  "step": 510
871
  },
872
  {
873
+ "epoch": 1.500910083727703,
874
+ "grad_norm": 0.20028460025787354,
875
  "learning_rate": 2e-05,
876
+ "loss": 0.9684,
877
+ "mean_token_accuracy": 0.723628830909729,
878
  "step": 515
879
  },
880
  {
881
+ "epoch": 1.51547142337095,
882
+ "grad_norm": 0.20677019655704498,
883
  "learning_rate": 2e-05,
884
+ "loss": 0.9802,
885
+ "mean_token_accuracy": 0.7201669454574585,
886
  "step": 520
887
  },
888
  {
889
+ "epoch": 1.5300327630141974,
890
+ "grad_norm": 0.17966614663600922,
891
  "learning_rate": 2e-05,
892
+ "loss": 0.9509,
893
+ "mean_token_accuracy": 0.7284548431634903,
894
  "step": 525
895
  },
896
  {
897
+ "epoch": 1.5445941026574443,
898
+ "grad_norm": 0.19962792098522186,
899
  "learning_rate": 2e-05,
900
+ "loss": 0.9556,
901
+ "mean_token_accuracy": 0.727879686653614,
902
  "step": 530
903
  },
904
  {
905
+ "epoch": 1.5591554423006917,
906
+ "grad_norm": 0.1987488865852356,
907
  "learning_rate": 2e-05,
908
+ "loss": 0.9441,
909
+ "mean_token_accuracy": 0.7310003876686096,
910
  "step": 535
911
  },
912
  {
913
+ "epoch": 1.5737167819439388,
914
+ "grad_norm": 0.19578705728054047,
915
  "learning_rate": 2e-05,
916
+ "loss": 0.9646,
917
+ "mean_token_accuracy": 0.7239905044436454,
918
  "step": 540
919
  },
920
  {
921
+ "epoch": 1.588278121587186,
922
+ "grad_norm": 0.18124082684516907,
923
  "learning_rate": 2e-05,
924
+ "loss": 0.9648,
925
+ "mean_token_accuracy": 0.7232480734586716,
926
  "step": 545
927
  },
928
  {
929
+ "epoch": 1.6028394612304333,
930
+ "grad_norm": 0.18559084832668304,
931
  "learning_rate": 2e-05,
932
+ "loss": 0.9374,
933
+ "mean_token_accuracy": 0.7308333814144135,
934
  "step": 550
935
  },
936
  {
937
+ "epoch": 1.6174008008736804,
938
+ "grad_norm": 0.20631375908851624,
939
  "learning_rate": 2e-05,
940
+ "loss": 0.9618,
941
+ "mean_token_accuracy": 0.7258706241846085,
942
  "step": 555
943
  },
944
  {
945
+ "epoch": 1.6319621405169276,
946
+ "grad_norm": 0.1777167022228241,
947
  "learning_rate": 2e-05,
948
+ "loss": 0.9447,
949
+ "mean_token_accuracy": 0.7300238028168679,
950
  "step": 560
951
  },
952
  {
953
+ "epoch": 1.6465234801601747,
954
+ "grad_norm": 0.17086541652679443,
955
  "learning_rate": 2e-05,
956
+ "loss": 0.9396,
957
+ "mean_token_accuracy": 0.7313713192939758,
958
  "step": 565
959
  },
960
  {
961
+ "epoch": 1.6610848198034218,
962
+ "grad_norm": 0.21147526800632477,
963
  "learning_rate": 2e-05,
964
+ "loss": 0.9512,
965
+ "mean_token_accuracy": 0.7282726511359214,
966
  "step": 570
967
  },
968
  {
969
+ "epoch": 1.6756461594466692,
970
+ "grad_norm": 0.1869203746318817,
971
  "learning_rate": 2e-05,
972
+ "loss": 0.9442,
973
+ "mean_token_accuracy": 0.7296415269374847,
974
  "step": 575
975
  },
976
  {
977
+ "epoch": 1.6902074990899163,
978
+ "grad_norm": 0.19998286664485931,
979
  "learning_rate": 2e-05,
980
+ "loss": 0.9495,
981
+ "mean_token_accuracy": 0.7277965158224106,
982
  "step": 580
983
  },
984
  {
985
+ "epoch": 1.7047688387331634,
986
+ "grad_norm": 0.1844228357076645,
987
  "learning_rate": 2e-05,
988
+ "loss": 0.9535,
989
+ "mean_token_accuracy": 0.7272509470582008,
990
  "step": 585
991
  },
992
  {
993
+ "epoch": 1.7193301783764108,
994
+ "grad_norm": 0.22390896081924438,
995
  "learning_rate": 2e-05,
996
+ "loss": 0.9526,
997
+ "mean_token_accuracy": 0.7276457890868187,
998
  "step": 590
999
  },
1000
  {
1001
+ "epoch": 1.7338915180196577,
1002
+ "grad_norm": 0.18321609497070312,
1003
  "learning_rate": 2e-05,
1004
+ "loss": 0.9534,
1005
+ "mean_token_accuracy": 0.7267900720238686,
1006
  "step": 595
1007
  },
1008
  {
1009
+ "epoch": 1.748452857662905,
1010
+ "grad_norm": 0.1791963130235672,
1011
  "learning_rate": 2e-05,
1012
+ "loss": 0.9656,
1013
+ "mean_token_accuracy": 0.7226544409990311,
1014
  "step": 600
1015
  },
1016
  {
1017
+ "epoch": 1.748452857662905,
1018
+ "eval_loss": 0.9683948755264282,
1019
+ "eval_mean_token_accuracy": 0.719802176952362,
1020
+ "eval_runtime": 9.0575,
1021
+ "eval_samples_per_second": 7.287,
1022
+ "eval_steps_per_second": 0.552,
1023
  "step": 600
1024
  },
1025
  {
1026
+ "epoch": 1.7630141973061522,
1027
+ "grad_norm": 0.2111322432756424,
1028
  "learning_rate": 2e-05,
1029
+ "loss": 0.9594,
1030
+ "mean_token_accuracy": 0.7252815589308739,
1031
  "step": 605
1032
  },
1033
  {
1034
+ "epoch": 1.7775755369493993,
1035
+ "grad_norm": 0.1824427843093872,
1036
  "learning_rate": 2e-05,
1037
+ "loss": 0.9448,
1038
+ "mean_token_accuracy": 0.730655600130558,
1039
  "step": 610
1040
  },
1041
  {
1042
+ "epoch": 1.7921368765926466,
1043
+ "grad_norm": 0.1896345168352127,
1044
  "learning_rate": 2e-05,
1045
+ "loss": 0.9429,
1046
+ "mean_token_accuracy": 0.7295209676027298,
1047
  "step": 615
1048
  },
1049
  {
1050
+ "epoch": 1.8066982162358936,
1051
+ "grad_norm": 0.19917869567871094,
1052
  "learning_rate": 2e-05,
1053
+ "loss": 0.9481,
1054
+ "mean_token_accuracy": 0.72839834690094,
1055
  "step": 620
1056
  },
1057
  {
1058
+ "epoch": 1.821259555879141,
1059
+ "grad_norm": 0.18886856734752655,
1060
  "learning_rate": 2e-05,
1061
+ "loss": 0.9494,
1062
+ "mean_token_accuracy": 0.7279918506741524,
1063
  "step": 625
1064
  },
1065
  {
1066
+ "epoch": 1.835820895522388,
1067
+ "grad_norm": 0.18665249645709991,
1068
  "learning_rate": 2e-05,
1069
+ "loss": 0.941,
1070
+ "mean_token_accuracy": 0.7308425426483154,
1071
  "step": 630
1072
  },
1073
  {
1074
+ "epoch": 1.8503822351656352,
1075
+ "grad_norm": 0.1790022999048233,
1076
  "learning_rate": 2e-05,
1077
+ "loss": 0.9459,
1078
+ "mean_token_accuracy": 0.7287953227758408,
1079
  "step": 635
1080
  },
1081
  {
1082
+ "epoch": 1.8649435748088825,
1083
+ "grad_norm": 0.18598653376102448,
1084
  "learning_rate": 2e-05,
1085
+ "loss": 0.9344,
1086
+ "mean_token_accuracy": 0.7317764893174171,
1087
  "step": 640
1088
  },
1089
  {
1090
+ "epoch": 1.8795049144521296,
1091
+ "grad_norm": 0.18939712643623352,
1092
  "learning_rate": 2e-05,
1093
+ "loss": 0.9473,
1094
+ "mean_token_accuracy": 0.7287281811237335,
1095
  "step": 645
1096
  },
1097
  {
1098
+ "epoch": 1.8940662540953768,
1099
+ "grad_norm": 0.1971713751554489,
1100
  "learning_rate": 2e-05,
1101
+ "loss": 0.9363,
1102
+ "mean_token_accuracy": 0.731384290754795,
1103
  "step": 650
1104
  },
1105
  {
1106
+ "epoch": 1.9086275937386241,
1107
+ "grad_norm": 0.19053973257541656,
1108
  "learning_rate": 2e-05,
1109
+ "loss": 0.9419,
1110
+ "mean_token_accuracy": 0.7299284294247628,
1111
  "step": 655
1112
  },
1113
  {
1114
+ "epoch": 1.923188933381871,
1115
+ "grad_norm": 0.1968354731798172,
1116
  "learning_rate": 2e-05,
1117
+ "loss": 0.9427,
1118
+ "mean_token_accuracy": 0.7299772590398789,
1119
  "step": 660
1120
  },
1121
  {
1122
+ "epoch": 1.9377502730251184,
1123
+ "grad_norm": 0.19762156903743744,
1124
  "learning_rate": 2e-05,
1125
+ "loss": 0.9517,
1126
+ "mean_token_accuracy": 0.7256975680589676,
1127
  "step": 665
1128
  },
1129
  {
1130
+ "epoch": 1.9523116126683655,
1131
+ "grad_norm": 0.18599362671375275,
1132
  "learning_rate": 2e-05,
1133
+ "loss": 0.9346,
1134
+ "mean_token_accuracy": 0.7319954812526703,
1135
  "step": 670
1136
  },
1137
  {
1138
+ "epoch": 1.9668729523116126,
1139
+ "grad_norm": 0.18457342684268951,
1140
  "learning_rate": 2e-05,
1141
+ "loss": 0.9555,
1142
+ "mean_token_accuracy": 0.726130048930645,
1143
  "step": 675
1144
  },
1145
  {
1146
+ "epoch": 1.98143429195486,
1147
+ "grad_norm": 0.1952061802148819,
1148
  "learning_rate": 2e-05,
1149
+ "loss": 0.9396,
1150
+ "mean_token_accuracy": 0.7293583780527115,
1151
  "step": 680
1152
  },
1153
  {
1154
+ "epoch": 1.995995631598107,
1155
+ "grad_norm": 0.24200007319450378,
1156
  "learning_rate": 2e-05,
1157
+ "loss": 0.9461,
1158
+ "mean_token_accuracy": 0.7285397097468376,
1159
  "step": 685
1160
  },
1161
  {
1162
+ "epoch": 2.0116490717145976,
1163
+ "grad_norm": 0.23615935444831848,
1164
  "learning_rate": 2e-05,
1165
+ "loss": 0.9642,
1166
+ "mean_token_accuracy": 0.735066328729902,
1167
  "step": 690
1168
  },
1169
  {
1170
+ "epoch": 2.026210411357845,
1171
+ "grad_norm": 0.20251308381557465,
1172
  "learning_rate": 2e-05,
1173
+ "loss": 0.9104,
1174
+ "mean_token_accuracy": 0.7364416778087616,
1175
  "step": 695
1176
  },
1177
  {
1178
+ "epoch": 2.0407717510010923,
1179
+ "grad_norm": 0.17524279654026031,
1180
  "learning_rate": 2e-05,
1181
+ "loss": 0.9201,
1182
+ "mean_token_accuracy": 0.7344257399439812,
1183
  "step": 700
1184
  },
1185
  {
1186
+ "epoch": 2.0407717510010923,
1187
+ "eval_loss": 0.9588068127632141,
1188
+ "eval_mean_token_accuracy": 0.7223332643508911,
1189
+ "eval_runtime": 9.0725,
1190
+ "eval_samples_per_second": 7.275,
1191
+ "eval_steps_per_second": 0.551,
1192
  "step": 700
1193
  },
1194
  {
1195
+ "epoch": 2.0553330906443392,
1196
+ "grad_norm": 0.177406445145607,
1197
  "learning_rate": 2e-05,
1198
+ "loss": 0.8955,
1199
+ "mean_token_accuracy": 0.741851630806923,
1200
  "step": 705
1201
  },
1202
  {
1203
+ "epoch": 2.0698944302875866,
1204
+ "grad_norm": 0.17523570358753204,
1205
  "learning_rate": 2e-05,
1206
+ "loss": 0.9214,
1207
+ "mean_token_accuracy": 0.7350994989275932,
1208
  "step": 710
1209
  },
1210
  {
1211
+ "epoch": 2.0844557699308335,
1212
+ "grad_norm": 0.18342241644859314,
1213
  "learning_rate": 2e-05,
1214
+ "loss": 0.9071,
1215
+ "mean_token_accuracy": 0.7384255573153495,
1216
  "step": 715
1217
  },
1218
  {
1219
+ "epoch": 2.099017109574081,
1220
+ "grad_norm": 0.17445097863674164,
1221
  "learning_rate": 2e-05,
1222
+ "loss": 0.901,
1223
+ "mean_token_accuracy": 0.7399073630571366,
1224
  "step": 720
1225
  },
1226
  {
1227
+ "epoch": 2.113578449217328,
1228
+ "grad_norm": 0.18421486020088196,
1229
  "learning_rate": 2e-05,
1230
+ "loss": 0.9381,
1231
+ "mean_token_accuracy": 0.7290249973535537,
1232
  "step": 725
1233
  },
1234
  {
1235
+ "epoch": 2.128139788860575,
1236
+ "grad_norm": 0.18407249450683594,
1237
  "learning_rate": 2e-05,
1238
+ "loss": 0.9094,
1239
+ "mean_token_accuracy": 0.7381876617670059,
1240
  "step": 730
1241
  },
1242
  {
1243
+ "epoch": 2.1427011285038224,
1244
+ "grad_norm": 0.19551889598369598,
1245
  "learning_rate": 2e-05,
1246
+ "loss": 0.902,
1247
+ "mean_token_accuracy": 0.7397311061620713,
1248
  "step": 735
1249
  },
1250
  {
1251
+ "epoch": 2.1572624681470693,
1252
+ "grad_norm": 0.19646836817264557,
1253
  "learning_rate": 2e-05,
1254
+ "loss": 0.9189,
1255
+ "mean_token_accuracy": 0.7339320570230484,
1256
  "step": 740
1257
  },
1258
  {
1259
+ "epoch": 2.1718238077903167,
1260
+ "grad_norm": 0.19153904914855957,
1261
  "learning_rate": 2e-05,
1262
+ "loss": 0.9209,
1263
+ "mean_token_accuracy": 0.7338812783360481,
1264
  "step": 745
1265
  },
1266
  {
1267
+ "epoch": 2.186385147433564,
1268
+ "grad_norm": 0.18896165490150452,
1269
  "learning_rate": 2e-05,
1270
+ "loss": 0.9115,
1271
+ "mean_token_accuracy": 0.7361234918236732,
1272
  "step": 750
1273
  },
1274
  {
1275
+ "epoch": 2.200946487076811,
1276
+ "grad_norm": 0.18773645162582397,
1277
  "learning_rate": 2e-05,
1278
+ "loss": 0.9167,
1279
+ "mean_token_accuracy": 0.7351216241717339,
1280
  "step": 755
1281
  },
1282
  {
1283
+ "epoch": 2.2155078267200583,
1284
+ "grad_norm": 0.16787868738174438,
1285
  "learning_rate": 2e-05,
1286
+ "loss": 0.9127,
1287
+ "mean_token_accuracy": 0.7366888895630836,
1288
  "step": 760
1289
  },
1290
  {
1291
+ "epoch": 2.2300691663633057,
1292
+ "grad_norm": 0.1758783459663391,
1293
  "learning_rate": 2e-05,
1294
+ "loss": 0.9042,
1295
+ "mean_token_accuracy": 0.738301183283329,
1296
  "step": 765
1297
  },
1298
  {
1299
+ "epoch": 2.2446305060065526,
1300
+ "grad_norm": 0.17144909501075745,
1301
  "learning_rate": 2e-05,
1302
+ "loss": 0.8967,
1303
+ "mean_token_accuracy": 0.7403302609920501,
1304
  "step": 770
1305
  },
1306
  {
1307
+ "epoch": 2.2591918456498,
1308
+ "grad_norm": 0.16952501237392426,
1309
  "learning_rate": 2e-05,
1310
+ "loss": 0.9238,
1311
+ "mean_token_accuracy": 0.7333216354250908,
1312
  "step": 775
1313
  },
1314
  {
1315
+ "epoch": 2.273753185293047,
1316
+ "grad_norm": 0.19771642982959747,
1317
  "learning_rate": 2e-05,
1318
+ "loss": 0.9211,
1319
+ "mean_token_accuracy": 0.7324410900473595,
1320
  "step": 780
1321
  },
1322
  {
1323
+ "epoch": 2.288314524936294,
1324
+ "grad_norm": 0.20366059243679047,
1325
  "learning_rate": 2e-05,
1326
+ "loss": 0.9051,
1327
+ "mean_token_accuracy": 0.7371811017394065,
1328
  "step": 785
1329
  },
1330
  {
1331
+ "epoch": 2.3028758645795415,
1332
+ "grad_norm": 0.18298108875751495,
1333
  "learning_rate": 2e-05,
1334
+ "loss": 0.915,
1335
+ "mean_token_accuracy": 0.7352513417601585,
1336
  "step": 790
1337
  },
1338
  {
1339
+ "epoch": 2.3174372042227884,
1340
+ "grad_norm": 0.17126043140888214,
1341
  "learning_rate": 2e-05,
1342
+ "loss": 0.9078,
1343
+ "mean_token_accuracy": 0.7370902448892593,
1344
  "step": 795
1345
  },
1346
  {
1347
+ "epoch": 2.3319985438660358,
1348
+ "grad_norm": 0.17268440127372742,
1349
  "learning_rate": 2e-05,
1350
+ "loss": 0.9083,
1351
+ "mean_token_accuracy": 0.7362028434872627,
1352
  "step": 800
1353
  },
1354
  {
1355
+ "epoch": 2.3319985438660358,
1356
+ "eval_loss": 0.9501657485961914,
1357
+ "eval_mean_token_accuracy": 0.7240748167037964,
1358
+ "eval_runtime": 9.038,
1359
+ "eval_samples_per_second": 7.303,
1360
+ "eval_steps_per_second": 0.553,
1361
  "step": 800
1362
  },
1363
  {
1364
+ "epoch": 2.3465598835092827,
1365
+ "grad_norm": 0.18702249228954315,
1366
  "learning_rate": 2e-05,
1367
+ "loss": 0.912,
1368
+ "mean_token_accuracy": 0.7360319286584854,
1369
  "step": 805
1370
  },
1371
  {
1372
+ "epoch": 2.36112122315253,
1373
+ "grad_norm": 0.18535007536411285,
1374
  "learning_rate": 2e-05,
1375
+ "loss": 0.9181,
1376
+ "mean_token_accuracy": 0.7348553270101548,
1377
  "step": 810
1378
  },
1379
  {
1380
+ "epoch": 2.3756825627957774,
1381
+ "grad_norm": 0.19746533036231995,
1382
  "learning_rate": 2e-05,
1383
+ "loss": 0.9119,
1384
+ "mean_token_accuracy": 0.7346351534128189,
1385
  "step": 815
1386
  },
1387
  {
1388
+ "epoch": 2.3902439024390243,
1389
+ "grad_norm": 0.1812228113412857,
1390
  "learning_rate": 2e-05,
1391
+ "loss": 0.9166,
1392
+ "mean_token_accuracy": 0.7355435863137245,
1393
  "step": 820
1394
  },
1395
  {
1396
+ "epoch": 2.4048052420822716,
1397
+ "grad_norm": 0.1950427144765854,
1398
  "learning_rate": 2e-05,
1399
+ "loss": 0.9186,
1400
+ "mean_token_accuracy": 0.7341731756925582,
1401
  "step": 825
1402
  },
1403
  {
1404
+ "epoch": 2.4193665817255186,
1405
+ "grad_norm": 0.19460786879062653,
1406
  "learning_rate": 2e-05,
1407
+ "loss": 0.9136,
1408
+ "mean_token_accuracy": 0.7349476546049118,
1409
  "step": 830
1410
  },
1411
  {
1412
+ "epoch": 2.433927921368766,
1413
+ "grad_norm": 0.1719018816947937,
1414
  "learning_rate": 2e-05,
1415
+ "loss": 0.9104,
1416
+ "mean_token_accuracy": 0.7361870780587196,
1417
  "step": 835
1418
  },
1419
  {
1420
+ "epoch": 2.4484892610120133,
1421
+ "grad_norm": 0.16659170389175415,
1422
  "learning_rate": 2e-05,
1423
+ "loss": 0.8925,
1424
+ "mean_token_accuracy": 0.740250737965107,
1425
  "step": 840
1426
  },
1427
  {
1428
+ "epoch": 2.46305060065526,
1429
+ "grad_norm": 0.20289891958236694,
1430
  "learning_rate": 2e-05,
1431
+ "loss": 0.8955,
1432
+ "mean_token_accuracy": 0.7406818434596062,
1433
  "step": 845
1434
  },
1435
  {
1436
+ "epoch": 2.4776119402985075,
1437
+ "grad_norm": 0.19275344908237457,
1438
  "learning_rate": 2e-05,
1439
+ "loss": 0.9111,
1440
+ "mean_token_accuracy": 0.7354253143072128,
1441
  "step": 850
1442
  },
1443
  {
1444
+ "epoch": 2.4921732799417544,
1445
+ "grad_norm": 0.23027832806110382,
1446
  "learning_rate": 2e-05,
1447
+ "loss": 0.903,
1448
+ "mean_token_accuracy": 0.7381577342748642,
1449
  "step": 855
1450
  },
1451
  {
1452
+ "epoch": 2.5067346195850018,
1453
+ "grad_norm": 0.1945352405309677,
1454
  "learning_rate": 2e-05,
1455
+ "loss": 0.9107,
1456
+ "mean_token_accuracy": 0.7365095824003219,
1457
  "step": 860
1458
  },
1459
  {
1460
+ "epoch": 2.521295959228249,
1461
+ "grad_norm": 0.18727873265743256,
1462
  "learning_rate": 2e-05,
1463
+ "loss": 0.9044,
1464
+ "mean_token_accuracy": 0.7381272122263909,
1465
  "step": 865
1466
  },
1467
  {
1468
+ "epoch": 2.535857298871496,
1469
+ "grad_norm": 0.1687106490135193,
1470
  "learning_rate": 2e-05,
1471
+ "loss": 0.9016,
1472
+ "mean_token_accuracy": 0.7383408591151237,
1473
  "step": 870
1474
  },
1475
  {
1476
+ "epoch": 2.5504186385147434,
1477
+ "grad_norm": 0.18417419493198395,
1478
  "learning_rate": 2e-05,
1479
+ "loss": 0.9023,
1480
+ "mean_token_accuracy": 0.7390033379197121,
1481
  "step": 875
1482
  },
1483
  {
1484
+ "epoch": 2.5649799781579903,
1485
+ "grad_norm": 0.18553201854228973,
1486
  "learning_rate": 2e-05,
1487
+ "loss": 0.8995,
1488
+ "mean_token_accuracy": 0.7391414895653725,
1489
  "step": 880
1490
  },
1491
  {
1492
+ "epoch": 2.5795413178012376,
1493
+ "grad_norm": 0.17184361815452576,
1494
  "learning_rate": 2e-05,
1495
+ "loss": 0.8977,
1496
+ "mean_token_accuracy": 0.7395716354250907,
1497
  "step": 885
1498
  },
1499
  {
1500
+ "epoch": 2.594102657444485,
1501
+ "grad_norm": 0.19258299469947815,
1502
  "learning_rate": 2e-05,
1503
+ "loss": 0.9003,
1504
+ "mean_token_accuracy": 0.7391077131032944,
1505
  "step": 890
1506
  },
1507
  {
1508
+ "epoch": 2.6086639970877323,
1509
+ "grad_norm": 0.18599550426006317,
1510
  "learning_rate": 2e-05,
1511
+ "loss": 0.9007,
1512
+ "mean_token_accuracy": 0.7381867274641991,
1513
  "step": 895
1514
  },
1515
  {
1516
+ "epoch": 2.6232253367309792,
1517
+ "grad_norm": 0.1717829406261444,
1518
  "learning_rate": 2e-05,
1519
+ "loss": 0.9141,
1520
+ "mean_token_accuracy": 0.7353848740458488,
1521
  "step": 900
1522
  },
1523
  {
1524
+ "epoch": 2.6232253367309792,
1525
+ "eval_loss": 0.9417613744735718,
1526
+ "eval_mean_token_accuracy": 0.7259013175964355,
1527
+ "eval_runtime": 9.0308,
1528
+ "eval_samples_per_second": 7.308,
1529
+ "eval_steps_per_second": 0.554,
1530
  "step": 900
1531
  },
1532
  {
1533
+ "epoch": 2.6377866763742266,
1534
+ "grad_norm": 0.1715007722377777,
1535
  "learning_rate": 2e-05,
1536
+ "loss": 0.8983,
1537
+ "mean_token_accuracy": 0.7399233922362327,
1538
  "step": 905
1539
  },
1540
  {
1541
+ "epoch": 2.6523480160174735,
1542
+ "grad_norm": 0.20033277571201324,
1543
  "learning_rate": 2e-05,
1544
+ "loss": 0.8892,
1545
+ "mean_token_accuracy": 0.7421796754002571,
1546
  "step": 910
1547
  },
1548
  {
1549
+ "epoch": 2.666909355660721,
1550
+ "grad_norm": 0.19205278158187866,
1551
  "learning_rate": 2e-05,
1552
+ "loss": 0.9151,
1553
+ "mean_token_accuracy": 0.7349491819739342,
1554
  "step": 915
1555
  },
1556
  {
1557
+ "epoch": 2.681470695303968,
1558
+ "grad_norm": 0.1759193241596222,
1559
  "learning_rate": 2e-05,
1560
+ "loss": 0.8897,
1561
+ "mean_token_accuracy": 0.7411266922950744,
1562
  "step": 920
1563
  },
1564
  {
1565
+ "epoch": 2.696032034947215,
1566
+ "grad_norm": 0.18072772026062012,
1567
  "learning_rate": 2e-05,
1568
+ "loss": 0.9085,
1569
+ "mean_token_accuracy": 0.737930352985859,
1570
  "step": 925
1571
  },
1572
  {
1573
+ "epoch": 2.7105933745904625,
1574
+ "grad_norm": 0.18214493989944458,
1575
  "learning_rate": 2e-05,
1576
+ "loss": 0.9032,
1577
+ "mean_token_accuracy": 0.737660813331604,
1578
  "step": 930
1579
  },
1580
  {
1581
+ "epoch": 2.7251547142337094,
1582
+ "grad_norm": 0.1743292659521103,
1583
  "learning_rate": 2e-05,
1584
+ "loss": 0.8834,
1585
+ "mean_token_accuracy": 0.7436462283134461,
1586
  "step": 935
1587
  },
1588
  {
1589
+ "epoch": 2.7397160538769567,
1590
+ "grad_norm": 0.21689608693122864,
1591
  "learning_rate": 2e-05,
1592
+ "loss": 0.8943,
1593
+ "mean_token_accuracy": 0.7397776529192924,
1594
  "step": 940
1595
  },
1596
  {
1597
+ "epoch": 2.754277393520204,
1598
+ "grad_norm": 0.19008708000183105,
1599
  "learning_rate": 2e-05,
1600
+ "loss": 0.9243,
1601
+ "mean_token_accuracy": 0.7323350265622139,
1602
  "step": 945
1603
  },
1604
  {
1605
+ "epoch": 2.768838733163451,
1606
+ "grad_norm": 0.20445488393306732,
1607
  "learning_rate": 2e-05,
1608
+ "loss": 0.8923,
1609
+ "mean_token_accuracy": 0.7400558426976204,
1610
  "step": 950
1611
  },
1612
  {
1613
+ "epoch": 2.7834000728066983,
1614
+ "grad_norm": 0.18380148708820343,
1615
  "learning_rate": 2e-05,
1616
+ "loss": 0.8938,
1617
+ "mean_token_accuracy": 0.7402835443615914,
1618
  "step": 955
1619
  },
1620
  {
1621
+ "epoch": 2.7979614124499452,
1622
+ "grad_norm": 0.17851661145687103,
1623
  "learning_rate": 2e-05,
1624
+ "loss": 0.8992,
1625
+ "mean_token_accuracy": 0.7384199738502503,
1626
  "step": 960
1627
  },
1628
  {
1629
+ "epoch": 2.8125227520931926,
1630
+ "grad_norm": 0.20929712057113647,
1631
  "learning_rate": 2e-05,
1632
+ "loss": 0.895,
1633
+ "mean_token_accuracy": 0.740375104546547,
1634
  "step": 965
1635
  },
1636
  {
1637
+ "epoch": 2.82708409173644,
1638
+ "grad_norm": 0.19595912098884583,
1639
  "learning_rate": 2e-05,
1640
+ "loss": 0.9008,
1641
+ "mean_token_accuracy": 0.7384095326066017,
1642
  "step": 970
1643
  },
1644
  {
1645
+ "epoch": 2.841645431379687,
1646
+ "grad_norm": 0.17353329062461853,
1647
  "learning_rate": 2e-05,
1648
+ "loss": 0.9021,
1649
+ "mean_token_accuracy": 0.7383004203438759,
1650
  "step": 975
1651
  },
1652
  {
1653
+ "epoch": 2.856206771022934,
1654
+ "grad_norm": 0.1858338862657547,
1655
  "learning_rate": 2e-05,
1656
+ "loss": 0.9004,
1657
+ "mean_token_accuracy": 0.7389308467507363,
1658
  "step": 980
1659
  },
1660
  {
1661
+ "epoch": 2.870768110666181,
1662
+ "grad_norm": 0.18422286212444305,
1663
  "learning_rate": 2e-05,
1664
+ "loss": 0.9109,
1665
+ "mean_token_accuracy": 0.7359930142760277,
1666
  "step": 985
1667
  },
1668
  {
1669
+ "epoch": 2.8853294503094284,
1670
+ "grad_norm": 0.17848879098892212,
1671
  "learning_rate": 2e-05,
1672
+ "loss": 0.8953,
1673
+ "mean_token_accuracy": 0.7397311091423034,
1674
  "step": 990
1675
  },
1676
  {
1677
+ "epoch": 2.899890789952676,
1678
+ "grad_norm": 0.1943521797657013,
1679
  "learning_rate": 2e-05,
1680
+ "loss": 0.8961,
1681
+ "mean_token_accuracy": 0.7394258737564087,
1682
  "step": 995
1683
  },
1684
  {
1685
+ "epoch": 2.9144521295959227,
1686
+ "grad_norm": 0.19629698991775513,
1687
  "learning_rate": 2e-05,
1688
+ "loss": 0.9044,
1689
+ "mean_token_accuracy": 0.7370201960206032,
1690
  "step": 1000
1691
  },
1692
  {
1693
+ "epoch": 2.9144521295959227,
1694
+ "eval_loss": 0.9340672492980957,
1695
+ "eval_mean_token_accuracy": 0.7274513006210327,
1696
+ "eval_runtime": 9.0532,
1697
+ "eval_samples_per_second": 7.29,
1698
+ "eval_steps_per_second": 0.552,
1699
  "step": 1000
1700
  },
1701
  {
1702
+ "epoch": 2.92901346923917,
1703
+ "grad_norm": 0.19953298568725586,
1704
  "learning_rate": 2e-05,
1705
+ "loss": 0.9018,
1706
+ "mean_token_accuracy": 0.7379929170012474,
1707
  "step": 1005
1708
  },
1709
  {
1710
+ "epoch": 2.943574808882417,
1711
+ "grad_norm": 0.19467906653881073,
1712
  "learning_rate": 2e-05,
1713
+ "loss": 0.9054,
1714
+ "mean_token_accuracy": 0.7371795266866684,
1715
  "step": 1010
1716
  },
1717
  {
1718
+ "epoch": 2.9581361485256643,
1719
+ "grad_norm": 0.18607822060585022,
1720
  "learning_rate": 2e-05,
1721
+ "loss": 0.9015,
1722
+ "mean_token_accuracy": 0.7378426045179367,
1723
  "step": 1015
1724
  },
1725
  {
1726
+ "epoch": 2.9726974881689117,
1727
+ "grad_norm": 0.18633662164211273,
1728
  "learning_rate": 2e-05,
1729
+ "loss": 0.9047,
1730
+ "mean_token_accuracy": 0.737396989762783,
1731
  "step": 1020
1732
  },
1733
  {
1734
+ "epoch": 2.9872588278121586,
1735
+ "grad_norm": 0.17410708963871002,
1736
  "learning_rate": 2e-05,
1737
+ "loss": 0.8927,
1738
+ "mean_token_accuracy": 0.7408069744706154,
1739
  "step": 1025
1740
  },
1741
  {
1742
+ "epoch": 2.9989078995267566,
1743
+ "mean_token_accuracy": 0.7377504613250494,
1744
+ "step": 1029,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1745
  "total_flos": 0.0,
1746
+ "train_loss": 0.9946218774200528,
1747
+ "train_runtime": 21041.6427,
1748
+ "train_samples_per_second": 1.566,
1749
+ "train_steps_per_second": 0.049
1750
  }
1751
  ],
1752
  "logging_steps": 5,
1753
+ "max_steps": 1029,
1754
  "num_input_tokens_seen": 0,
1755
+ "num_train_epochs": 3,
1756
  "save_steps": 500,
1757
  "stateful_callbacks": {
1758
  "TrainerControl": {
 
1767
  }
1768
  },
1769
  "total_flos": 0.0,
1770
+ "train_batch_size": 1,
1771
  "trial_name": null,
1772
  "trial_params": null
1773
  }