flyingbugs commited on
Commit
bb1bde1
·
verified ·
1 Parent(s): badd0f6

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +5 -10
  3. train_results.json +5 -5
  4. trainer_state.json +222 -2036
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: open-r1/OpenR1-Math-220k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jjh233/huggingface/runs/21iwqcw5)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jjh233/huggingface/runs/2sdy2t94)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "eval_loss": 0.8238936066627502,
3
- "eval_runtime": 26.0938,
4
- "eval_samples": 100,
5
- "eval_samples_per_second": 4.944,
6
- "eval_steps_per_second": 1.265,
7
- "total_flos": 65635690217472.0,
8
- "train_loss": 0.0,
9
- "train_runtime": 0.9025,
10
  "train_samples": 93733,
11
- "train_samples_per_second": 38008.288,
12
- "train_steps_per_second": 296.966
13
  }
 
1
  {
2
+ "total_flos": 487709642588160.0,
3
+ "train_loss": 0.5792717831348305,
4
+ "train_runtime": 20654.3187,
 
 
 
 
 
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 1.661,
7
+ "train_steps_per_second": 0.013
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 65635690217472.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 0.9025,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 38008.288,
7
- "train_steps_per_second": 296.966
8
  }
 
1
  {
2
+ "total_flos": 487709642588160.0,
3
+ "train_loss": 0.5792717831348305,
4
+ "train_runtime": 20654.3187,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 1.661,
7
+ "train_steps_per_second": 0.013
8
  }
trainer_state.json CHANGED
@@ -2,2205 +2,391 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
- "eval_steps": 5,
6
- "global_step": 676,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0073964497041420114,
13
- "grad_norm": 4.584371191542279,
14
- "learning_rate": 7.3529411764705884e-06,
15
- "loss": 2.7395,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 0.0073964497041420114,
20
- "eval_reasoning_loss": 1.308870553970337,
21
- "eval_reasoning_runtime": 4.1594,
22
- "eval_reasoning_samples_per_second": 128.624,
23
- "eval_reasoning_steps_per_second": 1.202,
24
- "step": 5
25
- },
26
- {
27
- "epoch": 0.0073964497041420114,
28
- "eval_utility_loss": 1.4049550294876099,
29
- "eval_utility_runtime": 1.7594,
30
- "eval_utility_samples_per_second": 120.498,
31
- "eval_utility_steps_per_second": 1.137,
32
- "step": 5
33
- },
34
- {
35
- "epoch": 0.014792899408284023,
36
- "grad_norm": 2.225119430385955,
37
- "learning_rate": 1.4705882352941177e-05,
38
- "loss": 2.5801,
39
- "step": 10
40
- },
41
- {
42
- "epoch": 0.014792899408284023,
43
- "eval_reasoning_loss": 1.1661676168441772,
44
- "eval_reasoning_runtime": 4.4119,
45
- "eval_reasoning_samples_per_second": 121.264,
46
- "eval_reasoning_steps_per_second": 1.133,
47
- "step": 10
48
- },
49
- {
50
- "epoch": 0.014792899408284023,
51
- "eval_utility_loss": 1.3157464265823364,
52
- "eval_utility_runtime": 1.7506,
53
- "eval_utility_samples_per_second": 121.103,
54
- "eval_utility_steps_per_second": 1.142,
55
  "step": 10
56
  },
57
  {
58
- "epoch": 0.022189349112426034,
59
- "grad_norm": 1.6694116716156027,
60
- "learning_rate": 2.2058823529411766e-05,
61
- "loss": 2.4429,
62
- "step": 15
63
- },
64
- {
65
- "epoch": 0.022189349112426034,
66
- "eval_reasoning_loss": 1.0858653783798218,
67
- "eval_reasoning_runtime": 4.5635,
68
- "eval_reasoning_samples_per_second": 117.233,
69
- "eval_reasoning_steps_per_second": 1.096,
70
- "step": 15
71
- },
72
- {
73
- "epoch": 0.022189349112426034,
74
- "eval_utility_loss": 1.2957844734191895,
75
- "eval_utility_runtime": 1.8291,
76
- "eval_utility_samples_per_second": 115.901,
77
- "eval_utility_steps_per_second": 1.093,
78
  "step": 15
79
  },
80
  {
81
- "epoch": 0.029585798816568046,
82
- "grad_norm": 1.305750192627527,
83
- "learning_rate": 2.9411764705882354e-05,
84
- "loss": 2.3256,
85
- "step": 20
86
- },
87
- {
88
- "epoch": 0.029585798816568046,
89
- "eval_reasoning_loss": 1.0403348207473755,
90
- "eval_reasoning_runtime": 4.8727,
91
- "eval_reasoning_samples_per_second": 109.796,
92
- "eval_reasoning_steps_per_second": 1.026,
93
- "step": 20
94
- },
95
- {
96
- "epoch": 0.029585798816568046,
97
- "eval_utility_loss": 1.2852095365524292,
98
- "eval_utility_runtime": 1.8864,
99
- "eval_utility_samples_per_second": 112.384,
100
- "eval_utility_steps_per_second": 1.06,
101
  "step": 20
102
  },
103
  {
104
- "epoch": 0.03698224852071006,
105
- "grad_norm": 1.3576019842769356,
106
- "learning_rate": 3.6764705882352945e-05,
107
- "loss": 2.3609,
108
  "step": 25
109
  },
110
  {
111
- "epoch": 0.03698224852071006,
112
- "eval_reasoning_loss": 1.0135447978973389,
113
- "eval_reasoning_runtime": 4.6881,
114
- "eval_reasoning_samples_per_second": 114.12,
115
- "eval_reasoning_steps_per_second": 1.067,
116
- "step": 25
117
- },
118
- {
119
- "epoch": 0.03698224852071006,
120
- "eval_utility_loss": 1.274791955947876,
121
- "eval_utility_runtime": 1.8048,
122
- "eval_utility_samples_per_second": 117.463,
123
- "eval_utility_steps_per_second": 1.108,
124
- "step": 25
125
- },
126
- {
127
- "epoch": 0.04437869822485207,
128
- "grad_norm": 1.3591976835447737,
129
- "learning_rate": 4.411764705882353e-05,
130
- "loss": 2.219,
131
- "step": 30
132
- },
133
- {
134
- "epoch": 0.04437869822485207,
135
- "eval_reasoning_loss": 0.9981840252876282,
136
- "eval_reasoning_runtime": 4.7288,
137
- "eval_reasoning_samples_per_second": 113.138,
138
- "eval_reasoning_steps_per_second": 1.057,
139
- "step": 30
140
- },
141
- {
142
- "epoch": 0.04437869822485207,
143
- "eval_utility_loss": 1.2681363821029663,
144
- "eval_utility_runtime": 2.0119,
145
- "eval_utility_samples_per_second": 105.371,
146
- "eval_utility_steps_per_second": 0.994,
147
  "step": 30
148
  },
149
  {
150
- "epoch": 0.051775147928994084,
151
- "grad_norm": 1.268058485040803,
152
- "learning_rate": 4.999973061007767e-05,
153
- "loss": 2.2399,
154
- "step": 35
155
- },
156
- {
157
- "epoch": 0.051775147928994084,
158
- "eval_reasoning_loss": 0.9858483076095581,
159
- "eval_reasoning_runtime": 4.6465,
160
- "eval_reasoning_samples_per_second": 115.139,
161
- "eval_reasoning_steps_per_second": 1.076,
162
- "step": 35
163
- },
164
- {
165
- "epoch": 0.051775147928994084,
166
- "eval_utility_loss": 1.2649621963500977,
167
- "eval_utility_runtime": 1.8766,
168
- "eval_utility_samples_per_second": 112.968,
169
- "eval_utility_steps_per_second": 1.066,
170
  "step": 35
171
  },
172
  {
173
- "epoch": 0.05917159763313609,
174
- "grad_norm": 1.2103607828033616,
175
- "learning_rate": 4.999030264010747e-05,
176
- "loss": 2.2228,
177
- "step": 40
178
- },
179
- {
180
- "epoch": 0.05917159763313609,
181
- "eval_reasoning_loss": 0.9769134521484375,
182
- "eval_reasoning_runtime": 4.7512,
183
- "eval_reasoning_samples_per_second": 112.604,
184
- "eval_reasoning_steps_per_second": 1.052,
185
- "step": 40
186
- },
187
- {
188
- "epoch": 0.05917159763313609,
189
- "eval_utility_loss": 1.2631217241287231,
190
- "eval_utility_runtime": 1.8913,
191
- "eval_utility_samples_per_second": 112.092,
192
- "eval_utility_steps_per_second": 1.057,
193
  "step": 40
194
  },
195
  {
196
- "epoch": 0.06656804733727811,
197
- "grad_norm": 1.2292426525096565,
198
- "learning_rate": 4.996741162407576e-05,
199
- "loss": 2.2372,
200
  "step": 45
201
  },
202
  {
203
- "epoch": 0.06656804733727811,
204
- "eval_reasoning_loss": 0.9719375371932983,
205
- "eval_reasoning_runtime": 4.7238,
206
- "eval_reasoning_samples_per_second": 113.256,
207
- "eval_reasoning_steps_per_second": 1.058,
208
- "step": 45
209
- },
210
- {
211
- "epoch": 0.06656804733727811,
212
- "eval_utility_loss": 1.262032389640808,
213
- "eval_utility_runtime": 1.8046,
214
- "eval_utility_samples_per_second": 117.475,
215
- "eval_utility_steps_per_second": 1.108,
216
- "step": 45
217
- },
218
- {
219
- "epoch": 0.07396449704142012,
220
- "grad_norm": 1.1574292202420797,
221
- "learning_rate": 4.993107126490191e-05,
222
- "loss": 2.1602,
223
- "step": 50
224
- },
225
- {
226
- "epoch": 0.07396449704142012,
227
- "eval_reasoning_loss": 0.965384840965271,
228
- "eval_reasoning_runtime": 4.7496,
229
- "eval_reasoning_samples_per_second": 112.641,
230
- "eval_reasoning_steps_per_second": 1.053,
231
- "step": 50
232
- },
233
- {
234
- "epoch": 0.07396449704142012,
235
- "eval_utility_loss": 1.2599124908447266,
236
- "eval_utility_runtime": 1.8864,
237
- "eval_utility_samples_per_second": 112.382,
238
- "eval_utility_steps_per_second": 1.06,
239
  "step": 50
240
  },
241
  {
242
- "epoch": 0.08136094674556213,
243
- "grad_norm": 1.2385754103472393,
244
- "learning_rate": 4.988130331649192e-05,
245
- "loss": 2.1932,
246
- "step": 55
247
- },
248
- {
249
- "epoch": 0.08136094674556213,
250
- "eval_reasoning_loss": 0.9624860882759094,
251
- "eval_reasoning_runtime": 4.6346,
252
- "eval_reasoning_samples_per_second": 115.436,
253
- "eval_reasoning_steps_per_second": 1.079,
254
- "step": 55
255
- },
256
- {
257
- "epoch": 0.08136094674556213,
258
- "eval_utility_loss": 1.2610447406768799,
259
- "eval_utility_runtime": 1.81,
260
- "eval_utility_samples_per_second": 117.129,
261
- "eval_utility_steps_per_second": 1.105,
262
  "step": 55
263
  },
264
  {
265
- "epoch": 0.08875739644970414,
266
- "grad_norm": 1.3366791934715165,
267
- "learning_rate": 4.981813757071618e-05,
268
- "loss": 2.154,
269
- "step": 60
270
- },
271
- {
272
- "epoch": 0.08875739644970414,
273
- "eval_reasoning_loss": 0.9584144949913025,
274
- "eval_reasoning_runtime": 4.7014,
275
- "eval_reasoning_samples_per_second": 113.796,
276
- "eval_reasoning_steps_per_second": 1.064,
277
- "step": 60
278
- },
279
- {
280
- "epoch": 0.08875739644970414,
281
- "eval_utility_loss": 1.2589086294174194,
282
- "eval_utility_runtime": 1.8524,
283
- "eval_utility_samples_per_second": 114.446,
284
- "eval_utility_steps_per_second": 1.08,
285
  "step": 60
286
  },
287
  {
288
- "epoch": 0.09615384615384616,
289
- "grad_norm": 1.153623603608571,
290
- "learning_rate": 4.974161183957565e-05,
291
- "loss": 2.1082,
292
  "step": 65
293
  },
294
  {
295
- "epoch": 0.09615384615384616,
296
- "eval_reasoning_loss": 0.9537850022315979,
297
- "eval_reasoning_runtime": 5.0172,
298
- "eval_reasoning_samples_per_second": 106.632,
299
- "eval_reasoning_steps_per_second": 0.997,
300
- "step": 65
301
- },
302
- {
303
- "epoch": 0.09615384615384616,
304
- "eval_utility_loss": 1.2607706785202026,
305
- "eval_utility_runtime": 1.7411,
306
- "eval_utility_samples_per_second": 121.759,
307
- "eval_utility_steps_per_second": 1.149,
308
- "step": 65
309
- },
310
- {
311
- "epoch": 0.10355029585798817,
312
- "grad_norm": 1.1278141586447843,
313
- "learning_rate": 4.965177193256699e-05,
314
- "loss": 2.1095,
315
- "step": 70
316
- },
317
- {
318
- "epoch": 0.10355029585798817,
319
- "eval_reasoning_loss": 0.9497982859611511,
320
- "eval_reasoning_runtime": 4.9054,
321
- "eval_reasoning_samples_per_second": 109.063,
322
- "eval_reasoning_steps_per_second": 1.019,
323
- "step": 70
324
- },
325
- {
326
- "epoch": 0.10355029585798817,
327
- "eval_utility_loss": 1.25819993019104,
328
- "eval_utility_runtime": 1.7991,
329
- "eval_utility_samples_per_second": 117.834,
330
- "eval_utility_steps_per_second": 1.112,
331
  "step": 70
332
  },
333
  {
334
- "epoch": 0.11094674556213018,
335
- "grad_norm": 1.2061200225092161,
336
- "learning_rate": 4.9548671629260224e-05,
337
- "loss": 2.0884,
338
- "step": 75
339
- },
340
- {
341
- "epoch": 0.11094674556213018,
342
- "eval_reasoning_loss": 0.9482110142707825,
343
- "eval_reasoning_runtime": 4.6282,
344
- "eval_reasoning_samples_per_second": 115.596,
345
- "eval_reasoning_steps_per_second": 1.08,
346
- "step": 75
347
- },
348
- {
349
- "epoch": 0.11094674556213018,
350
- "eval_utility_loss": 1.2583030462265015,
351
- "eval_utility_runtime": 1.8216,
352
- "eval_utility_samples_per_second": 116.379,
353
- "eval_utility_steps_per_second": 1.098,
354
  "step": 75
355
  },
356
  {
357
- "epoch": 0.11834319526627218,
358
- "grad_norm": 1.0632096757239367,
359
- "learning_rate": 4.943237264710554e-05,
360
- "loss": 2.0952,
361
- "step": 80
362
- },
363
- {
364
- "epoch": 0.11834319526627218,
365
- "eval_reasoning_loss": 0.9451742768287659,
366
- "eval_reasoning_runtime": 4.6101,
367
- "eval_reasoning_samples_per_second": 116.049,
368
- "eval_reasoning_steps_per_second": 1.085,
369
- "step": 80
370
- },
371
- {
372
- "epoch": 0.11834319526627218,
373
- "eval_utility_loss": 1.2601256370544434,
374
- "eval_utility_runtime": 1.8824,
375
- "eval_utility_samples_per_second": 112.621,
376
- "eval_utility_steps_per_second": 1.062,
377
  "step": 80
378
  },
379
  {
380
- "epoch": 0.1257396449704142,
381
- "grad_norm": 1.1561942737227944,
382
- "learning_rate": 4.9302944604488165e-05,
383
- "loss": 2.0352,
384
  "step": 85
385
  },
386
  {
387
- "epoch": 0.1257396449704142,
388
- "eval_reasoning_loss": 0.9423359036445618,
389
- "eval_reasoning_runtime": 4.5969,
390
- "eval_reasoning_samples_per_second": 116.383,
391
- "eval_reasoning_steps_per_second": 1.088,
392
- "step": 85
393
- },
394
- {
395
- "epoch": 0.1257396449704142,
396
- "eval_utility_loss": 1.2616825103759766,
397
- "eval_utility_runtime": 1.8916,
398
- "eval_utility_samples_per_second": 112.077,
399
- "eval_utility_steps_per_second": 1.057,
400
- "step": 85
401
- },
402
- {
403
- "epoch": 0.13313609467455623,
404
- "grad_norm": 1.0927334096818122,
405
- "learning_rate": 4.916046497905381e-05,
406
- "loss": 2.0583,
407
- "step": 90
408
- },
409
- {
410
- "epoch": 0.13313609467455623,
411
- "eval_reasoning_loss": 0.940236508846283,
412
- "eval_reasoning_runtime": 4.5778,
413
- "eval_reasoning_samples_per_second": 116.869,
414
- "eval_reasoning_steps_per_second": 1.092,
415
- "step": 90
416
- },
417
- {
418
- "epoch": 0.13313609467455623,
419
- "eval_utility_loss": 1.2627779245376587,
420
- "eval_utility_runtime": 1.8424,
421
- "eval_utility_samples_per_second": 115.065,
422
- "eval_utility_steps_per_second": 1.086,
423
  "step": 90
424
  },
425
  {
426
- "epoch": 0.14053254437869822,
427
- "grad_norm": 1.3461330902573267,
428
- "learning_rate": 4.9005019061329266e-05,
429
- "loss": 2.0494,
430
- "step": 95
431
- },
432
- {
433
- "epoch": 0.14053254437869822,
434
- "eval_reasoning_loss": 0.9385782480239868,
435
- "eval_reasoning_runtime": 4.7307,
436
- "eval_reasoning_samples_per_second": 113.091,
437
- "eval_reasoning_steps_per_second": 1.057,
438
- "step": 95
439
- },
440
- {
441
- "epoch": 0.14053254437869822,
442
- "eval_utility_loss": 1.2633188962936401,
443
- "eval_utility_runtime": 1.8137,
444
- "eval_utility_samples_per_second": 116.888,
445
- "eval_utility_steps_per_second": 1.103,
446
  "step": 95
447
  },
448
  {
449
- "epoch": 0.14792899408284024,
450
- "grad_norm": 1.3413228946173852,
451
- "learning_rate": 4.88366999036662e-05,
452
- "loss": 2.0453,
453
- "step": 100
454
- },
455
- {
456
- "epoch": 0.14792899408284024,
457
- "eval_reasoning_loss": 0.9367873668670654,
458
- "eval_reasoning_runtime": 4.6168,
459
- "eval_reasoning_samples_per_second": 115.881,
460
- "eval_reasoning_steps_per_second": 1.083,
461
- "step": 100
462
- },
463
- {
464
- "epoch": 0.14792899408284024,
465
- "eval_utility_loss": 1.2648777961730957,
466
- "eval_utility_runtime": 1.8308,
467
- "eval_utility_samples_per_second": 115.798,
468
- "eval_utility_steps_per_second": 1.092,
469
  "step": 100
470
  },
471
  {
472
- "epoch": 0.15532544378698224,
473
- "grad_norm": 0.8217045735024919,
474
- "learning_rate": 4.865560826453856e-05,
475
- "loss": 0.9177,
476
  "step": 105
477
  },
478
  {
479
- "epoch": 0.15532544378698224,
480
- "eval_loss": 0.9352405667304993,
481
- "eval_runtime": 3.8923,
482
- "eval_samples_per_second": 137.449,
483
- "eval_steps_per_second": 1.285,
484
- "step": 105
485
- },
486
- {
487
- "epoch": 0.16272189349112426,
488
- "grad_norm": 0.9788732678912112,
489
- "learning_rate": 4.846185254822698e-05,
490
- "loss": 0.9177,
491
- "step": 110
492
- },
493
- {
494
- "epoch": 0.16272189349112426,
495
- "eval_loss": 0.935136616230011,
496
- "eval_runtime": 3.9303,
497
- "eval_samples_per_second": 136.121,
498
- "eval_steps_per_second": 1.272,
499
  "step": 110
500
  },
501
  {
502
- "epoch": 0.17011834319526628,
503
- "grad_norm": 0.898568312527589,
504
- "learning_rate": 4.825554873992628e-05,
505
- "loss": 0.9127,
506
- "step": 115
507
- },
508
- {
509
- "epoch": 0.17011834319526628,
510
- "eval_loss": 0.9333159327507019,
511
- "eval_runtime": 4.2183,
512
- "eval_samples_per_second": 126.828,
513
- "eval_steps_per_second": 1.185,
514
  "step": 115
515
  },
516
  {
517
- "epoch": 0.17751479289940827,
518
- "grad_norm": 0.7542611234569648,
519
- "learning_rate": 4.803682033631494e-05,
520
- "loss": 0.9267,
521
- "step": 120
522
- },
523
- {
524
- "epoch": 0.17751479289940827,
525
- "eval_loss": 0.9326120018959045,
526
- "eval_runtime": 4.33,
527
- "eval_samples_per_second": 123.557,
528
- "eval_steps_per_second": 1.155,
529
  "step": 120
530
  },
531
  {
532
- "epoch": 0.1849112426035503,
533
- "grad_norm": 0.8285256031012249,
534
- "learning_rate": 4.780579827162803e-05,
535
- "loss": 0.9205,
536
  "step": 125
537
  },
538
  {
539
- "epoch": 0.1849112426035503,
540
- "eval_loss": 0.9305017590522766,
541
- "eval_runtime": 4.3606,
542
- "eval_samples_per_second": 122.69,
543
- "eval_steps_per_second": 1.147,
544
- "step": 125
545
- },
546
- {
547
- "epoch": 0.19230769230769232,
548
- "grad_norm": 0.9745000016888925,
549
- "learning_rate": 4.756262083927795e-05,
550
- "loss": 0.917,
551
- "step": 130
552
- },
553
- {
554
- "epoch": 0.19230769230769232,
555
- "eval_loss": 0.9290282130241394,
556
- "eval_runtime": 4.3713,
557
- "eval_samples_per_second": 122.388,
558
- "eval_steps_per_second": 1.144,
559
  "step": 130
560
  },
561
  {
562
- "epoch": 0.1997041420118343,
563
- "grad_norm": 0.8868792126499849,
564
- "learning_rate": 4.730743360906986e-05,
565
- "loss": 0.9034,
566
- "step": 135
567
- },
568
- {
569
- "epoch": 0.1997041420118343,
570
- "eval_loss": 0.9284210205078125,
571
- "eval_runtime": 4.4499,
572
- "eval_samples_per_second": 120.228,
573
- "eval_steps_per_second": 1.124,
574
  "step": 135
575
  },
576
  {
577
- "epoch": 0.20710059171597633,
578
- "grad_norm": 0.9306937022571846,
579
- "learning_rate": 4.704038934006124e-05,
580
- "loss": 0.9223,
581
- "step": 140
582
- },
583
- {
584
- "epoch": 0.20710059171597633,
585
- "eval_loss": 0.9253069758415222,
586
- "eval_runtime": 4.4492,
587
- "eval_samples_per_second": 120.247,
588
- "eval_steps_per_second": 1.124,
589
  "step": 140
590
  },
591
  {
592
- "epoch": 0.21449704142011836,
593
- "grad_norm": 0.9044379683040512,
594
- "learning_rate": 4.676164788911806e-05,
595
- "loss": 0.9215,
596
  "step": 145
597
  },
598
  {
599
- "epoch": 0.21449704142011836,
600
- "eval_loss": 0.9232907891273499,
601
- "eval_runtime": 4.5393,
602
- "eval_samples_per_second": 117.859,
603
- "eval_steps_per_second": 1.101,
604
- "step": 145
605
- },
606
- {
607
- "epoch": 0.22189349112426035,
608
- "grad_norm": 0.9600510868099964,
609
- "learning_rate": 4.647137611522186e-05,
610
- "loss": 0.8967,
611
- "step": 150
612
- },
613
- {
614
- "epoch": 0.22189349112426035,
615
- "eval_loss": 0.921413242816925,
616
- "eval_runtime": 4.5563,
617
- "eval_samples_per_second": 117.419,
618
- "eval_steps_per_second": 1.097,
619
  "step": 150
620
  },
621
  {
622
- "epoch": 0.22928994082840237,
623
- "grad_norm": 0.8577020835622902,
624
- "learning_rate": 4.6169747779585416e-05,
625
- "loss": 0.9124,
626
- "step": 155
627
- },
628
- {
629
- "epoch": 0.22928994082840237,
630
- "eval_loss": 0.9195402264595032,
631
- "eval_runtime": 4.4933,
632
- "eval_samples_per_second": 119.067,
633
- "eval_steps_per_second": 1.113,
634
  "step": 155
635
  },
636
  {
637
- "epoch": 0.23668639053254437,
638
- "grad_norm": 0.8280268017058183,
639
- "learning_rate": 4.585694344163654e-05,
640
- "loss": 0.8906,
641
- "step": 160
642
- },
643
- {
644
- "epoch": 0.23668639053254437,
645
- "eval_loss": 0.9192004203796387,
646
- "eval_runtime": 4.5417,
647
- "eval_samples_per_second": 117.797,
648
- "eval_steps_per_second": 1.101,
649
  "step": 160
650
  },
651
  {
652
- "epoch": 0.2440828402366864,
653
- "grad_norm": 0.8048304210000593,
654
- "learning_rate": 4.553315035093241e-05,
655
- "loss": 0.9004,
656
  "step": 165
657
  },
658
  {
659
- "epoch": 0.2440828402366864,
660
- "eval_loss": 0.919334352016449,
661
- "eval_runtime": 4.5374,
662
- "eval_samples_per_second": 117.909,
663
- "eval_steps_per_second": 1.102,
664
- "step": 165
665
- },
666
- {
667
- "epoch": 0.2514792899408284,
668
- "grad_norm": 0.9532036102972735,
669
- "learning_rate": 4.5198562335069036e-05,
670
- "loss": 0.9106,
671
- "step": 170
672
- },
673
- {
674
- "epoch": 0.2514792899408284,
675
- "eval_loss": 0.9174200892448425,
676
- "eval_runtime": 4.5738,
677
- "eval_samples_per_second": 116.971,
678
- "eval_steps_per_second": 1.093,
679
  "step": 170
680
  },
681
  {
682
- "epoch": 0.2588757396449704,
683
- "grad_norm": 0.8190721687575792,
684
- "learning_rate": 4.485337968365309e-05,
685
- "loss": 0.9131,
686
- "step": 175
687
- },
688
- {
689
- "epoch": 0.2588757396449704,
690
- "eval_loss": 0.9161617755889893,
691
- "eval_runtime": 4.5176,
692
- "eval_samples_per_second": 118.425,
693
- "eval_steps_per_second": 1.107,
694
  "step": 175
695
  },
696
  {
697
- "epoch": 0.26627218934911245,
698
- "grad_norm": 1.2006932003749058,
699
- "learning_rate": 4.4497809028405335e-05,
700
- "loss": 0.8932,
701
- "step": 180
702
- },
703
- {
704
- "epoch": 0.26627218934911245,
705
- "eval_loss": 0.9148706793785095,
706
- "eval_runtime": 4.4688,
707
- "eval_samples_per_second": 119.718,
708
- "eval_steps_per_second": 1.119,
709
  "step": 180
710
  },
711
  {
712
- "epoch": 0.27366863905325445,
713
- "grad_norm": 0.8025440479927234,
714
- "learning_rate": 4.413206321946775e-05,
715
- "loss": 0.8863,
716
  "step": 185
717
  },
718
  {
719
- "epoch": 0.27366863905325445,
720
- "eval_loss": 0.9131314158439636,
721
- "eval_runtime": 4.4717,
722
- "eval_samples_per_second": 119.641,
723
- "eval_steps_per_second": 1.118,
724
- "step": 185
725
- },
726
- {
727
- "epoch": 0.28106508875739644,
728
- "grad_norm": 0.8083730916154258,
729
- "learning_rate": 4.3756361197988056e-05,
730
- "loss": 0.8857,
731
- "step": 190
732
- },
733
- {
734
- "epoch": 0.28106508875739644,
735
- "eval_loss": 0.9111117124557495,
736
- "eval_runtime": 4.5302,
737
- "eval_samples_per_second": 118.096,
738
- "eval_steps_per_second": 1.104,
739
  "step": 190
740
  },
741
  {
742
- "epoch": 0.28846153846153844,
743
- "grad_norm": 0.795831165853354,
744
- "learning_rate": 4.337092786505812e-05,
745
- "loss": 0.8931,
746
- "step": 195
747
- },
748
- {
749
- "epoch": 0.28846153846153844,
750
- "eval_loss": 0.9104787707328796,
751
- "eval_runtime": 4.5367,
752
- "eval_samples_per_second": 117.928,
753
- "eval_steps_per_second": 1.102,
754
  "step": 195
755
  },
756
  {
757
- "epoch": 0.2958579881656805,
758
- "grad_norm": 0.7368270359113853,
759
- "learning_rate": 4.297599394708471e-05,
760
- "loss": 0.8869,
761
- "step": 200
762
- },
763
- {
764
- "epoch": 0.2958579881656805,
765
- "eval_loss": 0.9094380736351013,
766
- "eval_runtime": 4.5088,
767
- "eval_samples_per_second": 118.657,
768
- "eval_steps_per_second": 1.109,
769
  "step": 200
770
  },
771
  {
772
- "epoch": 0.3032544378698225,
773
- "grad_norm": 0.9433916024554357,
774
- "learning_rate": 4.257179585767301e-05,
775
- "loss": 0.8868,
776
  "step": 205
777
  },
778
  {
779
- "epoch": 0.3032544378698225,
780
- "eval_loss": 0.9086852669715881,
781
- "eval_runtime": 4.5444,
782
- "eval_samples_per_second": 117.726,
783
- "eval_steps_per_second": 1.1,
784
- "step": 205
785
- },
786
- {
787
- "epoch": 0.3106508875739645,
788
- "grad_norm": 0.840098337392708,
789
- "learning_rate": 4.2158575556105764e-05,
790
- "loss": 0.8914,
791
- "step": 210
792
- },
793
- {
794
- "epoch": 0.3106508875739645,
795
- "eval_loss": 0.9070125818252563,
796
- "eval_runtime": 4.6468,
797
- "eval_samples_per_second": 115.134,
798
- "eval_steps_per_second": 1.076,
799
  "step": 210
800
  },
801
  {
802
- "epoch": 0.3180473372781065,
803
- "grad_norm": 0.7502300558297901,
804
- "learning_rate": 4.17365804025027e-05,
805
- "loss": 0.8804,
806
- "step": 215
807
- },
808
- {
809
- "epoch": 0.3180473372781065,
810
- "eval_loss": 0.9057661890983582,
811
- "eval_runtime": 4.582,
812
- "eval_samples_per_second": 116.76,
813
- "eval_steps_per_second": 1.091,
814
  "step": 215
815
  },
816
  {
817
- "epoch": 0.3254437869822485,
818
- "grad_norm": 0.7844009332707674,
819
- "learning_rate": 4.130606300974686e-05,
820
- "loss": 0.8826,
821
- "step": 220
822
- },
823
- {
824
- "epoch": 0.3254437869822485,
825
- "eval_loss": 0.9040650725364685,
826
- "eval_runtime": 4.6541,
827
- "eval_samples_per_second": 114.953,
828
- "eval_steps_per_second": 1.074,
829
  "step": 220
830
  },
831
  {
832
- "epoch": 0.3328402366863905,
833
- "grad_norm": 0.7605732508216542,
834
- "learning_rate": 4.0867281092266644e-05,
835
- "loss": 0.8966,
836
  "step": 225
837
  },
838
  {
839
- "epoch": 0.3328402366863905,
840
- "eval_loss": 0.9045411944389343,
841
- "eval_runtime": 4.5979,
842
- "eval_samples_per_second": 116.357,
843
- "eval_steps_per_second": 1.087,
844
- "step": 225
845
- },
846
- {
847
- "epoch": 0.34023668639053256,
848
- "grad_norm": 0.7511534980546334,
849
- "learning_rate": 4.042049731176386e-05,
850
- "loss": 0.8845,
851
- "step": 230
852
- },
853
- {
854
- "epoch": 0.34023668639053256,
855
- "eval_loss": 0.9032608270645142,
856
- "eval_runtime": 4.5071,
857
- "eval_samples_per_second": 118.703,
858
- "eval_steps_per_second": 1.109,
859
  "step": 230
860
  },
861
  {
862
- "epoch": 0.34763313609467456,
863
- "grad_norm": 0.7495402660706051,
864
- "learning_rate": 3.996597911998038e-05,
865
- "loss": 0.8654,
866
- "step": 235
867
- },
868
- {
869
- "epoch": 0.34763313609467456,
870
- "eval_loss": 0.9020082950592041,
871
- "eval_runtime": 4.7865,
872
- "eval_samples_per_second": 111.774,
873
- "eval_steps_per_second": 1.045,
874
  "step": 235
875
  },
876
  {
877
- "epoch": 0.35502958579881655,
878
- "grad_norm": 0.7270437363614586,
879
- "learning_rate": 3.950399859859737e-05,
880
- "loss": 0.877,
881
- "step": 240
882
- },
883
- {
884
- "epoch": 0.35502958579881655,
885
- "eval_loss": 0.9021012187004089,
886
- "eval_runtime": 4.4736,
887
- "eval_samples_per_second": 119.59,
888
- "eval_steps_per_second": 1.118,
889
  "step": 240
890
  },
891
  {
892
- "epoch": 0.3624260355029586,
893
- "grad_norm": 0.7847060294005617,
894
- "learning_rate": 3.9034832296362885e-05,
895
- "loss": 0.8899,
896
  "step": 245
897
  },
898
  {
899
- "epoch": 0.3624260355029586,
900
- "eval_loss": 0.9012376070022583,
901
- "eval_runtime": 4.4956,
902
- "eval_samples_per_second": 119.004,
903
- "eval_steps_per_second": 1.112,
904
- "step": 245
905
- },
906
- {
907
- "epoch": 0.3698224852071006,
908
- "grad_norm": 0.6874768710018212,
909
- "learning_rate": 3.855876106354553e-05,
910
- "loss": 0.8662,
911
- "step": 250
912
- },
913
- {
914
- "epoch": 0.3698224852071006,
915
- "eval_loss": 0.8995451331138611,
916
- "eval_runtime": 4.5879,
917
- "eval_samples_per_second": 116.611,
918
- "eval_steps_per_second": 1.09,
919
  "step": 250
920
  },
921
  {
922
- "epoch": 0.3772189349112426,
923
- "grad_norm": 0.7086167285320196,
924
- "learning_rate": 3.807606988381309e-05,
925
- "loss": 0.8878,
926
- "step": 255
927
- },
928
- {
929
- "epoch": 0.3772189349112426,
930
- "eval_loss": 0.8975517749786377,
931
- "eval_runtime": 4.5179,
932
- "eval_samples_per_second": 118.417,
933
- "eval_steps_per_second": 1.107,
934
  "step": 255
935
  },
936
  {
937
- "epoch": 0.38461538461538464,
938
- "grad_norm": 0.8083616193730793,
939
- "learning_rate": 3.758704770363688e-05,
940
- "loss": 0.8641,
941
- "step": 260
942
- },
943
- {
944
- "epoch": 0.38461538461538464,
945
- "eval_loss": 0.8961707353591919,
946
- "eval_runtime": 4.6188,
947
- "eval_samples_per_second": 115.83,
948
- "eval_steps_per_second": 1.083,
949
  "step": 260
950
  },
951
  {
952
- "epoch": 0.39201183431952663,
953
- "grad_norm": 0.80186012197302,
954
- "learning_rate": 3.7091987259323813e-05,
955
- "loss": 0.8701,
956
  "step": 265
957
  },
958
- {
959
- "epoch": 0.39201183431952663,
960
- "eval_loss": 0.8945226073265076,
961
- "eval_runtime": 4.6059,
962
- "eval_samples_per_second": 116.155,
963
- "eval_steps_per_second": 1.086,
964
- "step": 265
965
- },
966
- {
967
- "epoch": 0.3994082840236686,
968
- "grad_norm": 0.7672004326970635,
969
- "learning_rate": 3.6591184901779965e-05,
970
- "loss": 0.884,
971
- "step": 270
972
- },
973
- {
974
- "epoch": 0.3994082840236686,
975
- "eval_loss": 0.8951981663703918,
976
- "eval_runtime": 4.5266,
977
- "eval_samples_per_second": 118.191,
978
- "eval_steps_per_second": 1.105,
979
- "step": 270
980
- },
981
- {
982
- "epoch": 0.4068047337278107,
983
- "grad_norm": 0.8279425083821433,
984
- "learning_rate": 3.6084940419110235e-05,
985
- "loss": 0.865,
986
- "step": 275
987
- },
988
- {
989
- "epoch": 0.4068047337278107,
990
- "eval_loss": 0.8942187428474426,
991
- "eval_runtime": 4.4718,
992
- "eval_samples_per_second": 119.638,
993
- "eval_steps_per_second": 1.118,
994
- "step": 275
995
- },
996
- {
997
- "epoch": 0.41420118343195267,
998
- "grad_norm": 0.6924917194060726,
999
- "learning_rate": 3.557355685716056e-05,
1000
- "loss": 0.8963,
1001
- "step": 280
1002
- },
1003
- {
1004
- "epoch": 0.41420118343195267,
1005
- "eval_loss": 0.8927004337310791,
1006
- "eval_runtime": 4.5867,
1007
- "eval_samples_per_second": 116.642,
1008
- "eval_steps_per_second": 1.09,
1009
- "step": 280
1010
- },
1011
- {
1012
- "epoch": 0.42159763313609466,
1013
- "grad_norm": 0.7300617000412236,
1014
- "learning_rate": 3.505734033810989e-05,
1015
- "loss": 0.878,
1016
- "step": 285
1017
- },
1018
- {
1019
- "epoch": 0.42159763313609466,
1020
- "eval_loss": 0.8926271796226501,
1021
- "eval_runtime": 4.5293,
1022
- "eval_samples_per_second": 118.121,
1023
- "eval_steps_per_second": 1.104,
1024
- "step": 285
1025
- },
1026
- {
1027
- "epoch": 0.4289940828402367,
1028
- "grad_norm": 0.8294306981958416,
1029
- "learning_rate": 3.45365998772207e-05,
1030
- "loss": 0.876,
1031
- "step": 290
1032
- },
1033
- {
1034
- "epoch": 0.4289940828402367,
1035
- "eval_loss": 0.8919618725776672,
1036
- "eval_runtime": 4.533,
1037
- "eval_samples_per_second": 118.024,
1038
- "eval_steps_per_second": 1.103,
1039
- "step": 290
1040
- },
1041
- {
1042
- "epoch": 0.4363905325443787,
1043
- "grad_norm": 0.6786743799594892,
1044
- "learning_rate": 3.4011647197857654e-05,
1045
- "loss": 0.8816,
1046
- "step": 295
1047
- },
1048
- {
1049
- "epoch": 0.4363905325443787,
1050
- "eval_loss": 0.8911965489387512,
1051
- "eval_runtime": 4.6093,
1052
- "eval_samples_per_second": 116.069,
1053
- "eval_steps_per_second": 1.085,
1054
- "step": 295
1055
- },
1056
- {
1057
- "epoch": 0.4437869822485207,
1058
- "grad_norm": 0.833036937401054,
1059
- "learning_rate": 3.34827965448851e-05,
1060
- "loss": 0.8727,
1061
- "step": 300
1062
- },
1063
- {
1064
- "epoch": 0.4437869822485207,
1065
- "eval_loss": 0.8890377879142761,
1066
- "eval_runtime": 4.5425,
1067
- "eval_samples_per_second": 117.776,
1068
- "eval_steps_per_second": 1.101,
1069
- "step": 300
1070
- },
1071
- {
1072
- "epoch": 0.4511834319526627,
1073
- "grad_norm": 0.7813589947732351,
1074
- "learning_rate": 3.2950364496555214e-05,
1075
- "loss": 0.8738,
1076
- "step": 305
1077
- },
1078
- {
1079
- "epoch": 0.4511834319526627,
1080
- "eval_loss": 0.8887431621551514,
1081
- "eval_runtime": 4.6379,
1082
- "eval_samples_per_second": 115.353,
1083
- "eval_steps_per_second": 1.078,
1084
- "step": 305
1085
- },
1086
- {
1087
- "epoch": 0.45857988165680474,
1088
- "grad_norm": 0.7325656989385956,
1089
- "learning_rate": 3.241466977499929e-05,
1090
- "loss": 0.8658,
1091
- "step": 310
1092
- },
1093
- {
1094
- "epoch": 0.45857988165680474,
1095
- "eval_loss": 0.887874186038971,
1096
- "eval_runtime": 4.6002,
1097
- "eval_samples_per_second": 116.301,
1098
- "eval_steps_per_second": 1.087,
1099
- "step": 310
1100
- },
1101
- {
1102
- "epoch": 0.46597633136094674,
1103
- "grad_norm": 0.9022362063604372,
1104
- "learning_rate": 3.187603305543577e-05,
1105
- "loss": 0.8705,
1106
- "step": 315
1107
- },
1108
- {
1109
- "epoch": 0.46597633136094674,
1110
- "eval_loss": 0.8875709772109985,
1111
- "eval_runtime": 4.59,
1112
- "eval_samples_per_second": 116.559,
1113
- "eval_steps_per_second": 1.089,
1114
- "step": 315
1115
- },
1116
- {
1117
- "epoch": 0.47337278106508873,
1118
- "grad_norm": 0.7741483099078358,
1119
- "learning_rate": 3.133477677420894e-05,
1120
- "loss": 0.8922,
1121
- "step": 320
1122
- },
1123
- {
1124
- "epoch": 0.47337278106508873,
1125
- "eval_loss": 0.8861641883850098,
1126
- "eval_runtime": 4.5295,
1127
- "eval_samples_per_second": 118.115,
1128
- "eval_steps_per_second": 1.104,
1129
- "step": 320
1130
- },
1131
- {
1132
- "epoch": 0.4807692307692308,
1133
- "grad_norm": 0.6898160412918697,
1134
- "learning_rate": 3.0791224935773624e-05,
1135
- "loss": 0.8716,
1136
- "step": 325
1137
- },
1138
- {
1139
- "epoch": 0.4807692307692308,
1140
- "eval_loss": 0.8853756189346313,
1141
- "eval_runtime": 4.5133,
1142
- "eval_samples_per_second": 118.54,
1143
- "eval_steps_per_second": 1.108,
1144
- "step": 325
1145
- },
1146
- {
1147
- "epoch": 0.4881656804733728,
1148
- "grad_norm": 0.6815469807890937,
1149
- "learning_rate": 3.0245702918740964e-05,
1150
- "loss": 0.8554,
1151
- "step": 330
1152
- },
1153
- {
1154
- "epoch": 0.4881656804733728,
1155
- "eval_loss": 0.8842912912368774,
1156
- "eval_runtime": 4.5403,
1157
- "eval_samples_per_second": 117.834,
1158
- "eval_steps_per_second": 1.101,
1159
- "step": 330
1160
- },
1161
- {
1162
- "epoch": 0.49556213017751477,
1163
- "grad_norm": 0.6536960015603316,
1164
- "learning_rate": 2.969853728110179e-05,
1165
- "loss": 0.8469,
1166
- "step": 335
1167
- },
1168
- {
1169
- "epoch": 0.49556213017751477,
1170
- "eval_loss": 0.8840665817260742,
1171
- "eval_runtime": 4.5356,
1172
- "eval_samples_per_second": 117.957,
1173
- "eval_steps_per_second": 1.102,
1174
- "step": 335
1175
- },
1176
- {
1177
- "epoch": 0.5029585798816568,
1178
- "grad_norm": 0.6782286300780161,
1179
- "learning_rate": 2.915005556474384e-05,
1180
- "loss": 0.8699,
1181
- "step": 340
1182
- },
1183
- {
1184
- "epoch": 0.5029585798816568,
1185
- "eval_loss": 0.8830198645591736,
1186
- "eval_runtime": 4.4892,
1187
- "eval_samples_per_second": 119.175,
1188
- "eval_steps_per_second": 1.114,
1189
- "step": 340
1190
- },
1191
- {
1192
- "epoch": 0.5103550295857988,
1193
- "grad_norm": 0.6416520739158129,
1194
- "learning_rate": 2.8600586099380123e-05,
1195
- "loss": 0.8633,
1196
- "step": 345
1197
- },
1198
- {
1199
- "epoch": 0.5103550295857988,
1200
- "eval_loss": 0.8818948268890381,
1201
- "eval_runtime": 4.4716,
1202
- "eval_samples_per_second": 119.643,
1203
- "eval_steps_per_second": 1.118,
1204
- "step": 345
1205
- },
1206
- {
1207
- "epoch": 0.5177514792899408,
1208
- "grad_norm": 0.7185733539593375,
1209
- "learning_rate": 2.8050457806005613e-05,
1210
- "loss": 0.8604,
1211
- "step": 350
1212
- },
1213
- {
1214
- "epoch": 0.5177514792899408,
1215
- "eval_loss": 0.8818439245223999,
1216
- "eval_runtime": 4.5556,
1217
- "eval_samples_per_second": 117.438,
1218
- "eval_steps_per_second": 1.098,
1219
- "step": 350
1220
- },
1221
- {
1222
- "epoch": 0.5251479289940828,
1223
- "grad_norm": 0.7685402090063009,
1224
- "learning_rate": 2.7500000000000004e-05,
1225
- "loss": 0.8607,
1226
- "step": 355
1227
- },
1228
- {
1229
- "epoch": 0.5251479289940828,
1230
- "eval_loss": 0.8809483051300049,
1231
- "eval_runtime": 4.4752,
1232
- "eval_samples_per_second": 119.546,
1233
- "eval_steps_per_second": 1.117,
1234
- "step": 355
1235
- },
1236
- {
1237
- "epoch": 0.5325443786982249,
1238
- "grad_norm": 0.7342686897340025,
1239
- "learning_rate": 2.69495421939944e-05,
1240
- "loss": 0.842,
1241
- "step": 360
1242
- },
1243
- {
1244
- "epoch": 0.5325443786982249,
1245
- "eval_loss": 0.8807406425476074,
1246
- "eval_runtime": 4.4623,
1247
- "eval_samples_per_second": 119.894,
1248
- "eval_steps_per_second": 1.121,
1249
- "step": 360
1250
- },
1251
- {
1252
- "epoch": 0.5399408284023669,
1253
- "grad_norm": 0.8172803037562146,
1254
- "learning_rate": 2.639941390061988e-05,
1255
- "loss": 0.8625,
1256
- "step": 365
1257
- },
1258
- {
1259
- "epoch": 0.5399408284023669,
1260
- "eval_loss": 0.8791869878768921,
1261
- "eval_runtime": 4.5731,
1262
- "eval_samples_per_second": 116.988,
1263
- "eval_steps_per_second": 1.093,
1264
- "step": 365
1265
- },
1266
- {
1267
- "epoch": 0.5473372781065089,
1268
- "grad_norm": 0.7569175685957509,
1269
- "learning_rate": 2.584994443525617e-05,
1270
- "loss": 0.8662,
1271
- "step": 370
1272
- },
1273
- {
1274
- "epoch": 0.5473372781065089,
1275
- "eval_loss": 0.8787435293197632,
1276
- "eval_runtime": 4.578,
1277
- "eval_samples_per_second": 116.863,
1278
- "eval_steps_per_second": 1.092,
1279
- "step": 370
1280
- },
1281
- {
1282
- "epoch": 0.5547337278106509,
1283
- "grad_norm": 0.7873241841048441,
1284
- "learning_rate": 2.5301462718898215e-05,
1285
- "loss": 0.8552,
1286
- "step": 375
1287
- },
1288
- {
1289
- "epoch": 0.5547337278106509,
1290
- "eval_loss": 0.8779678344726562,
1291
- "eval_runtime": 4.532,
1292
- "eval_samples_per_second": 118.05,
1293
- "eval_steps_per_second": 1.103,
1294
- "step": 375
1295
- },
1296
- {
1297
- "epoch": 0.5621301775147929,
1298
- "grad_norm": 0.7232681596622134,
1299
- "learning_rate": 2.4754297081259048e-05,
1300
- "loss": 0.8741,
1301
- "step": 380
1302
- },
1303
- {
1304
- "epoch": 0.5621301775147929,
1305
- "eval_loss": 0.87770015001297,
1306
- "eval_runtime": 4.585,
1307
- "eval_samples_per_second": 116.685,
1308
- "eval_steps_per_second": 1.091,
1309
- "step": 380
1310
- },
1311
- {
1312
- "epoch": 0.5695266272189349,
1313
- "grad_norm": 0.7943811735564513,
1314
- "learning_rate": 2.4208775064226384e-05,
1315
- "loss": 0.8499,
1316
- "step": 385
1317
- },
1318
- {
1319
- "epoch": 0.5695266272189349,
1320
- "eval_loss": 0.8768277764320374,
1321
- "eval_runtime": 4.532,
1322
- "eval_samples_per_second": 118.051,
1323
- "eval_steps_per_second": 1.103,
1324
- "step": 385
1325
- },
1326
- {
1327
- "epoch": 0.5769230769230769,
1328
- "grad_norm": 0.6788490872865308,
1329
- "learning_rate": 2.3665223225791074e-05,
1330
- "loss": 0.8509,
1331
- "step": 390
1332
- },
1333
- {
1334
- "epoch": 0.5769230769230769,
1335
- "eval_loss": 0.876204788684845,
1336
- "eval_runtime": 4.5571,
1337
- "eval_samples_per_second": 117.399,
1338
- "eval_steps_per_second": 1.097,
1339
- "step": 390
1340
- },
1341
- {
1342
- "epoch": 0.584319526627219,
1343
- "grad_norm": 0.6283691221546651,
1344
- "learning_rate": 2.3123966944564242e-05,
1345
- "loss": 0.8546,
1346
- "step": 395
1347
- },
1348
- {
1349
- "epoch": 0.584319526627219,
1350
- "eval_loss": 0.8753672242164612,
1351
- "eval_runtime": 4.4802,
1352
- "eval_samples_per_second": 119.414,
1353
- "eval_steps_per_second": 1.116,
1354
- "step": 395
1355
- },
1356
- {
1357
- "epoch": 0.591715976331361,
1358
- "grad_norm": 0.6786610256300601,
1359
- "learning_rate": 2.258533022500071e-05,
1360
- "loss": 0.836,
1361
- "step": 400
1362
- },
1363
- {
1364
- "epoch": 0.591715976331361,
1365
- "eval_loss": 0.874183714389801,
1366
- "eval_runtime": 4.569,
1367
- "eval_samples_per_second": 117.093,
1368
- "eval_steps_per_second": 1.094,
1369
- "step": 400
1370
- },
1371
- {
1372
- "epoch": 0.599112426035503,
1373
- "grad_norm": 0.6813901972627211,
1374
- "learning_rate": 2.2049635503444792e-05,
1375
- "loss": 0.8555,
1376
- "step": 405
1377
- },
1378
- {
1379
- "epoch": 0.599112426035503,
1380
- "eval_loss": 0.8732807636260986,
1381
- "eval_runtime": 4.5619,
1382
- "eval_samples_per_second": 117.275,
1383
- "eval_steps_per_second": 1.096,
1384
- "step": 405
1385
- },
1386
- {
1387
- "epoch": 0.606508875739645,
1388
- "grad_norm": 0.660113771455314,
1389
- "learning_rate": 2.151720345511491e-05,
1390
- "loss": 0.8483,
1391
- "step": 410
1392
- },
1393
- {
1394
- "epoch": 0.606508875739645,
1395
- "eval_loss": 0.8721917867660522,
1396
- "eval_runtime": 4.7031,
1397
- "eval_samples_per_second": 113.756,
1398
- "eval_steps_per_second": 1.063,
1399
- "step": 410
1400
- },
1401
- {
1402
- "epoch": 0.613905325443787,
1403
- "grad_norm": 0.6554745701701462,
1404
- "learning_rate": 2.0988352802142352e-05,
1405
- "loss": 0.8344,
1406
- "step": 415
1407
- },
1408
- {
1409
- "epoch": 0.613905325443787,
1410
- "eval_loss": 0.8716893792152405,
1411
- "eval_runtime": 4.6708,
1412
- "eval_samples_per_second": 114.541,
1413
- "eval_steps_per_second": 1.07,
1414
- "step": 415
1415
- },
1416
- {
1417
- "epoch": 0.621301775147929,
1418
- "grad_norm": 0.6532273385705507,
1419
- "learning_rate": 2.0463400122779307e-05,
1420
- "loss": 0.8336,
1421
- "step": 420
1422
- },
1423
- {
1424
- "epoch": 0.621301775147929,
1425
- "eval_loss": 0.871632993221283,
1426
- "eval_runtime": 4.6552,
1427
- "eval_samples_per_second": 114.924,
1428
- "eval_steps_per_second": 1.074,
1429
- "step": 420
1430
- },
1431
- {
1432
- "epoch": 0.628698224852071,
1433
- "grad_norm": 0.6375832377334109,
1434
- "learning_rate": 1.994265966189012e-05,
1435
- "loss": 0.8452,
1436
- "step": 425
1437
- },
1438
- {
1439
- "epoch": 0.628698224852071,
1440
- "eval_loss": 0.8717252016067505,
1441
- "eval_runtime": 4.5491,
1442
- "eval_samples_per_second": 117.607,
1443
- "eval_steps_per_second": 1.099,
1444
- "step": 425
1445
- },
1446
- {
1447
- "epoch": 0.636094674556213,
1448
- "grad_norm": 0.6981207777177686,
1449
- "learning_rate": 1.9426443142839447e-05,
1450
- "loss": 0.8429,
1451
- "step": 430
1452
- },
1453
- {
1454
- "epoch": 0.636094674556213,
1455
- "eval_loss": 0.8708469271659851,
1456
- "eval_runtime": 4.5282,
1457
- "eval_samples_per_second": 118.148,
1458
- "eval_steps_per_second": 1.104,
1459
- "step": 430
1460
- },
1461
- {
1462
- "epoch": 0.643491124260355,
1463
- "grad_norm": 0.652634344146103,
1464
- "learning_rate": 1.891505958088977e-05,
1465
- "loss": 0.8405,
1466
- "step": 435
1467
- },
1468
- {
1469
- "epoch": 0.643491124260355,
1470
- "eval_loss": 0.8699945211410522,
1471
- "eval_runtime": 4.5588,
1472
- "eval_samples_per_second": 117.354,
1473
- "eval_steps_per_second": 1.097,
1474
- "step": 435
1475
- },
1476
- {
1477
- "epoch": 0.650887573964497,
1478
- "grad_norm": 0.6201611689922387,
1479
- "learning_rate": 1.8408815098220043e-05,
1480
- "loss": 0.8566,
1481
- "step": 440
1482
- },
1483
- {
1484
- "epoch": 0.650887573964497,
1485
- "eval_loss": 0.8690354228019714,
1486
- "eval_runtime": 4.5688,
1487
- "eval_samples_per_second": 117.098,
1488
- "eval_steps_per_second": 1.094,
1489
- "step": 440
1490
- },
1491
- {
1492
- "epoch": 0.658284023668639,
1493
- "grad_norm": 0.6371435519919459,
1494
- "learning_rate": 1.7908012740676195e-05,
1495
- "loss": 0.8412,
1496
- "step": 445
1497
- },
1498
- {
1499
- "epoch": 0.658284023668639,
1500
- "eval_loss": 0.8690587282180786,
1501
- "eval_runtime": 4.5404,
1502
- "eval_samples_per_second": 117.832,
1503
- "eval_steps_per_second": 1.101,
1504
- "step": 445
1505
- },
1506
- {
1507
- "epoch": 0.665680473372781,
1508
- "grad_norm": 0.6528979398382087,
1509
- "learning_rate": 1.7412952296363133e-05,
1510
- "loss": 0.8395,
1511
- "step": 450
1512
- },
1513
- {
1514
- "epoch": 0.665680473372781,
1515
- "eval_loss": 0.8680040240287781,
1516
- "eval_runtime": 4.4831,
1517
- "eval_samples_per_second": 119.337,
1518
- "eval_steps_per_second": 1.115,
1519
- "step": 450
1520
- },
1521
- {
1522
- "epoch": 0.6730769230769231,
1523
- "grad_norm": 0.7567869297925619,
1524
- "learning_rate": 1.6923930116186907e-05,
1525
- "loss": 0.8481,
1526
- "step": 455
1527
- },
1528
- {
1529
- "epoch": 0.6730769230769231,
1530
- "eval_loss": 0.867551863193512,
1531
- "eval_runtime": 4.4437,
1532
- "eval_samples_per_second": 120.395,
1533
- "eval_steps_per_second": 1.125,
1534
- "step": 455
1535
- },
1536
- {
1537
- "epoch": 0.6804733727810651,
1538
- "grad_norm": 0.6773475295723909,
1539
- "learning_rate": 1.644123893645448e-05,
1540
- "loss": 0.8605,
1541
- "step": 460
1542
- },
1543
- {
1544
- "epoch": 0.6804733727810651,
1545
- "eval_loss": 0.8671652674674988,
1546
- "eval_runtime": 4.5312,
1547
- "eval_samples_per_second": 118.069,
1548
- "eval_steps_per_second": 1.103,
1549
- "step": 460
1550
- },
1551
- {
1552
- "epoch": 0.6878698224852071,
1553
- "grad_norm": 0.7010338732044618,
1554
- "learning_rate": 1.5965167703637124e-05,
1555
- "loss": 0.8395,
1556
- "step": 465
1557
- },
1558
- {
1559
- "epoch": 0.6878698224852071,
1560
- "eval_loss": 0.8658307790756226,
1561
- "eval_runtime": 4.586,
1562
- "eval_samples_per_second": 116.659,
1563
- "eval_steps_per_second": 1.09,
1564
- "step": 465
1565
- },
1566
- {
1567
- "epoch": 0.6952662721893491,
1568
- "grad_norm": 0.6702003501907882,
1569
- "learning_rate": 1.5496001401402644e-05,
1570
- "loss": 0.8418,
1571
- "step": 470
1572
- },
1573
- {
1574
- "epoch": 0.6952662721893491,
1575
- "eval_loss": 0.8654137253761292,
1576
- "eval_runtime": 4.4934,
1577
- "eval_samples_per_second": 119.062,
1578
- "eval_steps_per_second": 1.113,
1579
- "step": 470
1580
- },
1581
- {
1582
- "epoch": 0.7026627218934911,
1583
- "grad_norm": 0.6408882433617089,
1584
- "learning_rate": 1.5034020880019619e-05,
1585
- "loss": 0.8528,
1586
- "step": 475
1587
- },
1588
- {
1589
- "epoch": 0.7026627218934911,
1590
- "eval_loss": 0.8655184507369995,
1591
- "eval_runtime": 4.427,
1592
- "eval_samples_per_second": 120.85,
1593
- "eval_steps_per_second": 1.129,
1594
- "step": 475
1595
- },
1596
- {
1597
- "epoch": 0.7100591715976331,
1598
- "grad_norm": 0.6451588589935744,
1599
- "learning_rate": 1.4579502688236146e-05,
1600
- "loss": 0.856,
1601
- "step": 480
1602
- },
1603
- {
1604
- "epoch": 0.7100591715976331,
1605
- "eval_loss": 0.8649392127990723,
1606
- "eval_runtime": 4.5976,
1607
- "eval_samples_per_second": 116.366,
1608
- "eval_steps_per_second": 1.088,
1609
- "step": 480
1610
- },
1611
- {
1612
- "epoch": 0.7174556213017751,
1613
- "grad_norm": 0.6270246011748759,
1614
- "learning_rate": 1.4132718907733361e-05,
1615
- "loss": 0.8361,
1616
- "step": 485
1617
- },
1618
- {
1619
- "epoch": 0.7174556213017751,
1620
- "eval_loss": 0.864596962928772,
1621
- "eval_runtime": 4.5791,
1622
- "eval_samples_per_second": 116.835,
1623
- "eval_steps_per_second": 1.092,
1624
- "step": 485
1625
- },
1626
- {
1627
- "epoch": 0.7248520710059172,
1628
- "grad_norm": 0.7162779140578355,
1629
- "learning_rate": 1.3693936990253142e-05,
1630
- "loss": 0.8391,
1631
- "step": 490
1632
- },
1633
- {
1634
- "epoch": 0.7248520710059172,
1635
- "eval_loss": 0.8636202216148376,
1636
- "eval_runtime": 4.8532,
1637
- "eval_samples_per_second": 110.238,
1638
- "eval_steps_per_second": 1.03,
1639
- "step": 490
1640
- },
1641
- {
1642
- "epoch": 0.7322485207100592,
1643
- "grad_norm": 0.6264415975109302,
1644
- "learning_rate": 1.326341959749731e-05,
1645
- "loss": 0.8402,
1646
- "step": 495
1647
- },
1648
- {
1649
- "epoch": 0.7322485207100592,
1650
- "eval_loss": 0.8627746105194092,
1651
- "eval_runtime": 4.5577,
1652
- "eval_samples_per_second": 117.385,
1653
- "eval_steps_per_second": 1.097,
1654
- "step": 495
1655
- },
1656
- {
1657
- "epoch": 0.7396449704142012,
1658
- "grad_norm": 0.5999441934467236,
1659
- "learning_rate": 1.2841424443894246e-05,
1660
- "loss": 0.8356,
1661
- "step": 500
1662
- },
1663
- {
1664
- "epoch": 0.7396449704142012,
1665
- "eval_loss": 0.8623146414756775,
1666
- "eval_runtime": 4.5802,
1667
- "eval_samples_per_second": 116.807,
1668
- "eval_steps_per_second": 1.092,
1669
- "step": 500
1670
- },
1671
- {
1672
- "epoch": 0.7470414201183432,
1673
- "grad_norm": 0.6129410145276629,
1674
- "learning_rate": 1.2428204142327e-05,
1675
- "loss": 0.8462,
1676
- "step": 505
1677
- },
1678
- {
1679
- "epoch": 0.7470414201183432,
1680
- "eval_loss": 0.86202472448349,
1681
- "eval_runtime": 4.5046,
1682
- "eval_samples_per_second": 118.766,
1683
- "eval_steps_per_second": 1.11,
1684
- "step": 505
1685
- },
1686
- {
1687
- "epoch": 0.7544378698224852,
1688
- "grad_norm": 0.8356836512352364,
1689
- "learning_rate": 1.2024006052915295e-05,
1690
- "loss": 0.8276,
1691
- "step": 510
1692
- },
1693
- {
1694
- "epoch": 0.7544378698224852,
1695
- "eval_loss": 0.8616589903831482,
1696
- "eval_runtime": 4.5743,
1697
- "eval_samples_per_second": 116.957,
1698
- "eval_steps_per_second": 1.093,
1699
- "step": 510
1700
- },
1701
- {
1702
- "epoch": 0.7618343195266272,
1703
- "grad_norm": 0.5779272740478268,
1704
- "learning_rate": 1.1629072134941883e-05,
1705
- "loss": 0.8478,
1706
- "step": 515
1707
- },
1708
- {
1709
- "epoch": 0.7618343195266272,
1710
- "eval_loss": 0.8606404066085815,
1711
- "eval_runtime": 4.6303,
1712
- "eval_samples_per_second": 115.544,
1713
- "eval_steps_per_second": 1.08,
1714
- "step": 515
1715
- },
1716
- {
1717
- "epoch": 0.7692307692307693,
1718
- "grad_norm": 0.6069493343307135,
1719
- "learning_rate": 1.1243638802011954e-05,
1720
- "loss": 0.8438,
1721
- "step": 520
1722
- },
1723
- {
1724
- "epoch": 0.7692307692307693,
1725
- "eval_loss": 0.8605805039405823,
1726
- "eval_runtime": 4.5932,
1727
- "eval_samples_per_second": 116.477,
1728
- "eval_steps_per_second": 1.089,
1729
- "step": 520
1730
- },
1731
- {
1732
- "epoch": 0.7766272189349113,
1733
- "grad_norm": 0.6125533080147663,
1734
- "learning_rate": 1.0867936780532248e-05,
1735
- "loss": 0.8439,
1736
- "step": 525
1737
- },
1738
- {
1739
- "epoch": 0.7766272189349113,
1740
- "eval_loss": 0.8604384064674377,
1741
- "eval_runtime": 4.6021,
1742
- "eval_samples_per_second": 116.25,
1743
- "eval_steps_per_second": 1.086,
1744
- "step": 525
1745
- },
1746
- {
1747
- "epoch": 0.7840236686390533,
1748
- "grad_norm": 0.6452403926195849,
1749
- "learning_rate": 1.0502190971594672e-05,
1750
- "loss": 0.8424,
1751
- "step": 530
1752
- },
1753
- {
1754
- "epoch": 0.7840236686390533,
1755
- "eval_loss": 0.859944760799408,
1756
- "eval_runtime": 4.5374,
1757
- "eval_samples_per_second": 117.91,
1758
- "eval_steps_per_second": 1.102,
1759
- "step": 530
1760
- },
1761
- {
1762
- "epoch": 0.7914201183431953,
1763
- "grad_norm": 0.5745825978672692,
1764
- "learning_rate": 1.014662031634692e-05,
1765
- "loss": 0.8235,
1766
- "step": 535
1767
- },
1768
- {
1769
- "epoch": 0.7914201183431953,
1770
- "eval_loss": 0.8596252202987671,
1771
- "eval_runtime": 4.4046,
1772
- "eval_samples_per_second": 121.465,
1773
- "eval_steps_per_second": 1.135,
1774
- "step": 535
1775
- },
1776
- {
1777
- "epoch": 0.7988165680473372,
1778
- "grad_norm": 0.5837129035152734,
1779
- "learning_rate": 9.80143766493097e-06,
1780
- "loss": 0.8289,
1781
- "step": 540
1782
- },
1783
- {
1784
- "epoch": 0.7988165680473372,
1785
- "eval_loss": 0.8592759966850281,
1786
- "eval_runtime": 4.5322,
1787
- "eval_samples_per_second": 118.044,
1788
- "eval_steps_per_second": 1.103,
1789
- "step": 540
1790
- },
1791
- {
1792
- "epoch": 0.8062130177514792,
1793
- "grad_norm": 0.5813377576331795,
1794
- "learning_rate": 9.466849649067596e-06,
1795
- "loss": 0.83,
1796
- "step": 545
1797
- },
1798
- {
1799
- "epoch": 0.8062130177514792,
1800
- "eval_loss": 0.8585328459739685,
1801
- "eval_runtime": 4.5627,
1802
- "eval_samples_per_second": 117.254,
1803
- "eval_steps_per_second": 1.096,
1804
- "step": 545
1805
- },
1806
- {
1807
- "epoch": 0.8136094674556213,
1808
- "grad_norm": 0.6039721338817958,
1809
- "learning_rate": 9.143056558363463e-06,
1810
- "loss": 0.8363,
1811
- "step": 550
1812
- },
1813
- {
1814
- "epoch": 0.8136094674556213,
1815
- "eval_loss": 0.8581274747848511,
1816
- "eval_runtime": 4.6357,
1817
- "eval_samples_per_second": 115.408,
1818
- "eval_steps_per_second": 1.079,
1819
- "step": 550
1820
- },
1821
- {
1822
- "epoch": 0.8210059171597633,
1823
- "grad_norm": 0.5888712234714697,
1824
- "learning_rate": 8.83025222041459e-06,
1825
- "loss": 0.8403,
1826
- "step": 555
1827
- },
1828
- {
1829
- "epoch": 0.8210059171597633,
1830
- "eval_loss": 0.8580217361450195,
1831
- "eval_runtime": 4.5942,
1832
- "eval_samples_per_second": 116.45,
1833
- "eval_steps_per_second": 1.088,
1834
- "step": 555
1835
- },
1836
- {
1837
- "epoch": 0.8284023668639053,
1838
- "grad_norm": 0.6337569313224579,
1839
- "learning_rate": 8.528623884778144e-06,
1840
- "loss": 0.8517,
1841
- "step": 560
1842
- },
1843
- {
1844
- "epoch": 0.8284023668639053,
1845
- "eval_loss": 0.8578224182128906,
1846
- "eval_runtime": 4.9197,
1847
- "eval_samples_per_second": 108.747,
1848
- "eval_steps_per_second": 1.016,
1849
- "step": 560
1850
- },
1851
- {
1852
- "epoch": 0.8357988165680473,
1853
- "grad_norm": 0.5656560440585017,
1854
- "learning_rate": 8.238352110881945e-06,
1855
- "loss": 0.8286,
1856
- "step": 565
1857
- },
1858
- {
1859
- "epoch": 0.8357988165680473,
1860
- "eval_loss": 0.857262372970581,
1861
- "eval_runtime": 4.4021,
1862
- "eval_samples_per_second": 121.533,
1863
- "eval_steps_per_second": 1.136,
1864
- "step": 565
1865
- },
1866
- {
1867
- "epoch": 0.8431952662721893,
1868
- "grad_norm": 0.6428278413031142,
1869
- "learning_rate": 7.959610659938765e-06,
1870
- "loss": 0.8341,
1871
- "step": 570
1872
- },
1873
- {
1874
- "epoch": 0.8431952662721893,
1875
- "eval_loss": 0.8569393157958984,
1876
- "eval_runtime": 4.5023,
1877
- "eval_samples_per_second": 118.829,
1878
- "eval_steps_per_second": 1.111,
1879
- "step": 570
1880
- },
1881
- {
1882
- "epoch": 0.8505917159763313,
1883
- "grad_norm": 0.5744234711786275,
1884
- "learning_rate": 7.69256639093015e-06,
1885
- "loss": 0.8262,
1886
- "step": 575
1887
- },
1888
- {
1889
- "epoch": 0.8505917159763313,
1890
- "eval_loss": 0.8566195368766785,
1891
- "eval_runtime": 4.558,
1892
- "eval_samples_per_second": 117.375,
1893
- "eval_steps_per_second": 1.097,
1894
- "step": 575
1895
- },
1896
- {
1897
- "epoch": 0.8579881656804734,
1898
- "grad_norm": 0.578200912437047,
1899
- "learning_rate": 7.4373791607220455e-06,
1900
- "loss": 0.8351,
1901
- "step": 580
1902
- },
1903
- {
1904
- "epoch": 0.8579881656804734,
1905
- "eval_loss": 0.8565072417259216,
1906
- "eval_runtime": 4.5104,
1907
- "eval_samples_per_second": 118.615,
1908
- "eval_steps_per_second": 1.109,
1909
- "step": 580
1910
- },
1911
- {
1912
- "epoch": 0.8653846153846154,
1913
- "grad_norm": 0.5778075441668177,
1914
- "learning_rate": 7.194201728371964e-06,
1915
- "loss": 0.8329,
1916
- "step": 585
1917
- },
1918
- {
1919
- "epoch": 0.8653846153846154,
1920
- "eval_loss": 0.8561302423477173,
1921
- "eval_runtime": 4.5476,
1922
- "eval_samples_per_second": 117.646,
1923
- "eval_steps_per_second": 1.099,
1924
- "step": 585
1925
- },
1926
- {
1927
- "epoch": 0.8727810650887574,
1928
- "grad_norm": 0.5935938929493908,
1929
- "learning_rate": 6.96317966368506e-06,
1930
- "loss": 0.841,
1931
- "step": 590
1932
- },
1933
- {
1934
- "epoch": 0.8727810650887574,
1935
- "eval_loss": 0.8557109832763672,
1936
- "eval_runtime": 4.567,
1937
- "eval_samples_per_second": 117.145,
1938
- "eval_steps_per_second": 1.095,
1939
- "step": 590
1940
- },
1941
- {
1942
- "epoch": 0.8801775147928994,
1943
- "grad_norm": 0.6038472885423796,
1944
- "learning_rate": 6.744451260073718e-06,
1945
- "loss": 0.8286,
1946
- "step": 595
1947
- },
1948
- {
1949
- "epoch": 0.8801775147928994,
1950
- "eval_loss": 0.8555126786231995,
1951
- "eval_runtime": 4.5524,
1952
- "eval_samples_per_second": 117.519,
1953
- "eval_steps_per_second": 1.098,
1954
- "step": 595
1955
- },
1956
- {
1957
- "epoch": 0.8875739644970414,
1958
- "grad_norm": 0.5943088695918576,
1959
- "learning_rate": 6.538147451773029e-06,
1960
- "loss": 0.835,
1961
- "step": 600
1962
- },
1963
- {
1964
- "epoch": 0.8875739644970414,
1965
- "eval_loss": 0.8550283312797546,
1966
- "eval_runtime": 4.4943,
1967
- "eval_samples_per_second": 119.039,
1968
- "eval_steps_per_second": 1.113,
1969
- "step": 600
1970
- },
1971
- {
1972
- "epoch": 0.8949704142011834,
1973
- "grad_norm": 0.6151479841904817,
1974
- "learning_rate": 6.3443917354614466e-06,
1975
- "loss": 0.8313,
1976
- "step": 605
1977
- },
1978
- {
1979
- "epoch": 0.8949704142011834,
1980
- "eval_loss": 0.8547914028167725,
1981
- "eval_runtime": 4.5869,
1982
- "eval_samples_per_second": 116.638,
1983
- "eval_steps_per_second": 1.09,
1984
- "step": 605
1985
- },
1986
- {
1987
- "epoch": 0.9023668639053254,
1988
- "grad_norm": 0.5914893004098631,
1989
- "learning_rate": 6.163300096333806e-06,
1990
- "loss": 0.8276,
1991
- "step": 610
1992
- },
1993
- {
1994
- "epoch": 0.9023668639053254,
1995
- "eval_loss": 0.8543236255645752,
1996
- "eval_runtime": 4.5789,
1997
- "eval_samples_per_second": 116.84,
1998
- "eval_steps_per_second": 1.092,
1999
- "step": 610
2000
- },
2001
- {
2002
- "epoch": 0.9097633136094675,
2003
- "grad_norm": 0.5915554024359355,
2004
- "learning_rate": 5.9949809386707394e-06,
2005
- "loss": 0.8252,
2006
- "step": 615
2007
- },
2008
- {
2009
- "epoch": 0.9097633136094675,
2010
- "eval_loss": 0.8538296818733215,
2011
- "eval_runtime": 4.7415,
2012
- "eval_samples_per_second": 112.832,
2013
- "eval_steps_per_second": 1.055,
2014
- "step": 615
2015
- },
2016
- {
2017
- "epoch": 0.9171597633136095,
2018
- "grad_norm": 0.5682950068319568,
2019
- "learning_rate": 5.839535020946193e-06,
2020
- "loss": 0.8177,
2021
- "step": 620
2022
- },
2023
- {
2024
- "epoch": 0.9171597633136095,
2025
- "eval_loss": 0.8535985946655273,
2026
- "eval_runtime": 4.5719,
2027
- "eval_samples_per_second": 117.02,
2028
- "eval_steps_per_second": 1.094,
2029
- "step": 620
2030
- },
2031
- {
2032
- "epoch": 0.9245562130177515,
2033
- "grad_norm": 0.5863960670439361,
2034
- "learning_rate": 5.697055395511836e-06,
2035
- "loss": 0.8342,
2036
- "step": 625
2037
- },
2038
- {
2039
- "epoch": 0.9245562130177515,
2040
- "eval_loss": 0.8532615303993225,
2041
- "eval_runtime": 4.5376,
2042
- "eval_samples_per_second": 117.903,
2043
- "eval_steps_per_second": 1.102,
2044
- "step": 625
2045
- },
2046
- {
2047
- "epoch": 0.9319526627218935,
2048
- "grad_norm": 0.5531562271776167,
2049
- "learning_rate": 5.567627352894467e-06,
2050
- "loss": 0.8402,
2051
- "step": 630
2052
- },
2053
- {
2054
- "epoch": 0.9319526627218935,
2055
- "eval_loss": 0.8529289960861206,
2056
- "eval_runtime": 4.5667,
2057
- "eval_samples_per_second": 117.152,
2058
- "eval_steps_per_second": 1.095,
2059
- "step": 630
2060
- },
2061
- {
2062
- "epoch": 0.9393491124260355,
2063
- "grad_norm": 0.595496303433799,
2064
- "learning_rate": 5.451328370739774e-06,
2065
- "loss": 0.8233,
2066
- "step": 635
2067
- },
2068
- {
2069
- "epoch": 0.9393491124260355,
2070
- "eval_loss": 0.8527988195419312,
2071
- "eval_runtime": 4.6311,
2072
- "eval_samples_per_second": 115.523,
2073
- "eval_steps_per_second": 1.08,
2074
- "step": 635
2075
- },
2076
- {
2077
- "epoch": 0.9467455621301775,
2078
- "grad_norm": 0.62602812517354,
2079
- "learning_rate": 5.3482280674330136e-06,
2080
- "loss": 0.8183,
2081
- "step": 640
2082
- },
2083
- {
2084
- "epoch": 0.9467455621301775,
2085
- "eval_loss": 0.852441668510437,
2086
- "eval_runtime": 4.5774,
2087
- "eval_samples_per_second": 116.877,
2088
- "eval_steps_per_second": 1.092,
2089
- "step": 640
2090
- },
2091
- {
2092
- "epoch": 0.9541420118343196,
2093
- "grad_norm": 0.5716364859992062,
2094
- "learning_rate": 5.25838816042435e-06,
2095
- "loss": 0.822,
2096
- "step": 645
2097
- },
2098
- {
2099
- "epoch": 0.9541420118343196,
2100
- "eval_loss": 0.8520421385765076,
2101
- "eval_runtime": 4.5908,
2102
- "eval_samples_per_second": 116.537,
2103
- "eval_steps_per_second": 1.089,
2104
- "step": 645
2105
- },
2106
- {
2107
- "epoch": 0.9615384615384616,
2108
- "grad_norm": 0.5973178018451643,
2109
- "learning_rate": 5.1818624292838275e-06,
2110
- "loss": 0.8326,
2111
- "step": 650
2112
- },
2113
- {
2114
- "epoch": 0.9615384615384616,
2115
- "eval_loss": 0.8517911434173584,
2116
- "eval_runtime": 4.5169,
2117
- "eval_samples_per_second": 118.444,
2118
- "eval_steps_per_second": 1.107,
2119
- "step": 650
2120
- },
2121
- {
2122
- "epoch": 0.9689349112426036,
2123
- "grad_norm": 0.5848676320077507,
2124
- "learning_rate": 5.118696683508087e-06,
2125
- "loss": 0.8224,
2126
- "step": 655
2127
- },
2128
- {
2129
- "epoch": 0.9689349112426036,
2130
- "eval_loss": 0.8514999151229858,
2131
- "eval_runtime": 4.6466,
2132
- "eval_samples_per_second": 115.137,
2133
- "eval_steps_per_second": 1.076,
2134
- "step": 655
2135
- },
2136
- {
2137
- "epoch": 0.9763313609467456,
2138
- "grad_norm": 0.584823125278116,
2139
- "learning_rate": 5.0689287350980886e-06,
2140
- "loss": 0.823,
2141
- "step": 660
2142
- },
2143
- {
2144
- "epoch": 0.9763313609467456,
2145
- "eval_loss": 0.8514819741249084,
2146
- "eval_runtime": 4.4836,
2147
- "eval_samples_per_second": 119.325,
2148
- "eval_steps_per_second": 1.115,
2149
- "step": 660
2150
- },
2151
- {
2152
- "epoch": 0.9837278106508875,
2153
- "grad_norm": 0.5887341532281551,
2154
- "learning_rate": 5.03258837592424e-06,
2155
- "loss": 0.8366,
2156
- "step": 665
2157
- },
2158
- {
2159
- "epoch": 0.9837278106508875,
2160
- "eval_loss": 0.8512169718742371,
2161
- "eval_runtime": 4.4841,
2162
- "eval_samples_per_second": 119.311,
2163
- "eval_steps_per_second": 1.115,
2164
- "step": 665
2165
- },
2166
- {
2167
- "epoch": 0.9911242603550295,
2168
- "grad_norm": 0.6312074358327011,
2169
- "learning_rate": 5.009697359892536e-06,
2170
- "loss": 0.82,
2171
- "step": 670
2172
- },
2173
- {
2174
- "epoch": 0.9911242603550295,
2175
- "eval_loss": 0.8510258197784424,
2176
- "eval_runtime": 4.5807,
2177
- "eval_samples_per_second": 116.794,
2178
- "eval_steps_per_second": 1.092,
2179
- "step": 670
2180
- },
2181
- {
2182
- "epoch": 0.9985207100591716,
2183
- "grad_norm": 0.6109741913754086,
2184
- "learning_rate": 5.0002693899223325e-06,
2185
- "loss": 0.8494,
2186
- "step": 675
2187
- },
2188
- {
2189
- "epoch": 0.9985207100591716,
2190
- "eval_loss": 0.8508756160736084,
2191
- "eval_runtime": 4.5433,
2192
- "eval_samples_per_second": 117.755,
2193
- "eval_steps_per_second": 1.101,
2194
- "step": 675
2195
- },
2196
  {
2197
  "epoch": 1.0,
2198
- "step": 676,
2199
- "total_flos": 65635690217472.0,
2200
- "train_loss": 0.0,
2201
- "train_runtime": 0.9025,
2202
- "train_samples_per_second": 38008.288,
2203
- "train_steps_per_second": 296.966
2204
  }
2205
  ],
2206
  "logging_steps": 5,
@@ -2220,8 +406,8 @@
2220
  "attributes": {}
2221
  }
2222
  },
2223
- "total_flos": 65635690217472.0,
2224
- "train_batch_size": 16,
2225
  "trial_name": null,
2226
  "trial_params": null
2227
  }
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 268,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.018656716417910446,
13
+ "grad_norm": 1.8357123991549114,
14
+ "learning_rate": 1.785714285714286e-05,
15
+ "loss": 0.8456,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.03731343283582089,
20
+ "grad_norm": 0.7888674392020409,
21
+ "learning_rate": 3.571428571428572e-05,
22
+ "loss": 0.7682,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.055970149253731345,
27
+ "grad_norm": 0.5088001784298014,
28
+ "learning_rate": 4.999827900623038e-05,
29
+ "loss": 0.7026,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.07462686567164178,
34
+ "grad_norm": 0.3967331836313917,
35
+ "learning_rate": 4.993807186343243e-05,
36
+ "loss": 0.6745,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.09328358208955224,
41
+ "grad_norm": 0.38953492827269537,
42
+ "learning_rate": 4.979207812402531e-05,
43
+ "loss": 0.6436,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.11194029850746269,
48
+ "grad_norm": 0.3056894462367877,
49
+ "learning_rate": 4.956085596012407e-05,
50
+ "loss": 0.6362,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.13059701492537312,
55
+ "grad_norm": 0.2848383986273845,
56
+ "learning_rate": 4.924528939432311e-05,
57
+ "loss": 0.6199,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.14925373134328357,
62
+ "grad_norm": 0.24821786610124724,
63
+ "learning_rate": 4.884658491984735e-05,
64
+ "loss": 0.6106,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.16791044776119404,
69
+ "grad_norm": 0.21630773743864942,
70
+ "learning_rate": 4.8366266887814235e-05,
71
+ "loss": 0.6112,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.1865671641791045,
76
+ "grad_norm": 0.2542614286063211,
77
+ "learning_rate": 4.780617167924209e-05,
78
+ "loss": 0.5939,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.20522388059701493,
83
+ "grad_norm": 0.273652684856057,
84
+ "learning_rate": 4.716844068408693e-05,
85
+ "loss": 0.5965,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.22388059701492538,
90
+ "grad_norm": 0.2919315422944058,
91
+ "learning_rate": 4.6455512114150546e-05,
92
+ "loss": 0.5919,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.24253731343283583,
97
+ "grad_norm": 0.4896363894073648,
98
+ "learning_rate": 4.5670111681161296e-05,
99
+ "loss": 0.5829,
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.26119402985074625,
104
+ "grad_norm": 0.35893234362656307,
105
+ "learning_rate": 4.481524217566783e-05,
106
+ "loss": 0.5799,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.2798507462686567,
111
+ "grad_norm": 0.47821339713150723,
112
+ "learning_rate": 4.3894171986588217e-05,
113
+ "loss": 0.5787,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 0.29850746268656714,
118
+ "grad_norm": 0.42779548633021597,
119
+ "learning_rate": 4.29104226053073e-05,
120
+ "loss": 0.5775,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.31716417910447764,
125
+ "grad_norm": 0.34493135242732464,
126
+ "learning_rate": 4.186775516209732e-05,
127
+ "loss": 0.5728,
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 0.3358208955223881,
132
+ "grad_norm": 0.3450752761118264,
133
+ "learning_rate": 4.077015604633669e-05,
134
+ "loss": 0.5752,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.35447761194029853,
139
+ "grad_norm": 0.3254883369412446,
140
+ "learning_rate": 3.962182166550441e-05,
141
+ "loss": 0.5737,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 0.373134328358209,
146
+ "grad_norm": 0.3021937652365121,
147
+ "learning_rate": 3.8427142401220634e-05,
148
+ "loss": 0.5697,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.3917910447761194,
153
+ "grad_norm": 0.22543629034027707,
154
+ "learning_rate": 3.71906858236735e-05,
155
+ "loss": 0.566,
156
  "step": 105
157
  },
158
  {
159
+ "epoch": 0.41044776119402987,
160
+ "grad_norm": 0.26150626383864994,
161
+ "learning_rate": 3.591717922860785e-05,
162
+ "loss": 0.5733,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.4291044776119403,
167
+ "grad_norm": 0.25406069172974854,
168
+ "learning_rate": 3.46114915636416e-05,
169
+ "loss": 0.5641,
 
 
 
 
 
 
 
 
170
  "step": 115
171
  },
172
  {
173
+ "epoch": 0.44776119402985076,
174
+ "grad_norm": 0.2761171780274853,
175
+ "learning_rate": 3.3278614813010034e-05,
176
+ "loss": 0.565,
 
 
 
 
 
 
 
 
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.4664179104477612,
181
+ "grad_norm": 0.27444350076630936,
182
+ "learning_rate": 3.1923644911909e-05,
183
+ "loss": 0.5619,
184
  "step": 125
185
  },
186
  {
187
+ "epoch": 0.48507462686567165,
188
+ "grad_norm": 0.22218188459643629,
189
+ "learning_rate": 3.0551762263406576e-05,
190
+ "loss": 0.5606,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.503731343283582,
195
+ "grad_norm": 0.23885769537130422,
196
+ "learning_rate": 2.9168211932412042e-05,
197
+ "loss": 0.5579,
 
 
 
 
 
 
 
 
198
  "step": 135
199
  },
200
  {
201
+ "epoch": 0.5223880597014925,
202
+ "grad_norm": 0.19581242444906968,
203
+ "learning_rate": 2.777828359242567e-05,
204
+ "loss": 0.5632,
 
 
 
 
 
 
 
 
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.5410447761194029,
209
+ "grad_norm": 0.23903378838505404,
210
+ "learning_rate": 2.6387291301738377e-05,
211
+ "loss": 0.5559,
212
  "step": 145
213
  },
214
  {
215
+ "epoch": 0.5597014925373134,
216
+ "grad_norm": 0.22131995247320724,
217
+ "learning_rate": 2.50005531864019e-05,
218
+ "loss": 0.5537,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.5783582089552238,
223
+ "grad_norm": 0.20388629120046767,
224
+ "learning_rate": 2.362337110764688e-05,
225
+ "loss": 0.554,
 
 
 
 
 
 
 
 
226
  "step": 155
227
  },
228
  {
229
+ "epoch": 0.5970149253731343,
230
+ "grad_norm": 0.19865451240052354,
231
+ "learning_rate": 2.226101039148557e-05,
232
+ "loss": 0.5523,
 
 
 
 
 
 
 
 
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.6156716417910447,
237
+ "grad_norm": 0.21792292980419078,
238
+ "learning_rate": 2.0918679697998252e-05,
239
+ "loss": 0.5511,
240
  "step": 165
241
  },
242
  {
243
+ "epoch": 0.6343283582089553,
244
+ "grad_norm": 0.20734976300061925,
245
+ "learning_rate": 1.9601511107268255e-05,
246
+ "loss": 0.5516,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.6529850746268657,
251
+ "grad_norm": 0.1990824685948915,
252
+ "learning_rate": 1.8314540498102216e-05,
253
+ "loss": 0.5512,
 
 
 
 
 
 
 
 
254
  "step": 175
255
  },
256
  {
257
+ "epoch": 0.6716417910447762,
258
+ "grad_norm": 0.1759699937325297,
259
+ "learning_rate": 1.7062688294552992e-05,
260
+ "loss": 0.5433,
 
 
 
 
 
 
 
 
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.6902985074626866,
265
+ "grad_norm": 0.19952904233106478,
266
+ "learning_rate": 1.5850740653856096e-05,
267
+ "loss": 0.5467,
268
  "step": 185
269
  },
270
  {
271
+ "epoch": 0.7089552238805971,
272
+ "grad_norm": 0.17940456799398388,
273
+ "learning_rate": 1.4683331167703218e-05,
274
+ "loss": 0.5503,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.7276119402985075,
279
+ "grad_norm": 0.17212507506796912,
280
+ "learning_rate": 1.356492314681356e-05,
281
+ "loss": 0.5531,
 
 
 
 
 
 
 
 
282
  "step": 195
283
  },
284
  {
285
+ "epoch": 0.746268656716418,
286
+ "grad_norm": 0.16216686994925128,
287
+ "learning_rate": 1.2499792556533716e-05,
288
+ "loss": 0.5474,
 
 
 
 
 
 
 
 
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.7649253731343284,
293
+ "grad_norm": 0.17719961464188644,
294
+ "learning_rate": 1.1492011668707753e-05,
295
+ "loss": 0.5449,
296
  "step": 205
297
  },
298
  {
299
+ "epoch": 0.7835820895522388,
300
+ "grad_norm": 0.17440876918971734,
301
+ "learning_rate": 1.0545433492320603e-05,
302
+ "loss": 0.5501,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.8022388059701493,
307
+ "grad_norm": 0.1578415381103267,
308
+ "learning_rate": 9.663677042440537e-06,
309
+ "loss": 0.5444,
 
 
 
 
 
 
 
 
310
  "step": 215
311
  },
312
  {
313
+ "epoch": 0.8208955223880597,
314
+ "grad_norm": 0.16294954656469435,
315
+ "learning_rate": 8.850113503781367e-06,
316
+ "loss": 0.5443,
 
 
 
 
 
 
 
 
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.8395522388059702,
321
+ "grad_norm": 0.14550383873413048,
322
+ "learning_rate": 8.107853341784671e-06,
323
+ "loss": 0.5507,
324
  "step": 225
325
  },
326
  {
327
+ "epoch": 0.8582089552238806,
328
+ "grad_norm": 0.1571896983780636,
329
+ "learning_rate": 7.439734410499752e-06,
330
+ "loss": 0.547,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.8768656716417911,
335
+ "grad_norm": 0.1811708441839716,
336
+ "learning_rate": 6.848311102728011e-06,
337
+ "loss": 0.5472,
 
 
 
 
 
 
 
 
338
  "step": 235
339
  },
340
  {
341
+ "epoch": 0.8955223880597015,
342
+ "grad_norm": 0.1768578492896203,
343
+ "learning_rate": 6.335844583913515e-06,
344
+ "loss": 0.5433,
 
 
 
 
 
 
 
 
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.914179104477612,
349
+ "grad_norm": 0.14375585775141816,
350
+ "learning_rate": 5.904294147118193e-06,
351
+ "loss": 0.547,
352
  "step": 245
353
  },
354
  {
355
+ "epoch": 0.9328358208955224,
356
+ "grad_norm": 0.14507740115855672,
357
+ "learning_rate": 5.555309722133842e-06,
358
+ "loss": 0.5436,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.9514925373134329,
363
+ "grad_norm": 0.14189135037515943,
364
+ "learning_rate": 5.290225567370509e-06,
365
+ "loss": 0.5396,
 
 
 
 
 
 
 
 
366
  "step": 255
367
  },
368
  {
369
+ "epoch": 0.9701492537313433,
370
+ "grad_norm": 0.15104419515678771,
371
+ "learning_rate": 5.110055168638854e-06,
372
+ "loss": 0.5433,
 
 
 
 
 
 
 
 
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.9888059701492538,
377
+ "grad_norm": 0.14422216331517992,
378
+ "learning_rate": 5.0154873643297575e-06,
379
+ "loss": 0.547,
380
  "step": 265
381
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  {
383
  "epoch": 1.0,
384
+ "step": 268,
385
+ "total_flos": 487709642588160.0,
386
+ "train_loss": 0.5792717831348305,
387
+ "train_runtime": 20654.3187,
388
+ "train_samples_per_second": 1.661,
389
+ "train_steps_per_second": 0.013
390
  }
391
  ],
392
  "logging_steps": 5,
 
406
  "attributes": {}
407
  }
408
  },
409
+ "total_flos": 487709642588160.0,
410
+ "train_batch_size": 8,
411
  "trial_name": null,
412
  "trial_params": null
413
  }