robertou2 commited on
Commit
3e6dd15
·
verified ·
1 Parent(s): d76e05d

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. rng_state.pth +1 -1
  2. scheduler.pt +1 -1
  3. trainer_state.json +2552 -377
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0241384ebd3d15b1a8164991a445a993df2bd29e7024ac9c77da5909807a7c57
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb8b281d3670780618442404eb45c98293a0d43e46f8e5ca3eb87cb4663d60e7
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8da2a29d769a2c7c6ee654d830f0801577c6076f6346125ad041b3edd166bbc2
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fa59c9185701213ec25411dab80244c017dba754eeac7bca5fb0c59c13e7e9c
3
  size 1465
trainer_state.json CHANGED
@@ -1,844 +1,3019 @@
1
  {
2
- "best_global_step": 55,
3
- "best_metric": 0.7241045236587524,
4
- "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-55",
5
- "epoch": 2.8947368421052633,
6
  "eval_steps": 1,
7
- "global_step": 55,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.05263157894736842,
14
- "grad_norm": 0.7188231945037842,
15
  "learning_rate": 0.0,
16
- "loss": 3.2235,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.05263157894736842,
21
- "eval_loss": 3.15524959564209,
22
- "eval_runtime": 3.3312,
23
- "eval_samples_per_second": 9.006,
24
- "eval_steps_per_second": 1.201,
25
  "step": 1
26
  },
27
  {
28
  "epoch": 0.10526315789473684,
29
- "grad_norm": 0.766629159450531,
30
  "learning_rate": 3.3333333333333335e-05,
31
- "loss": 3.165,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 0.10526315789473684,
36
- "eval_loss": 3.1018595695495605,
37
- "eval_runtime": 3.28,
38
- "eval_samples_per_second": 9.146,
39
- "eval_steps_per_second": 1.219,
40
  "step": 2
41
  },
42
  {
43
  "epoch": 0.15789473684210525,
44
- "grad_norm": 0.6206756234169006,
45
  "learning_rate": 6.666666666666667e-05,
46
- "loss": 2.8628,
47
  "step": 3
48
  },
49
  {
50
  "epoch": 0.15789473684210525,
51
- "eval_loss": 2.97302508354187,
52
- "eval_runtime": 3.2899,
53
- "eval_samples_per_second": 9.119,
54
- "eval_steps_per_second": 1.216,
55
  "step": 3
56
  },
57
  {
58
  "epoch": 0.21052631578947367,
59
- "grad_norm": 0.6644885540008545,
60
  "learning_rate": 0.0001,
61
- "loss": 2.9711,
62
  "step": 4
63
  },
64
  {
65
  "epoch": 0.21052631578947367,
66
- "eval_loss": 2.762944221496582,
67
- "eval_runtime": 3.2987,
68
- "eval_samples_per_second": 9.095,
69
- "eval_steps_per_second": 1.213,
70
  "step": 4
71
  },
72
  {
73
  "epoch": 0.2631578947368421,
74
- "grad_norm": 0.6135285496711731,
75
  "learning_rate": 0.00013333333333333334,
76
- "loss": 2.7061,
77
  "step": 5
78
  },
79
  {
80
  "epoch": 0.2631578947368421,
81
- "eval_loss": 2.5087203979492188,
82
- "eval_runtime": 3.3091,
83
- "eval_samples_per_second": 9.066,
84
- "eval_steps_per_second": 1.209,
85
  "step": 5
86
  },
87
  {
88
  "epoch": 0.3157894736842105,
89
- "grad_norm": 0.5422775745391846,
90
  "learning_rate": 0.00016666666666666666,
91
- "loss": 2.4032,
92
  "step": 6
93
  },
94
  {
95
  "epoch": 0.3157894736842105,
96
- "eval_loss": 2.270092725753784,
97
- "eval_runtime": 3.3142,
98
- "eval_samples_per_second": 9.052,
99
- "eval_steps_per_second": 1.207,
100
  "step": 6
101
  },
102
  {
103
  "epoch": 0.3684210526315789,
104
- "grad_norm": 0.5579596161842346,
105
  "learning_rate": 0.0002,
106
- "loss": 2.272,
107
  "step": 7
108
  },
109
  {
110
  "epoch": 0.3684210526315789,
111
- "eval_loss": 2.0614399909973145,
112
- "eval_runtime": 3.3233,
113
- "eval_samples_per_second": 9.027,
114
- "eval_steps_per_second": 1.204,
115
  "step": 7
116
  },
117
  {
118
  "epoch": 0.42105263157894735,
119
- "grad_norm": 0.7365043759346008,
120
  "learning_rate": 0.00023333333333333333,
121
- "loss": 2.0297,
122
  "step": 8
123
  },
124
  {
125
  "epoch": 0.42105263157894735,
126
- "eval_loss": 1.8437634706497192,
127
- "eval_runtime": 3.3264,
128
- "eval_samples_per_second": 9.019,
129
- "eval_steps_per_second": 1.202,
130
  "step": 8
131
  },
132
  {
133
  "epoch": 0.47368421052631576,
134
- "grad_norm": 0.7677823901176453,
135
  "learning_rate": 0.0002666666666666667,
136
- "loss": 1.8911,
137
  "step": 9
138
  },
139
  {
140
  "epoch": 0.47368421052631576,
141
- "eval_loss": 1.615093469619751,
142
- "eval_runtime": 3.3357,
143
- "eval_samples_per_second": 8.994,
144
- "eval_steps_per_second": 1.199,
145
  "step": 9
146
  },
147
  {
148
  "epoch": 0.5263157894736842,
149
- "grad_norm": 0.7033586502075195,
150
  "learning_rate": 0.0003,
151
- "loss": 1.654,
152
  "step": 10
153
  },
154
  {
155
  "epoch": 0.5263157894736842,
156
- "eval_loss": 1.4461504220962524,
157
- "eval_runtime": 3.3549,
158
- "eval_samples_per_second": 8.942,
159
- "eval_steps_per_second": 1.192,
160
  "step": 10
161
  },
162
  {
163
  "epoch": 0.5789473684210527,
164
- "grad_norm": 0.721517026424408,
165
  "learning_rate": 0.0003333333333333333,
166
- "loss": 1.5364,
167
  "step": 11
168
  },
169
  {
170
  "epoch": 0.5789473684210527,
171
- "eval_loss": 1.3645799160003662,
172
- "eval_runtime": 3.361,
173
- "eval_samples_per_second": 8.926,
174
- "eval_steps_per_second": 1.19,
175
  "step": 11
176
  },
177
  {
178
  "epoch": 0.631578947368421,
179
- "grad_norm": 0.7304323315620422,
180
  "learning_rate": 0.00036666666666666667,
181
- "loss": 1.3689,
182
  "step": 12
183
  },
184
  {
185
  "epoch": 0.631578947368421,
186
- "eval_loss": 1.272360920906067,
187
- "eval_runtime": 3.3759,
188
- "eval_samples_per_second": 8.887,
189
- "eval_steps_per_second": 1.185,
190
  "step": 12
191
  },
192
  {
193
  "epoch": 0.6842105263157895,
194
- "grad_norm": 0.6370911002159119,
195
  "learning_rate": 0.0004,
196
- "loss": 1.329,
197
  "step": 13
198
  },
199
  {
200
  "epoch": 0.6842105263157895,
201
- "eval_loss": 1.19339120388031,
202
- "eval_runtime": 3.3835,
203
- "eval_samples_per_second": 8.867,
204
- "eval_steps_per_second": 1.182,
205
  "step": 13
206
  },
207
  {
208
  "epoch": 0.7368421052631579,
209
- "grad_norm": 0.5493318438529968,
210
  "learning_rate": 0.00043333333333333337,
211
- "loss": 1.1991,
212
  "step": 14
213
  },
214
  {
215
  "epoch": 0.7368421052631579,
216
- "eval_loss": 1.154818058013916,
217
- "eval_runtime": 3.3971,
218
- "eval_samples_per_second": 8.831,
219
- "eval_steps_per_second": 1.177,
220
  "step": 14
221
  },
222
  {
223
  "epoch": 0.7894736842105263,
224
- "grad_norm": 0.4599643051624298,
225
  "learning_rate": 0.00046666666666666666,
226
- "loss": 1.2358,
227
  "step": 15
228
  },
229
  {
230
  "epoch": 0.7894736842105263,
231
- "eval_loss": 1.1299824714660645,
232
- "eval_runtime": 3.4098,
233
- "eval_samples_per_second": 8.798,
234
- "eval_steps_per_second": 1.173,
235
  "step": 15
236
  },
237
  {
238
  "epoch": 0.8421052631578947,
239
- "grad_norm": 0.5700777173042297,
240
  "learning_rate": 0.0005,
241
- "loss": 1.206,
242
  "step": 16
243
  },
244
  {
245
  "epoch": 0.8421052631578947,
246
- "eval_loss": 1.1079914569854736,
247
- "eval_runtime": 3.4063,
248
- "eval_samples_per_second": 8.807,
249
- "eval_steps_per_second": 1.174,
250
  "step": 16
251
  },
252
  {
253
  "epoch": 0.8947368421052632,
254
- "grad_norm": 0.44451233744621277,
255
- "learning_rate": 0.0004993910125649561,
256
- "loss": 1.2374,
257
  "step": 17
258
  },
259
  {
260
  "epoch": 0.8947368421052632,
261
- "eval_loss": 1.076997995376587,
262
- "eval_runtime": 3.4099,
263
- "eval_samples_per_second": 8.798,
264
- "eval_steps_per_second": 1.173,
265
  "step": 17
266
  },
267
  {
268
  "epoch": 0.9473684210526315,
269
- "grad_norm": 0.382600337266922,
270
- "learning_rate": 0.0004975670171853926,
271
- "loss": 1.0959,
272
  "step": 18
273
  },
274
  {
275
  "epoch": 0.9473684210526315,
276
- "eval_loss": 1.0459389686584473,
277
- "eval_runtime": 3.4174,
278
- "eval_samples_per_second": 8.779,
279
- "eval_steps_per_second": 1.17,
280
  "step": 18
281
  },
282
  {
283
  "epoch": 1.0,
284
- "grad_norm": 0.3735465109348297,
285
- "learning_rate": 0.0004945369001834514,
286
- "loss": 1.1433,
287
  "step": 19
288
  },
289
  {
290
  "epoch": 1.0,
291
- "eval_loss": 1.0354558229446411,
292
- "eval_runtime": 3.41,
293
- "eval_samples_per_second": 8.798,
294
- "eval_steps_per_second": 1.173,
295
  "step": 19
296
  },
297
  {
298
  "epoch": 1.0526315789473684,
299
- "grad_norm": 0.36878153681755066,
300
- "learning_rate": 0.0004903154239845797,
301
- "loss": 1.0467,
302
  "step": 20
303
  },
304
  {
305
  "epoch": 1.0526315789473684,
306
- "eval_loss": 1.0118752717971802,
307
- "eval_runtime": 3.4023,
308
- "eval_samples_per_second": 8.818,
309
- "eval_steps_per_second": 1.176,
310
  "step": 20
311
  },
312
  {
313
  "epoch": 1.1052631578947367,
314
- "grad_norm": 0.3709339499473572,
315
- "learning_rate": 0.0004849231551964771,
316
- "loss": 1.0453,
317
  "step": 21
318
  },
319
  {
320
  "epoch": 1.1052631578947367,
321
- "eval_loss": 0.9837953448295593,
322
- "eval_runtime": 3.3826,
323
- "eval_samples_per_second": 8.869,
324
- "eval_steps_per_second": 1.183,
325
  "step": 21
326
  },
327
  {
328
  "epoch": 1.1578947368421053,
329
- "grad_norm": 0.32317909598350525,
330
- "learning_rate": 0.0004783863644106502,
331
- "loss": 1.0573,
332
  "step": 22
333
  },
334
  {
335
  "epoch": 1.1578947368421053,
336
- "eval_loss": 0.9650039076805115,
337
- "eval_runtime": 3.3888,
338
- "eval_samples_per_second": 8.853,
339
- "eval_steps_per_second": 1.18,
340
  "step": 22
341
  },
342
  {
343
  "epoch": 1.2105263157894737,
344
- "grad_norm": 0.3465510606765747,
345
- "learning_rate": 0.00047073689821473173,
346
- "loss": 0.9613,
347
  "step": 23
348
  },
349
  {
350
  "epoch": 1.2105263157894737,
351
- "eval_loss": 0.9524248838424683,
352
- "eval_runtime": 3.389,
353
- "eval_samples_per_second": 8.852,
354
- "eval_steps_per_second": 1.18,
355
  "step": 23
356
  },
357
  {
358
  "epoch": 1.263157894736842,
359
- "grad_norm": 0.341265469789505,
360
- "learning_rate": 0.00046201202403910646,
361
- "loss": 1.0765,
362
  "step": 24
363
  },
364
  {
365
  "epoch": 1.263157894736842,
366
- "eval_loss": 0.9478815197944641,
367
- "eval_runtime": 3.3934,
368
- "eval_samples_per_second": 8.841,
369
- "eval_steps_per_second": 1.179,
370
  "step": 24
371
  },
372
  {
373
  "epoch": 1.3157894736842106,
374
- "grad_norm": 0.32804617285728455,
375
- "learning_rate": 0.0004522542485937369,
376
- "loss": 0.9063,
377
  "step": 25
378
  },
379
  {
380
  "epoch": 1.3157894736842106,
381
- "eval_loss": 0.9379161596298218,
382
- "eval_runtime": 3.394,
383
- "eval_samples_per_second": 8.839,
384
- "eval_steps_per_second": 1.179,
385
  "step": 25
386
  },
387
  {
388
  "epoch": 1.368421052631579,
389
- "grad_norm": 0.31782791018486023,
390
- "learning_rate": 0.0004415111107797445,
391
- "loss": 0.9969,
392
  "step": 26
393
  },
394
  {
395
  "epoch": 1.368421052631579,
396
- "eval_loss": 0.9347817897796631,
397
- "eval_runtime": 3.3909,
398
- "eval_samples_per_second": 8.847,
399
- "eval_steps_per_second": 1.18,
400
  "step": 26
401
  },
402
  {
403
  "epoch": 1.4210526315789473,
404
- "grad_norm": 0.3140616714954376,
405
- "learning_rate": 0.0004298349500846628,
406
- "loss": 0.9423,
407
  "step": 27
408
  },
409
  {
410
  "epoch": 1.4210526315789473,
411
- "eval_loss": 0.9298030138015747,
412
- "eval_runtime": 3.4047,
413
- "eval_samples_per_second": 8.811,
414
- "eval_steps_per_second": 1.175,
415
  "step": 27
416
  },
417
  {
418
  "epoch": 1.4736842105263157,
419
- "grad_norm": 0.3035232126712799,
420
- "learning_rate": 0.0004172826515897146,
421
- "loss": 0.8544,
422
  "step": 28
423
  },
424
  {
425
  "epoch": 1.4736842105263157,
426
- "eval_loss": 0.920465350151062,
427
- "eval_runtime": 3.4152,
428
- "eval_samples_per_second": 8.784,
429
- "eval_steps_per_second": 1.171,
430
  "step": 28
431
  },
432
  {
433
  "epoch": 1.526315789473684,
434
- "grad_norm": 0.36378970742225647,
435
- "learning_rate": 0.00040391536883141455,
436
- "loss": 1.0175,
437
  "step": 29
438
  },
439
  {
440
  "epoch": 1.526315789473684,
441
- "eval_loss": 0.9069837331771851,
442
- "eval_runtime": 3.4214,
443
- "eval_samples_per_second": 8.768,
444
- "eval_steps_per_second": 1.169,
445
  "step": 29
446
  },
447
  {
448
  "epoch": 1.5789473684210527,
449
- "grad_norm": 0.3729051947593689,
450
- "learning_rate": 0.0003897982258676867,
451
- "loss": 0.9851,
452
  "step": 30
453
  },
454
  {
455
  "epoch": 1.5789473684210527,
456
- "eval_loss": 0.8988735675811768,
457
- "eval_runtime": 3.4109,
458
- "eval_samples_per_second": 8.795,
459
- "eval_steps_per_second": 1.173,
460
  "step": 30
461
  },
462
  {
463
  "epoch": 1.631578947368421,
464
- "grad_norm": 0.3581544756889343,
465
- "learning_rate": 0.000375,
466
- "loss": 0.9229,
467
  "step": 31
468
  },
469
  {
470
  "epoch": 1.631578947368421,
471
- "eval_loss": 0.8822915554046631,
472
- "eval_runtime": 3.3783,
473
- "eval_samples_per_second": 8.88,
474
- "eval_steps_per_second": 1.184,
475
  "step": 31
476
  },
477
  {
478
  "epoch": 1.6842105263157894,
479
- "grad_norm": 0.28150516748428345,
480
- "learning_rate": 0.00035959278669726934,
481
- "loss": 0.94,
482
  "step": 32
483
  },
484
  {
485
  "epoch": 1.6842105263157894,
486
- "eval_loss": 0.8713746666908264,
487
- "eval_runtime": 3.4041,
488
- "eval_samples_per_second": 8.813,
489
- "eval_steps_per_second": 1.175,
490
  "step": 32
491
  },
492
  {
493
  "epoch": 1.736842105263158,
494
- "grad_norm": 0.30831000208854675,
495
- "learning_rate": 0.00034365164835397803,
496
- "loss": 1.0407,
497
  "step": 33
498
  },
499
  {
500
  "epoch": 1.736842105263158,
501
- "eval_loss": 0.8603693842887878,
502
- "eval_runtime": 3.417,
503
- "eval_samples_per_second": 8.78,
504
- "eval_steps_per_second": 1.171,
505
  "step": 33
506
  },
507
  {
508
  "epoch": 1.7894736842105263,
509
- "grad_norm": 0.31896907091140747,
510
- "learning_rate": 0.00032725424859373687,
511
- "loss": 0.9185,
512
  "step": 34
513
  },
514
  {
515
  "epoch": 1.7894736842105263,
516
- "eval_loss": 0.849823534488678,
517
- "eval_runtime": 3.4154,
518
- "eval_samples_per_second": 8.784,
519
- "eval_steps_per_second": 1.171,
520
  "step": 34
521
  },
522
  {
523
  "epoch": 1.8421052631578947,
524
- "grad_norm": 0.29725414514541626,
525
- "learning_rate": 0.0003104804738999169,
526
- "loss": 0.978,
527
  "step": 35
528
  },
529
  {
530
  "epoch": 1.8421052631578947,
531
- "eval_loss": 0.8390634655952454,
532
- "eval_runtime": 3.4119,
533
- "eval_samples_per_second": 8.793,
534
- "eval_steps_per_second": 1.172,
535
  "step": 35
536
  },
537
  {
538
  "epoch": 1.8947368421052633,
539
- "grad_norm": 0.3137111961841583,
540
- "learning_rate": 0.00029341204441673266,
541
- "loss": 0.9221,
542
  "step": 36
543
  },
544
  {
545
  "epoch": 1.8947368421052633,
546
- "eval_loss": 0.8293085098266602,
547
- "eval_runtime": 3.3951,
548
- "eval_samples_per_second": 8.836,
549
- "eval_steps_per_second": 1.178,
550
  "step": 36
551
  },
552
  {
553
  "epoch": 1.9473684210526314,
554
- "grad_norm": 0.267716646194458,
555
- "learning_rate": 0.0002761321158169134,
556
- "loss": 1.0078,
557
  "step": 37
558
  },
559
  {
560
  "epoch": 1.9473684210526314,
561
- "eval_loss": 0.8227899670600891,
562
- "eval_runtime": 3.3926,
563
- "eval_samples_per_second": 8.843,
564
- "eval_steps_per_second": 1.179,
565
  "step": 37
566
  },
567
  {
568
  "epoch": 2.0,
569
- "grad_norm": 0.3097141683101654,
570
- "learning_rate": 0.0002587248741756253,
571
- "loss": 1.0386,
572
  "step": 38
573
  },
574
  {
575
  "epoch": 2.0,
576
- "eval_loss": 0.8196889758110046,
577
- "eval_runtime": 3.3913,
578
- "eval_samples_per_second": 8.846,
579
- "eval_steps_per_second": 1.179,
580
  "step": 38
581
  },
582
  {
583
  "epoch": 2.0526315789473686,
584
- "grad_norm": 0.29532116651535034,
585
- "learning_rate": 0.00024127512582437484,
586
- "loss": 0.9046,
587
  "step": 39
588
  },
589
  {
590
  "epoch": 2.0526315789473686,
591
- "eval_loss": 0.8109915852546692,
592
- "eval_runtime": 3.3856,
593
- "eval_samples_per_second": 8.861,
594
- "eval_steps_per_second": 1.181,
595
  "step": 39
596
  },
597
  {
598
  "epoch": 2.1052631578947367,
599
- "grad_norm": 0.3160407245159149,
600
- "learning_rate": 0.00022386788418308668,
601
- "loss": 0.8684,
602
  "step": 40
603
  },
604
  {
605
  "epoch": 2.1052631578947367,
606
- "eval_loss": 0.799045979976654,
607
- "eval_runtime": 3.3859,
608
- "eval_samples_per_second": 8.86,
609
- "eval_steps_per_second": 1.181,
610
  "step": 40
611
  },
612
  {
613
  "epoch": 2.1578947368421053,
614
- "grad_norm": 0.2594124674797058,
615
- "learning_rate": 0.00020658795558326743,
616
- "loss": 0.8051,
617
  "step": 41
618
  },
619
  {
620
  "epoch": 2.1578947368421053,
621
- "eval_loss": 0.7873298525810242,
622
- "eval_runtime": 3.3873,
623
- "eval_samples_per_second": 8.857,
624
- "eval_steps_per_second": 1.181,
625
  "step": 41
626
  },
627
  {
628
  "epoch": 2.2105263157894735,
629
- "grad_norm": 0.2573184370994568,
630
- "learning_rate": 0.0001895195261000831,
631
- "loss": 0.7542,
632
  "step": 42
633
  },
634
  {
635
  "epoch": 2.2105263157894735,
636
- "eval_loss": 0.7783879637718201,
637
- "eval_runtime": 3.3897,
638
- "eval_samples_per_second": 8.85,
639
- "eval_steps_per_second": 1.18,
640
  "step": 42
641
  },
642
  {
643
  "epoch": 2.263157894736842,
644
- "grad_norm": 0.3050247132778168,
645
- "learning_rate": 0.00017274575140626317,
646
- "loss": 0.8833,
647
  "step": 43
648
  },
649
  {
650
  "epoch": 2.263157894736842,
651
- "eval_loss": 0.7714616060256958,
652
- "eval_runtime": 3.4031,
653
- "eval_samples_per_second": 8.815,
654
- "eval_steps_per_second": 1.175,
655
  "step": 43
656
  },
657
  {
658
  "epoch": 2.3157894736842106,
659
- "grad_norm": 0.27206432819366455,
660
- "learning_rate": 0.00015634835164602198,
661
- "loss": 0.8176,
662
  "step": 44
663
  },
664
  {
665
  "epoch": 2.3157894736842106,
666
- "eval_loss": 0.7637041807174683,
667
- "eval_runtime": 3.4006,
668
- "eval_samples_per_second": 8.822,
669
- "eval_steps_per_second": 1.176,
670
  "step": 44
671
  },
672
  {
673
  "epoch": 2.3684210526315788,
674
- "grad_norm": 0.24384012818336487,
675
- "learning_rate": 0.00014040721330273062,
676
- "loss": 0.7616,
677
  "step": 45
678
  },
679
  {
680
  "epoch": 2.3684210526315788,
681
- "eval_loss": 0.7560217380523682,
682
- "eval_runtime": 3.4005,
683
- "eval_samples_per_second": 8.822,
684
- "eval_steps_per_second": 1.176,
685
  "step": 45
686
  },
687
  {
688
  "epoch": 2.4210526315789473,
689
- "grad_norm": 0.25645551085472107,
690
- "learning_rate": 0.00012500000000000006,
691
- "loss": 0.7888,
692
  "step": 46
693
  },
694
  {
695
  "epoch": 2.4210526315789473,
696
- "eval_loss": 0.7505295872688293,
697
- "eval_runtime": 3.3925,
698
- "eval_samples_per_second": 8.843,
699
- "eval_steps_per_second": 1.179,
700
  "step": 46
701
  },
702
  {
703
  "epoch": 2.473684210526316,
704
- "grad_norm": 0.27820125222206116,
705
- "learning_rate": 0.00011020177413231333,
706
- "loss": 0.7584,
707
  "step": 47
708
  },
709
  {
710
  "epoch": 2.473684210526316,
711
- "eval_loss": 0.7445800304412842,
712
- "eval_runtime": 3.3928,
713
- "eval_samples_per_second": 8.842,
714
- "eval_steps_per_second": 1.179,
715
  "step": 47
716
  },
717
  {
718
  "epoch": 2.526315789473684,
719
- "grad_norm": 0.23925091326236725,
720
- "learning_rate": 9.608463116858542e-05,
721
- "loss": 0.7504,
722
  "step": 48
723
  },
724
  {
725
  "epoch": 2.526315789473684,
726
- "eval_loss": 0.7403488755226135,
727
- "eval_runtime": 3.4026,
728
- "eval_samples_per_second": 8.817,
729
- "eval_steps_per_second": 1.176,
730
  "step": 48
731
  },
732
  {
733
  "epoch": 2.5789473684210527,
734
- "grad_norm": 0.32143712043762207,
735
- "learning_rate": 8.271734841028553e-05,
736
- "loss": 0.8269,
737
  "step": 49
738
  },
739
  {
740
  "epoch": 2.5789473684210527,
741
- "eval_loss": 0.7371814250946045,
742
- "eval_runtime": 3.3997,
743
- "eval_samples_per_second": 8.824,
744
- "eval_steps_per_second": 1.177,
745
  "step": 49
746
  },
747
  {
748
  "epoch": 2.6315789473684212,
749
- "grad_norm": 0.2628876864910126,
750
- "learning_rate": 7.016504991533726e-05,
751
- "loss": 0.7076,
752
  "step": 50
753
  },
754
  {
755
  "epoch": 2.6315789473684212,
756
- "eval_loss": 0.7335822582244873,
757
- "eval_runtime": 3.4029,
758
- "eval_samples_per_second": 8.816,
759
- "eval_steps_per_second": 1.175,
760
  "step": 50
761
  },
762
  {
763
  "epoch": 2.6842105263157894,
764
- "grad_norm": 0.30318617820739746,
765
- "learning_rate": 5.848888922025553e-05,
766
- "loss": 0.7792,
767
  "step": 51
768
  },
769
  {
770
  "epoch": 2.6842105263157894,
771
- "eval_loss": 0.7297669053077698,
772
- "eval_runtime": 3.3726,
773
- "eval_samples_per_second": 8.895,
774
- "eval_steps_per_second": 1.186,
775
  "step": 51
776
  },
777
  {
778
  "epoch": 2.736842105263158,
779
- "grad_norm": 0.3162338435649872,
780
- "learning_rate": 4.7745751406263163e-05,
781
- "loss": 0.7217,
782
  "step": 52
783
  },
784
  {
785
  "epoch": 2.736842105263158,
786
- "eval_loss": 0.728228747844696,
787
- "eval_runtime": 3.3989,
788
- "eval_samples_per_second": 8.827,
789
- "eval_steps_per_second": 1.177,
790
  "step": 52
791
  },
792
  {
793
  "epoch": 2.7894736842105265,
794
- "grad_norm": 0.2733875513076782,
795
- "learning_rate": 3.798797596089351e-05,
796
- "loss": 0.8098,
797
  "step": 53
798
  },
799
  {
800
  "epoch": 2.7894736842105265,
801
- "eval_loss": 0.7270908355712891,
802
- "eval_runtime": 3.4122,
803
- "eval_samples_per_second": 8.792,
804
- "eval_steps_per_second": 1.172,
805
  "step": 53
806
  },
807
  {
808
  "epoch": 2.8421052631578947,
809
- "grad_norm": 0.26100900769233704,
810
- "learning_rate": 2.9263101785268254e-05,
811
- "loss": 0.7631,
812
  "step": 54
813
  },
814
  {
815
  "epoch": 2.8421052631578947,
816
- "eval_loss": 0.7254647016525269,
817
- "eval_runtime": 3.4244,
818
- "eval_samples_per_second": 8.761,
819
- "eval_steps_per_second": 1.168,
820
  "step": 54
821
  },
822
  {
823
  "epoch": 2.8947368421052633,
824
- "grad_norm": 0.2827248275279999,
825
- "learning_rate": 2.1613635589349755e-05,
826
- "loss": 0.7716,
827
  "step": 55
828
  },
829
  {
830
  "epoch": 2.8947368421052633,
831
- "eval_loss": 0.7241045236587524,
832
- "eval_runtime": 3.4133,
833
- "eval_samples_per_second": 8.789,
834
- "eval_steps_per_second": 1.172,
835
  "step": 55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
836
  }
837
  ],
838
  "logging_steps": 1,
839
- "max_steps": 60,
840
  "num_input_tokens_seen": 0,
841
- "num_train_epochs": 4,
842
  "save_steps": 5,
843
  "stateful_callbacks": {
844
  "TrainerControl": {
@@ -852,7 +3027,7 @@
852
  "attributes": {}
853
  }
854
  },
855
- "total_flos": 2315465393725440.0,
856
  "train_batch_size": 1,
857
  "trial_name": null,
858
  "trial_params": null
 
1
  {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.0016098986379802227,
4
+ "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-200",
5
+ "epoch": 10.526315789473685,
6
  "eval_steps": 1,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.05263157894736842,
14
+ "grad_norm": 9.795289039611816,
15
  "learning_rate": 0.0,
16
+ "loss": 3.2204,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.05263157894736842,
21
+ "eval_loss": 3.1565215587615967,
22
+ "eval_runtime": 0.9831,
23
+ "eval_samples_per_second": 30.517,
24
+ "eval_steps_per_second": 4.069,
25
  "step": 1
26
  },
27
  {
28
  "epoch": 0.10526315789473684,
29
+ "grad_norm": 10.048436164855957,
30
  "learning_rate": 3.3333333333333335e-05,
31
+ "loss": 3.1604,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 0.10526315789473684,
36
+ "eval_loss": 2.4775681495666504,
37
+ "eval_runtime": 0.8971,
38
+ "eval_samples_per_second": 33.44,
39
+ "eval_steps_per_second": 4.459,
40
  "step": 2
41
  },
42
  {
43
  "epoch": 0.15789473684210525,
44
+ "grad_norm": 5.148971080780029,
45
  "learning_rate": 6.666666666666667e-05,
46
+ "loss": 2.3511,
47
  "step": 3
48
  },
49
  {
50
  "epoch": 0.15789473684210525,
51
+ "eval_loss": 2.0030856132507324,
52
+ "eval_runtime": 0.8926,
53
+ "eval_samples_per_second": 33.611,
54
+ "eval_steps_per_second": 4.481,
55
  "step": 3
56
  },
57
  {
58
  "epoch": 0.21052631578947367,
59
+ "grad_norm": 4.8437819480896,
60
  "learning_rate": 0.0001,
61
+ "loss": 2.0198,
62
  "step": 4
63
  },
64
  {
65
  "epoch": 0.21052631578947367,
66
+ "eval_loss": 1.6053706407546997,
67
+ "eval_runtime": 0.8924,
68
+ "eval_samples_per_second": 33.618,
69
+ "eval_steps_per_second": 4.482,
70
  "step": 4
71
  },
72
  {
73
  "epoch": 0.2631578947368421,
74
+ "grad_norm": 4.386927604675293,
75
  "learning_rate": 0.00013333333333333334,
76
+ "loss": 1.6969,
77
  "step": 5
78
  },
79
  {
80
  "epoch": 0.2631578947368421,
81
+ "eval_loss": 1.4053733348846436,
82
+ "eval_runtime": 0.8951,
83
+ "eval_samples_per_second": 33.517,
84
+ "eval_steps_per_second": 4.469,
85
  "step": 5
86
  },
87
  {
88
  "epoch": 0.3157894736842105,
89
+ "grad_norm": 3.955519676208496,
90
  "learning_rate": 0.00016666666666666666,
91
+ "loss": 1.4825,
92
  "step": 6
93
  },
94
  {
95
  "epoch": 0.3157894736842105,
96
+ "eval_loss": 1.3105080127716064,
97
+ "eval_runtime": 0.893,
98
+ "eval_samples_per_second": 33.593,
99
+ "eval_steps_per_second": 4.479,
100
  "step": 6
101
  },
102
  {
103
  "epoch": 0.3684210526315789,
104
+ "grad_norm": 3.6086604595184326,
105
  "learning_rate": 0.0002,
106
+ "loss": 1.3404,
107
  "step": 7
108
  },
109
  {
110
  "epoch": 0.3684210526315789,
111
+ "eval_loss": 1.2445138692855835,
112
+ "eval_runtime": 0.8942,
113
+ "eval_samples_per_second": 33.549,
114
+ "eval_steps_per_second": 4.473,
115
  "step": 7
116
  },
117
  {
118
  "epoch": 0.42105263157894735,
119
+ "grad_norm": 2.691216230392456,
120
  "learning_rate": 0.00023333333333333333,
121
+ "loss": 1.2627,
122
  "step": 8
123
  },
124
  {
125
  "epoch": 0.42105263157894735,
126
+ "eval_loss": 1.1471664905548096,
127
+ "eval_runtime": 0.8927,
128
+ "eval_samples_per_second": 33.606,
129
+ "eval_steps_per_second": 4.481,
130
  "step": 8
131
  },
132
  {
133
  "epoch": 0.47368421052631576,
134
+ "grad_norm": 2.5174126625061035,
135
  "learning_rate": 0.0002666666666666667,
136
+ "loss": 1.2037,
137
  "step": 9
138
  },
139
  {
140
  "epoch": 0.47368421052631576,
141
+ "eval_loss": 1.1372406482696533,
142
+ "eval_runtime": 0.8947,
143
+ "eval_samples_per_second": 33.529,
144
+ "eval_steps_per_second": 4.471,
145
  "step": 9
146
  },
147
  {
148
  "epoch": 0.5263157894736842,
149
+ "grad_norm": 2.893831253051758,
150
  "learning_rate": 0.0003,
151
+ "loss": 1.1793,
152
  "step": 10
153
  },
154
  {
155
  "epoch": 0.5263157894736842,
156
+ "eval_loss": 1.0686627626419067,
157
+ "eval_runtime": 0.8921,
158
+ "eval_samples_per_second": 33.628,
159
+ "eval_steps_per_second": 4.484,
160
  "step": 10
161
  },
162
  {
163
  "epoch": 0.5789473684210527,
164
+ "grad_norm": 2.5055713653564453,
165
  "learning_rate": 0.0003333333333333333,
166
+ "loss": 1.201,
167
  "step": 11
168
  },
169
  {
170
  "epoch": 0.5789473684210527,
171
+ "eval_loss": 1.0994912385940552,
172
+ "eval_runtime": 0.8951,
173
+ "eval_samples_per_second": 33.517,
174
+ "eval_steps_per_second": 4.469,
175
  "step": 11
176
  },
177
  {
178
  "epoch": 0.631578947368421,
179
+ "grad_norm": 2.297982931137085,
180
  "learning_rate": 0.00036666666666666667,
181
+ "loss": 1.177,
182
  "step": 12
183
  },
184
  {
185
  "epoch": 0.631578947368421,
186
+ "eval_loss": 1.0981471538543701,
187
+ "eval_runtime": 0.8926,
188
+ "eval_samples_per_second": 33.608,
189
+ "eval_steps_per_second": 4.481,
190
  "step": 12
191
  },
192
  {
193
  "epoch": 0.6842105263157895,
194
+ "grad_norm": 2.8536081314086914,
195
  "learning_rate": 0.0004,
196
+ "loss": 1.2106,
197
  "step": 13
198
  },
199
  {
200
  "epoch": 0.6842105263157895,
201
+ "eval_loss": 1.0119823217391968,
202
+ "eval_runtime": 0.8936,
203
+ "eval_samples_per_second": 33.574,
204
+ "eval_steps_per_second": 4.477,
205
  "step": 13
206
  },
207
  {
208
  "epoch": 0.7368421052631579,
209
+ "grad_norm": 1.8637670278549194,
210
  "learning_rate": 0.00043333333333333337,
211
+ "loss": 1.0688,
212
  "step": 14
213
  },
214
  {
215
  "epoch": 0.7368421052631579,
216
+ "eval_loss": 1.0545283555984497,
217
+ "eval_runtime": 0.893,
218
+ "eval_samples_per_second": 33.596,
219
+ "eval_steps_per_second": 4.479,
220
  "step": 14
221
  },
222
  {
223
  "epoch": 0.7894736842105263,
224
+ "grad_norm": 2.6284332275390625,
225
  "learning_rate": 0.00046666666666666666,
226
+ "loss": 1.1661,
227
  "step": 15
228
  },
229
  {
230
  "epoch": 0.7894736842105263,
231
+ "eval_loss": 1.031855821609497,
232
+ "eval_runtime": 0.8928,
233
+ "eval_samples_per_second": 33.603,
234
+ "eval_steps_per_second": 4.48,
235
  "step": 15
236
  },
237
  {
238
  "epoch": 0.8421052631578947,
239
+ "grad_norm": 1.9439812898635864,
240
  "learning_rate": 0.0005,
241
+ "loss": 1.1859,
242
  "step": 16
243
  },
244
  {
245
  "epoch": 0.8421052631578947,
246
+ "eval_loss": 0.9958587884902954,
247
+ "eval_runtime": 0.8982,
248
+ "eval_samples_per_second": 33.401,
249
+ "eval_steps_per_second": 4.453,
250
  "step": 16
251
  },
252
  {
253
  "epoch": 0.8947368421052632,
254
+ "grad_norm": 1.7199311256408691,
255
+ "learning_rate": 0.0004999776608025946,
256
+ "loss": 1.1636,
257
  "step": 17
258
  },
259
  {
260
  "epoch": 0.8947368421052632,
261
+ "eval_loss": 0.9928242564201355,
262
+ "eval_runtime": 0.8982,
263
+ "eval_samples_per_second": 33.398,
264
+ "eval_steps_per_second": 4.453,
265
  "step": 17
266
  },
267
  {
268
  "epoch": 0.9473684210526315,
269
+ "grad_norm": 5.788880825042725,
270
+ "learning_rate": 0.000499910647202696,
271
+ "loss": 1.0348,
272
  "step": 18
273
  },
274
  {
275
  "epoch": 0.9473684210526315,
276
+ "eval_loss": 1.0362129211425781,
277
+ "eval_runtime": 0.8954,
278
+ "eval_samples_per_second": 33.504,
279
+ "eval_steps_per_second": 4.467,
280
  "step": 18
281
  },
282
  {
283
  "epoch": 1.0,
284
+ "grad_norm": 2.156782388687134,
285
+ "learning_rate": 0.0004997989711765446,
286
+ "loss": 1.201,
287
  "step": 19
288
  },
289
  {
290
  "epoch": 1.0,
291
+ "eval_loss": 0.9807829260826111,
292
+ "eval_runtime": 0.8926,
293
+ "eval_samples_per_second": 33.611,
294
+ "eval_steps_per_second": 4.481,
295
  "step": 19
296
  },
297
  {
298
  "epoch": 1.0526315789473684,
299
+ "grad_norm": 1.9558554887771606,
300
+ "learning_rate": 0.0004996426526821629,
301
+ "loss": 0.8535,
302
  "step": 20
303
  },
304
  {
305
  "epoch": 1.0526315789473684,
306
+ "eval_loss": 0.9379722476005554,
307
+ "eval_runtime": 0.8935,
308
+ "eval_samples_per_second": 33.576,
309
+ "eval_steps_per_second": 4.477,
310
  "step": 20
311
  },
312
  {
313
  "epoch": 1.1052631578947367,
314
+ "grad_norm": 1.772550106048584,
315
+ "learning_rate": 0.0004994417196557883,
316
+ "loss": 0.968,
317
  "step": 21
318
  },
319
  {
320
  "epoch": 1.1052631578947367,
321
+ "eval_loss": 0.9845291376113892,
322
+ "eval_runtime": 0.8928,
323
+ "eval_samples_per_second": 33.603,
324
+ "eval_steps_per_second": 4.48,
325
  "step": 21
326
  },
327
  {
328
  "epoch": 1.1578947368421053,
329
+ "grad_norm": 2.108396291732788,
330
+ "learning_rate": 0.0004991962080068813,
331
+ "loss": 1.0552,
332
  "step": 22
333
  },
334
  {
335
  "epoch": 1.1578947368421053,
336
+ "eval_loss": 0.9239175319671631,
337
+ "eval_runtime": 0.893,
338
+ "eval_samples_per_second": 33.594,
339
+ "eval_steps_per_second": 4.479,
340
  "step": 22
341
  },
342
  {
343
  "epoch": 1.2105263157894737,
344
+ "grad_norm": 1.8215439319610596,
345
+ "learning_rate": 0.0004989061616117073,
346
+ "loss": 0.9825,
347
  "step": 23
348
  },
349
  {
350
  "epoch": 1.2105263157894737,
351
+ "eval_loss": 0.980516791343689,
352
+ "eval_runtime": 0.8952,
353
+ "eval_samples_per_second": 33.513,
354
+ "eval_steps_per_second": 4.468,
355
  "step": 23
356
  },
357
  {
358
  "epoch": 1.263157894736842,
359
+ "grad_norm": 20.904949188232422,
360
+ "learning_rate": 0.0004985716323054959,
361
+ "loss": 0.9563,
362
  "step": 24
363
  },
364
  {
365
  "epoch": 1.263157894736842,
366
+ "eval_loss": 1.112138271331787,
367
+ "eval_runtime": 0.8954,
368
+ "eval_samples_per_second": 33.505,
369
+ "eval_steps_per_second": 4.467,
370
  "step": 24
371
  },
372
  {
373
  "epoch": 1.3157894736842106,
374
+ "grad_norm": 2.785473585128784,
375
+ "learning_rate": 0.0004981926798731766,
376
+ "loss": 1.048,
377
  "step": 25
378
  },
379
  {
380
  "epoch": 1.3157894736842106,
381
+ "eval_loss": 0.9919915795326233,
382
+ "eval_runtime": 0.8932,
383
+ "eval_samples_per_second": 33.586,
384
+ "eval_steps_per_second": 4.478,
385
  "step": 25
386
  },
387
  {
388
  "epoch": 1.368421052631579,
389
+ "grad_norm": 1.8656138181686401,
390
+ "learning_rate": 0.000497769372038695,
391
+ "loss": 1.0315,
392
  "step": 26
393
  },
394
  {
395
  "epoch": 1.368421052631579,
396
+ "eval_loss": 0.9384483098983765,
397
+ "eval_runtime": 0.8931,
398
+ "eval_samples_per_second": 33.589,
399
+ "eval_steps_per_second": 4.479,
400
  "step": 26
401
  },
402
  {
403
  "epoch": 1.4210526315789473,
404
+ "grad_norm": 1.697496771812439,
405
+ "learning_rate": 0.0004973017844529094,
406
+ "loss": 1.0063,
407
  "step": 27
408
  },
409
  {
410
  "epoch": 1.4210526315789473,
411
+ "eval_loss": 0.904453694820404,
412
+ "eval_runtime": 0.8918,
413
+ "eval_samples_per_second": 33.64,
414
+ "eval_steps_per_second": 4.485,
415
  "step": 27
416
  },
417
  {
418
  "epoch": 1.4736842105263157,
419
+ "grad_norm": 1.7305934429168701,
420
+ "learning_rate": 0.0004967900006800708,
421
+ "loss": 0.8483,
422
  "step": 28
423
  },
424
  {
425
  "epoch": 1.4736842105263157,
426
+ "eval_loss": 0.876754879951477,
427
+ "eval_runtime": 0.8933,
428
+ "eval_samples_per_second": 33.584,
429
+ "eval_steps_per_second": 4.478,
430
  "step": 28
431
  },
432
  {
433
  "epoch": 1.526315789473684,
434
+ "grad_norm": 1.7766728401184082,
435
+ "learning_rate": 0.000496234112182889,
436
+ "loss": 1.0118,
437
  "step": 29
438
  },
439
  {
440
  "epoch": 1.526315789473684,
441
+ "eval_loss": 0.9041274785995483,
442
+ "eval_runtime": 0.8949,
443
+ "eval_samples_per_second": 33.524,
444
+ "eval_steps_per_second": 4.47,
445
  "step": 29
446
  },
447
  {
448
  "epoch": 1.5789473684210527,
449
+ "grad_norm": 1.9015165567398071,
450
+ "learning_rate": 0.000495634218306187,
451
+ "loss": 0.8917,
452
  "step": 30
453
  },
454
  {
455
  "epoch": 1.5789473684210527,
456
+ "eval_loss": 0.8897702693939209,
457
+ "eval_runtime": 0.8926,
458
+ "eval_samples_per_second": 33.611,
459
+ "eval_steps_per_second": 4.481,
460
  "step": 30
461
  },
462
  {
463
  "epoch": 1.631578947368421,
464
+ "grad_norm": 1.4804080724716187,
465
+ "learning_rate": 0.0004949904262591467,
466
+ "loss": 1.0084,
467
  "step": 31
468
  },
469
  {
470
  "epoch": 1.631578947368421,
471
+ "eval_loss": 0.885962188243866,
472
+ "eval_runtime": 0.8988,
473
+ "eval_samples_per_second": 33.378,
474
+ "eval_steps_per_second": 4.45,
475
  "step": 31
476
  },
477
  {
478
  "epoch": 1.6842105263157894,
479
+ "grad_norm": 1.819899320602417,
480
+ "learning_rate": 0.0004943028510961491,
481
+ "loss": 0.969,
482
  "step": 32
483
  },
484
  {
485
  "epoch": 1.6842105263157894,
486
+ "eval_loss": 0.8608292937278748,
487
+ "eval_runtime": 0.8958,
488
+ "eval_samples_per_second": 33.489,
489
+ "eval_steps_per_second": 4.465,
490
  "step": 32
491
  },
492
  {
493
  "epoch": 1.736842105263158,
494
+ "grad_norm": 2.8180196285247803,
495
+ "learning_rate": 0.0004935716156962127,
496
+ "loss": 1.1318,
497
  "step": 33
498
  },
499
  {
500
  "epoch": 1.736842105263158,
501
+ "eval_loss": 0.875141978263855,
502
+ "eval_runtime": 0.8971,
503
+ "eval_samples_per_second": 33.441,
504
+ "eval_steps_per_second": 4.459,
505
  "step": 33
506
  },
507
  {
508
  "epoch": 1.7894736842105263,
509
+ "grad_norm": 1.8047230243682861,
510
+ "learning_rate": 0.000492796850741033,
511
+ "loss": 1.0002,
512
  "step": 34
513
  },
514
  {
515
  "epoch": 1.7894736842105263,
516
+ "eval_loss": 0.89467453956604,
517
+ "eval_runtime": 0.8966,
518
+ "eval_samples_per_second": 33.46,
519
+ "eval_steps_per_second": 4.461,
520
  "step": 34
521
  },
522
  {
523
  "epoch": 1.8421052631578947,
524
+ "grad_norm": 2.6305246353149414,
525
+ "learning_rate": 0.0004919786946916281,
526
+ "loss": 1.1024,
527
  "step": 35
528
  },
529
  {
530
  "epoch": 1.8421052631578947,
531
+ "eval_loss": 0.8359136581420898,
532
+ "eval_runtime": 0.8971,
533
+ "eval_samples_per_second": 33.44,
534
+ "eval_steps_per_second": 4.459,
535
  "step": 35
536
  },
537
  {
538
  "epoch": 1.8947368421052633,
539
+ "grad_norm": 2.4130873680114746,
540
+ "learning_rate": 0.0004911172937635942,
541
+ "loss": 0.9314,
542
  "step": 36
543
  },
544
  {
545
  "epoch": 1.8947368421052633,
546
+ "eval_loss": 0.8058050274848938,
547
+ "eval_runtime": 0.8959,
548
+ "eval_samples_per_second": 33.487,
549
+ "eval_steps_per_second": 4.465,
550
  "step": 36
551
  },
552
  {
553
  "epoch": 1.9473684210526314,
554
+ "grad_norm": 1.580320119857788,
555
+ "learning_rate": 0.0004902128019009741,
556
+ "loss": 1.0036,
557
  "step": 37
558
  },
559
  {
560
  "epoch": 1.9473684210526314,
561
+ "eval_loss": 0.7546663880348206,
562
+ "eval_runtime": 0.8967,
563
+ "eval_samples_per_second": 33.457,
564
+ "eval_steps_per_second": 4.461,
565
  "step": 37
566
  },
567
  {
568
  "epoch": 2.0,
569
+ "grad_norm": 1.6066155433654785,
570
+ "learning_rate": 0.000489265380748746,
571
+ "loss": 1.094,
572
  "step": 38
573
  },
574
  {
575
  "epoch": 2.0,
576
+ "eval_loss": 0.8417730331420898,
577
+ "eval_runtime": 0.895,
578
+ "eval_samples_per_second": 33.519,
579
+ "eval_steps_per_second": 4.469,
580
  "step": 38
581
  },
582
  {
583
  "epoch": 2.0526315789473686,
584
+ "grad_norm": 2.4847571849823,
585
+ "learning_rate": 0.0004882751996239352,
586
+ "loss": 0.9106,
587
  "step": 39
588
  },
589
  {
590
  "epoch": 2.0526315789473686,
591
+ "eval_loss": 0.805930495262146,
592
+ "eval_runtime": 0.8985,
593
+ "eval_samples_per_second": 33.388,
594
+ "eval_steps_per_second": 4.452,
595
  "step": 39
596
  },
597
  {
598
  "epoch": 2.1052631578947367,
599
+ "grad_norm": 2.144543409347534,
600
+ "learning_rate": 0.0004872424354853545,
601
+ "loss": 0.8542,
602
  "step": 40
603
  },
604
  {
605
  "epoch": 2.1052631578947367,
606
+ "eval_loss": 0.7550076842308044,
607
+ "eval_runtime": 0.8977,
608
+ "eval_samples_per_second": 33.42,
609
+ "eval_steps_per_second": 4.456,
610
  "step": 40
611
  },
612
  {
613
  "epoch": 2.1578947368421053,
614
+ "grad_norm": 1.2767819166183472,
615
+ "learning_rate": 0.0004861672729019797,
616
+ "loss": 0.7569,
617
  "step": 41
618
  },
619
  {
620
  "epoch": 2.1578947368421053,
621
+ "eval_loss": 0.720465362071991,
622
+ "eval_runtime": 0.9013,
623
+ "eval_samples_per_second": 33.285,
624
+ "eval_steps_per_second": 4.438,
625
  "step": 41
626
  },
627
  {
628
  "epoch": 2.2105263157894735,
629
+ "grad_norm": 1.4606373310089111,
630
+ "learning_rate": 0.0004850499040199643,
631
+ "loss": 0.6198,
632
  "step": 42
633
  },
634
  {
635
  "epoch": 2.2105263157894735,
636
+ "eval_loss": 0.7800072431564331,
637
+ "eval_runtime": 0.8938,
638
+ "eval_samples_per_second": 33.564,
639
+ "eval_steps_per_second": 4.475,
640
  "step": 42
641
  },
642
  {
643
  "epoch": 2.263157894736842,
644
+ "grad_norm": 4.208314895629883,
645
+ "learning_rate": 0.0004838905285283005,
646
+ "loss": 0.8454,
647
  "step": 43
648
  },
649
  {
650
  "epoch": 2.263157894736842,
651
+ "eval_loss": 0.7882384657859802,
652
+ "eval_runtime": 0.8955,
653
+ "eval_samples_per_second": 33.502,
654
+ "eval_steps_per_second": 4.467,
655
  "step": 43
656
  },
657
  {
658
  "epoch": 2.3157894736842106,
659
+ "grad_norm": 2.8906519412994385,
660
+ "learning_rate": 0.00048268935362313215,
661
+ "loss": 0.8786,
662
  "step": 44
663
  },
664
  {
665
  "epoch": 2.3157894736842106,
666
+ "eval_loss": 0.7504675388336182,
667
+ "eval_runtime": 0.8973,
668
+ "eval_samples_per_second": 33.435,
669
+ "eval_steps_per_second": 4.458,
670
  "step": 44
671
  },
672
  {
673
  "epoch": 2.3684210526315788,
674
+ "grad_norm": 2.5608749389648438,
675
+ "learning_rate": 0.00048144659397072586,
676
+ "loss": 0.7165,
677
  "step": 45
678
  },
679
  {
680
  "epoch": 2.3684210526315788,
681
+ "eval_loss": 0.7160356640815735,
682
+ "eval_runtime": 0.8985,
683
+ "eval_samples_per_second": 33.389,
684
+ "eval_steps_per_second": 4.452,
685
  "step": 45
686
  },
687
  {
688
  "epoch": 2.4210526315789473,
689
+ "grad_norm": 2.237501621246338,
690
+ "learning_rate": 0.0004801624716691072,
691
+ "loss": 0.9232,
692
  "step": 46
693
  },
694
  {
695
  "epoch": 2.4210526315789473,
696
+ "eval_loss": 0.7007637619972229,
697
+ "eval_runtime": 0.8986,
698
+ "eval_samples_per_second": 33.387,
699
+ "eval_steps_per_second": 4.452,
700
  "step": 46
701
  },
702
  {
703
  "epoch": 2.473684210526316,
704
+ "grad_norm": 2.166039228439331,
705
+ "learning_rate": 0.00047883721620836894,
706
+ "loss": 0.782,
707
  "step": 47
708
  },
709
  {
710
  "epoch": 2.473684210526316,
711
+ "eval_loss": 0.6951841711997986,
712
+ "eval_runtime": 0.9007,
713
+ "eval_samples_per_second": 33.308,
714
+ "eval_steps_per_second": 4.441,
715
  "step": 47
716
  },
717
  {
718
  "epoch": 2.526315789473684,
719
+ "grad_norm": 1.6499485969543457,
720
+ "learning_rate": 0.0004774710644296578,
721
+ "loss": 0.7387,
722
  "step": 48
723
  },
724
  {
725
  "epoch": 2.526315789473684,
726
+ "eval_loss": 0.7041357755661011,
727
+ "eval_runtime": 0.8999,
728
+ "eval_samples_per_second": 33.337,
729
+ "eval_steps_per_second": 4.445,
730
  "step": 48
731
  },
732
  {
733
  "epoch": 2.5789473684210527,
734
+ "grad_norm": 2.833061456680298,
735
+ "learning_rate": 0.00047606426048284813,
736
+ "loss": 0.8343,
737
  "step": 49
738
  },
739
  {
740
  "epoch": 2.5789473684210527,
741
+ "eval_loss": 0.6822550296783447,
742
+ "eval_runtime": 0.9005,
743
+ "eval_samples_per_second": 33.316,
744
+ "eval_steps_per_second": 4.442,
745
  "step": 49
746
  },
747
  {
748
  "epoch": 2.6315789473684212,
749
+ "grad_norm": 2.0135650634765625,
750
+ "learning_rate": 0.00047461705578290833,
751
+ "loss": 0.7768,
752
  "step": 50
753
  },
754
  {
755
  "epoch": 2.6315789473684212,
756
+ "eval_loss": 0.6283606886863708,
757
+ "eval_runtime": 0.8974,
758
+ "eval_samples_per_second": 33.428,
759
+ "eval_steps_per_second": 4.457,
760
  "step": 50
761
  },
762
  {
763
  "epoch": 2.6842105263157894,
764
+ "grad_norm": 1.5658601522445679,
765
+ "learning_rate": 0.0004731297089649703,
766
+ "loss": 0.7418,
767
  "step": 51
768
  },
769
  {
770
  "epoch": 2.6842105263157894,
771
+ "eval_loss": 0.6374291181564331,
772
+ "eval_runtime": 0.8918,
773
+ "eval_samples_per_second": 33.641,
774
+ "eval_steps_per_second": 4.485,
775
  "step": 51
776
  },
777
  {
778
  "epoch": 2.736842105263158,
779
+ "grad_norm": 1.7403415441513062,
780
+ "learning_rate": 0.0004716024858381075,
781
+ "loss": 0.7866,
782
  "step": 52
783
  },
784
  {
785
  "epoch": 2.736842105263158,
786
+ "eval_loss": 0.6586597561836243,
787
+ "eval_runtime": 0.8957,
788
+ "eval_samples_per_second": 33.495,
789
+ "eval_steps_per_second": 4.466,
790
  "step": 52
791
  },
792
  {
793
  "epoch": 2.7894736842105265,
794
+ "grad_norm": 1.519404411315918,
795
+ "learning_rate": 0.00047003565933783123,
796
+ "loss": 0.8354,
797
  "step": 53
798
  },
799
  {
800
  "epoch": 2.7894736842105265,
801
+ "eval_loss": 0.691727340221405,
802
+ "eval_runtime": 0.8923,
803
+ "eval_samples_per_second": 33.62,
804
+ "eval_steps_per_second": 4.483,
805
  "step": 53
806
  },
807
  {
808
  "epoch": 2.8421052631578947,
809
+ "grad_norm": 1.5139788389205933,
810
+ "learning_rate": 0.0004684295094773134,
811
+ "loss": 0.7804,
812
  "step": 54
813
  },
814
  {
815
  "epoch": 2.8421052631578947,
816
+ "eval_loss": 0.6508743762969971,
817
+ "eval_runtime": 0.8929,
818
+ "eval_samples_per_second": 33.598,
819
+ "eval_steps_per_second": 4.48,
820
  "step": 54
821
  },
822
  {
823
  "epoch": 2.8947368421052633,
824
+ "grad_norm": 1.5480479001998901,
825
+ "learning_rate": 0.00046678432329734434,
826
+ "loss": 0.7253,
827
  "step": 55
828
  },
829
  {
830
  "epoch": 2.8947368421052633,
831
+ "eval_loss": 0.6439611911773682,
832
+ "eval_runtime": 0.894,
833
+ "eval_samples_per_second": 33.557,
834
+ "eval_steps_per_second": 4.474,
835
  "step": 55
836
+ },
837
+ {
838
+ "epoch": 2.9473684210526314,
839
+ "grad_norm": 1.5994068384170532,
840
+ "learning_rate": 0.00046510039481503486,
841
+ "loss": 0.842,
842
+ "step": 56
843
+ },
844
+ {
845
+ "epoch": 2.9473684210526314,
846
+ "eval_loss": 0.6327024698257446,
847
+ "eval_runtime": 0.9041,
848
+ "eval_samples_per_second": 33.184,
849
+ "eval_steps_per_second": 4.424,
850
+ "step": 56
851
+ },
852
+ {
853
+ "epoch": 3.0,
854
+ "grad_norm": 1.6054733991622925,
855
+ "learning_rate": 0.00046337802497127117,
856
+ "loss": 0.8073,
857
+ "step": 57
858
+ },
859
+ {
860
+ "epoch": 3.0,
861
+ "eval_loss": 0.6213096976280212,
862
+ "eval_runtime": 0.8992,
863
+ "eval_samples_per_second": 33.362,
864
+ "eval_steps_per_second": 4.448,
865
+ "step": 57
866
+ },
867
+ {
868
+ "epoch": 3.0526315789473686,
869
+ "grad_norm": 2.5787405967712402,
870
+ "learning_rate": 0.00046161752157693284,
871
+ "loss": 0.6017,
872
+ "step": 58
873
+ },
874
+ {
875
+ "epoch": 3.0526315789473686,
876
+ "eval_loss": 0.5892248749732971,
877
+ "eval_runtime": 0.8922,
878
+ "eval_samples_per_second": 33.624,
879
+ "eval_steps_per_second": 4.483,
880
+ "step": 58
881
+ },
882
+ {
883
+ "epoch": 3.1052631578947367,
884
+ "grad_norm": 1.7601501941680908,
885
+ "learning_rate": 0.0004598191992578828,
886
+ "loss": 0.6071,
887
+ "step": 59
888
+ },
889
+ {
890
+ "epoch": 3.1052631578947367,
891
+ "eval_loss": 0.5735067129135132,
892
+ "eval_runtime": 0.8924,
893
+ "eval_samples_per_second": 33.618,
894
+ "eval_steps_per_second": 4.482,
895
+ "step": 59
896
+ },
897
+ {
898
+ "epoch": 3.1578947368421053,
899
+ "grad_norm": 1.7480543851852417,
900
+ "learning_rate": 0.00045798337939873923,
901
+ "loss": 0.6597,
902
+ "step": 60
903
+ },
904
+ {
905
+ "epoch": 3.1578947368421053,
906
+ "eval_loss": 0.5306870341300964,
907
+ "eval_runtime": 0.8938,
908
+ "eval_samples_per_second": 33.566,
909
+ "eval_steps_per_second": 4.475,
910
+ "step": 60
911
+ },
912
+ {
913
+ "epoch": 3.2105263157894735,
914
+ "grad_norm": 2.3808937072753906,
915
+ "learning_rate": 0.0004561103900854401,
916
+ "loss": 0.5372,
917
+ "step": 61
918
+ },
919
+ {
920
+ "epoch": 3.2105263157894735,
921
+ "eval_loss": 0.535223662853241,
922
+ "eval_runtime": 0.8966,
923
+ "eval_samples_per_second": 33.459,
924
+ "eval_steps_per_second": 4.461,
925
+ "step": 61
926
+ },
927
+ {
928
+ "epoch": 3.263157894736842,
929
+ "grad_norm": 1.8272178173065186,
930
+ "learning_rate": 0.0004542005660466094,
931
+ "loss": 0.5399,
932
+ "step": 62
933
+ },
934
+ {
935
+ "epoch": 3.263157894736842,
936
+ "eval_loss": 0.5316082239151001,
937
+ "eval_runtime": 0.8994,
938
+ "eval_samples_per_second": 33.354,
939
+ "eval_steps_per_second": 4.447,
940
+ "step": 62
941
+ },
942
+ {
943
+ "epoch": 3.3157894736842106,
944
+ "grad_norm": 2.0635435581207275,
945
+ "learning_rate": 0.0004522542485937369,
946
+ "loss": 0.5531,
947
+ "step": 63
948
+ },
949
+ {
950
+ "epoch": 3.3157894736842106,
951
+ "eval_loss": 0.5134085416793823,
952
+ "eval_runtime": 0.8937,
953
+ "eval_samples_per_second": 33.567,
954
+ "eval_steps_per_second": 4.476,
955
+ "step": 63
956
+ },
957
+ {
958
+ "epoch": 3.3684210526315788,
959
+ "grad_norm": 2.268183708190918,
960
+ "learning_rate": 0.0004502717855601809,
961
+ "loss": 0.5291,
962
+ "step": 64
963
+ },
964
+ {
965
+ "epoch": 3.3684210526315788,
966
+ "eval_loss": 0.5419598817825317,
967
+ "eval_runtime": 0.8959,
968
+ "eval_samples_per_second": 33.486,
969
+ "eval_steps_per_second": 4.465,
970
+ "step": 64
971
+ },
972
+ {
973
+ "epoch": 3.4210526315789473,
974
+ "grad_norm": 1.8800358772277832,
975
+ "learning_rate": 0.0004482535312390058,
976
+ "loss": 0.5501,
977
+ "step": 65
978
+ },
979
+ {
980
+ "epoch": 3.4210526315789473,
981
+ "eval_loss": 0.5209227800369263,
982
+ "eval_runtime": 0.8927,
983
+ "eval_samples_per_second": 33.606,
984
+ "eval_steps_per_second": 4.481,
985
+ "step": 65
986
+ },
987
+ {
988
+ "epoch": 3.473684210526316,
989
+ "grad_norm": 3.1507558822631836,
990
+ "learning_rate": 0.00044619984631966527,
991
+ "loss": 0.5309,
992
+ "step": 66
993
+ },
994
+ {
995
+ "epoch": 3.473684210526316,
996
+ "eval_loss": 0.536996603012085,
997
+ "eval_runtime": 0.8951,
998
+ "eval_samples_per_second": 33.517,
999
+ "eval_steps_per_second": 4.469,
1000
+ "step": 66
1001
+ },
1002
+ {
1003
+ "epoch": 3.526315789473684,
1004
+ "grad_norm": 3.5700478553771973,
1005
+ "learning_rate": 0.0004441110978235418,
1006
+ "loss": 0.7223,
1007
+ "step": 67
1008
+ },
1009
+ {
1010
+ "epoch": 3.526315789473684,
1011
+ "eval_loss": 0.5140640139579773,
1012
+ "eval_runtime": 0.8962,
1013
+ "eval_samples_per_second": 33.474,
1014
+ "eval_steps_per_second": 4.463,
1015
+ "step": 67
1016
+ },
1017
+ {
1018
+ "epoch": 3.5789473684210527,
1019
+ "grad_norm": 1.758971929550171,
1020
+ "learning_rate": 0.0004419876590383554,
1021
+ "loss": 0.6927,
1022
+ "step": 68
1023
+ },
1024
+ {
1025
+ "epoch": 3.5789473684210527,
1026
+ "eval_loss": 0.47072505950927734,
1027
+ "eval_runtime": 0.9127,
1028
+ "eval_samples_per_second": 32.87,
1029
+ "eval_steps_per_second": 4.383,
1030
+ "step": 68
1031
+ },
1032
+ {
1033
+ "epoch": 3.6315789473684212,
1034
+ "grad_norm": 1.5274709463119507,
1035
+ "learning_rate": 0.00043982990945145146,
1036
+ "loss": 0.4762,
1037
+ "step": 69
1038
+ },
1039
+ {
1040
+ "epoch": 3.6315789473684212,
1041
+ "eval_loss": 0.4518219828605652,
1042
+ "eval_runtime": 0.8967,
1043
+ "eval_samples_per_second": 33.456,
1044
+ "eval_steps_per_second": 4.461,
1045
+ "step": 69
1046
+ },
1047
+ {
1048
+ "epoch": 3.6842105263157894,
1049
+ "grad_norm": 1.7685797214508057,
1050
+ "learning_rate": 0.0004376382346819819,
1051
+ "loss": 0.5629,
1052
+ "step": 70
1053
+ },
1054
+ {
1055
+ "epoch": 3.6842105263157894,
1056
+ "eval_loss": 0.40707579255104065,
1057
+ "eval_runtime": 0.8934,
1058
+ "eval_samples_per_second": 33.581,
1059
+ "eval_steps_per_second": 4.478,
1060
+ "step": 70
1061
+ },
1062
+ {
1063
+ "epoch": 3.736842105263158,
1064
+ "grad_norm": 1.6618574857711792,
1065
+ "learning_rate": 0.00043541302641198946,
1066
+ "loss": 0.5877,
1067
+ "step": 71
1068
+ },
1069
+ {
1070
+ "epoch": 3.736842105263158,
1071
+ "eval_loss": 0.3780651390552521,
1072
+ "eval_runtime": 0.9024,
1073
+ "eval_samples_per_second": 33.246,
1074
+ "eval_steps_per_second": 4.433,
1075
+ "step": 71
1076
+ },
1077
+ {
1078
+ "epoch": 3.7894736842105265,
1079
+ "grad_norm": 1.542702317237854,
1080
+ "learning_rate": 0.00043315468231640834,
1081
+ "loss": 0.5222,
1082
+ "step": 72
1083
+ },
1084
+ {
1085
+ "epoch": 3.7894736842105265,
1086
+ "eval_loss": 0.3732970356941223,
1087
+ "eval_runtime": 0.9166,
1088
+ "eval_samples_per_second": 32.73,
1089
+ "eval_steps_per_second": 4.364,
1090
+ "step": 72
1091
+ },
1092
+ {
1093
+ "epoch": 3.8421052631578947,
1094
+ "grad_norm": 1.8039391040802002,
1095
+ "learning_rate": 0.00043086360599199516,
1096
+ "loss": 0.5238,
1097
+ "step": 73
1098
+ },
1099
+ {
1100
+ "epoch": 3.8421052631578947,
1101
+ "eval_loss": 0.3568810820579529,
1102
+ "eval_runtime": 0.9031,
1103
+ "eval_samples_per_second": 33.218,
1104
+ "eval_steps_per_second": 4.429,
1105
+ "step": 73
1106
+ },
1107
+ {
1108
+ "epoch": 3.8947368421052633,
1109
+ "grad_norm": 1.6215863227844238,
1110
+ "learning_rate": 0.0004285402068852002,
1111
+ "loss": 0.6504,
1112
+ "step": 74
1113
+ },
1114
+ {
1115
+ "epoch": 3.8947368421052633,
1116
+ "eval_loss": 0.3885921835899353,
1117
+ "eval_runtime": 0.896,
1118
+ "eval_samples_per_second": 33.483,
1119
+ "eval_steps_per_second": 4.464,
1120
+ "step": 74
1121
+ },
1122
+ {
1123
+ "epoch": 3.9473684210526314,
1124
+ "grad_norm": 1.5152952671051025,
1125
+ "learning_rate": 0.00042618490021899383,
1126
+ "loss": 0.5694,
1127
+ "step": 75
1128
+ },
1129
+ {
1130
+ "epoch": 3.9473684210526314,
1131
+ "eval_loss": 0.38745489716529846,
1132
+ "eval_runtime": 0.8939,
1133
+ "eval_samples_per_second": 33.562,
1134
+ "eval_steps_per_second": 4.475,
1135
+ "step": 75
1136
+ },
1137
+ {
1138
+ "epoch": 4.0,
1139
+ "grad_norm": 2.6989200115203857,
1140
+ "learning_rate": 0.00042379810691866064,
1141
+ "loss": 0.5849,
1142
+ "step": 76
1143
+ },
1144
+ {
1145
+ "epoch": 4.0,
1146
+ "eval_loss": 0.42535698413848877,
1147
+ "eval_runtime": 0.9073,
1148
+ "eval_samples_per_second": 33.066,
1149
+ "eval_steps_per_second": 4.409,
1150
+ "step": 76
1151
+ },
1152
+ {
1153
+ "epoch": 4.052631578947368,
1154
+ "grad_norm": 1.7381691932678223,
1155
+ "learning_rate": 0.00042138025353657407,
1156
+ "loss": 0.3779,
1157
+ "step": 77
1158
+ },
1159
+ {
1160
+ "epoch": 4.052631578947368,
1161
+ "eval_loss": 0.37115439772605896,
1162
+ "eval_runtime": 0.9112,
1163
+ "eval_samples_per_second": 32.922,
1164
+ "eval_steps_per_second": 4.39,
1165
+ "step": 77
1166
+ },
1167
+ {
1168
+ "epoch": 4.105263157894737,
1169
+ "grad_norm": 2.188385248184204,
1170
+ "learning_rate": 0.00041893177217596633,
1171
+ "loss": 0.44,
1172
+ "step": 78
1173
+ },
1174
+ {
1175
+ "epoch": 4.105263157894737,
1176
+ "eval_loss": 0.2926563322544098,
1177
+ "eval_runtime": 0.8982,
1178
+ "eval_samples_per_second": 33.401,
1179
+ "eval_steps_per_second": 4.453,
1180
+ "step": 78
1181
+ },
1182
+ {
1183
+ "epoch": 4.157894736842105,
1184
+ "grad_norm": 2.3652961254119873,
1185
+ "learning_rate": 0.0004164531004137049,
1186
+ "loss": 0.3639,
1187
+ "step": 79
1188
+ },
1189
+ {
1190
+ "epoch": 4.157894736842105,
1191
+ "eval_loss": 0.2751067876815796,
1192
+ "eval_runtime": 0.9146,
1193
+ "eval_samples_per_second": 32.8,
1194
+ "eval_steps_per_second": 4.373,
1195
+ "step": 79
1196
+ },
1197
+ {
1198
+ "epoch": 4.2105263157894735,
1199
+ "grad_norm": 2.165874719619751,
1200
+ "learning_rate": 0.0004139446812220924,
1201
+ "loss": 0.2683,
1202
+ "step": 80
1203
+ },
1204
+ {
1205
+ "epoch": 4.2105263157894735,
1206
+ "eval_loss": 0.2685202360153198,
1207
+ "eval_runtime": 0.9124,
1208
+ "eval_samples_per_second": 32.881,
1209
+ "eval_steps_per_second": 4.384,
1210
+ "step": 80
1211
+ },
1212
+ {
1213
+ "epoch": 4.2631578947368425,
1214
+ "grad_norm": 1.7391912937164307,
1215
+ "learning_rate": 0.0004114069628897006,
1216
+ "loss": 0.2993,
1217
+ "step": 81
1218
+ },
1219
+ {
1220
+ "epoch": 4.2631578947368425,
1221
+ "eval_loss": 0.33646491169929504,
1222
+ "eval_runtime": 0.8952,
1223
+ "eval_samples_per_second": 33.51,
1224
+ "eval_steps_per_second": 4.468,
1225
+ "step": 81
1226
+ },
1227
+ {
1228
+ "epoch": 4.315789473684211,
1229
+ "grad_norm": 3.65714693069458,
1230
+ "learning_rate": 0.0004088403989412559,
1231
+ "loss": 0.4252,
1232
+ "step": 82
1233
+ },
1234
+ {
1235
+ "epoch": 4.315789473684211,
1236
+ "eval_loss": 0.2839888632297516,
1237
+ "eval_runtime": 0.9057,
1238
+ "eval_samples_per_second": 33.123,
1239
+ "eval_steps_per_second": 4.416,
1240
+ "step": 82
1241
+ },
1242
+ {
1243
+ "epoch": 4.368421052631579,
1244
+ "grad_norm": 2.1762771606445312,
1245
+ "learning_rate": 0.00040624544805658794,
1246
+ "loss": 0.3304,
1247
+ "step": 83
1248
+ },
1249
+ {
1250
+ "epoch": 4.368421052631579,
1251
+ "eval_loss": 0.27002134919166565,
1252
+ "eval_runtime": 0.8939,
1253
+ "eval_samples_per_second": 33.562,
1254
+ "eval_steps_per_second": 4.475,
1255
+ "step": 83
1256
+ },
1257
+ {
1258
+ "epoch": 4.421052631578947,
1259
+ "grad_norm": 2.1018354892730713,
1260
+ "learning_rate": 0.00040362257398865713,
1261
+ "loss": 0.4506,
1262
+ "step": 84
1263
+ },
1264
+ {
1265
+ "epoch": 4.421052631578947,
1266
+ "eval_loss": 0.2557659149169922,
1267
+ "eval_runtime": 0.8969,
1268
+ "eval_samples_per_second": 33.45,
1269
+ "eval_steps_per_second": 4.46,
1270
+ "step": 84
1271
+ },
1272
+ {
1273
+ "epoch": 4.473684210526316,
1274
+ "grad_norm": 1.7509180307388306,
1275
+ "learning_rate": 0.00040097224548067613,
1276
+ "loss": 0.3731,
1277
+ "step": 85
1278
+ },
1279
+ {
1280
+ "epoch": 4.473684210526316,
1281
+ "eval_loss": 0.26859304308891296,
1282
+ "eval_runtime": 0.9009,
1283
+ "eval_samples_per_second": 33.299,
1284
+ "eval_steps_per_second": 4.44,
1285
+ "step": 85
1286
+ },
1287
+ {
1288
+ "epoch": 4.526315789473684,
1289
+ "grad_norm": 1.971816897392273,
1290
+ "learning_rate": 0.0003982949361823388,
1291
+ "loss": 0.38,
1292
+ "step": 86
1293
+ },
1294
+ {
1295
+ "epoch": 4.526315789473684,
1296
+ "eval_loss": 0.2624681293964386,
1297
+ "eval_runtime": 0.8949,
1298
+ "eval_samples_per_second": 33.524,
1299
+ "eval_steps_per_second": 4.47,
1300
+ "step": 86
1301
+ },
1302
+ {
1303
+ "epoch": 4.578947368421053,
1304
+ "grad_norm": 1.4714068174362183,
1305
+ "learning_rate": 0.0003955911245651726,
1306
+ "loss": 0.3944,
1307
+ "step": 87
1308
+ },
1309
+ {
1310
+ "epoch": 4.578947368421053,
1311
+ "eval_loss": 0.23652420938014984,
1312
+ "eval_runtime": 0.8952,
1313
+ "eval_samples_per_second": 33.511,
1314
+ "eval_steps_per_second": 4.468,
1315
+ "step": 87
1316
+ },
1317
+ {
1318
+ "epoch": 4.631578947368421,
1319
+ "grad_norm": 2.6970834732055664,
1320
+ "learning_rate": 0.0003928612938370292,
1321
+ "loss": 0.3374,
1322
+ "step": 88
1323
+ },
1324
+ {
1325
+ "epoch": 4.631578947368421,
1326
+ "eval_loss": 0.2716277241706848,
1327
+ "eval_runtime": 0.8932,
1328
+ "eval_samples_per_second": 33.588,
1329
+ "eval_steps_per_second": 4.478,
1330
+ "step": 88
1331
+ },
1332
+ {
1333
+ "epoch": 4.684210526315789,
1334
+ "grad_norm": 1.9066615104675293,
1335
+ "learning_rate": 0.00039010593185572867,
1336
+ "loss": 0.2442,
1337
+ "step": 89
1338
+ },
1339
+ {
1340
+ "epoch": 4.684210526315789,
1341
+ "eval_loss": 0.2999991476535797,
1342
+ "eval_runtime": 0.8939,
1343
+ "eval_samples_per_second": 33.559,
1344
+ "eval_steps_per_second": 4.475,
1345
+ "step": 89
1346
+ },
1347
+ {
1348
+ "epoch": 4.7368421052631575,
1349
+ "grad_norm": 2.6232354640960693,
1350
+ "learning_rate": 0.00038732553104187296,
1351
+ "loss": 0.2857,
1352
+ "step": 90
1353
+ },
1354
+ {
1355
+ "epoch": 4.7368421052631575,
1356
+ "eval_loss": 0.2302989959716797,
1357
+ "eval_runtime": 0.8938,
1358
+ "eval_samples_per_second": 33.564,
1359
+ "eval_steps_per_second": 4.475,
1360
+ "step": 90
1361
+ },
1362
+ {
1363
+ "epoch": 4.7894736842105265,
1364
+ "grad_norm": 2.0710129737854004,
1365
+ "learning_rate": 0.0003845205882908432,
1366
+ "loss": 0.4195,
1367
+ "step": 91
1368
+ },
1369
+ {
1370
+ "epoch": 4.7894736842105265,
1371
+ "eval_loss": 0.21816590428352356,
1372
+ "eval_runtime": 0.9251,
1373
+ "eval_samples_per_second": 32.429,
1374
+ "eval_steps_per_second": 4.324,
1375
+ "step": 91
1376
+ },
1377
+ {
1378
+ "epoch": 4.842105263157895,
1379
+ "grad_norm": 1.8006062507629395,
1380
+ "learning_rate": 0.0003816916048839979,
1381
+ "loss": 0.2859,
1382
+ "step": 92
1383
+ },
1384
+ {
1385
+ "epoch": 4.842105263157895,
1386
+ "eval_loss": 0.21071405708789825,
1387
+ "eval_runtime": 0.8965,
1388
+ "eval_samples_per_second": 33.462,
1389
+ "eval_steps_per_second": 4.462,
1390
+ "step": 92
1391
+ },
1392
+ {
1393
+ "epoch": 4.894736842105263,
1394
+ "grad_norm": 1.6352888345718384,
1395
+ "learning_rate": 0.0003788390863990875,
1396
+ "loss": 0.4275,
1397
+ "step": 93
1398
+ },
1399
+ {
1400
+ "epoch": 4.894736842105263,
1401
+ "eval_loss": 0.20206846296787262,
1402
+ "eval_runtime": 0.9052,
1403
+ "eval_samples_per_second": 33.144,
1404
+ "eval_steps_per_second": 4.419,
1405
+ "step": 93
1406
+ },
1407
+ {
1408
+ "epoch": 4.947368421052632,
1409
+ "grad_norm": 1.6399378776550293,
1410
+ "learning_rate": 0.00037596354261990007,
1411
+ "loss": 0.389,
1412
+ "step": 94
1413
+ },
1414
+ {
1415
+ "epoch": 4.947368421052632,
1416
+ "eval_loss": 0.19467315077781677,
1417
+ "eval_runtime": 0.8973,
1418
+ "eval_samples_per_second": 33.435,
1419
+ "eval_steps_per_second": 4.458,
1420
+ "step": 94
1421
+ },
1422
+ {
1423
+ "epoch": 5.0,
1424
+ "grad_norm": 1.5680173635482788,
1425
+ "learning_rate": 0.0003730654874451569,
1426
+ "loss": 0.395,
1427
+ "step": 95
1428
+ },
1429
+ {
1430
+ "epoch": 5.0,
1431
+ "eval_loss": 0.19546455144882202,
1432
+ "eval_runtime": 0.91,
1433
+ "eval_samples_per_second": 32.968,
1434
+ "eval_steps_per_second": 4.396,
1435
+ "step": 95
1436
+ },
1437
+ {
1438
+ "epoch": 5.052631578947368,
1439
+ "grad_norm": 1.0308386087417603,
1440
+ "learning_rate": 0.00037014543879667093,
1441
+ "loss": 0.1384,
1442
+ "step": 96
1443
+ },
1444
+ {
1445
+ "epoch": 5.052631578947368,
1446
+ "eval_loss": 0.18969732522964478,
1447
+ "eval_runtime": 0.9021,
1448
+ "eval_samples_per_second": 33.258,
1449
+ "eval_steps_per_second": 4.434,
1450
+ "step": 96
1451
+ },
1452
+ {
1453
+ "epoch": 5.105263157894737,
1454
+ "grad_norm": 1.4042502641677856,
1455
+ "learning_rate": 0.0003672039185267878,
1456
+ "loss": 0.2291,
1457
+ "step": 97
1458
+ },
1459
+ {
1460
+ "epoch": 5.105263157894737,
1461
+ "eval_loss": 0.16800740361213684,
1462
+ "eval_runtime": 0.8938,
1463
+ "eval_samples_per_second": 33.563,
1464
+ "eval_steps_per_second": 4.475,
1465
+ "step": 97
1466
+ },
1467
+ {
1468
+ "epoch": 5.157894736842105,
1469
+ "grad_norm": 1.6313552856445312,
1470
+ "learning_rate": 0.00036424145232512333,
1471
+ "loss": 0.1736,
1472
+ "step": 98
1473
+ },
1474
+ {
1475
+ "epoch": 5.157894736842105,
1476
+ "eval_loss": 0.16714099049568176,
1477
+ "eval_runtime": 0.9009,
1478
+ "eval_samples_per_second": 33.301,
1479
+ "eval_steps_per_second": 4.44,
1480
+ "step": 98
1481
+ },
1482
+ {
1483
+ "epoch": 5.2105263157894735,
1484
+ "grad_norm": 1.8922698497772217,
1485
+ "learning_rate": 0.0003612585696246158,
1486
+ "loss": 0.1677,
1487
+ "step": 99
1488
+ },
1489
+ {
1490
+ "epoch": 5.2105263157894735,
1491
+ "eval_loss": 0.179762065410614,
1492
+ "eval_runtime": 0.9039,
1493
+ "eval_samples_per_second": 33.188,
1494
+ "eval_steps_per_second": 4.425,
1495
+ "step": 99
1496
+ },
1497
+ {
1498
+ "epoch": 5.2631578947368425,
1499
+ "grad_norm": 2.409526824951172,
1500
+ "learning_rate": 0.0003582558035069091,
1501
+ "loss": 0.2379,
1502
+ "step": 100
1503
+ },
1504
+ {
1505
+ "epoch": 5.2631578947368425,
1506
+ "eval_loss": 0.1902371197938919,
1507
+ "eval_runtime": 0.9097,
1508
+ "eval_samples_per_second": 32.98,
1509
+ "eval_steps_per_second": 4.397,
1510
+ "step": 100
1511
+ },
1512
+ {
1513
+ "epoch": 5.315789473684211,
1514
+ "grad_norm": 2.084869146347046,
1515
+ "learning_rate": 0.0003552336906070838,
1516
+ "loss": 0.2165,
1517
+ "step": 101
1518
+ },
1519
+ {
1520
+ "epoch": 5.315789473684211,
1521
+ "eval_loss": 0.17252177000045776,
1522
+ "eval_runtime": 0.8948,
1523
+ "eval_samples_per_second": 33.528,
1524
+ "eval_steps_per_second": 4.47,
1525
+ "step": 101
1526
+ },
1527
+ {
1528
+ "epoch": 5.368421052631579,
1529
+ "grad_norm": 1.655718207359314,
1530
+ "learning_rate": 0.000352192771017753,
1531
+ "loss": 0.223,
1532
+ "step": 102
1533
+ },
1534
+ {
1535
+ "epoch": 5.368421052631579,
1536
+ "eval_loss": 0.18867380917072296,
1537
+ "eval_runtime": 0.8956,
1538
+ "eval_samples_per_second": 33.495,
1539
+ "eval_steps_per_second": 4.466,
1540
+ "step": 102
1541
+ },
1542
+ {
1543
+ "epoch": 5.421052631578947,
1544
+ "grad_norm": 2.672633409500122,
1545
+ "learning_rate": 0.0003491335881925407,
1546
+ "loss": 0.161,
1547
+ "step": 103
1548
+ },
1549
+ {
1550
+ "epoch": 5.421052631578947,
1551
+ "eval_loss": 0.1944020837545395,
1552
+ "eval_runtime": 0.8924,
1553
+ "eval_samples_per_second": 33.616,
1554
+ "eval_steps_per_second": 4.482,
1555
+ "step": 103
1556
+ },
1557
+ {
1558
+ "epoch": 5.473684210526316,
1559
+ "grad_norm": 1.9712008237838745,
1560
+ "learning_rate": 0.0003460566888489593,
1561
+ "loss": 0.2525,
1562
+ "step": 104
1563
+ },
1564
+ {
1565
+ "epoch": 5.473684210526316,
1566
+ "eval_loss": 0.17671068012714386,
1567
+ "eval_runtime": 0.897,
1568
+ "eval_samples_per_second": 33.446,
1569
+ "eval_steps_per_second": 4.459,
1570
+ "step": 104
1571
+ },
1572
+ {
1573
+ "epoch": 5.526315789473684,
1574
+ "grad_norm": 2.2153072357177734,
1575
+ "learning_rate": 0.00034296262287070335,
1576
+ "loss": 0.2105,
1577
+ "step": 105
1578
+ },
1579
+ {
1580
+ "epoch": 5.526315789473684,
1581
+ "eval_loss": 0.1715732216835022,
1582
+ "eval_runtime": 0.8951,
1583
+ "eval_samples_per_second": 33.514,
1584
+ "eval_steps_per_second": 4.469,
1585
+ "step": 105
1586
+ },
1587
+ {
1588
+ "epoch": 5.578947368421053,
1589
+ "grad_norm": 1.8106168508529663,
1590
+ "learning_rate": 0.0003398519432093782,
1591
+ "loss": 0.259,
1592
+ "step": 106
1593
+ },
1594
+ {
1595
+ "epoch": 5.578947368421053,
1596
+ "eval_loss": 0.1465868353843689,
1597
+ "eval_runtime": 0.9077,
1598
+ "eval_samples_per_second": 33.051,
1599
+ "eval_steps_per_second": 4.407,
1600
+ "step": 106
1601
+ },
1602
+ {
1603
+ "epoch": 5.631578947368421,
1604
+ "grad_norm": 2.1159439086914062,
1605
+ "learning_rate": 0.0003367252057856802,
1606
+ "loss": 0.2065,
1607
+ "step": 107
1608
+ },
1609
+ {
1610
+ "epoch": 5.631578947368421,
1611
+ "eval_loss": 0.14219093322753906,
1612
+ "eval_runtime": 0.9049,
1613
+ "eval_samples_per_second": 33.154,
1614
+ "eval_steps_per_second": 4.42,
1615
+ "step": 107
1616
+ },
1617
+ {
1618
+ "epoch": 5.684210526315789,
1619
+ "grad_norm": 1.4467761516571045,
1620
+ "learning_rate": 0.00033358296939004547,
1621
+ "loss": 0.2083,
1622
+ "step": 108
1623
+ },
1624
+ {
1625
+ "epoch": 5.684210526315789,
1626
+ "eval_loss": 0.1406753957271576,
1627
+ "eval_runtime": 0.8954,
1628
+ "eval_samples_per_second": 33.505,
1629
+ "eval_steps_per_second": 4.467,
1630
+ "step": 108
1631
+ },
1632
+ {
1633
+ "epoch": 5.7368421052631575,
1634
+ "grad_norm": 1.3671239614486694,
1635
+ "learning_rate": 0.00033042579558278717,
1636
+ "loss": 0.1825,
1637
+ "step": 109
1638
+ },
1639
+ {
1640
+ "epoch": 5.7368421052631575,
1641
+ "eval_loss": 0.13007155060768127,
1642
+ "eval_runtime": 0.8998,
1643
+ "eval_samples_per_second": 33.342,
1644
+ "eval_steps_per_second": 4.446,
1645
+ "step": 109
1646
+ },
1647
+ {
1648
+ "epoch": 5.7894736842105265,
1649
+ "grad_norm": 1.479944109916687,
1650
+ "learning_rate": 0.00032725424859373687,
1651
+ "loss": 0.2244,
1652
+ "step": 110
1653
+ },
1654
+ {
1655
+ "epoch": 5.7894736842105265,
1656
+ "eval_loss": 0.12692232429981232,
1657
+ "eval_runtime": 0.901,
1658
+ "eval_samples_per_second": 33.298,
1659
+ "eval_steps_per_second": 4.44,
1660
+ "step": 110
1661
+ },
1662
+ {
1663
+ "epoch": 5.842105263157895,
1664
+ "grad_norm": 1.5173969268798828,
1665
+ "learning_rate": 0.0003240688952214085,
1666
+ "loss": 0.2273,
1667
+ "step": 111
1668
+ },
1669
+ {
1670
+ "epoch": 5.842105263157895,
1671
+ "eval_loss": 0.12454597651958466,
1672
+ "eval_runtime": 0.8987,
1673
+ "eval_samples_per_second": 33.382,
1674
+ "eval_steps_per_second": 4.451,
1675
+ "step": 111
1676
+ },
1677
+ {
1678
+ "epoch": 5.894736842105263,
1679
+ "grad_norm": 2.7870988845825195,
1680
+ "learning_rate": 0.00032087030473170445,
1681
+ "loss": 0.2101,
1682
+ "step": 112
1683
+ },
1684
+ {
1685
+ "epoch": 5.894736842105263,
1686
+ "eval_loss": 0.12002909928560257,
1687
+ "eval_runtime": 0.893,
1688
+ "eval_samples_per_second": 33.593,
1689
+ "eval_steps_per_second": 4.479,
1690
+ "step": 112
1691
+ },
1692
+ {
1693
+ "epoch": 5.947368421052632,
1694
+ "grad_norm": 1.3659342527389526,
1695
+ "learning_rate": 0.00031765904875617973,
1696
+ "loss": 0.1882,
1697
+ "step": 113
1698
+ },
1699
+ {
1700
+ "epoch": 5.947368421052632,
1701
+ "eval_loss": 0.10573837906122208,
1702
+ "eval_runtime": 0.8956,
1703
+ "eval_samples_per_second": 33.496,
1704
+ "eval_steps_per_second": 4.466,
1705
+ "step": 113
1706
+ },
1707
+ {
1708
+ "epoch": 6.0,
1709
+ "grad_norm": 1.8464044332504272,
1710
+ "learning_rate": 0.00031443570118988356,
1711
+ "loss": 0.2285,
1712
+ "step": 114
1713
+ },
1714
+ {
1715
+ "epoch": 6.0,
1716
+ "eval_loss": 0.10221625119447708,
1717
+ "eval_runtime": 0.8955,
1718
+ "eval_samples_per_second": 33.501,
1719
+ "eval_steps_per_second": 4.467,
1720
+ "step": 114
1721
+ },
1722
+ {
1723
+ "epoch": 6.052631578947368,
1724
+ "grad_norm": 1.3894392251968384,
1725
+ "learning_rate": 0.00031120083808879663,
1726
+ "loss": 0.1115,
1727
+ "step": 115
1728
+ },
1729
+ {
1730
+ "epoch": 6.052631578947368,
1731
+ "eval_loss": 0.09458151459693909,
1732
+ "eval_runtime": 0.8981,
1733
+ "eval_samples_per_second": 33.405,
1734
+ "eval_steps_per_second": 4.454,
1735
+ "step": 115
1736
+ },
1737
+ {
1738
+ "epoch": 6.105263157894737,
1739
+ "grad_norm": 0.933142364025116,
1740
+ "learning_rate": 0.0003079550375668821,
1741
+ "loss": 0.0888,
1742
+ "step": 116
1743
+ },
1744
+ {
1745
+ "epoch": 6.105263157894737,
1746
+ "eval_loss": 0.09364737570285797,
1747
+ "eval_runtime": 0.9403,
1748
+ "eval_samples_per_second": 31.905,
1749
+ "eval_steps_per_second": 4.254,
1750
+ "step": 116
1751
+ },
1752
+ {
1753
+ "epoch": 6.157894736842105,
1754
+ "grad_norm": 0.9676756262779236,
1755
+ "learning_rate": 0.00030469887969276877,
1756
+ "loss": 0.0785,
1757
+ "step": 117
1758
+ },
1759
+ {
1760
+ "epoch": 6.157894736842105,
1761
+ "eval_loss": 0.10635325312614441,
1762
+ "eval_runtime": 0.903,
1763
+ "eval_samples_per_second": 33.224,
1764
+ "eval_steps_per_second": 4.43,
1765
+ "step": 117
1766
+ },
1767
+ {
1768
+ "epoch": 6.2105263157894735,
1769
+ "grad_norm": 1.249068260192871,
1770
+ "learning_rate": 0.00030143294638608487,
1771
+ "loss": 0.0938,
1772
+ "step": 118
1773
+ },
1774
+ {
1775
+ "epoch": 6.2105263157894735,
1776
+ "eval_loss": 0.10837359726428986,
1777
+ "eval_runtime": 0.9003,
1778
+ "eval_samples_per_second": 33.323,
1779
+ "eval_steps_per_second": 4.443,
1780
+ "step": 118
1781
+ },
1782
+ {
1783
+ "epoch": 6.2631578947368425,
1784
+ "grad_norm": 2.1446304321289062,
1785
+ "learning_rate": 0.00029815782131346137,
1786
+ "loss": 0.1436,
1787
+ "step": 119
1788
+ },
1789
+ {
1790
+ "epoch": 6.2631578947368425,
1791
+ "eval_loss": 0.1047668606042862,
1792
+ "eval_runtime": 0.9066,
1793
+ "eval_samples_per_second": 33.09,
1794
+ "eval_steps_per_second": 4.412,
1795
+ "step": 119
1796
+ },
1797
+ {
1798
+ "epoch": 6.315789473684211,
1799
+ "grad_norm": 1.329365611076355,
1800
+ "learning_rate": 0.0002948740897842223,
1801
+ "loss": 0.1319,
1802
+ "step": 120
1803
+ },
1804
+ {
1805
+ "epoch": 6.315789473684211,
1806
+ "eval_loss": 0.10011889785528183,
1807
+ "eval_runtime": 0.9034,
1808
+ "eval_samples_per_second": 33.206,
1809
+ "eval_steps_per_second": 4.428,
1810
+ "step": 120
1811
+ },
1812
+ {
1813
+ "epoch": 6.368421052631579,
1814
+ "grad_norm": 1.4938923120498657,
1815
+ "learning_rate": 0.00029158233864578256,
1816
+ "loss": 0.1027,
1817
+ "step": 121
1818
+ },
1819
+ {
1820
+ "epoch": 6.368421052631579,
1821
+ "eval_loss": 0.09962069243192673,
1822
+ "eval_runtime": 0.8983,
1823
+ "eval_samples_per_second": 33.396,
1824
+ "eval_steps_per_second": 4.453,
1825
+ "step": 121
1826
+ },
1827
+ {
1828
+ "epoch": 6.421052631578947,
1829
+ "grad_norm": 1.295058250427246,
1830
+ "learning_rate": 0.00028828315617877,
1831
+ "loss": 0.0763,
1832
+ "step": 122
1833
+ },
1834
+ {
1835
+ "epoch": 6.421052631578947,
1836
+ "eval_loss": 0.10031073540449142,
1837
+ "eval_runtime": 0.9155,
1838
+ "eval_samples_per_second": 32.768,
1839
+ "eval_steps_per_second": 4.369,
1840
+ "step": 122
1841
+ },
1842
+ {
1843
+ "epoch": 6.473684210526316,
1844
+ "grad_norm": 1.8959721326828003,
1845
+ "learning_rate": 0.0002849771319918922,
1846
+ "loss": 0.1292,
1847
+ "step": 123
1848
+ },
1849
+ {
1850
+ "epoch": 6.473684210526316,
1851
+ "eval_loss": 0.11003147065639496,
1852
+ "eval_runtime": 0.919,
1853
+ "eval_samples_per_second": 32.644,
1854
+ "eval_steps_per_second": 4.353,
1855
+ "step": 123
1856
+ },
1857
+ {
1858
+ "epoch": 6.526315789473684,
1859
+ "grad_norm": 1.3598809242248535,
1860
+ "learning_rate": 0.00028166485691656423,
1861
+ "loss": 0.1272,
1862
+ "step": 124
1863
+ },
1864
+ {
1865
+ "epoch": 6.526315789473684,
1866
+ "eval_loss": 0.10435277968645096,
1867
+ "eval_runtime": 0.8989,
1868
+ "eval_samples_per_second": 33.374,
1869
+ "eval_steps_per_second": 4.45,
1870
+ "step": 124
1871
+ },
1872
+ {
1873
+ "epoch": 6.578947368421053,
1874
+ "grad_norm": 1.4015425443649292,
1875
+ "learning_rate": 0.00027834692290132053,
1876
+ "loss": 0.1348,
1877
+ "step": 125
1878
+ },
1879
+ {
1880
+ "epoch": 6.578947368421053,
1881
+ "eval_loss": 0.10004603117704391,
1882
+ "eval_runtime": 0.9016,
1883
+ "eval_samples_per_second": 33.273,
1884
+ "eval_steps_per_second": 4.436,
1885
+ "step": 125
1886
+ },
1887
+ {
1888
+ "epoch": 6.631578947368421,
1889
+ "grad_norm": 1.1642837524414062,
1890
+ "learning_rate": 0.0002750239229060246,
1891
+ "loss": 0.1121,
1892
+ "step": 126
1893
+ },
1894
+ {
1895
+ "epoch": 6.631578947368421,
1896
+ "eval_loss": 0.09635353088378906,
1897
+ "eval_runtime": 0.9239,
1898
+ "eval_samples_per_second": 32.47,
1899
+ "eval_steps_per_second": 4.329,
1900
+ "step": 126
1901
+ },
1902
+ {
1903
+ "epoch": 6.684210526315789,
1904
+ "grad_norm": 1.363749384880066,
1905
+ "learning_rate": 0.0002716964507958994,
1906
+ "loss": 0.1415,
1907
+ "step": 127
1908
+ },
1909
+ {
1910
+ "epoch": 6.684210526315789,
1911
+ "eval_loss": 0.07641066610813141,
1912
+ "eval_runtime": 0.912,
1913
+ "eval_samples_per_second": 32.894,
1914
+ "eval_steps_per_second": 4.386,
1915
+ "step": 127
1916
+ },
1917
+ {
1918
+ "epoch": 6.7368421052631575,
1919
+ "grad_norm": 1.292934775352478,
1920
+ "learning_rate": 0.0002683651012353955,
1921
+ "loss": 0.1513,
1922
+ "step": 128
1923
+ },
1924
+ {
1925
+ "epoch": 6.7368421052631575,
1926
+ "eval_loss": 0.07172319293022156,
1927
+ "eval_runtime": 0.8976,
1928
+ "eval_samples_per_second": 33.423,
1929
+ "eval_steps_per_second": 4.456,
1930
+ "step": 128
1931
+ },
1932
+ {
1933
+ "epoch": 6.7894736842105265,
1934
+ "grad_norm": 1.0078333616256714,
1935
+ "learning_rate": 0.0002650304695819168,
1936
+ "loss": 0.1185,
1937
+ "step": 129
1938
+ },
1939
+ {
1940
+ "epoch": 6.7894736842105265,
1941
+ "eval_loss": 0.06412829458713531,
1942
+ "eval_runtime": 0.8976,
1943
+ "eval_samples_per_second": 33.424,
1944
+ "eval_steps_per_second": 4.456,
1945
+ "step": 129
1946
+ },
1947
+ {
1948
+ "epoch": 6.842105263157895,
1949
+ "grad_norm": 1.173531413078308,
1950
+ "learning_rate": 0.00026169315177942135,
1951
+ "loss": 0.158,
1952
+ "step": 130
1953
+ },
1954
+ {
1955
+ "epoch": 6.842105263157895,
1956
+ "eval_loss": 0.05317940190434456,
1957
+ "eval_runtime": 0.895,
1958
+ "eval_samples_per_second": 33.52,
1959
+ "eval_steps_per_second": 4.469,
1960
+ "step": 130
1961
+ },
1962
+ {
1963
+ "epoch": 6.894736842105263,
1964
+ "grad_norm": 1.5312238931655884,
1965
+ "learning_rate": 0.0002583537442519187,
1966
+ "loss": 0.1335,
1967
+ "step": 131
1968
+ },
1969
+ {
1970
+ "epoch": 6.894736842105263,
1971
+ "eval_loss": 0.04491396248340607,
1972
+ "eval_runtime": 0.9089,
1973
+ "eval_samples_per_second": 33.006,
1974
+ "eval_steps_per_second": 4.401,
1975
+ "step": 131
1976
+ },
1977
+ {
1978
+ "epoch": 6.947368421052632,
1979
+ "grad_norm": 1.399732232093811,
1980
+ "learning_rate": 0.00025501284379688067,
1981
+ "loss": 0.1462,
1982
+ "step": 132
1983
+ },
1984
+ {
1985
+ "epoch": 6.947368421052632,
1986
+ "eval_loss": 0.050081584602594376,
1987
+ "eval_runtime": 0.9127,
1988
+ "eval_samples_per_second": 32.871,
1989
+ "eval_steps_per_second": 4.383,
1990
+ "step": 132
1991
+ },
1992
+ {
1993
+ "epoch": 7.0,
1994
+ "grad_norm": 1.5357416868209839,
1995
+ "learning_rate": 0.0002516710474785856,
1996
+ "loss": 0.1136,
1997
+ "step": 133
1998
+ },
1999
+ {
2000
+ "epoch": 7.0,
2001
+ "eval_loss": 0.05439286679029465,
2002
+ "eval_runtime": 0.8974,
2003
+ "eval_samples_per_second": 33.429,
2004
+ "eval_steps_per_second": 4.457,
2005
+ "step": 133
2006
+ },
2007
+ {
2008
+ "epoch": 7.052631578947368,
2009
+ "grad_norm": 0.9891072511672974,
2010
+ "learning_rate": 0.0002483289525214145,
2011
+ "loss": 0.0741,
2012
+ "step": 134
2013
+ },
2014
+ {
2015
+ "epoch": 7.052631578947368,
2016
+ "eval_loss": 0.05073266103863716,
2017
+ "eval_runtime": 0.8953,
2018
+ "eval_samples_per_second": 33.508,
2019
+ "eval_steps_per_second": 4.468,
2020
+ "step": 134
2021
+ },
2022
+ {
2023
+ "epoch": 7.105263157894737,
2024
+ "grad_norm": 0.9686666131019592,
2025
+ "learning_rate": 0.00024498715620311935,
2026
+ "loss": 0.0518,
2027
+ "step": 135
2028
+ },
2029
+ {
2030
+ "epoch": 7.105263157894737,
2031
+ "eval_loss": 0.05563385412096977,
2032
+ "eval_runtime": 0.9008,
2033
+ "eval_samples_per_second": 33.305,
2034
+ "eval_steps_per_second": 4.441,
2035
+ "step": 135
2036
+ },
2037
+ {
2038
+ "epoch": 7.157894736842105,
2039
+ "grad_norm": 1.2277772426605225,
2040
+ "learning_rate": 0.00024164625574808144,
2041
+ "loss": 0.0436,
2042
+ "step": 136
2043
+ },
2044
+ {
2045
+ "epoch": 7.157894736842105,
2046
+ "eval_loss": 0.058188486844301224,
2047
+ "eval_runtime": 0.9014,
2048
+ "eval_samples_per_second": 33.283,
2049
+ "eval_steps_per_second": 4.438,
2050
+ "step": 136
2051
+ },
2052
+ {
2053
+ "epoch": 7.2105263157894735,
2054
+ "grad_norm": 1.3704907894134521,
2055
+ "learning_rate": 0.00023830684822057877,
2056
+ "loss": 0.1041,
2057
+ "step": 137
2058
+ },
2059
+ {
2060
+ "epoch": 7.2105263157894735,
2061
+ "eval_loss": 0.06476210802793503,
2062
+ "eval_runtime": 0.9007,
2063
+ "eval_samples_per_second": 33.306,
2064
+ "eval_steps_per_second": 4.441,
2065
+ "step": 137
2066
+ },
2067
+ {
2068
+ "epoch": 7.2631578947368425,
2069
+ "grad_norm": 16.16583251953125,
2070
+ "learning_rate": 0.00023496953041808325,
2071
+ "loss": 0.0492,
2072
+ "step": 138
2073
+ },
2074
+ {
2075
+ "epoch": 7.2631578947368425,
2076
+ "eval_loss": 0.07608657330274582,
2077
+ "eval_runtime": 0.8935,
2078
+ "eval_samples_per_second": 33.576,
2079
+ "eval_steps_per_second": 4.477,
2080
+ "step": 138
2081
+ },
2082
+ {
2083
+ "epoch": 7.315789473684211,
2084
+ "grad_norm": 1.200278639793396,
2085
+ "learning_rate": 0.0002316348987646045,
2086
+ "loss": 0.0716,
2087
+ "step": 139
2088
+ },
2089
+ {
2090
+ "epoch": 7.315789473684211,
2091
+ "eval_loss": 0.07832919806241989,
2092
+ "eval_runtime": 0.8933,
2093
+ "eval_samples_per_second": 33.584,
2094
+ "eval_steps_per_second": 4.478,
2095
+ "step": 139
2096
+ },
2097
+ {
2098
+ "epoch": 7.368421052631579,
2099
+ "grad_norm": 1.10837984085083,
2100
+ "learning_rate": 0.00022830354920410064,
2101
+ "loss": 0.0867,
2102
+ "step": 140
2103
+ },
2104
+ {
2105
+ "epoch": 7.368421052631579,
2106
+ "eval_loss": 0.06757114827632904,
2107
+ "eval_runtime": 0.8916,
2108
+ "eval_samples_per_second": 33.649,
2109
+ "eval_steps_per_second": 4.487,
2110
+ "step": 140
2111
+ },
2112
+ {
2113
+ "epoch": 7.421052631578947,
2114
+ "grad_norm": 0.8154372572898865,
2115
+ "learning_rate": 0.0002249760770939754,
2116
+ "loss": 0.0596,
2117
+ "step": 141
2118
+ },
2119
+ {
2120
+ "epoch": 7.421052631578947,
2121
+ "eval_loss": 0.06439080089330673,
2122
+ "eval_runtime": 0.8926,
2123
+ "eval_samples_per_second": 33.61,
2124
+ "eval_steps_per_second": 4.481,
2125
+ "step": 141
2126
+ },
2127
+ {
2128
+ "epoch": 7.473684210526316,
2129
+ "grad_norm": 1.3668967485427856,
2130
+ "learning_rate": 0.0002216530770986795,
2131
+ "loss": 0.0742,
2132
+ "step": 142
2133
+ },
2134
+ {
2135
+ "epoch": 7.473684210526316,
2136
+ "eval_loss": 0.05956079065799713,
2137
+ "eval_runtime": 0.9129,
2138
+ "eval_samples_per_second": 32.861,
2139
+ "eval_steps_per_second": 4.381,
2140
+ "step": 142
2141
+ },
2142
+ {
2143
+ "epoch": 7.526315789473684,
2144
+ "grad_norm": 1.1893479824066162,
2145
+ "learning_rate": 0.0002183351430834358,
2146
+ "loss": 0.0885,
2147
+ "step": 143
2148
+ },
2149
+ {
2150
+ "epoch": 7.526315789473684,
2151
+ "eval_loss": 0.05919176712632179,
2152
+ "eval_runtime": 0.9013,
2153
+ "eval_samples_per_second": 33.285,
2154
+ "eval_steps_per_second": 4.438,
2155
+ "step": 143
2156
+ },
2157
+ {
2158
+ "epoch": 7.578947368421053,
2159
+ "grad_norm": 0.9393155574798584,
2160
+ "learning_rate": 0.0002150228680081079,
2161
+ "loss": 0.069,
2162
+ "step": 144
2163
+ },
2164
+ {
2165
+ "epoch": 7.578947368421053,
2166
+ "eval_loss": 0.055469710379838943,
2167
+ "eval_runtime": 0.8928,
2168
+ "eval_samples_per_second": 33.603,
2169
+ "eval_steps_per_second": 4.48,
2170
+ "step": 144
2171
+ },
2172
+ {
2173
+ "epoch": 7.631578947368421,
2174
+ "grad_norm": 1.1879485845565796,
2175
+ "learning_rate": 0.00021171684382123,
2176
+ "loss": 0.0636,
2177
+ "step": 145
2178
+ },
2179
+ {
2180
+ "epoch": 7.631578947368421,
2181
+ "eval_loss": 0.048830099403858185,
2182
+ "eval_runtime": 0.8962,
2183
+ "eval_samples_per_second": 33.476,
2184
+ "eval_steps_per_second": 4.463,
2185
+ "step": 145
2186
+ },
2187
+ {
2188
+ "epoch": 7.684210526315789,
2189
+ "grad_norm": 1.3696624040603638,
2190
+ "learning_rate": 0.0002084176613542175,
2191
+ "loss": 0.0769,
2192
+ "step": 146
2193
+ },
2194
+ {
2195
+ "epoch": 7.684210526315789,
2196
+ "eval_loss": 0.04780884087085724,
2197
+ "eval_runtime": 0.8944,
2198
+ "eval_samples_per_second": 33.543,
2199
+ "eval_steps_per_second": 4.472,
2200
+ "step": 146
2201
+ },
2202
+ {
2203
+ "epoch": 7.7368421052631575,
2204
+ "grad_norm": 0.8504798412322998,
2205
+ "learning_rate": 0.00020512591021577773,
2206
+ "loss": 0.0452,
2207
+ "step": 147
2208
+ },
2209
+ {
2210
+ "epoch": 7.7368421052631575,
2211
+ "eval_loss": 0.05237739533185959,
2212
+ "eval_runtime": 0.8936,
2213
+ "eval_samples_per_second": 33.572,
2214
+ "eval_steps_per_second": 4.476,
2215
+ "step": 147
2216
+ },
2217
+ {
2218
+ "epoch": 7.7894736842105265,
2219
+ "grad_norm": 1.4475505352020264,
2220
+ "learning_rate": 0.00020184217868653867,
2221
+ "loss": 0.0855,
2222
+ "step": 148
2223
+ },
2224
+ {
2225
+ "epoch": 7.7894736842105265,
2226
+ "eval_loss": 0.04543802887201309,
2227
+ "eval_runtime": 0.896,
2228
+ "eval_samples_per_second": 33.484,
2229
+ "eval_steps_per_second": 4.465,
2230
+ "step": 148
2231
+ },
2232
+ {
2233
+ "epoch": 7.842105263157895,
2234
+ "grad_norm": 1.5789515972137451,
2235
+ "learning_rate": 0.0001985670536139151,
2236
+ "loss": 0.0874,
2237
+ "step": 149
2238
+ },
2239
+ {
2240
+ "epoch": 7.842105263157895,
2241
+ "eval_loss": 0.0420089028775692,
2242
+ "eval_runtime": 0.9085,
2243
+ "eval_samples_per_second": 33.022,
2244
+ "eval_steps_per_second": 4.403,
2245
+ "step": 149
2246
+ },
2247
+ {
2248
+ "epoch": 7.894736842105263,
2249
+ "grad_norm": 0.9716910719871521,
2250
+ "learning_rate": 0.0001953011203072312,
2251
+ "loss": 0.0741,
2252
+ "step": 150
2253
+ },
2254
+ {
2255
+ "epoch": 7.894736842105263,
2256
+ "eval_loss": 0.053930822759866714,
2257
+ "eval_runtime": 0.8925,
2258
+ "eval_samples_per_second": 33.612,
2259
+ "eval_steps_per_second": 4.482,
2260
+ "step": 150
2261
+ },
2262
+ {
2263
+ "epoch": 7.947368421052632,
2264
+ "grad_norm": 1.258216142654419,
2265
+ "learning_rate": 0.00019204496243311792,
2266
+ "loss": 0.0988,
2267
+ "step": 151
2268
+ },
2269
+ {
2270
+ "epoch": 7.947368421052632,
2271
+ "eval_loss": 0.050727710127830505,
2272
+ "eval_runtime": 0.9129,
2273
+ "eval_samples_per_second": 32.861,
2274
+ "eval_steps_per_second": 4.381,
2275
+ "step": 151
2276
+ },
2277
+ {
2278
+ "epoch": 8.0,
2279
+ "grad_norm": 1.6167078018188477,
2280
+ "learning_rate": 0.00018879916191120349,
2281
+ "loss": 0.1526,
2282
+ "step": 152
2283
+ },
2284
+ {
2285
+ "epoch": 8.0,
2286
+ "eval_loss": 0.044940169900655746,
2287
+ "eval_runtime": 0.9149,
2288
+ "eval_samples_per_second": 32.792,
2289
+ "eval_steps_per_second": 4.372,
2290
+ "step": 152
2291
+ },
2292
+ {
2293
+ "epoch": 8.052631578947368,
2294
+ "grad_norm": 0.7703630328178406,
2295
+ "learning_rate": 0.00018556429881011656,
2296
+ "loss": 0.029,
2297
+ "step": 153
2298
+ },
2299
+ {
2300
+ "epoch": 8.052631578947368,
2301
+ "eval_loss": 0.04256557673215866,
2302
+ "eval_runtime": 0.8993,
2303
+ "eval_samples_per_second": 33.359,
2304
+ "eval_steps_per_second": 4.448,
2305
+ "step": 153
2306
+ },
2307
+ {
2308
+ "epoch": 8.105263157894736,
2309
+ "grad_norm": 0.7948728799819946,
2310
+ "learning_rate": 0.0001823409512438203,
2311
+ "loss": 0.0294,
2312
+ "step": 154
2313
+ },
2314
+ {
2315
+ "epoch": 8.105263157894736,
2316
+ "eval_loss": 0.0398668609559536,
2317
+ "eval_runtime": 0.9221,
2318
+ "eval_samples_per_second": 32.536,
2319
+ "eval_steps_per_second": 4.338,
2320
+ "step": 154
2321
+ },
2322
+ {
2323
+ "epoch": 8.157894736842104,
2324
+ "grad_norm": 0.5918542146682739,
2325
+ "learning_rate": 0.00017912969526829559,
2326
+ "loss": 0.0219,
2327
+ "step": 155
2328
+ },
2329
+ {
2330
+ "epoch": 8.157894736842104,
2331
+ "eval_loss": 0.03863578289747238,
2332
+ "eval_runtime": 0.8936,
2333
+ "eval_samples_per_second": 33.573,
2334
+ "eval_steps_per_second": 4.476,
2335
+ "step": 155
2336
+ },
2337
+ {
2338
+ "epoch": 8.210526315789474,
2339
+ "grad_norm": 0.5533296465873718,
2340
+ "learning_rate": 0.00017593110477859153,
2341
+ "loss": 0.0238,
2342
+ "step": 156
2343
+ },
2344
+ {
2345
+ "epoch": 8.210526315789474,
2346
+ "eval_loss": 0.03713521733880043,
2347
+ "eval_runtime": 0.8943,
2348
+ "eval_samples_per_second": 33.547,
2349
+ "eval_steps_per_second": 4.473,
2350
+ "step": 156
2351
+ },
2352
+ {
2353
+ "epoch": 8.263157894736842,
2354
+ "grad_norm": 0.5387775897979736,
2355
+ "learning_rate": 0.00017274575140626317,
2356
+ "loss": 0.0332,
2357
+ "step": 157
2358
+ },
2359
+ {
2360
+ "epoch": 8.263157894736842,
2361
+ "eval_loss": 0.0393383763730526,
2362
+ "eval_runtime": 0.8986,
2363
+ "eval_samples_per_second": 33.384,
2364
+ "eval_steps_per_second": 4.451,
2365
+ "step": 157
2366
+ },
2367
+ {
2368
+ "epoch": 8.31578947368421,
2369
+ "grad_norm": 2.716648578643799,
2370
+ "learning_rate": 0.00016957420441721284,
2371
+ "loss": 0.0508,
2372
+ "step": 158
2373
+ },
2374
+ {
2375
+ "epoch": 8.31578947368421,
2376
+ "eval_loss": 0.039231013506650925,
2377
+ "eval_runtime": 0.911,
2378
+ "eval_samples_per_second": 32.932,
2379
+ "eval_steps_per_second": 4.391,
2380
+ "step": 158
2381
+ },
2382
+ {
2383
+ "epoch": 8.368421052631579,
2384
+ "grad_norm": 0.6262527704238892,
2385
+ "learning_rate": 0.00016641703060995457,
2386
+ "loss": 0.0376,
2387
+ "step": 159
2388
+ },
2389
+ {
2390
+ "epoch": 8.368421052631579,
2391
+ "eval_loss": 0.03573182597756386,
2392
+ "eval_runtime": 0.9178,
2393
+ "eval_samples_per_second": 32.685,
2394
+ "eval_steps_per_second": 4.358,
2395
+ "step": 159
2396
+ },
2397
+ {
2398
+ "epoch": 8.421052631578947,
2399
+ "grad_norm": 1.1656262874603271,
2400
+ "learning_rate": 0.00016327479421431983,
2401
+ "loss": 0.0613,
2402
+ "step": 160
2403
+ },
2404
+ {
2405
+ "epoch": 8.421052631578947,
2406
+ "eval_loss": 0.029768355190753937,
2407
+ "eval_runtime": 0.8926,
2408
+ "eval_samples_per_second": 33.61,
2409
+ "eval_steps_per_second": 4.481,
2410
+ "step": 160
2411
+ },
2412
+ {
2413
+ "epoch": 8.473684210526315,
2414
+ "grad_norm": 0.9731020927429199,
2415
+ "learning_rate": 0.00016014805679062183,
2416
+ "loss": 0.0755,
2417
+ "step": 161
2418
+ },
2419
+ {
2420
+ "epoch": 8.473684210526315,
2421
+ "eval_loss": 0.022336162626743317,
2422
+ "eval_runtime": 0.9064,
2423
+ "eval_samples_per_second": 33.097,
2424
+ "eval_steps_per_second": 4.413,
2425
+ "step": 161
2426
+ },
2427
+ {
2428
+ "epoch": 8.526315789473685,
2429
+ "grad_norm": 0.9505934119224548,
2430
+ "learning_rate": 0.0001570373771292967,
2431
+ "loss": 0.0592,
2432
+ "step": 162
2433
+ },
2434
+ {
2435
+ "epoch": 8.526315789473685,
2436
+ "eval_loss": 0.019842755049467087,
2437
+ "eval_runtime": 0.9099,
2438
+ "eval_samples_per_second": 32.97,
2439
+ "eval_steps_per_second": 4.396,
2440
+ "step": 162
2441
+ },
2442
+ {
2443
+ "epoch": 8.578947368421053,
2444
+ "grad_norm": 0.709037184715271,
2445
+ "learning_rate": 0.00015394331115104075,
2446
+ "loss": 0.0386,
2447
+ "step": 163
2448
+ },
2449
+ {
2450
+ "epoch": 8.578947368421053,
2451
+ "eval_loss": 0.019390322268009186,
2452
+ "eval_runtime": 0.8955,
2453
+ "eval_samples_per_second": 33.501,
2454
+ "eval_steps_per_second": 4.467,
2455
+ "step": 163
2456
+ },
2457
+ {
2458
+ "epoch": 8.631578947368421,
2459
+ "grad_norm": 1.0144383907318115,
2460
+ "learning_rate": 0.00015086641180745932,
2461
+ "loss": 0.0392,
2462
+ "step": 164
2463
+ },
2464
+ {
2465
+ "epoch": 8.631578947368421,
2466
+ "eval_loss": 0.018627820536494255,
2467
+ "eval_runtime": 0.8942,
2468
+ "eval_samples_per_second": 33.549,
2469
+ "eval_steps_per_second": 4.473,
2470
+ "step": 164
2471
+ },
2472
+ {
2473
+ "epoch": 8.68421052631579,
2474
+ "grad_norm": 1.3723385334014893,
2475
+ "learning_rate": 0.00014780722898224708,
2476
+ "loss": 0.0286,
2477
+ "step": 165
2478
+ },
2479
+ {
2480
+ "epoch": 8.68421052631579,
2481
+ "eval_loss": 0.015356449410319328,
2482
+ "eval_runtime": 0.8932,
2483
+ "eval_samples_per_second": 33.586,
2484
+ "eval_steps_per_second": 4.478,
2485
+ "step": 165
2486
+ },
2487
+ {
2488
+ "epoch": 8.736842105263158,
2489
+ "grad_norm": 0.6372384428977966,
2490
+ "learning_rate": 0.0001447663093929163,
2491
+ "loss": 0.0425,
2492
+ "step": 166
2493
+ },
2494
+ {
2495
+ "epoch": 8.736842105263158,
2496
+ "eval_loss": 0.015127343125641346,
2497
+ "eval_runtime": 0.9004,
2498
+ "eval_samples_per_second": 33.319,
2499
+ "eval_steps_per_second": 4.443,
2500
+ "step": 166
2501
+ },
2502
+ {
2503
+ "epoch": 8.789473684210526,
2504
+ "grad_norm": 0.7628927826881409,
2505
+ "learning_rate": 0.00014174419649309089,
2506
+ "loss": 0.0218,
2507
+ "step": 167
2508
+ },
2509
+ {
2510
+ "epoch": 8.789473684210526,
2511
+ "eval_loss": 0.015446596778929234,
2512
+ "eval_runtime": 0.8932,
2513
+ "eval_samples_per_second": 33.586,
2514
+ "eval_steps_per_second": 4.478,
2515
+ "step": 167
2516
+ },
2517
+ {
2518
+ "epoch": 8.842105263157894,
2519
+ "grad_norm": 0.7694376111030579,
2520
+ "learning_rate": 0.00013874143037538418,
2521
+ "loss": 0.0251,
2522
+ "step": 168
2523
+ },
2524
+ {
2525
+ "epoch": 8.842105263157894,
2526
+ "eval_loss": 0.01555707585066557,
2527
+ "eval_runtime": 0.9268,
2528
+ "eval_samples_per_second": 32.368,
2529
+ "eval_steps_per_second": 4.316,
2530
+ "step": 168
2531
+ },
2532
+ {
2533
+ "epoch": 8.894736842105264,
2534
+ "grad_norm": 0.7292389869689941,
2535
+ "learning_rate": 0.0001357585476748766,
2536
+ "loss": 0.0345,
2537
+ "step": 169
2538
+ },
2539
+ {
2540
+ "epoch": 8.894736842105264,
2541
+ "eval_loss": 0.014117183163762093,
2542
+ "eval_runtime": 0.8989,
2543
+ "eval_samples_per_second": 33.374,
2544
+ "eval_steps_per_second": 4.45,
2545
+ "step": 169
2546
+ },
2547
+ {
2548
+ "epoch": 8.947368421052632,
2549
+ "grad_norm": 0.7417434453964233,
2550
+ "learning_rate": 0.00013279608147321223,
2551
+ "loss": 0.0355,
2552
+ "step": 170
2553
+ },
2554
+ {
2555
+ "epoch": 8.947368421052632,
2556
+ "eval_loss": 0.01502351462841034,
2557
+ "eval_runtime": 0.8919,
2558
+ "eval_samples_per_second": 33.634,
2559
+ "eval_steps_per_second": 4.485,
2560
+ "step": 170
2561
+ },
2562
+ {
2563
+ "epoch": 9.0,
2564
+ "grad_norm": 1.0023473501205444,
2565
+ "learning_rate": 0.00012985456120332905,
2566
+ "loss": 0.0463,
2567
+ "step": 171
2568
+ },
2569
+ {
2570
+ "epoch": 9.0,
2571
+ "eval_loss": 0.015432776883244514,
2572
+ "eval_runtime": 0.8928,
2573
+ "eval_samples_per_second": 33.604,
2574
+ "eval_steps_per_second": 4.481,
2575
+ "step": 171
2576
+ },
2577
+ {
2578
+ "epoch": 9.052631578947368,
2579
+ "grad_norm": 0.9472024440765381,
2580
+ "learning_rate": 0.00012693451255484312,
2581
+ "loss": 0.0164,
2582
+ "step": 172
2583
+ },
2584
+ {
2585
+ "epoch": 9.052631578947368,
2586
+ "eval_loss": 0.015417199581861496,
2587
+ "eval_runtime": 0.9209,
2588
+ "eval_samples_per_second": 32.576,
2589
+ "eval_steps_per_second": 4.344,
2590
+ "step": 172
2591
+ },
2592
+ {
2593
+ "epoch": 9.105263157894736,
2594
+ "grad_norm": 0.48799633979797363,
2595
+ "learning_rate": 0.00012403645738009997,
2596
+ "loss": 0.0112,
2597
+ "step": 173
2598
+ },
2599
+ {
2600
+ "epoch": 9.105263157894736,
2601
+ "eval_loss": 0.015746938064694405,
2602
+ "eval_runtime": 0.903,
2603
+ "eval_samples_per_second": 33.221,
2604
+ "eval_steps_per_second": 4.43,
2605
+ "step": 173
2606
+ },
2607
+ {
2608
+ "epoch": 9.157894736842104,
2609
+ "grad_norm": 0.38101622462272644,
2610
+ "learning_rate": 0.00012116091360091261,
2611
+ "loss": 0.0107,
2612
+ "step": 174
2613
+ },
2614
+ {
2615
+ "epoch": 9.157894736842104,
2616
+ "eval_loss": 0.016201062127947807,
2617
+ "eval_runtime": 0.9098,
2618
+ "eval_samples_per_second": 32.973,
2619
+ "eval_steps_per_second": 4.396,
2620
+ "step": 174
2621
+ },
2622
+ {
2623
+ "epoch": 9.210526315789474,
2624
+ "grad_norm": 0.5602852702140808,
2625
+ "learning_rate": 0.00011830839511600211,
2626
+ "loss": 0.0214,
2627
+ "step": 175
2628
+ },
2629
+ {
2630
+ "epoch": 9.210526315789474,
2631
+ "eval_loss": 0.01637989468872547,
2632
+ "eval_runtime": 0.8978,
2633
+ "eval_samples_per_second": 33.416,
2634
+ "eval_steps_per_second": 4.455,
2635
+ "step": 175
2636
+ },
2637
+ {
2638
+ "epoch": 9.263157894736842,
2639
+ "grad_norm": 0.48026910424232483,
2640
+ "learning_rate": 0.00011547941170915685,
2641
+ "loss": 0.0159,
2642
+ "step": 176
2643
+ },
2644
+ {
2645
+ "epoch": 9.263157894736842,
2646
+ "eval_loss": 0.01590169034898281,
2647
+ "eval_runtime": 0.8929,
2648
+ "eval_samples_per_second": 33.599,
2649
+ "eval_steps_per_second": 4.48,
2650
+ "step": 176
2651
+ },
2652
+ {
2653
+ "epoch": 9.31578947368421,
2654
+ "grad_norm": 0.42628395557403564,
2655
+ "learning_rate": 0.00011267446895812702,
2656
+ "loss": 0.0103,
2657
+ "step": 177
2658
+ },
2659
+ {
2660
+ "epoch": 9.31578947368421,
2661
+ "eval_loss": 0.016489733010530472,
2662
+ "eval_runtime": 0.9067,
2663
+ "eval_samples_per_second": 33.087,
2664
+ "eval_steps_per_second": 4.412,
2665
+ "step": 177
2666
+ },
2667
+ {
2668
+ "epoch": 9.368421052631579,
2669
+ "grad_norm": 0.31815841794013977,
2670
+ "learning_rate": 0.0001098940681442713,
2671
+ "loss": 0.0127,
2672
+ "step": 178
2673
+ },
2674
+ {
2675
+ "epoch": 9.368421052631579,
2676
+ "eval_loss": 0.016672790050506592,
2677
+ "eval_runtime": 0.9121,
2678
+ "eval_samples_per_second": 32.892,
2679
+ "eval_steps_per_second": 4.386,
2680
+ "step": 178
2681
+ },
2682
+ {
2683
+ "epoch": 9.421052631578947,
2684
+ "grad_norm": 0.9146761894226074,
2685
+ "learning_rate": 0.00010713870616297092,
2686
+ "loss": 0.0272,
2687
+ "step": 179
2688
+ },
2689
+ {
2690
+ "epoch": 9.421052631578947,
2691
+ "eval_loss": 0.016623031347990036,
2692
+ "eval_runtime": 0.8922,
2693
+ "eval_samples_per_second": 33.624,
2694
+ "eval_steps_per_second": 4.483,
2695
+ "step": 179
2696
+ },
2697
+ {
2698
+ "epoch": 9.473684210526315,
2699
+ "grad_norm": 0.7005583643913269,
2700
+ "learning_rate": 0.00010440887543482746,
2701
+ "loss": 0.0316,
2702
+ "step": 180
2703
+ },
2704
+ {
2705
+ "epoch": 9.473684210526315,
2706
+ "eval_loss": 0.01275827456265688,
2707
+ "eval_runtime": 0.8927,
2708
+ "eval_samples_per_second": 33.606,
2709
+ "eval_steps_per_second": 4.481,
2710
+ "step": 180
2711
+ },
2712
+ {
2713
+ "epoch": 9.526315789473685,
2714
+ "grad_norm": 0.650211751461029,
2715
+ "learning_rate": 0.0001017050638176612,
2716
+ "loss": 0.026,
2717
+ "step": 181
2718
+ },
2719
+ {
2720
+ "epoch": 9.526315789473685,
2721
+ "eval_loss": 0.00972173921763897,
2722
+ "eval_runtime": 0.8929,
2723
+ "eval_samples_per_second": 33.597,
2724
+ "eval_steps_per_second": 4.48,
2725
+ "step": 181
2726
+ },
2727
+ {
2728
+ "epoch": 9.578947368421053,
2729
+ "grad_norm": 0.6491077542304993,
2730
+ "learning_rate": 9.902775451932386e-05,
2731
+ "loss": 0.0118,
2732
+ "step": 182
2733
+ },
2734
+ {
2735
+ "epoch": 9.578947368421053,
2736
+ "eval_loss": 0.008100698702037334,
2737
+ "eval_runtime": 0.8982,
2738
+ "eval_samples_per_second": 33.399,
2739
+ "eval_steps_per_second": 4.453,
2740
+ "step": 182
2741
+ },
2742
+ {
2743
+ "epoch": 9.631578947368421,
2744
+ "grad_norm": 0.5663555264472961,
2745
+ "learning_rate": 9.637742601134286e-05,
2746
+ "loss": 0.0179,
2747
+ "step": 183
2748
+ },
2749
+ {
2750
+ "epoch": 9.631578947368421,
2751
+ "eval_loss": 0.007195114623755217,
2752
+ "eval_runtime": 0.8943,
2753
+ "eval_samples_per_second": 33.547,
2754
+ "eval_steps_per_second": 4.473,
2755
+ "step": 183
2756
+ },
2757
+ {
2758
+ "epoch": 9.68421052631579,
2759
+ "grad_norm": 0.45350518822669983,
2760
+ "learning_rate": 9.375455194341214e-05,
2761
+ "loss": 0.0133,
2762
+ "step": 184
2763
+ },
2764
+ {
2765
+ "epoch": 9.68421052631579,
2766
+ "eval_loss": 0.005673492327332497,
2767
+ "eval_runtime": 0.8957,
2768
+ "eval_samples_per_second": 33.493,
2769
+ "eval_steps_per_second": 4.466,
2770
+ "step": 184
2771
+ },
2772
+ {
2773
+ "epoch": 9.736842105263158,
2774
+ "grad_norm": 0.4562082886695862,
2775
+ "learning_rate": 9.11596010587441e-05,
2776
+ "loss": 0.0116,
2777
+ "step": 185
2778
+ },
2779
+ {
2780
+ "epoch": 9.736842105263158,
2781
+ "eval_loss": 0.005512699484825134,
2782
+ "eval_runtime": 0.8942,
2783
+ "eval_samples_per_second": 33.551,
2784
+ "eval_steps_per_second": 4.473,
2785
+ "step": 185
2786
+ },
2787
+ {
2788
+ "epoch": 9.789473684210526,
2789
+ "grad_norm": 0.4965287446975708,
2790
+ "learning_rate": 8.85930371102994e-05,
2791
+ "loss": 0.0175,
2792
+ "step": 186
2793
+ },
2794
+ {
2795
+ "epoch": 9.789473684210526,
2796
+ "eval_loss": 0.005058939103037119,
2797
+ "eval_runtime": 0.8924,
2798
+ "eval_samples_per_second": 33.619,
2799
+ "eval_steps_per_second": 4.482,
2800
+ "step": 186
2801
+ },
2802
+ {
2803
+ "epoch": 9.842105263157894,
2804
+ "grad_norm": 0.4823167324066162,
2805
+ "learning_rate": 8.605531877790762e-05,
2806
+ "loss": 0.0156,
2807
+ "step": 187
2808
+ },
2809
+ {
2810
+ "epoch": 9.842105263157894,
2811
+ "eval_loss": 0.004006177186965942,
2812
+ "eval_runtime": 0.8937,
2813
+ "eval_samples_per_second": 33.568,
2814
+ "eval_steps_per_second": 4.476,
2815
+ "step": 187
2816
+ },
2817
+ {
2818
+ "epoch": 9.894736842105264,
2819
+ "grad_norm": 0.5879040360450745,
2820
+ "learning_rate": 8.354689958629513e-05,
2821
+ "loss": 0.0147,
2822
+ "step": 188
2823
+ },
2824
+ {
2825
+ "epoch": 9.894736842105264,
2826
+ "eval_loss": 0.003014415269717574,
2827
+ "eval_runtime": 0.8965,
2828
+ "eval_samples_per_second": 33.465,
2829
+ "eval_steps_per_second": 4.462,
2830
+ "step": 188
2831
+ },
2832
+ {
2833
+ "epoch": 9.947368421052632,
2834
+ "grad_norm": 0.4576377868652344,
2835
+ "learning_rate": 8.106822782403376e-05,
2836
+ "loss": 0.0095,
2837
+ "step": 189
2838
+ },
2839
+ {
2840
+ "epoch": 9.947368421052632,
2841
+ "eval_loss": 0.002746094949543476,
2842
+ "eval_runtime": 0.8942,
2843
+ "eval_samples_per_second": 33.55,
2844
+ "eval_steps_per_second": 4.473,
2845
+ "step": 189
2846
+ },
2847
+ {
2848
+ "epoch": 10.0,
2849
+ "grad_norm": 0.3874748647212982,
2850
+ "learning_rate": 7.861974646342596e-05,
2851
+ "loss": 0.0065,
2852
+ "step": 190
2853
+ },
2854
+ {
2855
+ "epoch": 10.0,
2856
+ "eval_loss": 0.0022730662021785975,
2857
+ "eval_runtime": 0.8918,
2858
+ "eval_samples_per_second": 33.639,
2859
+ "eval_steps_per_second": 4.485,
2860
+ "step": 190
2861
+ },
2862
+ {
2863
+ "epoch": 10.052631578947368,
2864
+ "grad_norm": 0.19529208540916443,
2865
+ "learning_rate": 7.620189308133943e-05,
2866
+ "loss": 0.0038,
2867
+ "step": 191
2868
+ },
2869
+ {
2870
+ "epoch": 10.052631578947368,
2871
+ "eval_loss": 0.0020791899878531694,
2872
+ "eval_runtime": 0.8957,
2873
+ "eval_samples_per_second": 33.495,
2874
+ "eval_steps_per_second": 4.466,
2875
+ "step": 191
2876
+ },
2877
+ {
2878
+ "epoch": 10.105263157894736,
2879
+ "grad_norm": 0.11527393758296967,
2880
+ "learning_rate": 7.381509978100626e-05,
2881
+ "loss": 0.0022,
2882
+ "step": 192
2883
+ },
2884
+ {
2885
+ "epoch": 10.105263157894736,
2886
+ "eval_loss": 0.002016394166275859,
2887
+ "eval_runtime": 0.8996,
2888
+ "eval_samples_per_second": 33.347,
2889
+ "eval_steps_per_second": 4.446,
2890
+ "step": 192
2891
+ },
2892
+ {
2893
+ "epoch": 10.157894736842104,
2894
+ "grad_norm": 0.15251131355762482,
2895
+ "learning_rate": 7.145979311479986e-05,
2896
+ "loss": 0.003,
2897
+ "step": 193
2898
+ },
2899
+ {
2900
+ "epoch": 10.157894736842104,
2901
+ "eval_loss": 0.0021317771170288324,
2902
+ "eval_runtime": 0.8932,
2903
+ "eval_samples_per_second": 33.585,
2904
+ "eval_steps_per_second": 4.478,
2905
+ "step": 193
2906
+ },
2907
+ {
2908
+ "epoch": 10.210526315789474,
2909
+ "grad_norm": 0.16482071578502655,
2910
+ "learning_rate": 6.913639400800489e-05,
2911
+ "loss": 0.0024,
2912
+ "step": 194
2913
+ },
2914
+ {
2915
+ "epoch": 10.210526315789474,
2916
+ "eval_loss": 0.0021966167259961367,
2917
+ "eval_runtime": 0.8945,
2918
+ "eval_samples_per_second": 33.537,
2919
+ "eval_steps_per_second": 4.472,
2920
+ "step": 194
2921
+ },
2922
+ {
2923
+ "epoch": 10.263157894736842,
2924
+ "grad_norm": 0.14208117127418518,
2925
+ "learning_rate": 6.684531768359173e-05,
2926
+ "loss": 0.002,
2927
+ "step": 195
2928
+ },
2929
+ {
2930
+ "epoch": 10.263157894736842,
2931
+ "eval_loss": 0.0022034423891454935,
2932
+ "eval_runtime": 0.8952,
2933
+ "eval_samples_per_second": 33.511,
2934
+ "eval_steps_per_second": 4.468,
2935
+ "step": 195
2936
+ },
2937
+ {
2938
+ "epoch": 10.31578947368421,
2939
+ "grad_norm": 0.11844911426305771,
2940
+ "learning_rate": 6.458697358801061e-05,
2941
+ "loss": 0.0018,
2942
+ "step": 196
2943
+ },
2944
+ {
2945
+ "epoch": 10.31578947368421,
2946
+ "eval_loss": 0.002191495383158326,
2947
+ "eval_runtime": 0.8926,
2948
+ "eval_samples_per_second": 33.611,
2949
+ "eval_steps_per_second": 4.481,
2950
+ "step": 196
2951
+ },
2952
+ {
2953
+ "epoch": 10.368421052631579,
2954
+ "grad_norm": 0.25322437286376953,
2955
+ "learning_rate": 6.236176531801813e-05,
2956
+ "loss": 0.0049,
2957
+ "step": 197
2958
+ },
2959
+ {
2960
+ "epoch": 10.368421052631579,
2961
+ "eval_loss": 0.0022686992306262255,
2962
+ "eval_runtime": 0.8949,
2963
+ "eval_samples_per_second": 33.525,
2964
+ "eval_steps_per_second": 4.47,
2965
+ "step": 197
2966
+ },
2967
+ {
2968
+ "epoch": 10.421052631578947,
2969
+ "grad_norm": 0.29156965017318726,
2970
+ "learning_rate": 6.017009054854858e-05,
2971
+ "loss": 0.0045,
2972
+ "step": 198
2973
+ },
2974
+ {
2975
+ "epoch": 10.421052631578947,
2976
+ "eval_loss": 0.002286201808601618,
2977
+ "eval_runtime": 0.8929,
2978
+ "eval_samples_per_second": 33.597,
2979
+ "eval_steps_per_second": 4.48,
2980
+ "step": 198
2981
+ },
2982
+ {
2983
+ "epoch": 10.473684210526315,
2984
+ "grad_norm": 0.3855668306350708,
2985
+ "learning_rate": 5.801234096164468e-05,
2986
+ "loss": 0.0034,
2987
+ "step": 199
2988
+ },
2989
+ {
2990
+ "epoch": 10.473684210526315,
2991
+ "eval_loss": 0.0018616730812937021,
2992
+ "eval_runtime": 0.894,
2993
+ "eval_samples_per_second": 33.558,
2994
+ "eval_steps_per_second": 4.474,
2995
+ "step": 199
2996
+ },
2997
+ {
2998
+ "epoch": 10.526315789473685,
2999
+ "grad_norm": 0.2883719205856323,
3000
+ "learning_rate": 5.58889021764582e-05,
3001
+ "loss": 0.0044,
3002
+ "step": 200
3003
+ },
3004
+ {
3005
+ "epoch": 10.526315789473685,
3006
+ "eval_loss": 0.0016098986379802227,
3007
+ "eval_runtime": 0.8994,
3008
+ "eval_samples_per_second": 33.357,
3009
+ "eval_steps_per_second": 4.448,
3010
+ "step": 200
3011
  }
3012
  ],
3013
  "logging_steps": 1,
3014
+ "max_steps": 250,
3015
  "num_input_tokens_seen": 0,
3016
+ "num_train_epochs": 14,
3017
  "save_steps": 5,
3018
  "stateful_callbacks": {
3019
  "TrainerControl": {
 
3027
  "attributes": {}
3028
  }
3029
  },
3030
+ "total_flos": 8525733259253760.0,
3031
  "train_batch_size": 1,
3032
  "trial_name": null,
3033
  "trial_params": null