irodkin commited on
Commit
55f86c9
·
verified ·
1 Parent(s): 497ebe0

Training checkpoint at step 8000

Browse files
Files changed (1) hide show
  1. trainer_state.json +1446 -726
trainer_state.json CHANGED
@@ -1,2173 +1,2893 @@
1
  {
2
- "best_global_step": 2700,
3
- "best_metric": 2.4390792846679688,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-2000",
5
- "epoch": 0.12,
6
  "eval_steps": 100,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
- "grad_norm": 50.656509992459746,
15
  "learning_rate": 4.8e-08,
16
- "loss": 4.0214,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
- "grad_norm": 38.99252800700454,
22
  "learning_rate": 9.8e-08,
23
- "loss": 3.935,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
- "grad_norm": 21.472407776297636,
29
  "learning_rate": 1.4800000000000003e-07,
30
- "loss": 3.6524,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
- "grad_norm": 10.352839141449,
36
  "learning_rate": 1.9800000000000003e-07,
37
- "loss": 3.3517,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
- "eval_loss": 3.0817508697509766,
43
- "eval_runtime": 266.4152,
44
- "eval_samples_per_second": 3.085,
45
- "eval_steps_per_second": 1.543,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
- "grad_norm": 4.30904423901761,
51
  "learning_rate": 2.48e-07,
52
- "loss": 3.036,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
- "grad_norm": 2.264577383947395,
58
  "learning_rate": 2.9800000000000005e-07,
59
- "loss": 2.8341,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
- "grad_norm": 1.4155513897039314,
65
  "learning_rate": 3.48e-07,
66
- "loss": 2.73,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
- "grad_norm": 1.2078696095638184,
72
  "learning_rate": 3.9800000000000004e-07,
73
- "loss": 2.6731,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
- "eval_loss": 2.635507345199585,
79
- "eval_runtime": 266.4141,
80
- "eval_samples_per_second": 3.085,
81
- "eval_steps_per_second": 1.543,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
- "grad_norm": 1.0846849213120675,
87
  "learning_rate": 4.4800000000000004e-07,
88
- "loss": 2.6351,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
- "grad_norm": 0.9559789554521467,
94
  "learning_rate": 4.98e-07,
95
- "loss": 2.601,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
- "grad_norm": 1.0192810065788216,
101
  "learning_rate": 5.480000000000001e-07,
102
- "loss": 2.5828,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
- "grad_norm": 1.1431990209968765,
108
  "learning_rate": 5.98e-07,
109
- "loss": 2.5634,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
- "eval_loss": 2.550046682357788,
115
- "eval_runtime": 267.4638,
116
- "eval_samples_per_second": 3.073,
117
- "eval_steps_per_second": 1.537,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
- "grad_norm": 2.767331212303632,
123
  "learning_rate": 6.48e-07,
124
- "loss": 2.5495,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
- "grad_norm": 2.515653095645461,
130
  "learning_rate": 6.98e-07,
131
- "loss": 2.5327,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
- "grad_norm": 3.1526237978853615,
137
  "learning_rate": 7.480000000000001e-07,
138
- "loss": 2.5176,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
- "grad_norm": 3.62440809116865,
144
  "learning_rate": 7.98e-07,
145
- "loss": 2.5127,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
- "eval_loss": 2.5152335166931152,
151
- "eval_runtime": 266.9874,
152
- "eval_samples_per_second": 3.079,
153
- "eval_steps_per_second": 1.539,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
- "grad_norm": 7.135146349888136,
159
  "learning_rate": 8.480000000000001e-07,
160
- "loss": 2.5162,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
- "grad_norm": 2.648120897857043,
166
  "learning_rate": 8.980000000000001e-07,
167
- "loss": 2.4957,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
- "grad_norm": 1.8002381671303913,
173
  "learning_rate": 9.480000000000001e-07,
174
- "loss": 2.4903,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
- "grad_norm": 4.371053421441641,
180
  "learning_rate": 9.98e-07,
181
- "loss": 2.4832,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
- "eval_loss": 2.497570514678955,
187
- "eval_runtime": 267.4804,
188
- "eval_samples_per_second": 3.073,
189
- "eval_steps_per_second": 1.537,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
- "grad_norm": 2.7367881121941138,
195
  "learning_rate": 1.0480000000000002e-06,
196
- "loss": 2.4764,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
- "grad_norm": 2.6059320858334547,
202
  "learning_rate": 1.0980000000000001e-06,
203
- "loss": 2.4817,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
- "grad_norm": 1.4561576361319408,
209
  "learning_rate": 1.148e-06,
210
- "loss": 2.4699,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
- "grad_norm": 1.6446545702256075,
216
  "learning_rate": 1.1980000000000002e-06,
217
- "loss": 2.4625,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
- "eval_loss": 2.480875253677368,
223
- "eval_runtime": 267.1762,
224
- "eval_samples_per_second": 3.077,
225
- "eval_steps_per_second": 1.538,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
- "grad_norm": 1.8219267735301914,
231
  "learning_rate": 1.248e-06,
232
- "loss": 2.4611,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
- "grad_norm": 1.1370020634221731,
238
  "learning_rate": 1.2980000000000001e-06,
239
- "loss": 2.466,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
- "grad_norm": 2.1094427938442104,
245
  "learning_rate": 1.348e-06,
246
- "loss": 2.4612,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
- "grad_norm": 2.68194405349932,
252
  "learning_rate": 1.3980000000000002e-06,
253
- "loss": 2.4628,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
- "eval_loss": 2.4722726345062256,
259
- "eval_runtime": 268.4962,
260
- "eval_samples_per_second": 3.061,
261
- "eval_steps_per_second": 1.531,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
- "grad_norm": 2.035476406407236,
267
  "learning_rate": 1.4480000000000002e-06,
268
- "loss": 2.4472,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
- "grad_norm": 3.271849266938815,
274
  "learning_rate": 1.498e-06,
275
- "loss": 2.449,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
- "grad_norm": 0.9814118372932493,
281
  "learning_rate": 1.548e-06,
282
- "loss": 2.453,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
- "grad_norm": 2.6810297902336244,
288
  "learning_rate": 1.5980000000000002e-06,
289
- "loss": 2.4463,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
- "eval_loss": 2.4666495323181152,
295
- "eval_runtime": 414.1548,
296
- "eval_samples_per_second": 1.985,
297
- "eval_steps_per_second": 0.992,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
- "grad_norm": 1.2790805248924313,
303
  "learning_rate": 1.6480000000000001e-06,
304
- "loss": 2.4472,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
- "grad_norm": 1.0740073529506808,
310
  "learning_rate": 1.6980000000000003e-06,
311
- "loss": 2.452,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
- "grad_norm": 1.551336448361318,
317
  "learning_rate": 1.7480000000000002e-06,
318
- "loss": 2.4468,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
- "grad_norm": 2.3542378163349933,
324
  "learning_rate": 1.798e-06,
325
- "loss": 2.4436,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
- "eval_loss": 2.461561441421509,
331
- "eval_runtime": 580.7751,
332
- "eval_samples_per_second": 1.415,
333
- "eval_steps_per_second": 0.708,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
- "grad_norm": 2.0862642215782854,
339
  "learning_rate": 1.8480000000000001e-06,
340
- "loss": 2.4469,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
- "grad_norm": 1.1621391054674521,
346
  "learning_rate": 1.898e-06,
347
- "loss": 2.4419,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
- "grad_norm": 1.1717546484062549,
353
  "learning_rate": 1.9480000000000002e-06,
354
- "loss": 2.4337,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
- "grad_norm": 1.7485677588723867,
360
  "learning_rate": 1.998e-06,
361
- "loss": 2.4448,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
- "eval_loss": 2.4590463638305664,
367
- "eval_runtime": 271.0126,
368
- "eval_samples_per_second": 3.033,
369
- "eval_steps_per_second": 1.517,
370
  "step": 1000
371
  },
372
  {
373
  "epoch": 0.0205,
374
- "grad_norm": 1.2350950769146356,
375
  "learning_rate": 2.048e-06,
376
- "loss": 2.4331,
377
  "step": 1025
378
  },
379
  {
380
  "epoch": 0.021,
381
- "grad_norm": 1.4443797116679313,
382
  "learning_rate": 2.098e-06,
383
- "loss": 2.4292,
384
  "step": 1050
385
  },
386
  {
387
  "epoch": 0.0215,
388
- "grad_norm": 1.8443466972633604,
389
  "learning_rate": 2.148e-06,
390
- "loss": 2.4354,
391
  "step": 1075
392
  },
393
  {
394
  "epoch": 0.022,
395
- "grad_norm": 0.9668323863754179,
396
  "learning_rate": 2.198e-06,
397
- "loss": 2.4351,
398
  "step": 1100
399
  },
400
  {
401
  "epoch": 0.022,
402
- "eval_loss": 2.4556779861450195,
403
- "eval_runtime": 272.0439,
404
- "eval_samples_per_second": 3.022,
405
- "eval_steps_per_second": 1.511,
406
  "step": 1100
407
  },
408
  {
409
  "epoch": 0.0225,
410
- "grad_norm": 1.7207654971672084,
411
  "learning_rate": 2.2480000000000003e-06,
412
- "loss": 2.4313,
413
  "step": 1125
414
  },
415
  {
416
  "epoch": 0.023,
417
- "grad_norm": 1.0609444308653502,
418
  "learning_rate": 2.2980000000000003e-06,
419
- "loss": 2.4268,
420
  "step": 1150
421
  },
422
  {
423
  "epoch": 0.0235,
424
- "grad_norm": 2.632073425850519,
425
  "learning_rate": 2.3480000000000002e-06,
426
- "loss": 2.4279,
427
  "step": 1175
428
  },
429
  {
430
  "epoch": 0.024,
431
- "grad_norm": 0.8736598209263914,
432
  "learning_rate": 2.398e-06,
433
- "loss": 2.4224,
434
  "step": 1200
435
  },
436
  {
437
  "epoch": 0.024,
438
- "eval_loss": 2.452965021133423,
439
- "eval_runtime": 279.2551,
440
- "eval_samples_per_second": 2.944,
441
- "eval_steps_per_second": 1.472,
442
  "step": 1200
443
  },
444
  {
445
  "epoch": 0.0245,
446
- "grad_norm": 1.1010146380315569,
447
  "learning_rate": 2.448e-06,
448
- "loss": 2.4347,
449
  "step": 1225
450
  },
451
  {
452
  "epoch": 0.025,
453
- "grad_norm": 1.1289837790122792,
454
  "learning_rate": 2.498e-06,
455
- "loss": 2.4359,
456
  "step": 1250
457
  },
458
  {
459
  "epoch": 0.0255,
460
- "grad_norm": 1.332600172456898,
461
  "learning_rate": 2.5480000000000004e-06,
462
- "loss": 2.4233,
463
  "step": 1275
464
  },
465
  {
466
  "epoch": 0.026,
467
- "grad_norm": 0.918676647582182,
468
  "learning_rate": 2.598e-06,
469
- "loss": 2.4257,
470
  "step": 1300
471
  },
472
  {
473
  "epoch": 0.026,
474
- "eval_loss": 2.45108962059021,
475
- "eval_runtime": 273.2611,
476
- "eval_samples_per_second": 3.008,
477
- "eval_steps_per_second": 1.504,
478
  "step": 1300
479
  },
480
  {
481
  "epoch": 0.0265,
482
- "grad_norm": 1.4029272839399918,
483
  "learning_rate": 2.648e-06,
484
- "loss": 2.4289,
485
  "step": 1325
486
  },
487
  {
488
  "epoch": 0.027,
489
- "grad_norm": 1.2160130720202662,
490
  "learning_rate": 2.6980000000000003e-06,
491
- "loss": 2.4265,
492
  "step": 1350
493
  },
494
  {
495
  "epoch": 0.0275,
496
- "grad_norm": 0.9695113084191667,
497
  "learning_rate": 2.748e-06,
498
- "loss": 2.4195,
499
  "step": 1375
500
  },
501
  {
502
  "epoch": 0.028,
503
- "grad_norm": 1.7441288123872434,
504
  "learning_rate": 2.798e-06,
505
- "loss": 2.4298,
506
  "step": 1400
507
  },
508
  {
509
  "epoch": 0.028,
510
- "eval_loss": 2.4491610527038574,
511
- "eval_runtime": 583.0171,
512
- "eval_samples_per_second": 1.41,
513
- "eval_steps_per_second": 0.705,
514
  "step": 1400
515
  },
516
  {
517
  "epoch": 0.0285,
518
- "grad_norm": 1.0715504482160458,
519
  "learning_rate": 2.848e-06,
520
- "loss": 2.4262,
521
  "step": 1425
522
  },
523
  {
524
  "epoch": 0.029,
525
- "grad_norm": 1.572598515142917,
526
  "learning_rate": 2.8980000000000005e-06,
527
- "loss": 2.4247,
528
  "step": 1450
529
  },
530
  {
531
  "epoch": 0.0295,
532
- "grad_norm": 0.8769894814820209,
533
  "learning_rate": 2.9480000000000004e-06,
534
- "loss": 2.4203,
535
  "step": 1475
536
  },
537
  {
538
  "epoch": 0.03,
539
- "grad_norm": 1.15218078551874,
540
  "learning_rate": 2.9980000000000003e-06,
541
- "loss": 2.4132,
542
  "step": 1500
543
  },
544
  {
545
  "epoch": 0.03,
546
- "eval_loss": 2.4474868774414062,
547
- "eval_runtime": 269.8322,
548
- "eval_samples_per_second": 3.046,
549
- "eval_steps_per_second": 1.523,
550
  "step": 1500
551
  },
552
  {
553
  "epoch": 0.0305,
554
- "grad_norm": 0.9189527146104203,
555
  "learning_rate": 3.0480000000000003e-06,
556
- "loss": 2.4178,
557
  "step": 1525
558
  },
559
  {
560
  "epoch": 0.031,
561
- "grad_norm": 0.7625325255575713,
562
  "learning_rate": 3.0980000000000007e-06,
563
- "loss": 2.4177,
564
  "step": 1550
565
  },
566
  {
567
  "epoch": 0.0315,
568
- "grad_norm": 0.6754417969129354,
569
  "learning_rate": 3.1480000000000006e-06,
570
- "loss": 2.4231,
571
  "step": 1575
572
  },
573
  {
574
  "epoch": 0.032,
575
- "grad_norm": 0.7801492001481019,
576
  "learning_rate": 3.198e-06,
577
- "loss": 2.4211,
578
  "step": 1600
579
  },
580
  {
581
  "epoch": 0.032,
582
- "eval_loss": 2.4462578296661377,
583
- "eval_runtime": 284.7909,
584
- "eval_samples_per_second": 2.886,
585
- "eval_steps_per_second": 1.443,
586
  "step": 1600
587
  },
588
  {
589
  "epoch": 0.0325,
590
- "grad_norm": 1.0689417566940471,
591
  "learning_rate": 3.248e-06,
592
- "loss": 2.413,
593
  "step": 1625
594
  },
595
  {
596
  "epoch": 0.033,
597
- "grad_norm": 0.7787232312537056,
598
  "learning_rate": 3.298e-06,
599
- "loss": 2.4174,
600
  "step": 1650
601
  },
602
  {
603
  "epoch": 0.0335,
604
- "grad_norm": 0.781184938396322,
605
  "learning_rate": 3.348e-06,
606
- "loss": 2.4172,
607
  "step": 1675
608
  },
609
  {
610
  "epoch": 0.034,
611
- "grad_norm": 0.7747298764544236,
612
  "learning_rate": 3.3980000000000003e-06,
613
- "loss": 2.413,
614
  "step": 1700
615
  },
616
  {
617
  "epoch": 0.034,
618
- "eval_loss": 2.4448819160461426,
619
- "eval_runtime": 271.9561,
620
- "eval_samples_per_second": 3.023,
621
- "eval_steps_per_second": 1.511,
622
  "step": 1700
623
  },
624
  {
625
  "epoch": 0.0345,
626
- "grad_norm": 0.7757798466912066,
627
  "learning_rate": 3.4480000000000003e-06,
628
- "loss": 2.4098,
629
  "step": 1725
630
  },
631
  {
632
  "epoch": 0.035,
633
- "grad_norm": 0.7310107987133747,
634
  "learning_rate": 3.4980000000000002e-06,
635
- "loss": 2.4128,
636
  "step": 1750
637
  },
638
  {
639
  "epoch": 0.0355,
640
- "grad_norm": 0.7001013929335956,
641
  "learning_rate": 3.548e-06,
642
- "loss": 2.4186,
643
  "step": 1775
644
  },
645
  {
646
  "epoch": 0.036,
647
- "grad_norm": 0.8450545164626939,
648
  "learning_rate": 3.5980000000000005e-06,
649
- "loss": 2.4023,
650
  "step": 1800
651
  },
652
  {
653
  "epoch": 0.036,
654
- "eval_loss": 2.4438183307647705,
655
- "eval_runtime": 270.9746,
656
- "eval_samples_per_second": 3.033,
657
- "eval_steps_per_second": 1.517,
658
  "step": 1800
659
  },
660
  {
661
  "epoch": 0.0365,
662
- "grad_norm": 0.699119117469249,
663
  "learning_rate": 3.6480000000000005e-06,
664
- "loss": 2.4147,
665
  "step": 1825
666
  },
667
  {
668
  "epoch": 0.037,
669
- "grad_norm": 1.4648900237948448,
670
  "learning_rate": 3.6980000000000004e-06,
671
- "loss": 2.4057,
672
  "step": 1850
673
  },
674
  {
675
  "epoch": 0.0375,
676
- "grad_norm": 0.8100762412810776,
677
  "learning_rate": 3.7480000000000004e-06,
678
- "loss": 2.4,
679
  "step": 1875
680
  },
681
  {
682
  "epoch": 0.038,
683
- "grad_norm": 1.7107392169339468,
684
  "learning_rate": 3.7980000000000007e-06,
685
- "loss": 2.4064,
686
  "step": 1900
687
  },
688
  {
689
  "epoch": 0.038,
690
- "eval_loss": 2.4430434703826904,
691
- "eval_runtime": 271.5411,
692
- "eval_samples_per_second": 3.027,
693
- "eval_steps_per_second": 1.514,
694
  "step": 1900
695
  },
696
  {
697
  "epoch": 0.0385,
698
- "grad_norm": 0.7400885595875731,
699
  "learning_rate": 3.848e-06,
700
- "loss": 2.4045,
701
  "step": 1925
702
  },
703
  {
704
  "epoch": 0.039,
705
- "grad_norm": 0.6549639602828458,
706
  "learning_rate": 3.898e-06,
707
- "loss": 2.4013,
708
  "step": 1950
709
  },
710
  {
711
  "epoch": 0.0395,
712
- "grad_norm": 0.900275875384622,
713
  "learning_rate": 3.948e-06,
714
- "loss": 2.4023,
715
  "step": 1975
716
  },
717
  {
718
  "epoch": 0.04,
719
- "grad_norm": 0.6557480580465296,
720
  "learning_rate": 3.9980000000000005e-06,
721
- "loss": 2.4067,
722
  "step": 2000
723
  },
724
  {
725
  "epoch": 0.04,
726
- "eval_loss": 2.4421370029449463,
727
- "eval_runtime": 271.9467,
728
- "eval_samples_per_second": 3.023,
729
- "eval_steps_per_second": 1.511,
730
  "step": 2000
731
  },
732
  {
733
  "epoch": 0.0405,
734
- "grad_norm": 0.693045116257973,
735
  "learning_rate": 4.048e-06,
736
- "loss": 2.4039,
737
  "step": 2025
738
  },
739
  {
740
  "epoch": 0.041,
741
- "grad_norm": 0.7610891579847459,
742
  "learning_rate": 4.098e-06,
743
- "loss": 2.4078,
744
  "step": 2050
745
  },
746
  {
747
  "epoch": 0.0415,
748
- "grad_norm": 1.3834425728285105,
749
  "learning_rate": 4.148000000000001e-06,
750
- "loss": 2.4147,
751
  "step": 2075
752
  },
753
  {
754
  "epoch": 0.042,
755
- "grad_norm": 1.1248195017181577,
756
  "learning_rate": 4.198e-06,
757
- "loss": 2.4081,
758
  "step": 2100
759
  },
760
  {
761
  "epoch": 0.042,
762
- "eval_loss": 2.441387891769409,
763
- "eval_runtime": 273.2847,
764
- "eval_samples_per_second": 3.008,
765
- "eval_steps_per_second": 1.504,
766
  "step": 2100
767
  },
768
  {
769
  "epoch": 0.0425,
770
- "grad_norm": 0.9505394679390867,
771
  "learning_rate": 4.248000000000001e-06,
772
- "loss": 2.3997,
773
  "step": 2125
774
  },
775
  {
776
  "epoch": 0.043,
777
- "grad_norm": 0.6617161262323777,
778
  "learning_rate": 4.298e-06,
779
- "loss": 2.4092,
780
  "step": 2150
781
  },
782
  {
783
  "epoch": 0.0435,
784
- "grad_norm": 0.8041142918976389,
785
  "learning_rate": 4.3480000000000006e-06,
786
- "loss": 2.3911,
787
  "step": 2175
788
  },
789
  {
790
  "epoch": 0.044,
791
- "grad_norm": 0.6836064943747875,
792
  "learning_rate": 4.398000000000001e-06,
793
- "loss": 2.4009,
794
  "step": 2200
795
  },
796
  {
797
  "epoch": 0.044,
798
- "eval_loss": 2.4408862590789795,
799
- "eval_runtime": 272.2045,
800
- "eval_samples_per_second": 3.02,
801
- "eval_steps_per_second": 1.51,
802
  "step": 2200
803
  },
804
  {
805
  "epoch": 0.0445,
806
- "grad_norm": 0.8459080804967606,
807
  "learning_rate": 4.4480000000000004e-06,
808
- "loss": 2.3956,
809
  "step": 2225
810
  },
811
  {
812
  "epoch": 0.045,
813
- "grad_norm": 0.7291051916367819,
814
  "learning_rate": 4.498e-06,
815
- "loss": 2.3987,
816
  "step": 2250
817
  },
818
  {
819
  "epoch": 0.0455,
820
- "grad_norm": 0.8931508487954566,
821
  "learning_rate": 4.548e-06,
822
- "loss": 2.3998,
823
  "step": 2275
824
  },
825
  {
826
  "epoch": 0.046,
827
- "grad_norm": 0.7182312121808919,
828
  "learning_rate": 4.598e-06,
829
- "loss": 2.3986,
830
  "step": 2300
831
  },
832
  {
833
  "epoch": 0.046,
834
- "eval_loss": 2.4408977031707764,
835
- "eval_runtime": 270.8718,
836
- "eval_samples_per_second": 3.035,
837
- "eval_steps_per_second": 1.517,
838
  "step": 2300
839
  },
840
  {
841
  "epoch": 0.0465,
842
- "grad_norm": 0.6845849871247104,
843
  "learning_rate": 4.648e-06,
844
- "loss": 2.3993,
845
  "step": 2325
846
  },
847
  {
848
  "epoch": 0.047,
849
- "grad_norm": 0.6454402382658997,
850
  "learning_rate": 4.698000000000001e-06,
851
- "loss": 2.3919,
852
  "step": 2350
853
  },
854
  {
855
  "epoch": 0.0475,
856
- "grad_norm": 0.7059991305508788,
857
  "learning_rate": 4.748e-06,
858
- "loss": 2.3932,
859
  "step": 2375
860
  },
861
  {
862
  "epoch": 0.048,
863
- "grad_norm": 0.7912136028957374,
864
  "learning_rate": 4.7980000000000005e-06,
865
- "loss": 2.3981,
866
  "step": 2400
867
  },
868
  {
869
  "epoch": 0.048,
870
- "eval_loss": 2.4403159618377686,
871
- "eval_runtime": 268.2333,
872
- "eval_samples_per_second": 3.064,
873
- "eval_steps_per_second": 1.532,
874
  "step": 2400
875
  },
876
  {
877
  "epoch": 0.0485,
878
- "grad_norm": 0.8023695737551224,
879
  "learning_rate": 4.848000000000001e-06,
880
- "loss": 2.3989,
881
  "step": 2425
882
  },
883
  {
884
  "epoch": 0.049,
885
- "grad_norm": 0.6698221863733745,
886
  "learning_rate": 4.898e-06,
887
- "loss": 2.3846,
888
  "step": 2450
889
  },
890
  {
891
  "epoch": 0.0495,
892
- "grad_norm": 0.6888793258344078,
893
  "learning_rate": 4.948000000000001e-06,
894
- "loss": 2.3895,
895
  "step": 2475
896
  },
897
  {
898
  "epoch": 0.05,
899
- "grad_norm": 1.0427086572633348,
900
  "learning_rate": 4.998e-06,
901
- "loss": 2.3909,
902
  "step": 2500
903
  },
904
  {
905
  "epoch": 0.05,
906
- "eval_loss": 2.4396488666534424,
907
- "eval_runtime": 269.5843,
908
- "eval_samples_per_second": 3.049,
909
- "eval_steps_per_second": 1.525,
910
  "step": 2500
911
  },
912
  {
913
  "epoch": 0.0505,
914
- "grad_norm": 0.7651432866063412,
915
  "learning_rate": 5.048000000000001e-06,
916
- "loss": 2.3868,
917
  "step": 2525
918
  },
919
  {
920
  "epoch": 0.051,
921
- "grad_norm": 0.9011841495487002,
922
  "learning_rate": 5.098000000000001e-06,
923
- "loss": 2.3831,
924
  "step": 2550
925
  },
926
  {
927
  "epoch": 0.0515,
928
- "grad_norm": 0.7564083994967769,
929
  "learning_rate": 5.1480000000000005e-06,
930
- "loss": 2.3847,
931
  "step": 2575
932
  },
933
  {
934
  "epoch": 0.052,
935
- "grad_norm": 1.2435018934793685,
936
  "learning_rate": 5.198000000000001e-06,
937
- "loss": 2.3819,
938
  "step": 2600
939
  },
940
  {
941
  "epoch": 0.052,
942
- "eval_loss": 2.439671039581299,
943
- "eval_runtime": 268.9479,
944
- "eval_samples_per_second": 3.056,
945
- "eval_steps_per_second": 1.528,
946
  "step": 2600
947
  },
948
  {
949
  "epoch": 0.0525,
950
- "grad_norm": 0.6889931430781852,
951
  "learning_rate": 5.248000000000001e-06,
952
- "loss": 2.3866,
953
  "step": 2625
954
  },
955
  {
956
  "epoch": 0.053,
957
- "grad_norm": 0.6691285553697948,
958
  "learning_rate": 5.298000000000001e-06,
959
- "loss": 2.3847,
960
  "step": 2650
961
  },
962
  {
963
  "epoch": 0.0535,
964
- "grad_norm": 0.763818492118123,
965
  "learning_rate": 5.348000000000001e-06,
966
- "loss": 2.3869,
967
  "step": 2675
968
  },
969
  {
970
  "epoch": 0.054,
971
- "grad_norm": 0.751625809368973,
972
  "learning_rate": 5.398e-06,
973
- "loss": 2.3849,
974
  "step": 2700
975
  },
976
  {
977
  "epoch": 0.054,
978
- "eval_loss": 2.4390792846679688,
979
- "eval_runtime": 269.1474,
980
- "eval_samples_per_second": 3.054,
981
- "eval_steps_per_second": 1.527,
982
  "step": 2700
983
  },
984
  {
985
  "epoch": 0.0545,
986
- "grad_norm": 0.7089053263326198,
987
  "learning_rate": 5.448e-06,
988
- "loss": 2.3888,
989
  "step": 2725
990
  },
991
  {
992
  "epoch": 0.055,
993
- "grad_norm": 0.7882614052121117,
994
  "learning_rate": 5.498e-06,
995
- "loss": 2.3867,
996
  "step": 2750
997
  },
998
  {
999
  "epoch": 0.0555,
1000
- "grad_norm": 0.834757676919121,
1001
  "learning_rate": 5.548e-06,
1002
- "loss": 2.3934,
1003
  "step": 2775
1004
  },
1005
  {
1006
  "epoch": 0.056,
1007
- "grad_norm": 0.8944452545678075,
1008
  "learning_rate": 5.5980000000000004e-06,
1009
- "loss": 2.3806,
1010
  "step": 2800
1011
  },
1012
  {
1013
  "epoch": 0.056,
1014
- "eval_loss": 2.4394161701202393,
1015
- "eval_runtime": 267.9021,
1016
- "eval_samples_per_second": 3.068,
1017
- "eval_steps_per_second": 1.534,
1018
  "step": 2800
1019
  },
1020
  {
1021
  "epoch": 0.0565,
1022
- "grad_norm": 0.7299925105583852,
1023
  "learning_rate": 5.648e-06,
1024
- "loss": 2.3838,
1025
  "step": 2825
1026
  },
1027
  {
1028
  "epoch": 0.057,
1029
- "grad_norm": 0.8737328950825439,
1030
  "learning_rate": 5.698e-06,
1031
- "loss": 2.3845,
1032
  "step": 2850
1033
  },
1034
  {
1035
  "epoch": 0.0575,
1036
- "grad_norm": 0.7907442941301994,
1037
  "learning_rate": 5.748e-06,
1038
- "loss": 2.3834,
1039
  "step": 2875
1040
  },
1041
  {
1042
  "epoch": 0.058,
1043
- "grad_norm": 0.6846463994155354,
1044
  "learning_rate": 5.798e-06,
1045
- "loss": 2.3776,
1046
  "step": 2900
1047
  },
1048
  {
1049
  "epoch": 0.058,
1050
- "eval_loss": 2.4393930435180664,
1051
- "eval_runtime": 266.9434,
1052
- "eval_samples_per_second": 3.079,
1053
- "eval_steps_per_second": 1.54,
1054
  "step": 2900
1055
  },
1056
  {
1057
  "epoch": 0.0585,
1058
- "grad_norm": 0.62540280272208,
1059
  "learning_rate": 5.848000000000001e-06,
1060
- "loss": 2.3808,
1061
  "step": 2925
1062
  },
1063
  {
1064
  "epoch": 0.059,
1065
- "grad_norm": 0.8774049191705039,
1066
  "learning_rate": 5.898e-06,
1067
- "loss": 2.3873,
1068
  "step": 2950
1069
  },
1070
  {
1071
  "epoch": 0.0595,
1072
- "grad_norm": 0.6485716892578811,
1073
  "learning_rate": 5.9480000000000005e-06,
1074
- "loss": 2.3768,
1075
  "step": 2975
1076
  },
1077
  {
1078
  "epoch": 0.06,
1079
- "grad_norm": 0.730163203087036,
1080
  "learning_rate": 5.998000000000001e-06,
1081
- "loss": 2.3713,
1082
  "step": 3000
1083
  },
1084
  {
1085
  "epoch": 0.06,
1086
- "eval_loss": 2.4393091201782227,
1087
- "eval_runtime": 267.1212,
1088
- "eval_samples_per_second": 3.077,
1089
- "eval_steps_per_second": 1.539,
1090
  "step": 3000
1091
  },
1092
  {
1093
  "epoch": 0.0605,
1094
- "grad_norm": 0.6879314967282564,
1095
  "learning_rate": 6.048e-06,
1096
- "loss": 2.3783,
1097
  "step": 3025
1098
  },
1099
  {
1100
  "epoch": 0.061,
1101
- "grad_norm": 0.8144222671191256,
1102
  "learning_rate": 6.098000000000001e-06,
1103
- "loss": 2.3675,
1104
  "step": 3050
1105
  },
1106
  {
1107
  "epoch": 0.0615,
1108
- "grad_norm": 0.7831396066479988,
1109
  "learning_rate": 6.148e-06,
1110
- "loss": 2.3749,
1111
  "step": 3075
1112
  },
1113
  {
1114
  "epoch": 0.062,
1115
- "grad_norm": 0.8428793923955709,
1116
  "learning_rate": 6.198000000000001e-06,
1117
- "loss": 2.3815,
1118
  "step": 3100
1119
  },
1120
  {
1121
  "epoch": 0.062,
1122
- "eval_loss": 2.4399077892303467,
1123
- "eval_runtime": 267.544,
1124
- "eval_samples_per_second": 3.072,
1125
- "eval_steps_per_second": 1.536,
1126
  "step": 3100
1127
  },
1128
  {
1129
  "epoch": 0.0625,
1130
- "grad_norm": 0.6828002417283633,
1131
  "learning_rate": 6.248000000000001e-06,
1132
- "loss": 2.3824,
1133
  "step": 3125
1134
  },
1135
  {
1136
  "epoch": 0.063,
1137
- "grad_norm": 0.6281277470124486,
1138
  "learning_rate": 6.2980000000000005e-06,
1139
- "loss": 2.3721,
1140
  "step": 3150
1141
  },
1142
  {
1143
  "epoch": 0.0635,
1144
- "grad_norm": 0.6760775775841215,
1145
  "learning_rate": 6.348000000000001e-06,
1146
- "loss": 2.3691,
1147
  "step": 3175
1148
  },
1149
  {
1150
  "epoch": 0.064,
1151
- "grad_norm": 1.5950387898495975,
1152
  "learning_rate": 6.398000000000001e-06,
1153
- "loss": 2.3752,
1154
  "step": 3200
1155
  },
1156
  {
1157
  "epoch": 0.064,
1158
- "eval_loss": 2.439392328262329,
1159
- "eval_runtime": 266.9401,
1160
- "eval_samples_per_second": 3.079,
1161
- "eval_steps_per_second": 1.54,
1162
  "step": 3200
1163
  },
1164
  {
1165
  "epoch": 0.0645,
1166
- "grad_norm": 0.8656925185501568,
1167
  "learning_rate": 6.448000000000001e-06,
1168
- "loss": 2.3756,
1169
  "step": 3225
1170
  },
1171
  {
1172
  "epoch": 0.065,
1173
- "grad_norm": 1.0689072967386066,
1174
  "learning_rate": 6.498000000000001e-06,
1175
- "loss": 2.374,
1176
  "step": 3250
1177
  },
1178
  {
1179
  "epoch": 0.0655,
1180
- "grad_norm": 0.642689784270949,
1181
  "learning_rate": 6.548000000000001e-06,
1182
- "loss": 2.3675,
1183
  "step": 3275
1184
  },
1185
  {
1186
  "epoch": 0.066,
1187
- "grad_norm": 0.7931015535140951,
1188
  "learning_rate": 6.598000000000001e-06,
1189
- "loss": 2.3723,
1190
  "step": 3300
1191
  },
1192
  {
1193
  "epoch": 0.066,
1194
- "eval_loss": 2.4396414756774902,
1195
- "eval_runtime": 266.3301,
1196
- "eval_samples_per_second": 3.086,
1197
- "eval_steps_per_second": 1.543,
1198
  "step": 3300
1199
  },
1200
  {
1201
  "epoch": 0.0665,
1202
- "grad_norm": 0.7478812929813007,
1203
  "learning_rate": 6.648e-06,
1204
- "loss": 2.3728,
1205
  "step": 3325
1206
  },
1207
  {
1208
  "epoch": 0.067,
1209
- "grad_norm": 0.7286462484545718,
1210
  "learning_rate": 6.698e-06,
1211
- "loss": 2.3718,
1212
  "step": 3350
1213
  },
1214
  {
1215
  "epoch": 0.0675,
1216
- "grad_norm": 0.7188647675429949,
1217
  "learning_rate": 6.7480000000000004e-06,
1218
- "loss": 2.3744,
1219
  "step": 3375
1220
  },
1221
  {
1222
  "epoch": 0.068,
1223
- "grad_norm": 0.6869274776133854,
1224
  "learning_rate": 6.798e-06,
1225
- "loss": 2.3625,
1226
  "step": 3400
1227
  },
1228
  {
1229
  "epoch": 0.068,
1230
- "eval_loss": 2.43986177444458,
1231
- "eval_runtime": 266.556,
1232
- "eval_samples_per_second": 3.084,
1233
- "eval_steps_per_second": 1.542,
1234
  "step": 3400
1235
  },
1236
  {
1237
  "epoch": 0.0685,
1238
- "grad_norm": 0.7666004938241721,
1239
  "learning_rate": 6.848e-06,
1240
- "loss": 2.3686,
1241
  "step": 3425
1242
  },
1243
  {
1244
  "epoch": 0.069,
1245
- "grad_norm": 0.864757734602374,
1246
  "learning_rate": 6.898e-06,
1247
- "loss": 2.3596,
1248
  "step": 3450
1249
  },
1250
  {
1251
  "epoch": 0.0695,
1252
- "grad_norm": 0.7715710742116183,
1253
  "learning_rate": 6.948e-06,
1254
- "loss": 2.3576,
1255
  "step": 3475
1256
  },
1257
  {
1258
  "epoch": 0.07,
1259
- "grad_norm": 0.6372061584106886,
1260
  "learning_rate": 6.998000000000001e-06,
1261
- "loss": 2.3729,
1262
  "step": 3500
1263
  },
1264
  {
1265
  "epoch": 0.07,
1266
- "eval_loss": 2.4403369426727295,
1267
- "eval_runtime": 266.4426,
1268
- "eval_samples_per_second": 3.085,
1269
- "eval_steps_per_second": 1.543,
1270
  "step": 3500
1271
  },
1272
  {
1273
  "epoch": 0.0705,
1274
- "grad_norm": 1.176729441259039,
1275
  "learning_rate": 7.048e-06,
1276
- "loss": 2.3655,
1277
  "step": 3525
1278
  },
1279
  {
1280
  "epoch": 0.071,
1281
- "grad_norm": 0.6115287728562199,
1282
  "learning_rate": 7.0980000000000005e-06,
1283
- "loss": 2.3625,
1284
  "step": 3550
1285
  },
1286
  {
1287
  "epoch": 0.0715,
1288
- "grad_norm": 0.6246990242430912,
1289
  "learning_rate": 7.148000000000001e-06,
1290
- "loss": 2.3618,
1291
  "step": 3575
1292
  },
1293
  {
1294
  "epoch": 0.072,
1295
- "grad_norm": 0.7221703846470798,
1296
  "learning_rate": 7.198e-06,
1297
- "loss": 2.3628,
1298
  "step": 3600
1299
  },
1300
  {
1301
  "epoch": 0.072,
1302
- "eval_loss": 2.440523862838745,
1303
- "eval_runtime": 266.5015,
1304
- "eval_samples_per_second": 3.084,
1305
- "eval_steps_per_second": 1.542,
1306
  "step": 3600
1307
  },
1308
  {
1309
  "epoch": 0.0725,
1310
- "grad_norm": 0.6655580980887033,
1311
  "learning_rate": 7.248000000000001e-06,
1312
- "loss": 2.3606,
1313
  "step": 3625
1314
  },
1315
  {
1316
  "epoch": 0.073,
1317
- "grad_norm": 0.8105564047038534,
1318
  "learning_rate": 7.298e-06,
1319
- "loss": 2.3633,
1320
  "step": 3650
1321
  },
1322
  {
1323
  "epoch": 0.0735,
1324
- "grad_norm": 0.6850595144504594,
1325
  "learning_rate": 7.348000000000001e-06,
1326
- "loss": 2.3599,
1327
  "step": 3675
1328
  },
1329
  {
1330
  "epoch": 0.074,
1331
- "grad_norm": 0.6572561289961083,
1332
  "learning_rate": 7.398000000000001e-06,
1333
- "loss": 2.356,
1334
  "step": 3700
1335
  },
1336
  {
1337
  "epoch": 0.074,
1338
- "eval_loss": 2.440370798110962,
1339
- "eval_runtime": 266.1237,
1340
- "eval_samples_per_second": 3.089,
1341
- "eval_steps_per_second": 1.544,
1342
  "step": 3700
1343
  },
1344
  {
1345
  "epoch": 0.0745,
1346
- "grad_norm": 0.6568528045310038,
1347
  "learning_rate": 7.4480000000000005e-06,
1348
- "loss": 2.364,
1349
  "step": 3725
1350
  },
1351
  {
1352
  "epoch": 0.075,
1353
- "grad_norm": 0.7767463454261476,
1354
  "learning_rate": 7.498000000000001e-06,
1355
- "loss": 2.3544,
1356
  "step": 3750
1357
  },
1358
  {
1359
  "epoch": 0.0755,
1360
- "grad_norm": 0.6432402218022537,
1361
  "learning_rate": 7.548000000000001e-06,
1362
- "loss": 2.3667,
1363
  "step": 3775
1364
  },
1365
  {
1366
  "epoch": 0.076,
1367
- "grad_norm": 0.6601750907381865,
1368
  "learning_rate": 7.598000000000001e-06,
1369
- "loss": 2.3538,
1370
  "step": 3800
1371
  },
1372
  {
1373
  "epoch": 0.076,
1374
- "eval_loss": 2.440768241882324,
1375
- "eval_runtime": 266.5088,
1376
- "eval_samples_per_second": 3.084,
1377
- "eval_steps_per_second": 1.542,
1378
  "step": 3800
1379
  },
1380
  {
1381
  "epoch": 0.0765,
1382
- "grad_norm": 0.6698754690123959,
1383
  "learning_rate": 7.648e-06,
1384
- "loss": 2.3606,
1385
  "step": 3825
1386
  },
1387
  {
1388
  "epoch": 0.077,
1389
- "grad_norm": 0.6469410659344654,
1390
  "learning_rate": 7.698000000000002e-06,
1391
- "loss": 2.362,
1392
  "step": 3850
1393
  },
1394
  {
1395
  "epoch": 0.0775,
1396
- "grad_norm": 1.0013339216690909,
1397
  "learning_rate": 7.748000000000001e-06,
1398
- "loss": 2.3507,
1399
  "step": 3875
1400
  },
1401
  {
1402
  "epoch": 0.078,
1403
- "grad_norm": 0.7506371440780338,
1404
  "learning_rate": 7.798e-06,
1405
- "loss": 2.3452,
1406
  "step": 3900
1407
  },
1408
  {
1409
  "epoch": 0.078,
1410
- "eval_loss": 2.4411609172821045,
1411
- "eval_runtime": 266.382,
1412
- "eval_samples_per_second": 3.086,
1413
- "eval_steps_per_second": 1.543,
1414
  "step": 3900
1415
  },
1416
  {
1417
  "epoch": 0.0785,
1418
- "grad_norm": 0.8222668670513549,
1419
  "learning_rate": 7.848000000000002e-06,
1420
- "loss": 2.3492,
1421
  "step": 3925
1422
  },
1423
  {
1424
  "epoch": 0.079,
1425
- "grad_norm": 0.7348741963305673,
1426
  "learning_rate": 7.898e-06,
1427
- "loss": 2.3514,
1428
  "step": 3950
1429
  },
1430
  {
1431
  "epoch": 0.0795,
1432
- "grad_norm": 0.6659497394839384,
1433
  "learning_rate": 7.948e-06,
1434
- "loss": 2.3555,
1435
  "step": 3975
1436
  },
1437
  {
1438
  "epoch": 0.08,
1439
- "grad_norm": 0.6450727740951402,
1440
  "learning_rate": 7.998e-06,
1441
- "loss": 2.3473,
1442
  "step": 4000
1443
  },
1444
  {
1445
  "epoch": 0.08,
1446
- "eval_loss": 2.4414608478546143,
1447
- "eval_runtime": 266.9648,
1448
- "eval_samples_per_second": 3.079,
1449
- "eval_steps_per_second": 1.54,
1450
  "step": 4000
1451
  },
1452
  {
1453
  "epoch": 0.0805,
1454
- "grad_norm": 0.7546326612640164,
1455
  "learning_rate": 8.048e-06,
1456
- "loss": 2.3539,
1457
  "step": 4025
1458
  },
1459
  {
1460
  "epoch": 0.081,
1461
- "grad_norm": 0.6761905448705341,
1462
  "learning_rate": 8.098000000000001e-06,
1463
- "loss": 2.3556,
1464
  "step": 4050
1465
  },
1466
  {
1467
  "epoch": 0.0815,
1468
- "grad_norm": 0.7318094477510495,
1469
  "learning_rate": 8.148e-06,
1470
- "loss": 2.3423,
1471
  "step": 4075
1472
  },
1473
  {
1474
  "epoch": 0.082,
1475
- "grad_norm": 0.6821593855329929,
1476
  "learning_rate": 8.198e-06,
1477
- "loss": 2.3502,
1478
  "step": 4100
1479
  },
1480
  {
1481
  "epoch": 0.082,
1482
- "eval_loss": 2.4423110485076904,
1483
- "eval_runtime": 279.3363,
1484
- "eval_samples_per_second": 2.943,
1485
- "eval_steps_per_second": 1.471,
1486
  "step": 4100
1487
  },
1488
  {
1489
  "epoch": 0.0825,
1490
- "grad_norm": 0.6574698437748202,
1491
  "learning_rate": 8.248e-06,
1492
- "loss": 2.3409,
1493
  "step": 4125
1494
  },
1495
  {
1496
  "epoch": 0.083,
1497
- "grad_norm": 0.6906095764967443,
1498
  "learning_rate": 8.298000000000001e-06,
1499
- "loss": 2.3492,
1500
  "step": 4150
1501
  },
1502
  {
1503
  "epoch": 0.0835,
1504
- "grad_norm": 0.6442365789950762,
1505
  "learning_rate": 8.348e-06,
1506
- "loss": 2.3488,
1507
  "step": 4175
1508
  },
1509
  {
1510
  "epoch": 0.084,
1511
- "grad_norm": 0.6795131481296484,
1512
  "learning_rate": 8.398e-06,
1513
- "loss": 2.3467,
1514
  "step": 4200
1515
  },
1516
  {
1517
  "epoch": 0.084,
1518
- "eval_loss": 2.442260980606079,
1519
- "eval_runtime": 266.2484,
1520
- "eval_samples_per_second": 3.087,
1521
- "eval_steps_per_second": 1.544,
1522
  "step": 4200
1523
  },
1524
  {
1525
  "epoch": 0.0845,
1526
- "grad_norm": 0.698061792277122,
1527
  "learning_rate": 8.448000000000001e-06,
1528
- "loss": 2.3436,
1529
  "step": 4225
1530
  },
1531
  {
1532
  "epoch": 0.085,
1533
- "grad_norm": 0.7442835229840722,
1534
  "learning_rate": 8.498e-06,
1535
- "loss": 2.3381,
1536
  "step": 4250
1537
  },
1538
  {
1539
  "epoch": 0.0855,
1540
- "grad_norm": 0.7362824833247893,
1541
  "learning_rate": 8.548e-06,
1542
- "loss": 2.3431,
1543
  "step": 4275
1544
  },
1545
  {
1546
  "epoch": 0.086,
1547
- "grad_norm": 0.7137715109018318,
1548
  "learning_rate": 8.598000000000001e-06,
1549
- "loss": 2.3295,
1550
  "step": 4300
1551
  },
1552
  {
1553
  "epoch": 0.086,
1554
- "eval_loss": 2.443796157836914,
1555
- "eval_runtime": 266.8087,
1556
- "eval_samples_per_second": 3.081,
1557
- "eval_steps_per_second": 1.54,
1558
  "step": 4300
1559
  },
1560
  {
1561
  "epoch": 0.0865,
1562
- "grad_norm": 0.7098595503905674,
1563
  "learning_rate": 8.648000000000001e-06,
1564
- "loss": 2.3459,
1565
  "step": 4325
1566
  },
1567
  {
1568
  "epoch": 0.087,
1569
- "grad_norm": 0.7996135906080958,
1570
  "learning_rate": 8.698e-06,
1571
- "loss": 2.3466,
1572
  "step": 4350
1573
  },
1574
  {
1575
  "epoch": 0.0875,
1576
- "grad_norm": 0.842668515468917,
1577
  "learning_rate": 8.748000000000002e-06,
1578
- "loss": 2.3384,
1579
  "step": 4375
1580
  },
1581
  {
1582
  "epoch": 0.088,
1583
- "grad_norm": 0.6713693442490306,
1584
  "learning_rate": 8.798000000000001e-06,
1585
- "loss": 2.3421,
1586
  "step": 4400
1587
  },
1588
  {
1589
  "epoch": 0.088,
1590
- "eval_loss": 2.4439103603363037,
1591
- "eval_runtime": 266.7578,
1592
- "eval_samples_per_second": 3.081,
1593
- "eval_steps_per_second": 1.541,
1594
  "step": 4400
1595
  },
1596
  {
1597
  "epoch": 0.0885,
1598
- "grad_norm": 0.7003512380037604,
1599
  "learning_rate": 8.848e-06,
1600
- "loss": 2.3346,
1601
  "step": 4425
1602
  },
1603
  {
1604
  "epoch": 0.089,
1605
- "grad_norm": 0.6607605926628429,
1606
  "learning_rate": 8.898000000000002e-06,
1607
- "loss": 2.3406,
1608
  "step": 4450
1609
  },
1610
  {
1611
  "epoch": 0.0895,
1612
- "grad_norm": 0.6803671624212732,
1613
  "learning_rate": 8.948000000000001e-06,
1614
- "loss": 2.336,
1615
  "step": 4475
1616
  },
1617
  {
1618
  "epoch": 0.09,
1619
- "grad_norm": 0.8922530007640126,
1620
  "learning_rate": 8.998000000000001e-06,
1621
- "loss": 2.3436,
1622
  "step": 4500
1623
  },
1624
  {
1625
  "epoch": 0.09,
1626
- "eval_loss": 2.444617509841919,
1627
- "eval_runtime": 266.6078,
1628
- "eval_samples_per_second": 3.083,
1629
- "eval_steps_per_second": 1.542,
1630
  "step": 4500
1631
  },
1632
  {
1633
  "epoch": 0.0905,
1634
- "grad_norm": 0.7083372956919765,
1635
  "learning_rate": 9.048e-06,
1636
- "loss": 2.3381,
1637
  "step": 4525
1638
  },
1639
  {
1640
  "epoch": 0.091,
1641
- "grad_norm": 0.7428013956928052,
1642
  "learning_rate": 9.098000000000002e-06,
1643
- "loss": 2.3296,
1644
  "step": 4550
1645
  },
1646
  {
1647
  "epoch": 0.0915,
1648
- "grad_norm": 0.7273252330071415,
1649
  "learning_rate": 9.148e-06,
1650
- "loss": 2.3316,
1651
  "step": 4575
1652
  },
1653
  {
1654
  "epoch": 0.092,
1655
- "grad_norm": 0.7554687577503647,
1656
  "learning_rate": 9.198e-06,
1657
- "loss": 2.3319,
1658
  "step": 4600
1659
  },
1660
  {
1661
  "epoch": 0.092,
1662
- "eval_loss": 2.4464893341064453,
1663
- "eval_runtime": 266.7269,
1664
- "eval_samples_per_second": 3.082,
1665
- "eval_steps_per_second": 1.541,
1666
  "step": 4600
1667
  },
1668
  {
1669
  "epoch": 0.0925,
1670
- "grad_norm": 0.6871461093264853,
1671
  "learning_rate": 9.248e-06,
1672
- "loss": 2.3326,
1673
  "step": 4625
1674
  },
1675
  {
1676
  "epoch": 0.093,
1677
- "grad_norm": 0.7005189512561462,
1678
  "learning_rate": 9.298e-06,
1679
- "loss": 2.3292,
1680
  "step": 4650
1681
  },
1682
  {
1683
  "epoch": 0.0935,
1684
- "grad_norm": 0.7009511384899544,
1685
  "learning_rate": 9.348000000000001e-06,
1686
- "loss": 2.329,
1687
  "step": 4675
1688
  },
1689
  {
1690
  "epoch": 0.094,
1691
- "grad_norm": 0.6627629301918981,
1692
  "learning_rate": 9.398e-06,
1693
- "loss": 2.3255,
1694
  "step": 4700
1695
  },
1696
  {
1697
  "epoch": 0.094,
1698
- "eval_loss": 2.4467787742614746,
1699
- "eval_runtime": 266.7555,
1700
- "eval_samples_per_second": 3.081,
1701
- "eval_steps_per_second": 1.541,
1702
  "step": 4700
1703
  },
1704
  {
1705
  "epoch": 0.0945,
1706
- "grad_norm": 0.6653742483339516,
1707
  "learning_rate": 9.448e-06,
1708
- "loss": 2.3361,
1709
  "step": 4725
1710
  },
1711
  {
1712
  "epoch": 0.095,
1713
- "grad_norm": 0.6648535177091066,
1714
  "learning_rate": 9.498000000000001e-06,
1715
- "loss": 2.3383,
1716
  "step": 4750
1717
  },
1718
  {
1719
  "epoch": 0.0955,
1720
- "grad_norm": 0.8353528219476875,
1721
  "learning_rate": 9.548e-06,
1722
- "loss": 2.3232,
1723
  "step": 4775
1724
  },
1725
  {
1726
  "epoch": 0.096,
1727
- "grad_norm": 0.6908965943260156,
1728
  "learning_rate": 9.598e-06,
1729
- "loss": 2.3281,
1730
  "step": 4800
1731
  },
1732
  {
1733
  "epoch": 0.096,
1734
- "eval_loss": 2.4486067295074463,
1735
- "eval_runtime": 266.8511,
1736
- "eval_samples_per_second": 3.08,
1737
- "eval_steps_per_second": 1.54,
1738
  "step": 4800
1739
  },
1740
  {
1741
  "epoch": 0.0965,
1742
- "grad_norm": 0.6826328022309806,
1743
  "learning_rate": 9.648000000000001e-06,
1744
- "loss": 2.3272,
1745
  "step": 4825
1746
  },
1747
  {
1748
  "epoch": 0.097,
1749
- "grad_norm": 0.6699367425641755,
1750
  "learning_rate": 9.698000000000001e-06,
1751
- "loss": 2.3259,
1752
  "step": 4850
1753
  },
1754
  {
1755
  "epoch": 0.0975,
1756
- "grad_norm": 0.7187871648191224,
1757
  "learning_rate": 9.748e-06,
1758
- "loss": 2.3283,
1759
  "step": 4875
1760
  },
1761
  {
1762
  "epoch": 0.098,
1763
- "grad_norm": 0.7364128547560979,
1764
  "learning_rate": 9.798e-06,
1765
- "loss": 2.3176,
1766
  "step": 4900
1767
  },
1768
  {
1769
  "epoch": 0.098,
1770
- "eval_loss": 2.4474828243255615,
1771
- "eval_runtime": 266.4312,
1772
- "eval_samples_per_second": 3.085,
1773
- "eval_steps_per_second": 1.543,
1774
  "step": 4900
1775
  },
1776
  {
1777
  "epoch": 0.0985,
1778
- "grad_norm": 1.2246499149220575,
1779
  "learning_rate": 9.848000000000001e-06,
1780
- "loss": 2.327,
1781
  "step": 4925
1782
  },
1783
  {
1784
  "epoch": 0.099,
1785
- "grad_norm": 0.7443136433045754,
1786
  "learning_rate": 9.898e-06,
1787
- "loss": 2.3258,
1788
  "step": 4950
1789
  },
1790
  {
1791
  "epoch": 0.0995,
1792
- "grad_norm": 0.7376813348732968,
1793
  "learning_rate": 9.948e-06,
1794
- "loss": 2.3179,
1795
  "step": 4975
1796
  },
1797
  {
1798
  "epoch": 0.1,
1799
- "grad_norm": 0.6982272728278753,
1800
  "learning_rate": 9.998000000000002e-06,
1801
- "loss": 2.3257,
1802
  "step": 5000
1803
  },
1804
  {
1805
  "epoch": 0.1,
1806
- "eval_loss": 2.4501407146453857,
1807
- "eval_runtime": 266.6865,
1808
- "eval_samples_per_second": 3.082,
1809
- "eval_steps_per_second": 1.541,
1810
  "step": 5000
1811
  },
1812
  {
1813
  "epoch": 0.1005,
1814
- "grad_norm": 0.6681721665421178,
1815
  "learning_rate": 9.994666666666668e-06,
1816
- "loss": 2.3213,
1817
  "step": 5025
1818
  },
1819
  {
1820
  "epoch": 0.101,
1821
- "grad_norm": 0.6575702354718237,
1822
  "learning_rate": 9.989111111111111e-06,
1823
- "loss": 2.3273,
1824
  "step": 5050
1825
  },
1826
  {
1827
  "epoch": 0.1015,
1828
- "grad_norm": 0.6971113354828473,
1829
  "learning_rate": 9.983555555555556e-06,
1830
- "loss": 2.3177,
1831
  "step": 5075
1832
  },
1833
  {
1834
  "epoch": 0.102,
1835
- "grad_norm": 0.6998740128971044,
1836
  "learning_rate": 9.978000000000002e-06,
1837
- "loss": 2.3216,
1838
  "step": 5100
1839
  },
1840
  {
1841
  "epoch": 0.102,
1842
- "eval_loss": 2.4517312049865723,
1843
- "eval_runtime": 266.5502,
1844
- "eval_samples_per_second": 3.084,
1845
- "eval_steps_per_second": 1.542,
1846
  "step": 5100
1847
  },
1848
  {
1849
  "epoch": 0.1025,
1850
- "grad_norm": 0.7760241850843234,
1851
  "learning_rate": 9.972444444444445e-06,
1852
- "loss": 2.3207,
1853
  "step": 5125
1854
  },
1855
  {
1856
  "epoch": 0.103,
1857
- "grad_norm": 0.6558847315179642,
1858
  "learning_rate": 9.966888888888889e-06,
1859
- "loss": 2.3099,
1860
  "step": 5150
1861
  },
1862
  {
1863
  "epoch": 0.1035,
1864
- "grad_norm": 0.9064918476165184,
1865
  "learning_rate": 9.961333333333334e-06,
1866
- "loss": 2.311,
1867
  "step": 5175
1868
  },
1869
  {
1870
  "epoch": 0.104,
1871
- "grad_norm": 0.9360076872893354,
1872
  "learning_rate": 9.95577777777778e-06,
1873
- "loss": 2.3182,
1874
  "step": 5200
1875
  },
1876
  {
1877
  "epoch": 0.104,
1878
- "eval_loss": 2.4505715370178223,
1879
- "eval_runtime": 266.5823,
1880
- "eval_samples_per_second": 3.083,
1881
- "eval_steps_per_second": 1.542,
1882
  "step": 5200
1883
  },
1884
  {
1885
  "epoch": 0.1045,
1886
- "grad_norm": 0.674748709590867,
1887
  "learning_rate": 9.950222222222223e-06,
1888
- "loss": 2.3177,
1889
  "step": 5225
1890
  },
1891
  {
1892
  "epoch": 0.105,
1893
- "grad_norm": 0.6815938545709628,
1894
  "learning_rate": 9.944666666666668e-06,
1895
- "loss": 2.3117,
1896
  "step": 5250
1897
  },
1898
  {
1899
  "epoch": 0.1055,
1900
- "grad_norm": 0.7208564899692077,
1901
  "learning_rate": 9.939111111111112e-06,
1902
- "loss": 2.3129,
1903
  "step": 5275
1904
  },
1905
  {
1906
  "epoch": 0.106,
1907
- "grad_norm": 0.6517762207461896,
1908
  "learning_rate": 9.933555555555557e-06,
1909
- "loss": 2.3083,
1910
  "step": 5300
1911
  },
1912
  {
1913
  "epoch": 0.106,
1914
- "eval_loss": 2.451033115386963,
1915
- "eval_runtime": 266.739,
1916
- "eval_samples_per_second": 3.082,
1917
- "eval_steps_per_second": 1.541,
1918
  "step": 5300
1919
  },
1920
  {
1921
  "epoch": 0.1065,
1922
- "grad_norm": 0.7209332286785523,
1923
  "learning_rate": 9.928e-06,
1924
- "loss": 2.3215,
1925
  "step": 5325
1926
  },
1927
  {
1928
  "epoch": 0.107,
1929
- "grad_norm": 0.6530480515190894,
1930
  "learning_rate": 9.922444444444446e-06,
1931
- "loss": 2.3087,
1932
  "step": 5350
1933
  },
1934
  {
1935
  "epoch": 0.1075,
1936
- "grad_norm": 0.6611961097877365,
1937
  "learning_rate": 9.91688888888889e-06,
1938
- "loss": 2.3004,
1939
  "step": 5375
1940
  },
1941
  {
1942
  "epoch": 0.108,
1943
- "grad_norm": 0.6667421121158212,
1944
  "learning_rate": 9.911333333333335e-06,
1945
- "loss": 2.3097,
1946
  "step": 5400
1947
  },
1948
  {
1949
  "epoch": 0.108,
1950
- "eval_loss": 2.4530234336853027,
1951
- "eval_runtime": 266.6052,
1952
- "eval_samples_per_second": 3.083,
1953
- "eval_steps_per_second": 1.542,
1954
  "step": 5400
1955
  },
1956
  {
1957
  "epoch": 0.1085,
1958
- "grad_norm": 0.6941489468595143,
1959
  "learning_rate": 9.905777777777778e-06,
1960
- "loss": 2.3097,
1961
  "step": 5425
1962
  },
1963
  {
1964
  "epoch": 0.109,
1965
- "grad_norm": 0.6857832561703359,
1966
  "learning_rate": 9.900222222222223e-06,
1967
- "loss": 2.3139,
1968
  "step": 5450
1969
  },
1970
  {
1971
  "epoch": 0.1095,
1972
- "grad_norm": 0.6840488369499558,
1973
  "learning_rate": 9.894666666666669e-06,
1974
- "loss": 2.303,
1975
  "step": 5475
1976
  },
1977
  {
1978
  "epoch": 0.11,
1979
- "grad_norm": 0.6802796535790764,
1980
  "learning_rate": 9.889111111111112e-06,
1981
- "loss": 2.3149,
1982
  "step": 5500
1983
  },
1984
  {
1985
  "epoch": 0.11,
1986
- "eval_loss": 2.452944040298462,
1987
- "eval_runtime": 267.9917,
1988
- "eval_samples_per_second": 3.067,
1989
- "eval_steps_per_second": 1.534,
1990
  "step": 5500
1991
  },
1992
  {
1993
  "epoch": 0.1105,
1994
- "grad_norm": 0.6483920466336737,
1995
  "learning_rate": 9.883555555555556e-06,
1996
- "loss": 2.3111,
1997
  "step": 5525
1998
  },
1999
  {
2000
  "epoch": 0.111,
2001
- "grad_norm": 0.6695881633643047,
2002
  "learning_rate": 9.878000000000001e-06,
2003
- "loss": 2.31,
2004
  "step": 5550
2005
  },
2006
  {
2007
  "epoch": 0.1115,
2008
- "grad_norm": 0.6964827034411643,
2009
  "learning_rate": 9.872444444444446e-06,
2010
- "loss": 2.3111,
2011
  "step": 5575
2012
  },
2013
  {
2014
  "epoch": 0.112,
2015
- "grad_norm": 0.7096404846524412,
2016
  "learning_rate": 9.86688888888889e-06,
2017
- "loss": 2.2995,
2018
  "step": 5600
2019
  },
2020
  {
2021
  "epoch": 0.112,
2022
- "eval_loss": 2.4539694786071777,
2023
- "eval_runtime": 268.5912,
2024
- "eval_samples_per_second": 3.06,
2025
- "eval_steps_per_second": 1.53,
2026
  "step": 5600
2027
  },
2028
  {
2029
  "epoch": 0.1125,
2030
- "grad_norm": 0.778866892002705,
2031
  "learning_rate": 9.861333333333333e-06,
2032
- "loss": 2.2951,
2033
  "step": 5625
2034
  },
2035
  {
2036
  "epoch": 0.113,
2037
- "grad_norm": 0.6534696913167488,
2038
  "learning_rate": 9.855777777777779e-06,
2039
- "loss": 2.2999,
2040
  "step": 5650
2041
  },
2042
  {
2043
  "epoch": 0.1135,
2044
- "grad_norm": 0.6378284092785932,
2045
  "learning_rate": 9.850222222222224e-06,
2046
- "loss": 2.3005,
2047
  "step": 5675
2048
  },
2049
  {
2050
  "epoch": 0.114,
2051
- "grad_norm": 0.6797038144836399,
2052
  "learning_rate": 9.844666666666667e-06,
2053
- "loss": 2.3043,
2054
  "step": 5700
2055
  },
2056
  {
2057
  "epoch": 0.114,
2058
- "eval_loss": 2.4557044506073,
2059
- "eval_runtime": 268.6823,
2060
- "eval_samples_per_second": 3.059,
2061
- "eval_steps_per_second": 1.53,
2062
  "step": 5700
2063
  },
2064
  {
2065
  "epoch": 0.1145,
2066
- "grad_norm": 1.365902512042536,
2067
  "learning_rate": 9.839111111111111e-06,
2068
- "loss": 2.2916,
2069
  "step": 5725
2070
  },
2071
  {
2072
  "epoch": 0.115,
2073
- "grad_norm": 0.7361097107736868,
2074
  "learning_rate": 9.833555555555556e-06,
2075
- "loss": 2.3111,
2076
  "step": 5750
2077
  },
2078
  {
2079
  "epoch": 0.1155,
2080
- "grad_norm": 0.7329661017576566,
2081
  "learning_rate": 9.828000000000001e-06,
2082
- "loss": 2.2864,
2083
  "step": 5775
2084
  },
2085
  {
2086
  "epoch": 0.116,
2087
- "grad_norm": 0.7330550696150586,
2088
  "learning_rate": 9.822444444444445e-06,
2089
- "loss": 2.2953,
2090
  "step": 5800
2091
  },
2092
  {
2093
  "epoch": 0.116,
2094
- "eval_loss": 2.456087589263916,
2095
- "eval_runtime": 268.8338,
2096
- "eval_samples_per_second": 3.058,
2097
- "eval_steps_per_second": 1.529,
2098
  "step": 5800
2099
  },
2100
  {
2101
  "epoch": 0.1165,
2102
- "grad_norm": 0.7270812590805177,
2103
  "learning_rate": 9.81688888888889e-06,
2104
- "loss": 2.3084,
2105
  "step": 5825
2106
  },
2107
  {
2108
  "epoch": 0.117,
2109
- "grad_norm": 0.7391337123301003,
2110
  "learning_rate": 9.811333333333334e-06,
2111
- "loss": 2.2892,
2112
  "step": 5850
2113
  },
2114
  {
2115
  "epoch": 0.1175,
2116
- "grad_norm": 0.7460062590261749,
2117
  "learning_rate": 9.805777777777779e-06,
2118
- "loss": 2.2902,
2119
  "step": 5875
2120
  },
2121
  {
2122
  "epoch": 0.118,
2123
- "grad_norm": 0.7111372338674795,
2124
  "learning_rate": 9.800222222222223e-06,
2125
- "loss": 2.3026,
2126
  "step": 5900
2127
  },
2128
  {
2129
  "epoch": 0.118,
2130
- "eval_loss": 2.4564828872680664,
2131
- "eval_runtime": 267.0789,
2132
- "eval_samples_per_second": 3.078,
2133
- "eval_steps_per_second": 1.539,
2134
  "step": 5900
2135
  },
2136
  {
2137
  "epoch": 0.1185,
2138
- "grad_norm": 0.6928050054331656,
2139
  "learning_rate": 9.794666666666668e-06,
2140
- "loss": 2.2939,
2141
  "step": 5925
2142
  },
2143
  {
2144
  "epoch": 0.119,
2145
- "grad_norm": 1.1910401129331545,
2146
  "learning_rate": 9.789111111111111e-06,
2147
- "loss": 2.295,
2148
  "step": 5950
2149
  },
2150
  {
2151
  "epoch": 0.1195,
2152
- "grad_norm": 0.7117996271252786,
2153
  "learning_rate": 9.783555555555557e-06,
2154
- "loss": 2.2921,
2155
  "step": 5975
2156
  },
2157
  {
2158
  "epoch": 0.12,
2159
- "grad_norm": 0.7331367166809584,
2160
  "learning_rate": 9.778e-06,
2161
- "loss": 2.288,
2162
  "step": 6000
2163
  },
2164
  {
2165
  "epoch": 0.12,
2166
- "eval_loss": 2.457658529281616,
2167
- "eval_runtime": 279.2078,
2168
- "eval_samples_per_second": 2.944,
2169
- "eval_steps_per_second": 1.472,
2170
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2171
  }
2172
  ],
2173
  "logging_steps": 25,
@@ -2187,7 +2907,7 @@
2187
  "attributes": {}
2188
  }
2189
  },
2190
- "total_flos": 1.9099213789963223e+19,
2191
  "train_batch_size": 1,
2192
  "trial_name": null,
2193
  "trial_params": null
 
1
  {
2
+ "best_global_step": 4300,
3
+ "best_metric": 2.432278633117676,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-4000",
5
+ "epoch": 0.16,
6
  "eval_steps": 100,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.0005,
14
+ "grad_norm": 39.75564521032967,
15
  "learning_rate": 4.8e-08,
16
+ "loss": 3.6517,
17
  "step": 25
18
  },
19
  {
20
  "epoch": 0.001,
21
+ "grad_norm": 28.937531835097435,
22
  "learning_rate": 9.8e-08,
23
+ "loss": 3.5931,
24
  "step": 50
25
  },
26
  {
27
  "epoch": 0.0015,
28
+ "grad_norm": 21.922720332659644,
29
  "learning_rate": 1.4800000000000003e-07,
30
+ "loss": 3.3397,
31
  "step": 75
32
  },
33
  {
34
  "epoch": 0.002,
35
+ "grad_norm": 8.739610199908325,
36
  "learning_rate": 1.9800000000000003e-07,
37
+ "loss": 3.1289,
38
  "step": 100
39
  },
40
  {
41
  "epoch": 0.002,
42
+ "eval_loss": 2.9243295192718506,
43
+ "eval_runtime": 264.3302,
44
+ "eval_samples_per_second": 3.11,
45
+ "eval_steps_per_second": 1.555,
46
  "step": 100
47
  },
48
  {
49
  "epoch": 0.0025,
50
+ "grad_norm": 4.433912600039677,
51
  "learning_rate": 2.48e-07,
52
+ "loss": 2.8957,
53
  "step": 125
54
  },
55
  {
56
  "epoch": 0.003,
57
+ "grad_norm": 3.2874790066620303,
58
  "learning_rate": 2.9800000000000005e-07,
59
+ "loss": 2.763,
60
  "step": 150
61
  },
62
  {
63
  "epoch": 0.0035,
64
+ "grad_norm": 1.5203472215469231,
65
  "learning_rate": 3.48e-07,
66
+ "loss": 2.676,
67
  "step": 175
68
  },
69
  {
70
  "epoch": 0.004,
71
+ "grad_norm": 1.1945541683905954,
72
  "learning_rate": 3.9800000000000004e-07,
73
+ "loss": 2.635,
74
  "step": 200
75
  },
76
  {
77
  "epoch": 0.004,
78
+ "eval_loss": 2.6094932556152344,
79
+ "eval_runtime": 265.7702,
80
+ "eval_samples_per_second": 3.093,
81
+ "eval_steps_per_second": 1.546,
82
  "step": 200
83
  },
84
  {
85
  "epoch": 0.0045,
86
+ "grad_norm": 1.0852713304633745,
87
  "learning_rate": 4.4800000000000004e-07,
88
+ "loss": 2.6016,
89
  "step": 225
90
  },
91
  {
92
  "epoch": 0.005,
93
+ "grad_norm": 1.0733940346699529,
94
  "learning_rate": 4.98e-07,
95
+ "loss": 2.5797,
96
  "step": 250
97
  },
98
  {
99
  "epoch": 0.0055,
100
+ "grad_norm": 0.9273949035031271,
101
  "learning_rate": 5.480000000000001e-07,
102
+ "loss": 2.5607,
103
  "step": 275
104
  },
105
  {
106
  "epoch": 0.006,
107
+ "grad_norm": 0.9289300678591714,
108
  "learning_rate": 5.98e-07,
109
+ "loss": 2.552,
110
  "step": 300
111
  },
112
  {
113
  "epoch": 0.006,
114
+ "eval_loss": 2.541522264480591,
115
+ "eval_runtime": 266.7478,
116
+ "eval_samples_per_second": 3.082,
117
+ "eval_steps_per_second": 1.541,
118
  "step": 300
119
  },
120
  {
121
  "epoch": 0.0065,
122
+ "grad_norm": 1.1328584507449984,
123
  "learning_rate": 6.48e-07,
124
+ "loss": 2.5402,
125
  "step": 325
126
  },
127
  {
128
  "epoch": 0.007,
129
+ "grad_norm": 0.8593307029257858,
130
  "learning_rate": 6.98e-07,
131
+ "loss": 2.5286,
132
  "step": 350
133
  },
134
  {
135
  "epoch": 0.0075,
136
+ "grad_norm": 0.895615604067586,
137
  "learning_rate": 7.480000000000001e-07,
138
+ "loss": 2.5311,
139
  "step": 375
140
  },
141
  {
142
  "epoch": 0.008,
143
+ "grad_norm": 0.912306580242149,
144
  "learning_rate": 7.98e-07,
145
+ "loss": 2.5037,
146
  "step": 400
147
  },
148
  {
149
  "epoch": 0.008,
150
+ "eval_loss": 2.514389991760254,
151
+ "eval_runtime": 266.4899,
152
+ "eval_samples_per_second": 3.085,
153
+ "eval_steps_per_second": 1.542,
154
  "step": 400
155
  },
156
  {
157
  "epoch": 0.0085,
158
+ "grad_norm": 1.1866535514670034,
159
  "learning_rate": 8.480000000000001e-07,
160
+ "loss": 2.5011,
161
  "step": 425
162
  },
163
  {
164
  "epoch": 0.009,
165
+ "grad_norm": 1.211342504193914,
166
  "learning_rate": 8.980000000000001e-07,
167
+ "loss": 2.503,
168
  "step": 450
169
  },
170
  {
171
  "epoch": 0.0095,
172
+ "grad_norm": 1.113763817383069,
173
  "learning_rate": 9.480000000000001e-07,
174
+ "loss": 2.4999,
175
  "step": 475
176
  },
177
  {
178
  "epoch": 0.01,
179
+ "grad_norm": 1.2585585589647226,
180
  "learning_rate": 9.98e-07,
181
+ "loss": 2.4872,
182
  "step": 500
183
  },
184
  {
185
  "epoch": 0.01,
186
+ "eval_loss": 2.497868061065674,
187
+ "eval_runtime": 265.7962,
188
+ "eval_samples_per_second": 3.093,
189
+ "eval_steps_per_second": 1.546,
190
  "step": 500
191
  },
192
  {
193
  "epoch": 0.0105,
194
+ "grad_norm": 1.2585825718084245,
195
  "learning_rate": 1.0480000000000002e-06,
196
+ "loss": 2.4852,
197
  "step": 525
198
  },
199
  {
200
  "epoch": 0.011,
201
+ "grad_norm": 1.4101257437846046,
202
  "learning_rate": 1.0980000000000001e-06,
203
+ "loss": 2.4892,
204
  "step": 550
205
  },
206
  {
207
  "epoch": 0.0115,
208
+ "grad_norm": 1.1975234150707363,
209
  "learning_rate": 1.148e-06,
210
+ "loss": 2.4861,
211
  "step": 575
212
  },
213
  {
214
  "epoch": 0.012,
215
+ "grad_norm": 1.3662769225582332,
216
  "learning_rate": 1.1980000000000002e-06,
217
+ "loss": 2.4882,
218
  "step": 600
219
  },
220
  {
221
  "epoch": 0.012,
222
+ "eval_loss": 2.4879231452941895,
223
+ "eval_runtime": 267.0005,
224
+ "eval_samples_per_second": 3.079,
225
+ "eval_steps_per_second": 1.539,
226
  "step": 600
227
  },
228
  {
229
  "epoch": 0.0125,
230
+ "grad_norm": 1.3086724275194024,
231
  "learning_rate": 1.248e-06,
232
+ "loss": 2.4745,
233
  "step": 625
234
  },
235
  {
236
  "epoch": 0.013,
237
+ "grad_norm": 1.317023206802888,
238
  "learning_rate": 1.2980000000000001e-06,
239
+ "loss": 2.4727,
240
  "step": 650
241
  },
242
  {
243
  "epoch": 0.0135,
244
+ "grad_norm": 1.5284967544483212,
245
  "learning_rate": 1.348e-06,
246
+ "loss": 2.469,
247
  "step": 675
248
  },
249
  {
250
  "epoch": 0.014,
251
+ "grad_norm": 1.1047595217316941,
252
  "learning_rate": 1.3980000000000002e-06,
253
+ "loss": 2.4695,
254
  "step": 700
255
  },
256
  {
257
  "epoch": 0.014,
258
+ "eval_loss": 2.480103015899658,
259
+ "eval_runtime": 263.5022,
260
+ "eval_samples_per_second": 3.12,
261
+ "eval_steps_per_second": 1.56,
262
  "step": 700
263
  },
264
  {
265
  "epoch": 0.0145,
266
+ "grad_norm": 1.2077328209863791,
267
  "learning_rate": 1.4480000000000002e-06,
268
+ "loss": 2.4654,
269
  "step": 725
270
  },
271
  {
272
  "epoch": 0.015,
273
+ "grad_norm": 1.209220841771836,
274
  "learning_rate": 1.498e-06,
275
+ "loss": 2.4663,
276
  "step": 750
277
  },
278
  {
279
  "epoch": 0.0155,
280
+ "grad_norm": 1.3063169829879686,
281
  "learning_rate": 1.548e-06,
282
+ "loss": 2.4704,
283
  "step": 775
284
  },
285
  {
286
  "epoch": 0.016,
287
+ "grad_norm": 1.3180183352683195,
288
  "learning_rate": 1.5980000000000002e-06,
289
+ "loss": 2.4583,
290
  "step": 800
291
  },
292
  {
293
  "epoch": 0.016,
294
+ "eval_loss": 2.473590850830078,
295
+ "eval_runtime": 305.9875,
296
+ "eval_samples_per_second": 2.686,
297
+ "eval_steps_per_second": 1.343,
298
  "step": 800
299
  },
300
  {
301
  "epoch": 0.0165,
302
+ "grad_norm": 1.1674852380778837,
303
  "learning_rate": 1.6480000000000001e-06,
304
+ "loss": 2.467,
305
  "step": 825
306
  },
307
  {
308
  "epoch": 0.017,
309
+ "grad_norm": 1.2497656349941002,
310
  "learning_rate": 1.6980000000000003e-06,
311
+ "loss": 2.4612,
312
  "step": 850
313
  },
314
  {
315
  "epoch": 0.0175,
316
+ "grad_norm": 1.3358614980967494,
317
  "learning_rate": 1.7480000000000002e-06,
318
+ "loss": 2.4636,
319
  "step": 875
320
  },
321
  {
322
  "epoch": 0.018,
323
+ "grad_norm": 1.252489857653356,
324
  "learning_rate": 1.798e-06,
325
+ "loss": 2.454,
326
  "step": 900
327
  },
328
  {
329
  "epoch": 0.018,
330
+ "eval_loss": 2.4681763648986816,
331
+ "eval_runtime": 264.702,
332
+ "eval_samples_per_second": 3.105,
333
+ "eval_steps_per_second": 1.553,
334
  "step": 900
335
  },
336
  {
337
  "epoch": 0.0185,
338
+ "grad_norm": 1.2815437998994337,
339
  "learning_rate": 1.8480000000000001e-06,
340
+ "loss": 2.4571,
341
  "step": 925
342
  },
343
  {
344
  "epoch": 0.019,
345
+ "grad_norm": 1.0902475329451575,
346
  "learning_rate": 1.898e-06,
347
+ "loss": 2.451,
348
  "step": 950
349
  },
350
  {
351
  "epoch": 0.0195,
352
+ "grad_norm": 1.1502696024965324,
353
  "learning_rate": 1.9480000000000002e-06,
354
+ "loss": 2.4527,
355
  "step": 975
356
  },
357
  {
358
  "epoch": 0.02,
359
+ "grad_norm": 1.2336661855806117,
360
  "learning_rate": 1.998e-06,
361
+ "loss": 2.4496,
362
  "step": 1000
363
  },
364
  {
365
  "epoch": 0.02,
366
+ "eval_loss": 2.463880777359009,
367
+ "eval_runtime": 275.7426,
368
+ "eval_samples_per_second": 2.981,
369
+ "eval_steps_per_second": 1.491,
370
  "step": 1000
371
  },
372
  {
373
  "epoch": 0.0205,
374
+ "grad_norm": 1.2680742209094296,
375
  "learning_rate": 2.048e-06,
376
+ "loss": 2.4494,
377
  "step": 1025
378
  },
379
  {
380
  "epoch": 0.021,
381
+ "grad_norm": 1.0341778808278126,
382
  "learning_rate": 2.098e-06,
383
+ "loss": 2.4467,
384
  "step": 1050
385
  },
386
  {
387
  "epoch": 0.0215,
388
+ "grad_norm": 0.9860490736001175,
389
  "learning_rate": 2.148e-06,
390
+ "loss": 2.4473,
391
  "step": 1075
392
  },
393
  {
394
  "epoch": 0.022,
395
+ "grad_norm": 0.9419267295275278,
396
  "learning_rate": 2.198e-06,
397
+ "loss": 2.443,
398
  "step": 1100
399
  },
400
  {
401
  "epoch": 0.022,
402
+ "eval_loss": 2.4598941802978516,
403
+ "eval_runtime": 265.0502,
404
+ "eval_samples_per_second": 3.101,
405
+ "eval_steps_per_second": 1.551,
406
  "step": 1100
407
  },
408
  {
409
  "epoch": 0.0225,
410
+ "grad_norm": 1.3280720471027394,
411
  "learning_rate": 2.2480000000000003e-06,
412
+ "loss": 2.4515,
413
  "step": 1125
414
  },
415
  {
416
  "epoch": 0.023,
417
+ "grad_norm": 1.053570785582915,
418
  "learning_rate": 2.2980000000000003e-06,
419
+ "loss": 2.4396,
420
  "step": 1150
421
  },
422
  {
423
  "epoch": 0.0235,
424
+ "grad_norm": 0.9108119839585552,
425
  "learning_rate": 2.3480000000000002e-06,
426
+ "loss": 2.4442,
427
  "step": 1175
428
  },
429
  {
430
  "epoch": 0.024,
431
+ "grad_norm": 1.0062346367900277,
432
  "learning_rate": 2.398e-06,
433
+ "loss": 2.4443,
434
  "step": 1200
435
  },
436
  {
437
  "epoch": 0.024,
438
+ "eval_loss": 2.456455945968628,
439
+ "eval_runtime": 264.5888,
440
+ "eval_samples_per_second": 3.107,
441
+ "eval_steps_per_second": 1.553,
442
  "step": 1200
443
  },
444
  {
445
  "epoch": 0.0245,
446
+ "grad_norm": 1.0264127705426926,
447
  "learning_rate": 2.448e-06,
448
+ "loss": 2.4351,
449
  "step": 1225
450
  },
451
  {
452
  "epoch": 0.025,
453
+ "grad_norm": 0.8015249588347212,
454
  "learning_rate": 2.498e-06,
455
+ "loss": 2.4406,
456
  "step": 1250
457
  },
458
  {
459
  "epoch": 0.0255,
460
+ "grad_norm": 1.1105649485540114,
461
  "learning_rate": 2.5480000000000004e-06,
462
+ "loss": 2.4377,
463
  "step": 1275
464
  },
465
  {
466
  "epoch": 0.026,
467
+ "grad_norm": 0.9701758426012801,
468
  "learning_rate": 2.598e-06,
469
+ "loss": 2.4341,
470
  "step": 1300
471
  },
472
  {
473
  "epoch": 0.026,
474
+ "eval_loss": 2.453026056289673,
475
+ "eval_runtime": 264.7653,
476
+ "eval_samples_per_second": 3.105,
477
+ "eval_steps_per_second": 1.552,
478
  "step": 1300
479
  },
480
  {
481
  "epoch": 0.0265,
482
+ "grad_norm": 0.9587254891845429,
483
  "learning_rate": 2.648e-06,
484
+ "loss": 2.4303,
485
  "step": 1325
486
  },
487
  {
488
  "epoch": 0.027,
489
+ "grad_norm": 0.8135883960763247,
490
  "learning_rate": 2.6980000000000003e-06,
491
+ "loss": 2.4363,
492
  "step": 1350
493
  },
494
  {
495
  "epoch": 0.0275,
496
+ "grad_norm": 0.9192860127847176,
497
  "learning_rate": 2.748e-06,
498
+ "loss": 2.4257,
499
  "step": 1375
500
  },
501
  {
502
  "epoch": 0.028,
503
+ "grad_norm": 0.947465928893444,
504
  "learning_rate": 2.798e-06,
505
+ "loss": 2.4353,
506
  "step": 1400
507
  },
508
  {
509
  "epoch": 0.028,
510
+ "eval_loss": 2.450345993041992,
511
+ "eval_runtime": 265.6266,
512
+ "eval_samples_per_second": 3.095,
513
+ "eval_steps_per_second": 1.547,
514
  "step": 1400
515
  },
516
  {
517
  "epoch": 0.0285,
518
+ "grad_norm": 0.9270137901066681,
519
  "learning_rate": 2.848e-06,
520
+ "loss": 2.4347,
521
  "step": 1425
522
  },
523
  {
524
  "epoch": 0.029,
525
+ "grad_norm": 0.8839980710491563,
526
  "learning_rate": 2.8980000000000005e-06,
527
+ "loss": 2.4213,
528
  "step": 1450
529
  },
530
  {
531
  "epoch": 0.0295,
532
+ "grad_norm": 0.913196005454606,
533
  "learning_rate": 2.9480000000000004e-06,
534
+ "loss": 2.4232,
535
  "step": 1475
536
  },
537
  {
538
  "epoch": 0.03,
539
+ "grad_norm": 0.8139623858623861,
540
  "learning_rate": 2.9980000000000003e-06,
541
+ "loss": 2.4254,
542
  "step": 1500
543
  },
544
  {
545
  "epoch": 0.03,
546
+ "eval_loss": 2.447662830352783,
547
+ "eval_runtime": 263.4353,
548
+ "eval_samples_per_second": 3.12,
549
+ "eval_steps_per_second": 1.56,
550
  "step": 1500
551
  },
552
  {
553
  "epoch": 0.0305,
554
+ "grad_norm": 0.8422198221554755,
555
  "learning_rate": 3.0480000000000003e-06,
556
+ "loss": 2.4196,
557
  "step": 1525
558
  },
559
  {
560
  "epoch": 0.031,
561
+ "grad_norm": 0.8542957579365906,
562
  "learning_rate": 3.0980000000000007e-06,
563
+ "loss": 2.4294,
564
  "step": 1550
565
  },
566
  {
567
  "epoch": 0.0315,
568
+ "grad_norm": 1.149263137594797,
569
  "learning_rate": 3.1480000000000006e-06,
570
+ "loss": 2.4265,
571
  "step": 1575
572
  },
573
  {
574
  "epoch": 0.032,
575
+ "grad_norm": 0.811470126240392,
576
  "learning_rate": 3.198e-06,
577
+ "loss": 2.4105,
578
  "step": 1600
579
  },
580
  {
581
  "epoch": 0.032,
582
+ "eval_loss": 2.4456679821014404,
583
+ "eval_runtime": 264.056,
584
+ "eval_samples_per_second": 3.113,
585
+ "eval_steps_per_second": 1.556,
586
  "step": 1600
587
  },
588
  {
589
  "epoch": 0.0325,
590
+ "grad_norm": 2.3928975221881434,
591
  "learning_rate": 3.248e-06,
592
+ "loss": 2.4208,
593
  "step": 1625
594
  },
595
  {
596
  "epoch": 0.033,
597
+ "grad_norm": 0.8031315125360012,
598
  "learning_rate": 3.298e-06,
599
+ "loss": 2.4224,
600
  "step": 1650
601
  },
602
  {
603
  "epoch": 0.0335,
604
+ "grad_norm": 0.835567276692195,
605
  "learning_rate": 3.348e-06,
606
+ "loss": 2.4188,
607
  "step": 1675
608
  },
609
  {
610
  "epoch": 0.034,
611
+ "grad_norm": 0.8894325175719718,
612
  "learning_rate": 3.3980000000000003e-06,
613
+ "loss": 2.4206,
614
  "step": 1700
615
  },
616
  {
617
  "epoch": 0.034,
618
+ "eval_loss": 2.4437851905822754,
619
+ "eval_runtime": 264.6455,
620
+ "eval_samples_per_second": 3.106,
621
+ "eval_steps_per_second": 1.553,
622
  "step": 1700
623
  },
624
  {
625
  "epoch": 0.0345,
626
+ "grad_norm": 0.802724390649243,
627
  "learning_rate": 3.4480000000000003e-06,
628
+ "loss": 2.4241,
629
  "step": 1725
630
  },
631
  {
632
  "epoch": 0.035,
633
+ "grad_norm": 0.8206312612014312,
634
  "learning_rate": 3.4980000000000002e-06,
635
+ "loss": 2.4157,
636
  "step": 1750
637
  },
638
  {
639
  "epoch": 0.0355,
640
+ "grad_norm": 0.8653789917535344,
641
  "learning_rate": 3.548e-06,
642
+ "loss": 2.412,
643
  "step": 1775
644
  },
645
  {
646
  "epoch": 0.036,
647
+ "grad_norm": 0.7816319078215015,
648
  "learning_rate": 3.5980000000000005e-06,
649
+ "loss": 2.4179,
650
  "step": 1800
651
  },
652
  {
653
  "epoch": 0.036,
654
+ "eval_loss": 2.4423036575317383,
655
+ "eval_runtime": 264.5578,
656
+ "eval_samples_per_second": 3.107,
657
+ "eval_steps_per_second": 1.554,
658
  "step": 1800
659
  },
660
  {
661
  "epoch": 0.0365,
662
+ "grad_norm": 0.707594544466941,
663
  "learning_rate": 3.6480000000000005e-06,
664
+ "loss": 2.416,
665
  "step": 1825
666
  },
667
  {
668
  "epoch": 0.037,
669
+ "grad_norm": 0.7481066913011816,
670
  "learning_rate": 3.6980000000000004e-06,
671
+ "loss": 2.4242,
672
  "step": 1850
673
  },
674
  {
675
  "epoch": 0.0375,
676
+ "grad_norm": 0.7612014979445353,
677
  "learning_rate": 3.7480000000000004e-06,
678
+ "loss": 2.4173,
679
  "step": 1875
680
  },
681
  {
682
  "epoch": 0.038,
683
+ "grad_norm": 0.772750918048857,
684
  "learning_rate": 3.7980000000000007e-06,
685
+ "loss": 2.4134,
686
  "step": 1900
687
  },
688
  {
689
  "epoch": 0.038,
690
+ "eval_loss": 2.440969228744507,
691
+ "eval_runtime": 274.3624,
692
+ "eval_samples_per_second": 2.996,
693
+ "eval_steps_per_second": 1.498,
694
  "step": 1900
695
  },
696
  {
697
  "epoch": 0.0385,
698
+ "grad_norm": 0.7927966042188935,
699
  "learning_rate": 3.848e-06,
700
+ "loss": 2.4131,
701
  "step": 1925
702
  },
703
  {
704
  "epoch": 0.039,
705
+ "grad_norm": 0.7664274167276341,
706
  "learning_rate": 3.898e-06,
707
+ "loss": 2.4133,
708
  "step": 1950
709
  },
710
  {
711
  "epoch": 0.0395,
712
+ "grad_norm": 0.7038638213491795,
713
  "learning_rate": 3.948e-06,
714
+ "loss": 2.4135,
715
  "step": 1975
716
  },
717
  {
718
  "epoch": 0.04,
719
+ "grad_norm": 0.7231696877425319,
720
  "learning_rate": 3.9980000000000005e-06,
721
+ "loss": 2.4169,
722
  "step": 2000
723
  },
724
  {
725
  "epoch": 0.04,
726
+ "eval_loss": 2.439641237258911,
727
+ "eval_runtime": 282.4449,
728
+ "eval_samples_per_second": 2.91,
729
+ "eval_steps_per_second": 1.455,
730
  "step": 2000
731
  },
732
  {
733
  "epoch": 0.0405,
734
+ "grad_norm": 0.7184393791203537,
735
  "learning_rate": 4.048e-06,
736
+ "loss": 2.4071,
737
  "step": 2025
738
  },
739
  {
740
  "epoch": 0.041,
741
+ "grad_norm": 0.7366813467336683,
742
  "learning_rate": 4.098e-06,
743
+ "loss": 2.4113,
744
  "step": 2050
745
  },
746
  {
747
  "epoch": 0.0415,
748
+ "grad_norm": 0.7081408763220511,
749
  "learning_rate": 4.148000000000001e-06,
750
+ "loss": 2.4168,
751
  "step": 2075
752
  },
753
  {
754
  "epoch": 0.042,
755
+ "grad_norm": 0.6912835983850483,
756
  "learning_rate": 4.198e-06,
757
+ "loss": 2.4105,
758
  "step": 2100
759
  },
760
  {
761
  "epoch": 0.042,
762
+ "eval_loss": 2.438904047012329,
763
+ "eval_runtime": 277.7481,
764
+ "eval_samples_per_second": 2.96,
765
+ "eval_steps_per_second": 1.48,
766
  "step": 2100
767
  },
768
  {
769
  "epoch": 0.0425,
770
+ "grad_norm": 0.7745538733736145,
771
  "learning_rate": 4.248000000000001e-06,
772
+ "loss": 2.4131,
773
  "step": 2125
774
  },
775
  {
776
  "epoch": 0.043,
777
+ "grad_norm": 0.6897576190091962,
778
  "learning_rate": 4.298e-06,
779
+ "loss": 2.4084,
780
  "step": 2150
781
  },
782
  {
783
  "epoch": 0.0435,
784
+ "grad_norm": 0.7020994032566351,
785
  "learning_rate": 4.3480000000000006e-06,
786
+ "loss": 2.4125,
787
  "step": 2175
788
  },
789
  {
790
  "epoch": 0.044,
791
+ "grad_norm": 0.6668651869738377,
792
  "learning_rate": 4.398000000000001e-06,
793
+ "loss": 2.4034,
794
  "step": 2200
795
  },
796
  {
797
  "epoch": 0.044,
798
+ "eval_loss": 2.4380908012390137,
799
+ "eval_runtime": 268.2252,
800
+ "eval_samples_per_second": 3.065,
801
+ "eval_steps_per_second": 1.532,
802
  "step": 2200
803
  },
804
  {
805
  "epoch": 0.0445,
806
+ "grad_norm": 0.6547759047620061,
807
  "learning_rate": 4.4480000000000004e-06,
808
+ "loss": 2.4099,
809
  "step": 2225
810
  },
811
  {
812
  "epoch": 0.045,
813
+ "grad_norm": 0.6865815945777785,
814
  "learning_rate": 4.498e-06,
815
+ "loss": 2.412,
816
  "step": 2250
817
  },
818
  {
819
  "epoch": 0.0455,
820
+ "grad_norm": 0.6878267781655092,
821
  "learning_rate": 4.548e-06,
822
+ "loss": 2.4137,
823
  "step": 2275
824
  },
825
  {
826
  "epoch": 0.046,
827
+ "grad_norm": 0.8314813616644483,
828
  "learning_rate": 4.598e-06,
829
+ "loss": 2.4097,
830
  "step": 2300
831
  },
832
  {
833
  "epoch": 0.046,
834
+ "eval_loss": 2.4374496936798096,
835
+ "eval_runtime": 263.1701,
836
+ "eval_samples_per_second": 3.123,
837
+ "eval_steps_per_second": 1.562,
838
  "step": 2300
839
  },
840
  {
841
  "epoch": 0.0465,
842
+ "grad_norm": 0.6723966792931375,
843
  "learning_rate": 4.648e-06,
844
+ "loss": 2.4051,
845
  "step": 2325
846
  },
847
  {
848
  "epoch": 0.047,
849
+ "grad_norm": 0.7003756914046538,
850
  "learning_rate": 4.698000000000001e-06,
851
+ "loss": 2.4032,
852
  "step": 2350
853
  },
854
  {
855
  "epoch": 0.0475,
856
+ "grad_norm": 0.6747085415631567,
857
  "learning_rate": 4.748e-06,
858
+ "loss": 2.4096,
859
  "step": 2375
860
  },
861
  {
862
  "epoch": 0.048,
863
+ "grad_norm": 0.6571218540079207,
864
  "learning_rate": 4.7980000000000005e-06,
865
+ "loss": 2.4165,
866
  "step": 2400
867
  },
868
  {
869
  "epoch": 0.048,
870
+ "eval_loss": 2.4365923404693604,
871
+ "eval_runtime": 264.2268,
872
+ "eval_samples_per_second": 3.111,
873
+ "eval_steps_per_second": 1.555,
874
  "step": 2400
875
  },
876
  {
877
  "epoch": 0.0485,
878
+ "grad_norm": 0.7464314980483315,
879
  "learning_rate": 4.848000000000001e-06,
880
+ "loss": 2.4098,
881
  "step": 2425
882
  },
883
  {
884
  "epoch": 0.049,
885
+ "grad_norm": 0.6267266619200393,
886
  "learning_rate": 4.898e-06,
887
+ "loss": 2.4019,
888
  "step": 2450
889
  },
890
  {
891
  "epoch": 0.0495,
892
+ "grad_norm": 0.6650772680412506,
893
  "learning_rate": 4.948000000000001e-06,
894
+ "loss": 2.405,
895
  "step": 2475
896
  },
897
  {
898
  "epoch": 0.05,
899
+ "grad_norm": 0.7197173899674899,
900
  "learning_rate": 4.998e-06,
901
+ "loss": 2.4095,
902
  "step": 2500
903
  },
904
  {
905
  "epoch": 0.05,
906
+ "eval_loss": 2.4358348846435547,
907
+ "eval_runtime": 266.7682,
908
+ "eval_samples_per_second": 3.081,
909
+ "eval_steps_per_second": 1.541,
910
  "step": 2500
911
  },
912
  {
913
  "epoch": 0.0505,
914
+ "grad_norm": 0.6249572472256157,
915
  "learning_rate": 5.048000000000001e-06,
916
+ "loss": 2.4058,
917
  "step": 2525
918
  },
919
  {
920
  "epoch": 0.051,
921
+ "grad_norm": 0.7429228032719255,
922
  "learning_rate": 5.098000000000001e-06,
923
+ "loss": 2.4084,
924
  "step": 2550
925
  },
926
  {
927
  "epoch": 0.0515,
928
+ "grad_norm": 0.6320325962693778,
929
  "learning_rate": 5.1480000000000005e-06,
930
+ "loss": 2.4015,
931
  "step": 2575
932
  },
933
  {
934
  "epoch": 0.052,
935
+ "grad_norm": 0.672581755106835,
936
  "learning_rate": 5.198000000000001e-06,
937
+ "loss": 2.4051,
938
  "step": 2600
939
  },
940
  {
941
  "epoch": 0.052,
942
+ "eval_loss": 2.4351842403411865,
943
+ "eval_runtime": 264.9149,
944
+ "eval_samples_per_second": 3.103,
945
+ "eval_steps_per_second": 1.551,
946
  "step": 2600
947
  },
948
  {
949
  "epoch": 0.0525,
950
+ "grad_norm": 0.7086480776921088,
951
  "learning_rate": 5.248000000000001e-06,
952
+ "loss": 2.3988,
953
  "step": 2625
954
  },
955
  {
956
  "epoch": 0.053,
957
+ "grad_norm": 0.6774201154936552,
958
  "learning_rate": 5.298000000000001e-06,
959
+ "loss": 2.394,
960
  "step": 2650
961
  },
962
  {
963
  "epoch": 0.0535,
964
+ "grad_norm": 0.6661104910300973,
965
  "learning_rate": 5.348000000000001e-06,
966
+ "loss": 2.4034,
967
  "step": 2675
968
  },
969
  {
970
  "epoch": 0.054,
971
+ "grad_norm": 0.6224421593448741,
972
  "learning_rate": 5.398e-06,
973
+ "loss": 2.3939,
974
  "step": 2700
975
  },
976
  {
977
  "epoch": 0.054,
978
+ "eval_loss": 2.434826374053955,
979
+ "eval_runtime": 264.1641,
980
+ "eval_samples_per_second": 3.112,
981
+ "eval_steps_per_second": 1.556,
982
  "step": 2700
983
  },
984
  {
985
  "epoch": 0.0545,
986
+ "grad_norm": 0.6944661408419767,
987
  "learning_rate": 5.448e-06,
988
+ "loss": 2.4064,
989
  "step": 2725
990
  },
991
  {
992
  "epoch": 0.055,
993
+ "grad_norm": 0.6597297955298902,
994
  "learning_rate": 5.498e-06,
995
+ "loss": 2.4051,
996
  "step": 2750
997
  },
998
  {
999
  "epoch": 0.0555,
1000
+ "grad_norm": 0.6526109506522182,
1001
  "learning_rate": 5.548e-06,
1002
+ "loss": 2.4124,
1003
  "step": 2775
1004
  },
1005
  {
1006
  "epoch": 0.056,
1007
+ "grad_norm": 0.6528041780055424,
1008
  "learning_rate": 5.5980000000000004e-06,
1009
+ "loss": 2.3979,
1010
  "step": 2800
1011
  },
1012
  {
1013
  "epoch": 0.056,
1014
+ "eval_loss": 2.4344167709350586,
1015
+ "eval_runtime": 264.2924,
1016
+ "eval_samples_per_second": 3.11,
1017
+ "eval_steps_per_second": 1.555,
1018
  "step": 2800
1019
  },
1020
  {
1021
  "epoch": 0.0565,
1022
+ "grad_norm": 0.7067565611523313,
1023
  "learning_rate": 5.648e-06,
1024
+ "loss": 2.398,
1025
  "step": 2825
1026
  },
1027
  {
1028
  "epoch": 0.057,
1029
+ "grad_norm": 0.6416666495903947,
1030
  "learning_rate": 5.698e-06,
1031
+ "loss": 2.3991,
1032
  "step": 2850
1033
  },
1034
  {
1035
  "epoch": 0.0575,
1036
+ "grad_norm": 0.6605105424774851,
1037
  "learning_rate": 5.748e-06,
1038
+ "loss": 2.3962,
1039
  "step": 2875
1040
  },
1041
  {
1042
  "epoch": 0.058,
1043
+ "grad_norm": 0.6308761264530915,
1044
  "learning_rate": 5.798e-06,
1045
+ "loss": 2.4058,
1046
  "step": 2900
1047
  },
1048
  {
1049
  "epoch": 0.058,
1050
+ "eval_loss": 2.434436082839966,
1051
+ "eval_runtime": 265.0112,
1052
+ "eval_samples_per_second": 3.102,
1053
+ "eval_steps_per_second": 1.551,
1054
  "step": 2900
1055
  },
1056
  {
1057
  "epoch": 0.0585,
1058
+ "grad_norm": 0.6363649329289001,
1059
  "learning_rate": 5.848000000000001e-06,
1060
+ "loss": 2.3943,
1061
  "step": 2925
1062
  },
1063
  {
1064
  "epoch": 0.059,
1065
+ "grad_norm": 0.6147983139117156,
1066
  "learning_rate": 5.898e-06,
1067
+ "loss": 2.3982,
1068
  "step": 2950
1069
  },
1070
  {
1071
  "epoch": 0.0595,
1072
+ "grad_norm": 0.611354772141602,
1073
  "learning_rate": 5.9480000000000005e-06,
1074
+ "loss": 2.3921,
1075
  "step": 2975
1076
  },
1077
  {
1078
  "epoch": 0.06,
1079
+ "grad_norm": 0.6269054680170398,
1080
  "learning_rate": 5.998000000000001e-06,
1081
+ "loss": 2.392,
1082
  "step": 3000
1083
  },
1084
  {
1085
  "epoch": 0.06,
1086
+ "eval_loss": 2.433990955352783,
1087
+ "eval_runtime": 264.2169,
1088
+ "eval_samples_per_second": 3.111,
1089
+ "eval_steps_per_second": 1.556,
1090
  "step": 3000
1091
  },
1092
  {
1093
  "epoch": 0.0605,
1094
+ "grad_norm": 0.6248207448228328,
1095
  "learning_rate": 6.048e-06,
1096
+ "loss": 2.3858,
1097
  "step": 3025
1098
  },
1099
  {
1100
  "epoch": 0.061,
1101
+ "grad_norm": 0.6275258656299642,
1102
  "learning_rate": 6.098000000000001e-06,
1103
+ "loss": 2.4015,
1104
  "step": 3050
1105
  },
1106
  {
1107
  "epoch": 0.0615,
1108
+ "grad_norm": 1.0457401571274152,
1109
  "learning_rate": 6.148e-06,
1110
+ "loss": 2.3909,
1111
  "step": 3075
1112
  },
1113
  {
1114
  "epoch": 0.062,
1115
+ "grad_norm": 0.6551230863319748,
1116
  "learning_rate": 6.198000000000001e-06,
1117
+ "loss": 2.3983,
1118
  "step": 3100
1119
  },
1120
  {
1121
  "epoch": 0.062,
1122
+ "eval_loss": 2.433279275894165,
1123
+ "eval_runtime": 264.1521,
1124
+ "eval_samples_per_second": 3.112,
1125
+ "eval_steps_per_second": 1.556,
1126
  "step": 3100
1127
  },
1128
  {
1129
  "epoch": 0.0625,
1130
+ "grad_norm": 0.6306746226297937,
1131
  "learning_rate": 6.248000000000001e-06,
1132
+ "loss": 2.397,
1133
  "step": 3125
1134
  },
1135
  {
1136
  "epoch": 0.063,
1137
+ "grad_norm": 0.6299802316587856,
1138
  "learning_rate": 6.2980000000000005e-06,
1139
+ "loss": 2.4018,
1140
  "step": 3150
1141
  },
1142
  {
1143
  "epoch": 0.0635,
1144
+ "grad_norm": 0.6265424590222634,
1145
  "learning_rate": 6.348000000000001e-06,
1146
+ "loss": 2.4065,
1147
  "step": 3175
1148
  },
1149
  {
1150
  "epoch": 0.064,
1151
+ "grad_norm": 0.6717273211615455,
1152
  "learning_rate": 6.398000000000001e-06,
1153
+ "loss": 2.3906,
1154
  "step": 3200
1155
  },
1156
  {
1157
  "epoch": 0.064,
1158
+ "eval_loss": 2.4333276748657227,
1159
+ "eval_runtime": 263.9592,
1160
+ "eval_samples_per_second": 3.114,
1161
+ "eval_steps_per_second": 1.557,
1162
  "step": 3200
1163
  },
1164
  {
1165
  "epoch": 0.0645,
1166
+ "grad_norm": 0.6159924635031793,
1167
  "learning_rate": 6.448000000000001e-06,
1168
+ "loss": 2.3947,
1169
  "step": 3225
1170
  },
1171
  {
1172
  "epoch": 0.065,
1173
+ "grad_norm": 0.6124462043712093,
1174
  "learning_rate": 6.498000000000001e-06,
1175
+ "loss": 2.3963,
1176
  "step": 3250
1177
  },
1178
  {
1179
  "epoch": 0.0655,
1180
+ "grad_norm": 0.6144378183602921,
1181
  "learning_rate": 6.548000000000001e-06,
1182
+ "loss": 2.402,
1183
  "step": 3275
1184
  },
1185
  {
1186
  "epoch": 0.066,
1187
+ "grad_norm": 0.6295732934678283,
1188
  "learning_rate": 6.598000000000001e-06,
1189
+ "loss": 2.3877,
1190
  "step": 3300
1191
  },
1192
  {
1193
  "epoch": 0.066,
1194
+ "eval_loss": 2.4331116676330566,
1195
+ "eval_runtime": 263.4524,
1196
+ "eval_samples_per_second": 3.12,
1197
+ "eval_steps_per_second": 1.56,
1198
  "step": 3300
1199
  },
1200
  {
1201
  "epoch": 0.0665,
1202
+ "grad_norm": 0.5938287129149346,
1203
  "learning_rate": 6.648e-06,
1204
+ "loss": 2.389,
1205
  "step": 3325
1206
  },
1207
  {
1208
  "epoch": 0.067,
1209
+ "grad_norm": 0.6194783667871923,
1210
  "learning_rate": 6.698e-06,
1211
+ "loss": 2.39,
1212
  "step": 3350
1213
  },
1214
  {
1215
  "epoch": 0.0675,
1216
+ "grad_norm": 0.60927231594853,
1217
  "learning_rate": 6.7480000000000004e-06,
1218
+ "loss": 2.3968,
1219
  "step": 3375
1220
  },
1221
  {
1222
  "epoch": 0.068,
1223
+ "grad_norm": 0.6386175333576501,
1224
  "learning_rate": 6.798e-06,
1225
+ "loss": 2.3861,
1226
  "step": 3400
1227
  },
1228
  {
1229
  "epoch": 0.068,
1230
+ "eval_loss": 2.4328911304473877,
1231
+ "eval_runtime": 264.2923,
1232
+ "eval_samples_per_second": 3.11,
1233
+ "eval_steps_per_second": 1.555,
1234
  "step": 3400
1235
  },
1236
  {
1237
  "epoch": 0.0685,
1238
+ "grad_norm": 0.6092295027577579,
1239
  "learning_rate": 6.848e-06,
1240
+ "loss": 2.3827,
1241
  "step": 3425
1242
  },
1243
  {
1244
  "epoch": 0.069,
1245
+ "grad_norm": 0.5914846449422462,
1246
  "learning_rate": 6.898e-06,
1247
+ "loss": 2.3894,
1248
  "step": 3450
1249
  },
1250
  {
1251
  "epoch": 0.0695,
1252
+ "grad_norm": 0.5927461214526666,
1253
  "learning_rate": 6.948e-06,
1254
+ "loss": 2.3858,
1255
  "step": 3475
1256
  },
1257
  {
1258
  "epoch": 0.07,
1259
+ "grad_norm": 0.5992194088197265,
1260
  "learning_rate": 6.998000000000001e-06,
1261
+ "loss": 2.3941,
1262
  "step": 3500
1263
  },
1264
  {
1265
  "epoch": 0.07,
1266
+ "eval_loss": 2.432774543762207,
1267
+ "eval_runtime": 263.8546,
1268
+ "eval_samples_per_second": 3.115,
1269
+ "eval_steps_per_second": 1.558,
1270
  "step": 3500
1271
  },
1272
  {
1273
  "epoch": 0.0705,
1274
+ "grad_norm": 0.6119297158568089,
1275
  "learning_rate": 7.048e-06,
1276
+ "loss": 2.3897,
1277
  "step": 3525
1278
  },
1279
  {
1280
  "epoch": 0.071,
1281
+ "grad_norm": 0.6040666217758901,
1282
  "learning_rate": 7.0980000000000005e-06,
1283
+ "loss": 2.3966,
1284
  "step": 3550
1285
  },
1286
  {
1287
  "epoch": 0.0715,
1288
+ "grad_norm": 0.6142925813030266,
1289
  "learning_rate": 7.148000000000001e-06,
1290
+ "loss": 2.3953,
1291
  "step": 3575
1292
  },
1293
  {
1294
  "epoch": 0.072,
1295
+ "grad_norm": 0.5857079248330344,
1296
  "learning_rate": 7.198e-06,
1297
+ "loss": 2.3854,
1298
  "step": 3600
1299
  },
1300
  {
1301
  "epoch": 0.072,
1302
+ "eval_loss": 2.432868719100952,
1303
+ "eval_runtime": 264.1849,
1304
+ "eval_samples_per_second": 3.111,
1305
+ "eval_steps_per_second": 1.556,
1306
  "step": 3600
1307
  },
1308
  {
1309
  "epoch": 0.0725,
1310
+ "grad_norm": 0.6075613052530382,
1311
  "learning_rate": 7.248000000000001e-06,
1312
+ "loss": 2.3798,
1313
  "step": 3625
1314
  },
1315
  {
1316
  "epoch": 0.073,
1317
+ "grad_norm": 0.6146043204282547,
1318
  "learning_rate": 7.298e-06,
1319
+ "loss": 2.3894,
1320
  "step": 3650
1321
  },
1322
  {
1323
  "epoch": 0.0735,
1324
+ "grad_norm": 0.613284002341936,
1325
  "learning_rate": 7.348000000000001e-06,
1326
+ "loss": 2.3897,
1327
  "step": 3675
1328
  },
1329
  {
1330
  "epoch": 0.074,
1331
+ "grad_norm": 0.6694404263159593,
1332
  "learning_rate": 7.398000000000001e-06,
1333
+ "loss": 2.3925,
1334
  "step": 3700
1335
  },
1336
  {
1337
  "epoch": 0.074,
1338
+ "eval_loss": 2.4324021339416504,
1339
+ "eval_runtime": 263.3107,
1340
+ "eval_samples_per_second": 3.122,
1341
+ "eval_steps_per_second": 1.561,
1342
  "step": 3700
1343
  },
1344
  {
1345
  "epoch": 0.0745,
1346
+ "grad_norm": 0.5756401973694445,
1347
  "learning_rate": 7.4480000000000005e-06,
1348
+ "loss": 2.3894,
1349
  "step": 3725
1350
  },
1351
  {
1352
  "epoch": 0.075,
1353
+ "grad_norm": 0.5945783703417461,
1354
  "learning_rate": 7.498000000000001e-06,
1355
+ "loss": 2.3928,
1356
  "step": 3750
1357
  },
1358
  {
1359
  "epoch": 0.0755,
1360
+ "grad_norm": 0.5935750222986942,
1361
  "learning_rate": 7.548000000000001e-06,
1362
+ "loss": 2.3774,
1363
  "step": 3775
1364
  },
1365
  {
1366
  "epoch": 0.076,
1367
+ "grad_norm": 0.5938734543073783,
1368
  "learning_rate": 7.598000000000001e-06,
1369
+ "loss": 2.3776,
1370
  "step": 3800
1371
  },
1372
  {
1373
  "epoch": 0.076,
1374
+ "eval_loss": 2.432751178741455,
1375
+ "eval_runtime": 263.8929,
1376
+ "eval_samples_per_second": 3.115,
1377
+ "eval_steps_per_second": 1.557,
1378
  "step": 3800
1379
  },
1380
  {
1381
  "epoch": 0.0765,
1382
+ "grad_norm": 0.595820899700728,
1383
  "learning_rate": 7.648e-06,
1384
+ "loss": 2.3804,
1385
  "step": 3825
1386
  },
1387
  {
1388
  "epoch": 0.077,
1389
+ "grad_norm": 0.6079304106413467,
1390
  "learning_rate": 7.698000000000002e-06,
1391
+ "loss": 2.3917,
1392
  "step": 3850
1393
  },
1394
  {
1395
  "epoch": 0.0775,
1396
+ "grad_norm": 0.6083448146618482,
1397
  "learning_rate": 7.748000000000001e-06,
1398
+ "loss": 2.3842,
1399
  "step": 3875
1400
  },
1401
  {
1402
  "epoch": 0.078,
1403
+ "grad_norm": 0.6128893415605828,
1404
  "learning_rate": 7.798e-06,
1405
+ "loss": 2.3806,
1406
  "step": 3900
1407
  },
1408
  {
1409
  "epoch": 0.078,
1410
+ "eval_loss": 2.4325239658355713,
1411
+ "eval_runtime": 263.6693,
1412
+ "eval_samples_per_second": 3.118,
1413
+ "eval_steps_per_second": 1.559,
1414
  "step": 3900
1415
  },
1416
  {
1417
  "epoch": 0.0785,
1418
+ "grad_norm": 0.6079041195191952,
1419
  "learning_rate": 7.848000000000002e-06,
1420
+ "loss": 2.3801,
1421
  "step": 3925
1422
  },
1423
  {
1424
  "epoch": 0.079,
1425
+ "grad_norm": 0.6075689821557235,
1426
  "learning_rate": 7.898e-06,
1427
+ "loss": 2.3797,
1428
  "step": 3950
1429
  },
1430
  {
1431
  "epoch": 0.0795,
1432
+ "grad_norm": 0.5882326737716994,
1433
  "learning_rate": 7.948e-06,
1434
+ "loss": 2.3905,
1435
  "step": 3975
1436
  },
1437
  {
1438
  "epoch": 0.08,
1439
+ "grad_norm": 0.5828476462223788,
1440
  "learning_rate": 7.998e-06,
1441
+ "loss": 2.3806,
1442
  "step": 4000
1443
  },
1444
  {
1445
  "epoch": 0.08,
1446
+ "eval_loss": 2.4323527812957764,
1447
+ "eval_runtime": 263.9786,
1448
+ "eval_samples_per_second": 3.114,
1449
+ "eval_steps_per_second": 1.557,
1450
  "step": 4000
1451
  },
1452
  {
1453
  "epoch": 0.0805,
1454
+ "grad_norm": 0.5907927035367586,
1455
  "learning_rate": 8.048e-06,
1456
+ "loss": 2.3739,
1457
  "step": 4025
1458
  },
1459
  {
1460
  "epoch": 0.081,
1461
+ "grad_norm": 0.608189189988593,
1462
  "learning_rate": 8.098000000000001e-06,
1463
+ "loss": 2.3837,
1464
  "step": 4050
1465
  },
1466
  {
1467
  "epoch": 0.0815,
1468
+ "grad_norm": 0.5933025642280234,
1469
  "learning_rate": 8.148e-06,
1470
+ "loss": 2.3814,
1471
  "step": 4075
1472
  },
1473
  {
1474
  "epoch": 0.082,
1475
+ "grad_norm": 0.5898305070270532,
1476
  "learning_rate": 8.198e-06,
1477
+ "loss": 2.3854,
1478
  "step": 4100
1479
  },
1480
  {
1481
  "epoch": 0.082,
1482
+ "eval_loss": 2.432577610015869,
1483
+ "eval_runtime": 264.0972,
1484
+ "eval_samples_per_second": 3.112,
1485
+ "eval_steps_per_second": 1.556,
1486
  "step": 4100
1487
  },
1488
  {
1489
  "epoch": 0.0825,
1490
+ "grad_norm": 0.5673002921483621,
1491
  "learning_rate": 8.248e-06,
1492
+ "loss": 2.3827,
1493
  "step": 4125
1494
  },
1495
  {
1496
  "epoch": 0.083,
1497
+ "grad_norm": 0.5859186364996516,
1498
  "learning_rate": 8.298000000000001e-06,
1499
+ "loss": 2.3859,
1500
  "step": 4150
1501
  },
1502
  {
1503
  "epoch": 0.0835,
1504
+ "grad_norm": 0.5852893491639726,
1505
  "learning_rate": 8.348e-06,
1506
+ "loss": 2.3711,
1507
  "step": 4175
1508
  },
1509
  {
1510
  "epoch": 0.084,
1511
+ "grad_norm": 0.5704807601233864,
1512
  "learning_rate": 8.398e-06,
1513
+ "loss": 2.3682,
1514
  "step": 4200
1515
  },
1516
  {
1517
  "epoch": 0.084,
1518
+ "eval_loss": 2.4325780868530273,
1519
+ "eval_runtime": 264.0677,
1520
+ "eval_samples_per_second": 3.113,
1521
+ "eval_steps_per_second": 1.556,
1522
  "step": 4200
1523
  },
1524
  {
1525
  "epoch": 0.0845,
1526
+ "grad_norm": 0.565873049775094,
1527
  "learning_rate": 8.448000000000001e-06,
1528
+ "loss": 2.3894,
1529
  "step": 4225
1530
  },
1531
  {
1532
  "epoch": 0.085,
1533
+ "grad_norm": 0.6594348238393681,
1534
  "learning_rate": 8.498e-06,
1535
+ "loss": 2.3736,
1536
  "step": 4250
1537
  },
1538
  {
1539
  "epoch": 0.0855,
1540
+ "grad_norm": 0.6114416993962639,
1541
  "learning_rate": 8.548e-06,
1542
+ "loss": 2.3768,
1543
  "step": 4275
1544
  },
1545
  {
1546
  "epoch": 0.086,
1547
+ "grad_norm": 0.613007148558132,
1548
  "learning_rate": 8.598000000000001e-06,
1549
+ "loss": 2.3841,
1550
  "step": 4300
1551
  },
1552
  {
1553
  "epoch": 0.086,
1554
+ "eval_loss": 2.432278633117676,
1555
+ "eval_runtime": 264.5455,
1556
+ "eval_samples_per_second": 3.107,
1557
+ "eval_steps_per_second": 1.554,
1558
  "step": 4300
1559
  },
1560
  {
1561
  "epoch": 0.0865,
1562
+ "grad_norm": 0.6316113111159283,
1563
  "learning_rate": 8.648000000000001e-06,
1564
+ "loss": 2.3853,
1565
  "step": 4325
1566
  },
1567
  {
1568
  "epoch": 0.087,
1569
+ "grad_norm": 0.578758909498954,
1570
  "learning_rate": 8.698e-06,
1571
+ "loss": 2.3838,
1572
  "step": 4350
1573
  },
1574
  {
1575
  "epoch": 0.0875,
1576
+ "grad_norm": 0.5663796780744771,
1577
  "learning_rate": 8.748000000000002e-06,
1578
+ "loss": 2.3744,
1579
  "step": 4375
1580
  },
1581
  {
1582
  "epoch": 0.088,
1583
+ "grad_norm": 0.5996723194508057,
1584
  "learning_rate": 8.798000000000001e-06,
1585
+ "loss": 2.3741,
1586
  "step": 4400
1587
  },
1588
  {
1589
  "epoch": 0.088,
1590
+ "eval_loss": 2.4327504634857178,
1591
+ "eval_runtime": 264.3839,
1592
+ "eval_samples_per_second": 3.109,
1593
+ "eval_steps_per_second": 1.555,
1594
  "step": 4400
1595
  },
1596
  {
1597
  "epoch": 0.0885,
1598
+ "grad_norm": 0.5903185672805589,
1599
  "learning_rate": 8.848e-06,
1600
+ "loss": 2.3789,
1601
  "step": 4425
1602
  },
1603
  {
1604
  "epoch": 0.089,
1605
+ "grad_norm": 0.5683354037993711,
1606
  "learning_rate": 8.898000000000002e-06,
1607
+ "loss": 2.3739,
1608
  "step": 4450
1609
  },
1610
  {
1611
  "epoch": 0.0895,
1612
+ "grad_norm": 0.5992802333814672,
1613
  "learning_rate": 8.948000000000001e-06,
1614
+ "loss": 2.3805,
1615
  "step": 4475
1616
  },
1617
  {
1618
  "epoch": 0.09,
1619
+ "grad_norm": 0.5951158771681028,
1620
  "learning_rate": 8.998000000000001e-06,
1621
+ "loss": 2.3702,
1622
  "step": 4500
1623
  },
1624
  {
1625
  "epoch": 0.09,
1626
+ "eval_loss": 2.432904005050659,
1627
+ "eval_runtime": 264.0927,
1628
+ "eval_samples_per_second": 3.113,
1629
+ "eval_steps_per_second": 1.556,
1630
  "step": 4500
1631
  },
1632
  {
1633
  "epoch": 0.0905,
1634
+ "grad_norm": 0.628437176595306,
1635
  "learning_rate": 9.048e-06,
1636
+ "loss": 2.3705,
1637
  "step": 4525
1638
  },
1639
  {
1640
  "epoch": 0.091,
1641
+ "grad_norm": 0.5852194468933433,
1642
  "learning_rate": 9.098000000000002e-06,
1643
+ "loss": 2.3726,
1644
  "step": 4550
1645
  },
1646
  {
1647
  "epoch": 0.0915,
1648
+ "grad_norm": 0.5832814461503186,
1649
  "learning_rate": 9.148e-06,
1650
+ "loss": 2.3709,
1651
  "step": 4575
1652
  },
1653
  {
1654
  "epoch": 0.092,
1655
+ "grad_norm": 0.6235298544634128,
1656
  "learning_rate": 9.198e-06,
1657
+ "loss": 2.3823,
1658
  "step": 4600
1659
  },
1660
  {
1661
  "epoch": 0.092,
1662
+ "eval_loss": 2.433288335800171,
1663
+ "eval_runtime": 264.0394,
1664
+ "eval_samples_per_second": 3.113,
1665
+ "eval_steps_per_second": 1.557,
1666
  "step": 4600
1667
  },
1668
  {
1669
  "epoch": 0.0925,
1670
+ "grad_norm": 0.6097464410099737,
1671
  "learning_rate": 9.248e-06,
1672
+ "loss": 2.3715,
1673
  "step": 4625
1674
  },
1675
  {
1676
  "epoch": 0.093,
1677
+ "grad_norm": 0.5830918527201829,
1678
  "learning_rate": 9.298e-06,
1679
+ "loss": 2.3694,
1680
  "step": 4650
1681
  },
1682
  {
1683
  "epoch": 0.0935,
1684
+ "grad_norm": 0.6195865573807103,
1685
  "learning_rate": 9.348000000000001e-06,
1686
+ "loss": 2.3711,
1687
  "step": 4675
1688
  },
1689
  {
1690
  "epoch": 0.094,
1691
+ "grad_norm": 0.5922485886549429,
1692
  "learning_rate": 9.398e-06,
1693
+ "loss": 2.3764,
1694
  "step": 4700
1695
  },
1696
  {
1697
  "epoch": 0.094,
1698
+ "eval_loss": 2.4330477714538574,
1699
+ "eval_runtime": 263.7501,
1700
+ "eval_samples_per_second": 3.117,
1701
+ "eval_steps_per_second": 1.558,
1702
  "step": 4700
1703
  },
1704
  {
1705
  "epoch": 0.0945,
1706
+ "grad_norm": 0.5909566806378528,
1707
  "learning_rate": 9.448e-06,
1708
+ "loss": 2.3799,
1709
  "step": 4725
1710
  },
1711
  {
1712
  "epoch": 0.095,
1713
+ "grad_norm": 0.5872189964007283,
1714
  "learning_rate": 9.498000000000001e-06,
1715
+ "loss": 2.3737,
1716
  "step": 4750
1717
  },
1718
  {
1719
  "epoch": 0.0955,
1720
+ "grad_norm": 0.6071714619656263,
1721
  "learning_rate": 9.548e-06,
1722
+ "loss": 2.3789,
1723
  "step": 4775
1724
  },
1725
  {
1726
  "epoch": 0.096,
1727
+ "grad_norm": 0.5631342344537085,
1728
  "learning_rate": 9.598e-06,
1729
+ "loss": 2.3641,
1730
  "step": 4800
1731
  },
1732
  {
1733
  "epoch": 0.096,
1734
+ "eval_loss": 2.4332797527313232,
1735
+ "eval_runtime": 264.5164,
1736
+ "eval_samples_per_second": 3.108,
1737
+ "eval_steps_per_second": 1.554,
1738
  "step": 4800
1739
  },
1740
  {
1741
  "epoch": 0.0965,
1742
+ "grad_norm": 0.600707218384485,
1743
  "learning_rate": 9.648000000000001e-06,
1744
+ "loss": 2.3715,
1745
  "step": 4825
1746
  },
1747
  {
1748
  "epoch": 0.097,
1749
+ "grad_norm": 0.5705494762785608,
1750
  "learning_rate": 9.698000000000001e-06,
1751
+ "loss": 2.3741,
1752
  "step": 4850
1753
  },
1754
  {
1755
  "epoch": 0.0975,
1756
+ "grad_norm": 0.5891811727113021,
1757
  "learning_rate": 9.748e-06,
1758
+ "loss": 2.3738,
1759
  "step": 4875
1760
  },
1761
  {
1762
  "epoch": 0.098,
1763
+ "grad_norm": 0.5947555260131183,
1764
  "learning_rate": 9.798e-06,
1765
+ "loss": 2.365,
1766
  "step": 4900
1767
  },
1768
  {
1769
  "epoch": 0.098,
1770
+ "eval_loss": 2.433032751083374,
1771
+ "eval_runtime": 264.6355,
1772
+ "eval_samples_per_second": 3.106,
1773
+ "eval_steps_per_second": 1.553,
1774
  "step": 4900
1775
  },
1776
  {
1777
  "epoch": 0.0985,
1778
+ "grad_norm": 0.6055417663185935,
1779
  "learning_rate": 9.848000000000001e-06,
1780
+ "loss": 2.3677,
1781
  "step": 4925
1782
  },
1783
  {
1784
  "epoch": 0.099,
1785
+ "grad_norm": 0.5803464068069174,
1786
  "learning_rate": 9.898e-06,
1787
+ "loss": 2.3699,
1788
  "step": 4950
1789
  },
1790
  {
1791
  "epoch": 0.0995,
1792
+ "grad_norm": 0.5899201870269601,
1793
  "learning_rate": 9.948e-06,
1794
+ "loss": 2.3685,
1795
  "step": 4975
1796
  },
1797
  {
1798
  "epoch": 0.1,
1799
+ "grad_norm": 0.6226759838202708,
1800
  "learning_rate": 9.998000000000002e-06,
1801
+ "loss": 2.3599,
1802
  "step": 5000
1803
  },
1804
  {
1805
  "epoch": 0.1,
1806
+ "eval_loss": 2.433412551879883,
1807
+ "eval_runtime": 279.6783,
1808
+ "eval_samples_per_second": 2.939,
1809
+ "eval_steps_per_second": 1.47,
1810
  "step": 5000
1811
  },
1812
  {
1813
  "epoch": 0.1005,
1814
+ "grad_norm": 0.6129345554278736,
1815
  "learning_rate": 9.994666666666668e-06,
1816
+ "loss": 2.3651,
1817
  "step": 5025
1818
  },
1819
  {
1820
  "epoch": 0.101,
1821
+ "grad_norm": 0.5783687106202524,
1822
  "learning_rate": 9.989111111111111e-06,
1823
+ "loss": 2.3635,
1824
  "step": 5050
1825
  },
1826
  {
1827
  "epoch": 0.1015,
1828
+ "grad_norm": 0.7886759246703615,
1829
  "learning_rate": 9.983555555555556e-06,
1830
+ "loss": 2.3688,
1831
  "step": 5075
1832
  },
1833
  {
1834
  "epoch": 0.102,
1835
+ "grad_norm": 0.5496276670344779,
1836
  "learning_rate": 9.978000000000002e-06,
1837
+ "loss": 2.3718,
1838
  "step": 5100
1839
  },
1840
  {
1841
  "epoch": 0.102,
1842
+ "eval_loss": 2.4336636066436768,
1843
+ "eval_runtime": 264.0531,
1844
+ "eval_samples_per_second": 3.113,
1845
+ "eval_steps_per_second": 1.557,
1846
  "step": 5100
1847
  },
1848
  {
1849
  "epoch": 0.1025,
1850
+ "grad_norm": 0.596488402670124,
1851
  "learning_rate": 9.972444444444445e-06,
1852
+ "loss": 2.3654,
1853
  "step": 5125
1854
  },
1855
  {
1856
  "epoch": 0.103,
1857
+ "grad_norm": 0.5758952191659142,
1858
  "learning_rate": 9.966888888888889e-06,
1859
+ "loss": 2.3662,
1860
  "step": 5150
1861
  },
1862
  {
1863
  "epoch": 0.1035,
1864
+ "grad_norm": 0.5714325894660194,
1865
  "learning_rate": 9.961333333333334e-06,
1866
+ "loss": 2.3671,
1867
  "step": 5175
1868
  },
1869
  {
1870
  "epoch": 0.104,
1871
+ "grad_norm": 0.5826964477363549,
1872
  "learning_rate": 9.95577777777778e-06,
1873
+ "loss": 2.3621,
1874
  "step": 5200
1875
  },
1876
  {
1877
  "epoch": 0.104,
1878
+ "eval_loss": 2.433170795440674,
1879
+ "eval_runtime": 263.4913,
1880
+ "eval_samples_per_second": 3.12,
1881
+ "eval_steps_per_second": 1.56,
1882
  "step": 5200
1883
  },
1884
  {
1885
  "epoch": 0.1045,
1886
+ "grad_norm": 0.5939017286545814,
1887
  "learning_rate": 9.950222222222223e-06,
1888
+ "loss": 2.3704,
1889
  "step": 5225
1890
  },
1891
  {
1892
  "epoch": 0.105,
1893
+ "grad_norm": 0.5916137818576529,
1894
  "learning_rate": 9.944666666666668e-06,
1895
+ "loss": 2.3662,
1896
  "step": 5250
1897
  },
1898
  {
1899
  "epoch": 0.1055,
1900
+ "grad_norm": 0.6105360548349205,
1901
  "learning_rate": 9.939111111111112e-06,
1902
+ "loss": 2.3646,
1903
  "step": 5275
1904
  },
1905
  {
1906
  "epoch": 0.106,
1907
+ "grad_norm": 0.5821955662592928,
1908
  "learning_rate": 9.933555555555557e-06,
1909
+ "loss": 2.365,
1910
  "step": 5300
1911
  },
1912
  {
1913
  "epoch": 0.106,
1914
+ "eval_loss": 2.4327642917633057,
1915
+ "eval_runtime": 263.745,
1916
+ "eval_samples_per_second": 3.117,
1917
+ "eval_steps_per_second": 1.558,
1918
  "step": 5300
1919
  },
1920
  {
1921
  "epoch": 0.1065,
1922
+ "grad_norm": 0.5805717889494187,
1923
  "learning_rate": 9.928e-06,
1924
+ "loss": 2.364,
1925
  "step": 5325
1926
  },
1927
  {
1928
  "epoch": 0.107,
1929
+ "grad_norm": 0.5876895049794754,
1930
  "learning_rate": 9.922444444444446e-06,
1931
+ "loss": 2.362,
1932
  "step": 5350
1933
  },
1934
  {
1935
  "epoch": 0.1075,
1936
+ "grad_norm": 0.6258383766876349,
1937
  "learning_rate": 9.91688888888889e-06,
1938
+ "loss": 2.3654,
1939
  "step": 5375
1940
  },
1941
  {
1942
  "epoch": 0.108,
1943
+ "grad_norm": 0.5963835367877209,
1944
  "learning_rate": 9.911333333333335e-06,
1945
+ "loss": 2.3627,
1946
  "step": 5400
1947
  },
1948
  {
1949
  "epoch": 0.108,
1950
+ "eval_loss": 2.4326930046081543,
1951
+ "eval_runtime": 263.2366,
1952
+ "eval_samples_per_second": 3.123,
1953
+ "eval_steps_per_second": 1.561,
1954
  "step": 5400
1955
  },
1956
  {
1957
  "epoch": 0.1085,
1958
+ "grad_norm": 0.5827253994353866,
1959
  "learning_rate": 9.905777777777778e-06,
1960
+ "loss": 2.3703,
1961
  "step": 5425
1962
  },
1963
  {
1964
  "epoch": 0.109,
1965
+ "grad_norm": 0.571031920084426,
1966
  "learning_rate": 9.900222222222223e-06,
1967
+ "loss": 2.3671,
1968
  "step": 5450
1969
  },
1970
  {
1971
  "epoch": 0.1095,
1972
+ "grad_norm": 0.599548806743577,
1973
  "learning_rate": 9.894666666666669e-06,
1974
+ "loss": 2.362,
1975
  "step": 5475
1976
  },
1977
  {
1978
  "epoch": 0.11,
1979
+ "grad_norm": 0.5736311725646083,
1980
  "learning_rate": 9.889111111111112e-06,
1981
+ "loss": 2.3622,
1982
  "step": 5500
1983
  },
1984
  {
1985
  "epoch": 0.11,
1986
+ "eval_loss": 2.4330084323883057,
1987
+ "eval_runtime": 264.1044,
1988
+ "eval_samples_per_second": 3.112,
1989
+ "eval_steps_per_second": 1.556,
1990
  "step": 5500
1991
  },
1992
  {
1993
  "epoch": 0.1105,
1994
+ "grad_norm": 0.6098672058792028,
1995
  "learning_rate": 9.883555555555556e-06,
1996
+ "loss": 2.3705,
1997
  "step": 5525
1998
  },
1999
  {
2000
  "epoch": 0.111,
2001
+ "grad_norm": 0.5761728375832208,
2002
  "learning_rate": 9.878000000000001e-06,
2003
+ "loss": 2.3608,
2004
  "step": 5550
2005
  },
2006
  {
2007
  "epoch": 0.1115,
2008
+ "grad_norm": 0.5922504560114277,
2009
  "learning_rate": 9.872444444444446e-06,
2010
+ "loss": 2.3542,
2011
  "step": 5575
2012
  },
2013
  {
2014
  "epoch": 0.112,
2015
+ "grad_norm": 0.5668795024079605,
2016
  "learning_rate": 9.86688888888889e-06,
2017
+ "loss": 2.3623,
2018
  "step": 5600
2019
  },
2020
  {
2021
  "epoch": 0.112,
2022
+ "eval_loss": 2.432955503463745,
2023
+ "eval_runtime": 263.8097,
2024
+ "eval_samples_per_second": 3.116,
2025
+ "eval_steps_per_second": 1.558,
2026
  "step": 5600
2027
  },
2028
  {
2029
  "epoch": 0.1125,
2030
+ "grad_norm": 0.5697809034851604,
2031
  "learning_rate": 9.861333333333333e-06,
2032
+ "loss": 2.3541,
2033
  "step": 5625
2034
  },
2035
  {
2036
  "epoch": 0.113,
2037
+ "grad_norm": 0.5740407982821335,
2038
  "learning_rate": 9.855777777777779e-06,
2039
+ "loss": 2.3594,
2040
  "step": 5650
2041
  },
2042
  {
2043
  "epoch": 0.1135,
2044
+ "grad_norm": 0.5697372211616294,
2045
  "learning_rate": 9.850222222222224e-06,
2046
+ "loss": 2.3592,
2047
  "step": 5675
2048
  },
2049
  {
2050
  "epoch": 0.114,
2051
+ "grad_norm": 0.5845230307189324,
2052
  "learning_rate": 9.844666666666667e-06,
2053
+ "loss": 2.3456,
2054
  "step": 5700
2055
  },
2056
  {
2057
  "epoch": 0.114,
2058
+ "eval_loss": 2.432389974594116,
2059
+ "eval_runtime": 263.8043,
2060
+ "eval_samples_per_second": 3.116,
2061
+ "eval_steps_per_second": 1.558,
2062
  "step": 5700
2063
  },
2064
  {
2065
  "epoch": 0.1145,
2066
+ "grad_norm": 0.5677067211464538,
2067
  "learning_rate": 9.839111111111111e-06,
2068
+ "loss": 2.3581,
2069
  "step": 5725
2070
  },
2071
  {
2072
  "epoch": 0.115,
2073
+ "grad_norm": 0.6024564908699644,
2074
  "learning_rate": 9.833555555555556e-06,
2075
+ "loss": 2.359,
2076
  "step": 5750
2077
  },
2078
  {
2079
  "epoch": 0.1155,
2080
+ "grad_norm": 0.5789830837760237,
2081
  "learning_rate": 9.828000000000001e-06,
2082
+ "loss": 2.36,
2083
  "step": 5775
2084
  },
2085
  {
2086
  "epoch": 0.116,
2087
+ "grad_norm": 0.5912805339254935,
2088
  "learning_rate": 9.822444444444445e-06,
2089
+ "loss": 2.3588,
2090
  "step": 5800
2091
  },
2092
  {
2093
  "epoch": 0.116,
2094
+ "eval_loss": 2.432565689086914,
2095
+ "eval_runtime": 263.3515,
2096
+ "eval_samples_per_second": 3.121,
2097
+ "eval_steps_per_second": 1.561,
2098
  "step": 5800
2099
  },
2100
  {
2101
  "epoch": 0.1165,
2102
+ "grad_norm": 0.5647440650976697,
2103
  "learning_rate": 9.81688888888889e-06,
2104
+ "loss": 2.3576,
2105
  "step": 5825
2106
  },
2107
  {
2108
  "epoch": 0.117,
2109
+ "grad_norm": 0.5673458673735715,
2110
  "learning_rate": 9.811333333333334e-06,
2111
+ "loss": 2.3616,
2112
  "step": 5850
2113
  },
2114
  {
2115
  "epoch": 0.1175,
2116
+ "grad_norm": 0.6030082642745155,
2117
  "learning_rate": 9.805777777777779e-06,
2118
+ "loss": 2.3556,
2119
  "step": 5875
2120
  },
2121
  {
2122
  "epoch": 0.118,
2123
+ "grad_norm": 0.5571893163840321,
2124
  "learning_rate": 9.800222222222223e-06,
2125
+ "loss": 2.3557,
2126
  "step": 5900
2127
  },
2128
  {
2129
  "epoch": 0.118,
2130
+ "eval_loss": 2.4327075481414795,
2131
+ "eval_runtime": 263.2657,
2132
+ "eval_samples_per_second": 3.122,
2133
+ "eval_steps_per_second": 1.561,
2134
  "step": 5900
2135
  },
2136
  {
2137
  "epoch": 0.1185,
2138
+ "grad_norm": 0.5716010515949606,
2139
  "learning_rate": 9.794666666666668e-06,
2140
+ "loss": 2.3616,
2141
  "step": 5925
2142
  },
2143
  {
2144
  "epoch": 0.119,
2145
+ "grad_norm": 0.6245053681878497,
2146
  "learning_rate": 9.789111111111111e-06,
2147
+ "loss": 2.358,
2148
  "step": 5950
2149
  },
2150
  {
2151
  "epoch": 0.1195,
2152
+ "grad_norm": 0.5896528100704728,
2153
  "learning_rate": 9.783555555555557e-06,
2154
+ "loss": 2.355,
2155
  "step": 5975
2156
  },
2157
  {
2158
  "epoch": 0.12,
2159
+ "grad_norm": 0.5534590488643797,
2160
  "learning_rate": 9.778e-06,
2161
+ "loss": 2.3567,
2162
  "step": 6000
2163
  },
2164
  {
2165
  "epoch": 0.12,
2166
+ "eval_loss": 2.4327354431152344,
2167
+ "eval_runtime": 263.9156,
2168
+ "eval_samples_per_second": 3.115,
2169
+ "eval_steps_per_second": 1.557,
2170
  "step": 6000
2171
+ },
2172
+ {
2173
+ "epoch": 0.1205,
2174
+ "grad_norm": 0.5779403883996491,
2175
+ "learning_rate": 9.772444444444445e-06,
2176
+ "loss": 2.3487,
2177
+ "step": 6025
2178
+ },
2179
+ {
2180
+ "epoch": 0.121,
2181
+ "grad_norm": 0.5693494880188505,
2182
+ "learning_rate": 9.76688888888889e-06,
2183
+ "loss": 2.3506,
2184
+ "step": 6050
2185
+ },
2186
+ {
2187
+ "epoch": 0.1215,
2188
+ "grad_norm": 0.5864069751838692,
2189
+ "learning_rate": 9.761333333333334e-06,
2190
+ "loss": 2.3498,
2191
+ "step": 6075
2192
+ },
2193
+ {
2194
+ "epoch": 0.122,
2195
+ "grad_norm": 0.5930208676954954,
2196
+ "learning_rate": 9.755777777777778e-06,
2197
+ "loss": 2.3508,
2198
+ "step": 6100
2199
+ },
2200
+ {
2201
+ "epoch": 0.122,
2202
+ "eval_loss": 2.432914972305298,
2203
+ "eval_runtime": 263.746,
2204
+ "eval_samples_per_second": 3.117,
2205
+ "eval_steps_per_second": 1.558,
2206
+ "step": 6100
2207
+ },
2208
+ {
2209
+ "epoch": 0.1225,
2210
+ "grad_norm": 0.5967532601446782,
2211
+ "learning_rate": 9.750222222222223e-06,
2212
+ "loss": 2.3584,
2213
+ "step": 6125
2214
+ },
2215
+ {
2216
+ "epoch": 0.123,
2217
+ "grad_norm": 0.5670429310236035,
2218
+ "learning_rate": 9.744666666666668e-06,
2219
+ "loss": 2.3584,
2220
+ "step": 6150
2221
+ },
2222
+ {
2223
+ "epoch": 0.1235,
2224
+ "grad_norm": 0.5744482242457726,
2225
+ "learning_rate": 9.739111111111112e-06,
2226
+ "loss": 2.351,
2227
+ "step": 6175
2228
+ },
2229
+ {
2230
+ "epoch": 0.124,
2231
+ "grad_norm": 0.6029007635970692,
2232
+ "learning_rate": 9.733555555555555e-06,
2233
+ "loss": 2.3494,
2234
+ "step": 6200
2235
+ },
2236
+ {
2237
+ "epoch": 0.124,
2238
+ "eval_loss": 2.432878255844116,
2239
+ "eval_runtime": 263.5842,
2240
+ "eval_samples_per_second": 3.119,
2241
+ "eval_steps_per_second": 1.559,
2242
+ "step": 6200
2243
+ },
2244
+ {
2245
+ "epoch": 0.1245,
2246
+ "grad_norm": 0.564399310279196,
2247
+ "learning_rate": 9.728e-06,
2248
+ "loss": 2.3595,
2249
+ "step": 6225
2250
+ },
2251
+ {
2252
+ "epoch": 0.125,
2253
+ "grad_norm": 0.6065670221926927,
2254
+ "learning_rate": 9.722444444444446e-06,
2255
+ "loss": 2.3547,
2256
+ "step": 6250
2257
+ },
2258
+ {
2259
+ "epoch": 0.1255,
2260
+ "grad_norm": 0.5659801132085207,
2261
+ "learning_rate": 9.71688888888889e-06,
2262
+ "loss": 2.3511,
2263
+ "step": 6275
2264
+ },
2265
+ {
2266
+ "epoch": 0.126,
2267
+ "grad_norm": 0.5837628069797915,
2268
+ "learning_rate": 9.711333333333333e-06,
2269
+ "loss": 2.3575,
2270
+ "step": 6300
2271
+ },
2272
+ {
2273
+ "epoch": 0.126,
2274
+ "eval_loss": 2.4329097270965576,
2275
+ "eval_runtime": 264.6192,
2276
+ "eval_samples_per_second": 3.106,
2277
+ "eval_steps_per_second": 1.553,
2278
+ "step": 6300
2279
+ },
2280
+ {
2281
+ "epoch": 0.1265,
2282
+ "grad_norm": 0.5760319910919499,
2283
+ "learning_rate": 9.705777777777778e-06,
2284
+ "loss": 2.3488,
2285
+ "step": 6325
2286
+ },
2287
+ {
2288
+ "epoch": 0.127,
2289
+ "grad_norm": 0.5761318046315628,
2290
+ "learning_rate": 9.700222222222224e-06,
2291
+ "loss": 2.3435,
2292
+ "step": 6350
2293
+ },
2294
+ {
2295
+ "epoch": 0.1275,
2296
+ "grad_norm": 0.5609369346838009,
2297
+ "learning_rate": 9.694666666666667e-06,
2298
+ "loss": 2.347,
2299
+ "step": 6375
2300
+ },
2301
+ {
2302
+ "epoch": 0.128,
2303
+ "grad_norm": 0.5954461846572633,
2304
+ "learning_rate": 9.68911111111111e-06,
2305
+ "loss": 2.3485,
2306
+ "step": 6400
2307
+ },
2308
+ {
2309
+ "epoch": 0.128,
2310
+ "eval_loss": 2.4333934783935547,
2311
+ "eval_runtime": 263.5903,
2312
+ "eval_samples_per_second": 3.118,
2313
+ "eval_steps_per_second": 1.559,
2314
+ "step": 6400
2315
+ },
2316
+ {
2317
+ "epoch": 0.1285,
2318
+ "grad_norm": 0.5524126786458765,
2319
+ "learning_rate": 9.683555555555556e-06,
2320
+ "loss": 2.3514,
2321
+ "step": 6425
2322
+ },
2323
+ {
2324
+ "epoch": 0.129,
2325
+ "grad_norm": 0.5590067107241867,
2326
+ "learning_rate": 9.678000000000001e-06,
2327
+ "loss": 2.3477,
2328
+ "step": 6450
2329
+ },
2330
+ {
2331
+ "epoch": 0.1295,
2332
+ "grad_norm": 0.5578028236930622,
2333
+ "learning_rate": 9.672444444444445e-06,
2334
+ "loss": 2.3434,
2335
+ "step": 6475
2336
+ },
2337
+ {
2338
+ "epoch": 0.13,
2339
+ "grad_norm": 0.6002389478119885,
2340
+ "learning_rate": 9.66688888888889e-06,
2341
+ "loss": 2.3415,
2342
+ "step": 6500
2343
+ },
2344
+ {
2345
+ "epoch": 0.13,
2346
+ "eval_loss": 2.433302164077759,
2347
+ "eval_runtime": 263.4334,
2348
+ "eval_samples_per_second": 3.12,
2349
+ "eval_steps_per_second": 1.56,
2350
+ "step": 6500
2351
+ },
2352
+ {
2353
+ "epoch": 0.1305,
2354
+ "grad_norm": 0.5868647352323021,
2355
+ "learning_rate": 9.661333333333334e-06,
2356
+ "loss": 2.3532,
2357
+ "step": 6525
2358
+ },
2359
+ {
2360
+ "epoch": 0.131,
2361
+ "grad_norm": 0.5525203092071236,
2362
+ "learning_rate": 9.655777777777779e-06,
2363
+ "loss": 2.3439,
2364
+ "step": 6550
2365
+ },
2366
+ {
2367
+ "epoch": 0.1315,
2368
+ "grad_norm": 0.642282300647443,
2369
+ "learning_rate": 9.650222222222222e-06,
2370
+ "loss": 2.333,
2371
+ "step": 6575
2372
+ },
2373
+ {
2374
+ "epoch": 0.132,
2375
+ "grad_norm": 0.5954691746571129,
2376
+ "learning_rate": 9.644666666666668e-06,
2377
+ "loss": 2.3371,
2378
+ "step": 6600
2379
+ },
2380
+ {
2381
+ "epoch": 0.132,
2382
+ "eval_loss": 2.4332070350646973,
2383
+ "eval_runtime": 263.9928,
2384
+ "eval_samples_per_second": 3.114,
2385
+ "eval_steps_per_second": 1.557,
2386
+ "step": 6600
2387
+ },
2388
+ {
2389
+ "epoch": 0.1325,
2390
+ "grad_norm": 0.5696322215994257,
2391
+ "learning_rate": 9.639111111111113e-06,
2392
+ "loss": 2.3568,
2393
+ "step": 6625
2394
+ },
2395
+ {
2396
+ "epoch": 0.133,
2397
+ "grad_norm": 0.569783318316734,
2398
+ "learning_rate": 9.633555555555556e-06,
2399
+ "loss": 2.3468,
2400
+ "step": 6650
2401
+ },
2402
+ {
2403
+ "epoch": 0.1335,
2404
+ "grad_norm": 0.5974477984803339,
2405
+ "learning_rate": 9.628e-06,
2406
+ "loss": 2.3369,
2407
+ "step": 6675
2408
+ },
2409
+ {
2410
+ "epoch": 0.134,
2411
+ "grad_norm": 0.5850514409957908,
2412
+ "learning_rate": 9.622444444444445e-06,
2413
+ "loss": 2.3328,
2414
+ "step": 6700
2415
+ },
2416
+ {
2417
+ "epoch": 0.134,
2418
+ "eval_loss": 2.4336042404174805,
2419
+ "eval_runtime": 264.1653,
2420
+ "eval_samples_per_second": 3.112,
2421
+ "eval_steps_per_second": 1.556,
2422
+ "step": 6700
2423
+ },
2424
+ {
2425
+ "epoch": 0.1345,
2426
+ "grad_norm": 0.5598567946533984,
2427
+ "learning_rate": 9.61688888888889e-06,
2428
+ "loss": 2.3505,
2429
+ "step": 6725
2430
+ },
2431
+ {
2432
+ "epoch": 0.135,
2433
+ "grad_norm": 0.564538169627995,
2434
+ "learning_rate": 9.611333333333334e-06,
2435
+ "loss": 2.3512,
2436
+ "step": 6750
2437
+ },
2438
+ {
2439
+ "epoch": 0.1355,
2440
+ "grad_norm": 0.555057205811747,
2441
+ "learning_rate": 9.605777777777778e-06,
2442
+ "loss": 2.3441,
2443
+ "step": 6775
2444
+ },
2445
+ {
2446
+ "epoch": 0.136,
2447
+ "grad_norm": 0.5928392878820046,
2448
+ "learning_rate": 9.600222222222223e-06,
2449
+ "loss": 2.342,
2450
+ "step": 6800
2451
+ },
2452
+ {
2453
+ "epoch": 0.136,
2454
+ "eval_loss": 2.4332380294799805,
2455
+ "eval_runtime": 263.6981,
2456
+ "eval_samples_per_second": 3.117,
2457
+ "eval_steps_per_second": 1.559,
2458
+ "step": 6800
2459
+ },
2460
+ {
2461
+ "epoch": 0.1365,
2462
+ "grad_norm": 0.580747535991996,
2463
+ "learning_rate": 9.594666666666668e-06,
2464
+ "loss": 2.3402,
2465
+ "step": 6825
2466
+ },
2467
+ {
2468
+ "epoch": 0.137,
2469
+ "grad_norm": 0.5361093856752921,
2470
+ "learning_rate": 9.589111111111112e-06,
2471
+ "loss": 2.3345,
2472
+ "step": 6850
2473
+ },
2474
+ {
2475
+ "epoch": 0.1375,
2476
+ "grad_norm": 0.5764684974648585,
2477
+ "learning_rate": 9.583555555555555e-06,
2478
+ "loss": 2.3434,
2479
+ "step": 6875
2480
+ },
2481
+ {
2482
+ "epoch": 0.138,
2483
+ "grad_norm": 0.5695437902803252,
2484
+ "learning_rate": 9.578e-06,
2485
+ "loss": 2.3345,
2486
+ "step": 6900
2487
+ },
2488
+ {
2489
+ "epoch": 0.138,
2490
+ "eval_loss": 2.4334897994995117,
2491
+ "eval_runtime": 263.9042,
2492
+ "eval_samples_per_second": 3.115,
2493
+ "eval_steps_per_second": 1.557,
2494
+ "step": 6900
2495
+ },
2496
+ {
2497
+ "epoch": 0.1385,
2498
+ "grad_norm": 0.5856816810807355,
2499
+ "learning_rate": 9.572444444444446e-06,
2500
+ "loss": 2.3344,
2501
+ "step": 6925
2502
+ },
2503
+ {
2504
+ "epoch": 0.139,
2505
+ "grad_norm": 0.5692161417871612,
2506
+ "learning_rate": 9.56688888888889e-06,
2507
+ "loss": 2.3492,
2508
+ "step": 6950
2509
+ },
2510
+ {
2511
+ "epoch": 0.1395,
2512
+ "grad_norm": 0.5782790626699041,
2513
+ "learning_rate": 9.561333333333333e-06,
2514
+ "loss": 2.3343,
2515
+ "step": 6975
2516
+ },
2517
+ {
2518
+ "epoch": 0.14,
2519
+ "grad_norm": 0.5592348825440727,
2520
+ "learning_rate": 9.555777777777778e-06,
2521
+ "loss": 2.3361,
2522
+ "step": 7000
2523
+ },
2524
+ {
2525
+ "epoch": 0.14,
2526
+ "eval_loss": 2.4338128566741943,
2527
+ "eval_runtime": 264.0278,
2528
+ "eval_samples_per_second": 3.113,
2529
+ "eval_steps_per_second": 1.557,
2530
+ "step": 7000
2531
+ },
2532
+ {
2533
+ "epoch": 0.1405,
2534
+ "grad_norm": 0.5810855929853301,
2535
+ "learning_rate": 9.550222222222223e-06,
2536
+ "loss": 2.3397,
2537
+ "step": 7025
2538
+ },
2539
+ {
2540
+ "epoch": 0.141,
2541
+ "grad_norm": 0.5672444444354668,
2542
+ "learning_rate": 9.544666666666667e-06,
2543
+ "loss": 2.3384,
2544
+ "step": 7050
2545
+ },
2546
+ {
2547
+ "epoch": 0.1415,
2548
+ "grad_norm": 0.649461804794621,
2549
+ "learning_rate": 9.539111111111112e-06,
2550
+ "loss": 2.3384,
2551
+ "step": 7075
2552
+ },
2553
+ {
2554
+ "epoch": 0.142,
2555
+ "grad_norm": 0.5697893925017475,
2556
+ "learning_rate": 9.533555555555556e-06,
2557
+ "loss": 2.3415,
2558
+ "step": 7100
2559
+ },
2560
+ {
2561
+ "epoch": 0.142,
2562
+ "eval_loss": 2.4329330921173096,
2563
+ "eval_runtime": 263.8408,
2564
+ "eval_samples_per_second": 3.116,
2565
+ "eval_steps_per_second": 1.558,
2566
+ "step": 7100
2567
+ },
2568
+ {
2569
+ "epoch": 0.1425,
2570
+ "grad_norm": 0.562192662676289,
2571
+ "learning_rate": 9.528000000000001e-06,
2572
+ "loss": 2.3381,
2573
+ "step": 7125
2574
+ },
2575
+ {
2576
+ "epoch": 0.143,
2577
+ "grad_norm": 0.5782927675061864,
2578
+ "learning_rate": 9.522444444444444e-06,
2579
+ "loss": 2.3316,
2580
+ "step": 7150
2581
+ },
2582
+ {
2583
+ "epoch": 0.1435,
2584
+ "grad_norm": 0.5470889439002048,
2585
+ "learning_rate": 9.51688888888889e-06,
2586
+ "loss": 2.3336,
2587
+ "step": 7175
2588
+ },
2589
+ {
2590
+ "epoch": 0.144,
2591
+ "grad_norm": 0.5732687375919955,
2592
+ "learning_rate": 9.511333333333335e-06,
2593
+ "loss": 2.3302,
2594
+ "step": 7200
2595
+ },
2596
+ {
2597
+ "epoch": 0.144,
2598
+ "eval_loss": 2.4339091777801514,
2599
+ "eval_runtime": 265.4685,
2600
+ "eval_samples_per_second": 3.096,
2601
+ "eval_steps_per_second": 1.548,
2602
+ "step": 7200
2603
+ },
2604
+ {
2605
+ "epoch": 0.1445,
2606
+ "grad_norm": 0.5552677779418167,
2607
+ "learning_rate": 9.505777777777779e-06,
2608
+ "loss": 2.3382,
2609
+ "step": 7225
2610
+ },
2611
+ {
2612
+ "epoch": 0.145,
2613
+ "grad_norm": 0.5597695533114173,
2614
+ "learning_rate": 9.500222222222222e-06,
2615
+ "loss": 2.3281,
2616
+ "step": 7250
2617
+ },
2618
+ {
2619
+ "epoch": 0.1455,
2620
+ "grad_norm": 0.586047229250587,
2621
+ "learning_rate": 9.494666666666667e-06,
2622
+ "loss": 2.3365,
2623
+ "step": 7275
2624
+ },
2625
+ {
2626
+ "epoch": 0.146,
2627
+ "grad_norm": 0.5631697021330876,
2628
+ "learning_rate": 9.489111111111113e-06,
2629
+ "loss": 2.3434,
2630
+ "step": 7300
2631
+ },
2632
+ {
2633
+ "epoch": 0.146,
2634
+ "eval_loss": 2.4337289333343506,
2635
+ "eval_runtime": 264.0121,
2636
+ "eval_samples_per_second": 3.113,
2637
+ "eval_steps_per_second": 1.557,
2638
+ "step": 7300
2639
+ },
2640
+ {
2641
+ "epoch": 0.1465,
2642
+ "grad_norm": 0.5787283610065107,
2643
+ "learning_rate": 9.483555555555556e-06,
2644
+ "loss": 2.3385,
2645
+ "step": 7325
2646
+ },
2647
+ {
2648
+ "epoch": 0.147,
2649
+ "grad_norm": 0.5894250508009748,
2650
+ "learning_rate": 9.478e-06,
2651
+ "loss": 2.3289,
2652
+ "step": 7350
2653
+ },
2654
+ {
2655
+ "epoch": 0.1475,
2656
+ "grad_norm": 0.5698558287850775,
2657
+ "learning_rate": 9.472444444444445e-06,
2658
+ "loss": 2.3363,
2659
+ "step": 7375
2660
+ },
2661
+ {
2662
+ "epoch": 0.148,
2663
+ "grad_norm": 0.5704695535231787,
2664
+ "learning_rate": 9.46688888888889e-06,
2665
+ "loss": 2.3245,
2666
+ "step": 7400
2667
+ },
2668
+ {
2669
+ "epoch": 0.148,
2670
+ "eval_loss": 2.4338371753692627,
2671
+ "eval_runtime": 264.1068,
2672
+ "eval_samples_per_second": 3.112,
2673
+ "eval_steps_per_second": 1.556,
2674
+ "step": 7400
2675
+ },
2676
+ {
2677
+ "epoch": 0.1485,
2678
+ "grad_norm": 0.5452782996001769,
2679
+ "learning_rate": 9.461333333333334e-06,
2680
+ "loss": 2.3442,
2681
+ "step": 7425
2682
+ },
2683
+ {
2684
+ "epoch": 0.149,
2685
+ "grad_norm": 0.5741037001956839,
2686
+ "learning_rate": 9.455777777777777e-06,
2687
+ "loss": 2.3349,
2688
+ "step": 7450
2689
+ },
2690
+ {
2691
+ "epoch": 0.1495,
2692
+ "grad_norm": 0.5570524045425876,
2693
+ "learning_rate": 9.450222222222223e-06,
2694
+ "loss": 2.3324,
2695
+ "step": 7475
2696
+ },
2697
+ {
2698
+ "epoch": 0.15,
2699
+ "grad_norm": 0.5701333037498688,
2700
+ "learning_rate": 9.444666666666668e-06,
2701
+ "loss": 2.3268,
2702
+ "step": 7500
2703
+ },
2704
+ {
2705
+ "epoch": 0.15,
2706
+ "eval_loss": 2.4347753524780273,
2707
+ "eval_runtime": 264.1822,
2708
+ "eval_samples_per_second": 3.111,
2709
+ "eval_steps_per_second": 1.556,
2710
+ "step": 7500
2711
+ },
2712
+ {
2713
+ "epoch": 0.1505,
2714
+ "grad_norm": 0.5636194713998469,
2715
+ "learning_rate": 9.439111111111111e-06,
2716
+ "loss": 2.3324,
2717
+ "step": 7525
2718
+ },
2719
+ {
2720
+ "epoch": 0.151,
2721
+ "grad_norm": 0.5745462812172999,
2722
+ "learning_rate": 9.433555555555557e-06,
2723
+ "loss": 2.3438,
2724
+ "step": 7550
2725
+ },
2726
+ {
2727
+ "epoch": 0.1515,
2728
+ "grad_norm": 0.5658180287749817,
2729
+ "learning_rate": 9.428e-06,
2730
+ "loss": 2.3272,
2731
+ "step": 7575
2732
+ },
2733
+ {
2734
+ "epoch": 0.152,
2735
+ "grad_norm": 0.5590021944536283,
2736
+ "learning_rate": 9.422444444444445e-06,
2737
+ "loss": 2.3379,
2738
+ "step": 7600
2739
+ },
2740
+ {
2741
+ "epoch": 0.152,
2742
+ "eval_loss": 2.43342924118042,
2743
+ "eval_runtime": 264.6073,
2744
+ "eval_samples_per_second": 3.106,
2745
+ "eval_steps_per_second": 1.553,
2746
+ "step": 7600
2747
+ },
2748
+ {
2749
+ "epoch": 0.1525,
2750
+ "grad_norm": 0.5756847823781959,
2751
+ "learning_rate": 9.41688888888889e-06,
2752
+ "loss": 2.3291,
2753
+ "step": 7625
2754
+ },
2755
+ {
2756
+ "epoch": 0.153,
2757
+ "grad_norm": 0.5614727649452073,
2758
+ "learning_rate": 9.411333333333334e-06,
2759
+ "loss": 2.3164,
2760
+ "step": 7650
2761
+ },
2762
+ {
2763
+ "epoch": 0.1535,
2764
+ "grad_norm": 0.581410678990456,
2765
+ "learning_rate": 9.405777777777778e-06,
2766
+ "loss": 2.3205,
2767
+ "step": 7675
2768
+ },
2769
+ {
2770
+ "epoch": 0.154,
2771
+ "grad_norm": 0.6063515370764081,
2772
+ "learning_rate": 9.400222222222223e-06,
2773
+ "loss": 2.3331,
2774
+ "step": 7700
2775
+ },
2776
+ {
2777
+ "epoch": 0.154,
2778
+ "eval_loss": 2.435711622238159,
2779
+ "eval_runtime": 283.6724,
2780
+ "eval_samples_per_second": 2.898,
2781
+ "eval_steps_per_second": 1.449,
2782
+ "step": 7700
2783
+ },
2784
+ {
2785
+ "epoch": 0.1545,
2786
+ "grad_norm": 0.5535459156675728,
2787
+ "learning_rate": 9.394666666666668e-06,
2788
+ "loss": 2.3312,
2789
+ "step": 7725
2790
+ },
2791
+ {
2792
+ "epoch": 0.155,
2793
+ "grad_norm": 0.5550223235337549,
2794
+ "learning_rate": 9.389111111111112e-06,
2795
+ "loss": 2.3222,
2796
+ "step": 7750
2797
+ },
2798
+ {
2799
+ "epoch": 0.1555,
2800
+ "grad_norm": 0.5661396564004607,
2801
+ "learning_rate": 9.383555555555557e-06,
2802
+ "loss": 2.329,
2803
+ "step": 7775
2804
+ },
2805
+ {
2806
+ "epoch": 0.156,
2807
+ "grad_norm": 0.5754229466302317,
2808
+ "learning_rate": 9.378e-06,
2809
+ "loss": 2.3375,
2810
+ "step": 7800
2811
+ },
2812
+ {
2813
+ "epoch": 0.156,
2814
+ "eval_loss": 2.4339263439178467,
2815
+ "eval_runtime": 263.7245,
2816
+ "eval_samples_per_second": 3.117,
2817
+ "eval_steps_per_second": 1.558,
2818
+ "step": 7800
2819
+ },
2820
+ {
2821
+ "epoch": 0.1565,
2822
+ "grad_norm": 0.5922113870936093,
2823
+ "learning_rate": 9.372444444444446e-06,
2824
+ "loss": 2.3326,
2825
+ "step": 7825
2826
+ },
2827
+ {
2828
+ "epoch": 0.157,
2829
+ "grad_norm": 0.5802231546249389,
2830
+ "learning_rate": 9.36688888888889e-06,
2831
+ "loss": 2.3313,
2832
+ "step": 7850
2833
+ },
2834
+ {
2835
+ "epoch": 0.1575,
2836
+ "grad_norm": 0.5613750089293277,
2837
+ "learning_rate": 9.361333333333335e-06,
2838
+ "loss": 2.3306,
2839
+ "step": 7875
2840
+ },
2841
+ {
2842
+ "epoch": 0.158,
2843
+ "grad_norm": 0.5554952690049914,
2844
+ "learning_rate": 9.355777777777778e-06,
2845
+ "loss": 2.3307,
2846
+ "step": 7900
2847
+ },
2848
+ {
2849
+ "epoch": 0.158,
2850
+ "eval_loss": 2.435500144958496,
2851
+ "eval_runtime": 268.1064,
2852
+ "eval_samples_per_second": 3.066,
2853
+ "eval_steps_per_second": 1.533,
2854
+ "step": 7900
2855
+ },
2856
+ {
2857
+ "epoch": 0.1585,
2858
+ "grad_norm": 0.5699743157285643,
2859
+ "learning_rate": 9.350222222222224e-06,
2860
+ "loss": 2.3274,
2861
+ "step": 7925
2862
+ },
2863
+ {
2864
+ "epoch": 0.159,
2865
+ "grad_norm": 0.580771514541295,
2866
+ "learning_rate": 9.344666666666667e-06,
2867
+ "loss": 2.3238,
2868
+ "step": 7950
2869
+ },
2870
+ {
2871
+ "epoch": 0.1595,
2872
+ "grad_norm": 0.563419791930312,
2873
+ "learning_rate": 9.339111111111112e-06,
2874
+ "loss": 2.3384,
2875
+ "step": 7975
2876
+ },
2877
+ {
2878
+ "epoch": 0.16,
2879
+ "grad_norm": 0.5793778749938447,
2880
+ "learning_rate": 9.333555555555558e-06,
2881
+ "loss": 2.3291,
2882
+ "step": 8000
2883
+ },
2884
+ {
2885
+ "epoch": 0.16,
2886
+ "eval_loss": 2.4343531131744385,
2887
+ "eval_runtime": 263.9111,
2888
+ "eval_samples_per_second": 3.115,
2889
+ "eval_steps_per_second": 1.557,
2890
+ "step": 8000
2891
  }
2892
  ],
2893
  "logging_steps": 25,
 
2907
  "attributes": {}
2908
  }
2909
  },
2910
+ "total_flos": 2.546561838661763e+19,
2911
  "train_batch_size": 1,
2912
  "trial_name": null,
2913
  "trial_params": null