Safetensors
English
Kirili4ik commited on
Commit
5388de3
·
verified ·
1 Parent(s): 366ddac

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -3586
trainer_state.json DELETED
@@ -1,3586 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 10.353982300884956,
6
- "eval_steps": 500,
7
- "global_step": 12000,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.021584286639326572,
14
- "grad_norm": 1.5234375,
15
- "learning_rate": 8.000000000000001e-06,
16
- "loss": 3.7923,
17
- "step": 25
18
- },
19
- {
20
- "epoch": 0.043168573278653144,
21
- "grad_norm": 0.6796875,
22
- "learning_rate": 1.6333333333333335e-05,
23
- "loss": 3.8062,
24
- "step": 50
25
- },
26
- {
27
- "epoch": 0.06475285991797972,
28
- "grad_norm": 0.98828125,
29
- "learning_rate": 2.466666666666667e-05,
30
- "loss": 3.5078,
31
- "step": 75
32
- },
33
- {
34
- "epoch": 0.08633714655730629,
35
- "grad_norm": 1.53125,
36
- "learning_rate": 3.3e-05,
37
- "loss": 2.8861,
38
- "step": 100
39
- },
40
- {
41
- "epoch": 0.10792143319663285,
42
- "grad_norm": 0.48046875,
43
- "learning_rate": 4.133333333333333e-05,
44
- "loss": 2.6734,
45
- "step": 125
46
- },
47
- {
48
- "epoch": 0.12950571983595943,
49
- "grad_norm": 0.5625,
50
- "learning_rate": 4.966666666666667e-05,
51
- "loss": 2.5981,
52
- "step": 150
53
- },
54
- {
55
- "epoch": 0.151090006475286,
56
- "grad_norm": 0.2021484375,
57
- "learning_rate": 5.8e-05,
58
- "loss": 2.4867,
59
- "step": 175
60
- },
61
- {
62
- "epoch": 0.17267429311461258,
63
- "grad_norm": 0.455078125,
64
- "learning_rate": 6.633333333333334e-05,
65
- "loss": 2.4344,
66
- "step": 200
67
- },
68
- {
69
- "epoch": 0.19425857975393912,
70
- "grad_norm": 1.3359375,
71
- "learning_rate": 7.466666666666667e-05,
72
- "loss": 2.5441,
73
- "step": 225
74
- },
75
- {
76
- "epoch": 0.2158428663932657,
77
- "grad_norm": 0.1552734375,
78
- "learning_rate": 8.3e-05,
79
- "loss": 2.4667,
80
- "step": 250
81
- },
82
- {
83
- "epoch": 0.23742715303259226,
84
- "grad_norm": 0.357421875,
85
- "learning_rate": 9.133333333333334e-05,
86
- "loss": 2.4935,
87
- "step": 275
88
- },
89
- {
90
- "epoch": 0.25901143967191886,
91
- "grad_norm": 0.19140625,
92
- "learning_rate": 9.966666666666667e-05,
93
- "loss": 2.4301,
94
- "step": 300
95
- },
96
- {
97
- "epoch": 0.2805957263112454,
98
- "grad_norm": 0.1474609375,
99
- "learning_rate": 9.997592778335006e-05,
100
- "loss": 2.526,
101
- "step": 325
102
- },
103
- {
104
- "epoch": 0.302180012950572,
105
- "grad_norm": 0.2041015625,
106
- "learning_rate": 9.995085255767302e-05,
107
- "loss": 2.5079,
108
- "step": 350
109
- },
110
- {
111
- "epoch": 0.32376429958989855,
112
- "grad_norm": 0.25,
113
- "learning_rate": 9.9925777331996e-05,
114
- "loss": 2.5088,
115
- "step": 375
116
- },
117
- {
118
- "epoch": 0.34534858622922515,
119
- "grad_norm": 0.138671875,
120
- "learning_rate": 9.990070210631897e-05,
121
- "loss": 2.5054,
122
- "step": 400
123
- },
124
- {
125
- "epoch": 0.3669328728685517,
126
- "grad_norm": 0.291015625,
127
- "learning_rate": 9.987562688064192e-05,
128
- "loss": 2.5176,
129
- "step": 425
130
- },
131
- {
132
- "epoch": 0.38851715950787824,
133
- "grad_norm": 0.265625,
134
- "learning_rate": 9.98505516549649e-05,
135
- "loss": 2.3518,
136
- "step": 450
137
- },
138
- {
139
- "epoch": 0.41010144614720484,
140
- "grad_norm": 0.6640625,
141
- "learning_rate": 9.982547642928787e-05,
142
- "loss": 2.4348,
143
- "step": 475
144
- },
145
- {
146
- "epoch": 0.4316857327865314,
147
- "grad_norm": 0.1748046875,
148
- "learning_rate": 9.980040120361083e-05,
149
- "loss": 2.4306,
150
- "step": 500
151
- },
152
- {
153
- "epoch": 0.4316857327865314,
154
- "eval_loss": 2.454599618911743,
155
- "eval_runtime": 119.0408,
156
- "eval_samples_per_second": 4.2,
157
- "eval_steps_per_second": 4.2,
158
- "step": 500
159
- },
160
- {
161
- "epoch": 0.453270019425858,
162
- "grad_norm": 0.201171875,
163
- "learning_rate": 9.977532597793381e-05,
164
- "loss": 2.4055,
165
- "step": 525
166
- },
167
- {
168
- "epoch": 0.47485430606518453,
169
- "grad_norm": 0.62109375,
170
- "learning_rate": 9.975025075225678e-05,
171
- "loss": 2.4223,
172
- "step": 550
173
- },
174
- {
175
- "epoch": 0.49643859270451113,
176
- "grad_norm": 0.1474609375,
177
- "learning_rate": 9.972517552657973e-05,
178
- "loss": 2.3973,
179
- "step": 575
180
- },
181
- {
182
- "epoch": 0.5180228793438377,
183
- "grad_norm": 0.1806640625,
184
- "learning_rate": 9.970010030090271e-05,
185
- "loss": 2.477,
186
- "step": 600
187
- },
188
- {
189
- "epoch": 0.5396071659831643,
190
- "grad_norm": 0.166015625,
191
- "learning_rate": 9.967502507522568e-05,
192
- "loss": 2.447,
193
- "step": 625
194
- },
195
- {
196
- "epoch": 0.5611914526224908,
197
- "grad_norm": 0.197265625,
198
- "learning_rate": 9.964994984954865e-05,
199
- "loss": 2.4324,
200
- "step": 650
201
- },
202
- {
203
- "epoch": 0.5827757392618174,
204
- "grad_norm": 0.1845703125,
205
- "learning_rate": 9.962487462387163e-05,
206
- "loss": 2.3273,
207
- "step": 675
208
- },
209
- {
210
- "epoch": 0.604360025901144,
211
- "grad_norm": 0.1630859375,
212
- "learning_rate": 9.959979939819459e-05,
213
- "loss": 2.4169,
214
- "step": 700
215
- },
216
- {
217
- "epoch": 0.6259443125404706,
218
- "grad_norm": 0.166015625,
219
- "learning_rate": 9.957472417251756e-05,
220
- "loss": 2.4924,
221
- "step": 725
222
- },
223
- {
224
- "epoch": 0.6475285991797971,
225
- "grad_norm": 0.1884765625,
226
- "learning_rate": 9.954964894684052e-05,
227
- "loss": 2.5247,
228
- "step": 750
229
- },
230
- {
231
- "epoch": 0.6691128858191236,
232
- "grad_norm": 0.1904296875,
233
- "learning_rate": 9.952457372116349e-05,
234
- "loss": 2.4709,
235
- "step": 775
236
- },
237
- {
238
- "epoch": 0.6906971724584503,
239
- "grad_norm": 0.1845703125,
240
- "learning_rate": 9.949949849548647e-05,
241
- "loss": 2.369,
242
- "step": 800
243
- },
244
- {
245
- "epoch": 0.7122814590977768,
246
- "grad_norm": 0.1630859375,
247
- "learning_rate": 9.947442326980944e-05,
248
- "loss": 2.5355,
249
- "step": 825
250
- },
251
- {
252
- "epoch": 0.7338657457371034,
253
- "grad_norm": 0.2109375,
254
- "learning_rate": 9.94493480441324e-05,
255
- "loss": 2.4896,
256
- "step": 850
257
- },
258
- {
259
- "epoch": 0.7554500323764299,
260
- "grad_norm": 0.1943359375,
261
- "learning_rate": 9.942427281845537e-05,
262
- "loss": 2.4591,
263
- "step": 875
264
- },
265
- {
266
- "epoch": 0.7770343190157565,
267
- "grad_norm": 0.140625,
268
- "learning_rate": 9.939919759277834e-05,
269
- "loss": 2.369,
270
- "step": 900
271
- },
272
- {
273
- "epoch": 0.7986186056550831,
274
- "grad_norm": 0.21484375,
275
- "learning_rate": 9.93741223671013e-05,
276
- "loss": 2.4518,
277
- "step": 925
278
- },
279
- {
280
- "epoch": 0.8202028922944097,
281
- "grad_norm": 0.1962890625,
282
- "learning_rate": 9.934904714142428e-05,
283
- "loss": 2.3972,
284
- "step": 950
285
- },
286
- {
287
- "epoch": 0.8417871789337362,
288
- "grad_norm": 0.26953125,
289
- "learning_rate": 9.932397191574725e-05,
290
- "loss": 2.4437,
291
- "step": 975
292
- },
293
- {
294
- "epoch": 0.8633714655730628,
295
- "grad_norm": 0.255859375,
296
- "learning_rate": 9.929889669007022e-05,
297
- "loss": 2.3575,
298
- "step": 1000
299
- },
300
- {
301
- "epoch": 0.8633714655730628,
302
- "eval_loss": 2.429636240005493,
303
- "eval_runtime": 118.4486,
304
- "eval_samples_per_second": 4.221,
305
- "eval_steps_per_second": 4.221,
306
- "step": 1000
307
- },
308
- {
309
- "epoch": 0.8849557522123894,
310
- "grad_norm": 0.197265625,
311
- "learning_rate": 9.927382146439318e-05,
312
- "loss": 2.3569,
313
- "step": 1025
314
- },
315
- {
316
- "epoch": 0.906540038851716,
317
- "grad_norm": 0.142578125,
318
- "learning_rate": 9.924874623871615e-05,
319
- "loss": 2.4218,
320
- "step": 1050
321
- },
322
- {
323
- "epoch": 0.9281243254910425,
324
- "grad_norm": 0.2373046875,
325
- "learning_rate": 9.922367101303913e-05,
326
- "loss": 2.4717,
327
- "step": 1075
328
- },
329
- {
330
- "epoch": 0.9497086121303691,
331
- "grad_norm": 0.162109375,
332
- "learning_rate": 9.91985957873621e-05,
333
- "loss": 2.3969,
334
- "step": 1100
335
- },
336
- {
337
- "epoch": 0.9712928987696957,
338
- "grad_norm": 0.171875,
339
- "learning_rate": 9.917352056168506e-05,
340
- "loss": 2.462,
341
- "step": 1125
342
- },
343
- {
344
- "epoch": 0.9928771854090223,
345
- "grad_norm": 0.177734375,
346
- "learning_rate": 9.914844533600803e-05,
347
- "loss": 2.4901,
348
- "step": 1150
349
- },
350
- {
351
- "epoch": 1.013813943449169,
352
- "grad_norm": 0.1943359375,
353
- "learning_rate": 9.912337011033099e-05,
354
- "loss": 2.2822,
355
- "step": 1175
356
- },
357
- {
358
- "epoch": 1.0353982300884956,
359
- "grad_norm": 0.2001953125,
360
- "learning_rate": 9.909829488465396e-05,
361
- "loss": 2.3955,
362
- "step": 1200
363
- },
364
- {
365
- "epoch": 1.0569825167278222,
366
- "grad_norm": 0.169921875,
367
- "learning_rate": 9.907321965897694e-05,
368
- "loss": 2.431,
369
- "step": 1225
370
- },
371
- {
372
- "epoch": 1.0785668033671487,
373
- "grad_norm": 0.26171875,
374
- "learning_rate": 9.90481444332999e-05,
375
- "loss": 2.3124,
376
- "step": 1250
377
- },
378
- {
379
- "epoch": 1.1001510900064753,
380
- "grad_norm": 0.201171875,
381
- "learning_rate": 9.902306920762287e-05,
382
- "loss": 2.3362,
383
- "step": 1275
384
- },
385
- {
386
- "epoch": 1.1217353766458018,
387
- "grad_norm": 0.45703125,
388
- "learning_rate": 9.899799398194585e-05,
389
- "loss": 2.2931,
390
- "step": 1300
391
- },
392
- {
393
- "epoch": 1.1433196632851284,
394
- "grad_norm": 1.1640625,
395
- "learning_rate": 9.89729187562688e-05,
396
- "loss": 2.0676,
397
- "step": 1325
398
- },
399
- {
400
- "epoch": 1.164903949924455,
401
- "grad_norm": 0.283203125,
402
- "learning_rate": 9.894784353059177e-05,
403
- "loss": 1.5906,
404
- "step": 1350
405
- },
406
- {
407
- "epoch": 1.1864882365637817,
408
- "grad_norm": 0.291015625,
409
- "learning_rate": 9.892276830491475e-05,
410
- "loss": 1.5273,
411
- "step": 1375
412
- },
413
- {
414
- "epoch": 1.2080725232031082,
415
- "grad_norm": 0.265625,
416
- "learning_rate": 9.889769307923772e-05,
417
- "loss": 1.3603,
418
- "step": 1400
419
- },
420
- {
421
- "epoch": 1.2296568098424348,
422
- "grad_norm": 0.37109375,
423
- "learning_rate": 9.887261785356068e-05,
424
- "loss": 1.4502,
425
- "step": 1425
426
- },
427
- {
428
- "epoch": 1.2512410964817613,
429
- "grad_norm": 0.36328125,
430
- "learning_rate": 9.884754262788366e-05,
431
- "loss": 1.3359,
432
- "step": 1450
433
- },
434
- {
435
- "epoch": 1.2728253831210878,
436
- "grad_norm": 0.2734375,
437
- "learning_rate": 9.882246740220662e-05,
438
- "loss": 1.1317,
439
- "step": 1475
440
- },
441
- {
442
- "epoch": 1.2944096697604144,
443
- "grad_norm": 0.26171875,
444
- "learning_rate": 9.87973921765296e-05,
445
- "loss": 1.4062,
446
- "step": 1500
447
- },
448
- {
449
- "epoch": 1.2944096697604144,
450
- "eval_loss": 1.3137363195419312,
451
- "eval_runtime": 118.9724,
452
- "eval_samples_per_second": 4.203,
453
- "eval_steps_per_second": 4.203,
454
- "step": 1500
455
- },
456
- {
457
- "epoch": 1.315993956399741,
458
- "grad_norm": 0.2470703125,
459
- "learning_rate": 9.877231695085256e-05,
460
- "loss": 1.401,
461
- "step": 1525
462
- },
463
- {
464
- "epoch": 1.3375782430390677,
465
- "grad_norm": 0.365234375,
466
- "learning_rate": 9.874724172517553e-05,
467
- "loss": 1.2082,
468
- "step": 1550
469
- },
470
- {
471
- "epoch": 1.359162529678394,
472
- "grad_norm": 0.283203125,
473
- "learning_rate": 9.872216649949851e-05,
474
- "loss": 1.2937,
475
- "step": 1575
476
- },
477
- {
478
- "epoch": 1.3807468163177208,
479
- "grad_norm": 0.240234375,
480
- "learning_rate": 9.869709127382148e-05,
481
- "loss": 1.3758,
482
- "step": 1600
483
- },
484
- {
485
- "epoch": 1.4023311029570473,
486
- "grad_norm": 0.2490234375,
487
- "learning_rate": 9.867201604814443e-05,
488
- "loss": 1.2571,
489
- "step": 1625
490
- },
491
- {
492
- "epoch": 1.4239153895963739,
493
- "grad_norm": 0.2080078125,
494
- "learning_rate": 9.864694082246741e-05,
495
- "loss": 1.3575,
496
- "step": 1650
497
- },
498
- {
499
- "epoch": 1.4454996762357004,
500
- "grad_norm": 0.2412109375,
501
- "learning_rate": 9.862186559679037e-05,
502
- "loss": 1.1738,
503
- "step": 1675
504
- },
505
- {
506
- "epoch": 1.467083962875027,
507
- "grad_norm": 0.232421875,
508
- "learning_rate": 9.859679037111334e-05,
509
- "loss": 1.3577,
510
- "step": 1700
511
- },
512
- {
513
- "epoch": 1.4886682495143535,
514
- "grad_norm": 0.2255859375,
515
- "learning_rate": 9.857171514543632e-05,
516
- "loss": 1.3506,
517
- "step": 1725
518
- },
519
- {
520
- "epoch": 1.51025253615368,
521
- "grad_norm": 0.27734375,
522
- "learning_rate": 9.854663991975929e-05,
523
- "loss": 1.3018,
524
- "step": 1750
525
- },
526
- {
527
- "epoch": 1.5318368227930068,
528
- "grad_norm": 0.271484375,
529
- "learning_rate": 9.852156469408225e-05,
530
- "loss": 1.2907,
531
- "step": 1775
532
- },
533
- {
534
- "epoch": 1.5534211094323331,
535
- "grad_norm": 0.2255859375,
536
- "learning_rate": 9.849648946840522e-05,
537
- "loss": 1.2517,
538
- "step": 1800
539
- },
540
- {
541
- "epoch": 1.57500539607166,
542
- "grad_norm": 0.2138671875,
543
- "learning_rate": 9.847141424272819e-05,
544
- "loss": 1.2246,
545
- "step": 1825
546
- },
547
- {
548
- "epoch": 1.5965896827109864,
549
- "grad_norm": 0.255859375,
550
- "learning_rate": 9.844633901705115e-05,
551
- "loss": 1.3008,
552
- "step": 1850
553
- },
554
- {
555
- "epoch": 1.618173969350313,
556
- "grad_norm": 0.2158203125,
557
- "learning_rate": 9.842126379137413e-05,
558
- "loss": 1.1651,
559
- "step": 1875
560
- },
561
- {
562
- "epoch": 1.6397582559896395,
563
- "grad_norm": 0.212890625,
564
- "learning_rate": 9.83961885656971e-05,
565
- "loss": 1.1388,
566
- "step": 1900
567
- },
568
- {
569
- "epoch": 1.661342542628966,
570
- "grad_norm": 0.2177734375,
571
- "learning_rate": 9.837111334002006e-05,
572
- "loss": 1.0537,
573
- "step": 1925
574
- },
575
- {
576
- "epoch": 1.6829268292682928,
577
- "grad_norm": 0.2099609375,
578
- "learning_rate": 9.834603811434303e-05,
579
- "loss": 1.2838,
580
- "step": 1950
581
- },
582
- {
583
- "epoch": 1.7045111159076192,
584
- "grad_norm": 0.1943359375,
585
- "learning_rate": 9.8320962888666e-05,
586
- "loss": 1.1597,
587
- "step": 1975
588
- },
589
- {
590
- "epoch": 1.726095402546946,
591
- "grad_norm": 0.23828125,
592
- "learning_rate": 9.829588766298898e-05,
593
- "loss": 1.1234,
594
- "step": 2000
595
- },
596
- {
597
- "epoch": 1.726095402546946,
598
- "eval_loss": 1.2347378730773926,
599
- "eval_runtime": 118.1524,
600
- "eval_samples_per_second": 4.232,
601
- "eval_steps_per_second": 4.232,
602
- "step": 2000
603
- },
604
- {
605
- "epoch": 1.7476796891862723,
606
- "grad_norm": 0.232421875,
607
- "learning_rate": 9.827081243731194e-05,
608
- "loss": 1.1737,
609
- "step": 2025
610
- },
611
- {
612
- "epoch": 1.769263975825599,
613
- "grad_norm": 0.1767578125,
614
- "learning_rate": 9.824573721163491e-05,
615
- "loss": 1.1409,
616
- "step": 2050
617
- },
618
- {
619
- "epoch": 1.7908482624649256,
620
- "grad_norm": 0.1962890625,
621
- "learning_rate": 9.822066198595788e-05,
622
- "loss": 1.3478,
623
- "step": 2075
624
- },
625
- {
626
- "epoch": 1.812432549104252,
627
- "grad_norm": 0.283203125,
628
- "learning_rate": 9.819558676028084e-05,
629
- "loss": 1.226,
630
- "step": 2100
631
- },
632
- {
633
- "epoch": 1.8340168357435787,
634
- "grad_norm": 0.2119140625,
635
- "learning_rate": 9.817051153460381e-05,
636
- "loss": 1.1638,
637
- "step": 2125
638
- },
639
- {
640
- "epoch": 1.8556011223829052,
641
- "grad_norm": 0.271484375,
642
- "learning_rate": 9.814543630892679e-05,
643
- "loss": 1.2201,
644
- "step": 2150
645
- },
646
- {
647
- "epoch": 1.877185409022232,
648
- "grad_norm": 0.1884765625,
649
- "learning_rate": 9.812036108324976e-05,
650
- "loss": 1.2503,
651
- "step": 2175
652
- },
653
- {
654
- "epoch": 1.8987696956615583,
655
- "grad_norm": 0.220703125,
656
- "learning_rate": 9.809528585757272e-05,
657
- "loss": 1.4087,
658
- "step": 2200
659
- },
660
- {
661
- "epoch": 1.920353982300885,
662
- "grad_norm": 0.22265625,
663
- "learning_rate": 9.807021063189569e-05,
664
- "loss": 1.1294,
665
- "step": 2225
666
- },
667
- {
668
- "epoch": 1.9419382689402114,
669
- "grad_norm": 0.220703125,
670
- "learning_rate": 9.804513540621865e-05,
671
- "loss": 1.2414,
672
- "step": 2250
673
- },
674
- {
675
- "epoch": 1.9635225555795381,
676
- "grad_norm": 0.2578125,
677
- "learning_rate": 9.802006018054163e-05,
678
- "loss": 1.151,
679
- "step": 2275
680
- },
681
- {
682
- "epoch": 1.9851068422188647,
683
- "grad_norm": 0.2119140625,
684
- "learning_rate": 9.79949849548646e-05,
685
- "loss": 1.1744,
686
- "step": 2300
687
- },
688
- {
689
- "epoch": 2.0060436002590114,
690
- "grad_norm": 0.2119140625,
691
- "learning_rate": 9.796990972918757e-05,
692
- "loss": 1.1466,
693
- "step": 2325
694
- },
695
- {
696
- "epoch": 2.027627886898338,
697
- "grad_norm": 0.2578125,
698
- "learning_rate": 9.794483450351055e-05,
699
- "loss": 1.285,
700
- "step": 2350
701
- },
702
- {
703
- "epoch": 2.0492121735376645,
704
- "grad_norm": 0.240234375,
705
- "learning_rate": 9.79197592778335e-05,
706
- "loss": 1.2525,
707
- "step": 2375
708
- },
709
- {
710
- "epoch": 2.0707964601769913,
711
- "grad_norm": 0.1806640625,
712
- "learning_rate": 9.789468405215647e-05,
713
- "loss": 1.1043,
714
- "step": 2400
715
- },
716
- {
717
- "epoch": 2.0923807468163176,
718
- "grad_norm": 0.2490234375,
719
- "learning_rate": 9.786960882647945e-05,
720
- "loss": 1.151,
721
- "step": 2425
722
- },
723
- {
724
- "epoch": 2.1139650334556443,
725
- "grad_norm": 0.1943359375,
726
- "learning_rate": 9.784453360080241e-05,
727
- "loss": 1.2291,
728
- "step": 2450
729
- },
730
- {
731
- "epoch": 2.1355493200949707,
732
- "grad_norm": 0.2392578125,
733
- "learning_rate": 9.781945837512538e-05,
734
- "loss": 1.201,
735
- "step": 2475
736
- },
737
- {
738
- "epoch": 2.1571336067342974,
739
- "grad_norm": 0.259765625,
740
- "learning_rate": 9.779438314944836e-05,
741
- "loss": 1.1237,
742
- "step": 2500
743
- },
744
- {
745
- "epoch": 2.1571336067342974,
746
- "eval_loss": 1.2181166410446167,
747
- "eval_runtime": 116.187,
748
- "eval_samples_per_second": 4.303,
749
- "eval_steps_per_second": 4.303,
750
- "step": 2500
751
- },
752
- {
753
- "epoch": 2.178717893373624,
754
- "grad_norm": 0.2138671875,
755
- "learning_rate": 9.776930792377131e-05,
756
- "loss": 1.1879,
757
- "step": 2525
758
- },
759
- {
760
- "epoch": 2.2003021800129505,
761
- "grad_norm": 0.2158203125,
762
- "learning_rate": 9.774423269809428e-05,
763
- "loss": 1.2643,
764
- "step": 2550
765
- },
766
- {
767
- "epoch": 2.2218864666522773,
768
- "grad_norm": 0.1796875,
769
- "learning_rate": 9.771915747241726e-05,
770
- "loss": 1.1984,
771
- "step": 2575
772
- },
773
- {
774
- "epoch": 2.2434707532916036,
775
- "grad_norm": 0.279296875,
776
- "learning_rate": 9.769408224674022e-05,
777
- "loss": 1.2307,
778
- "step": 2600
779
- },
780
- {
781
- "epoch": 2.2650550399309304,
782
- "grad_norm": 0.19921875,
783
- "learning_rate": 9.766900702106319e-05,
784
- "loss": 1.2687,
785
- "step": 2625
786
- },
787
- {
788
- "epoch": 2.2866393265702567,
789
- "grad_norm": 0.232421875,
790
- "learning_rate": 9.764393179538617e-05,
791
- "loss": 1.2413,
792
- "step": 2650
793
- },
794
- {
795
- "epoch": 2.3082236132095835,
796
- "grad_norm": 0.21875,
797
- "learning_rate": 9.761885656970912e-05,
798
- "loss": 1.161,
799
- "step": 2675
800
- },
801
- {
802
- "epoch": 2.32980789984891,
803
- "grad_norm": 0.1875,
804
- "learning_rate": 9.75937813440321e-05,
805
- "loss": 1.3196,
806
- "step": 2700
807
- },
808
- {
809
- "epoch": 2.3513921864882366,
810
- "grad_norm": 0.20703125,
811
- "learning_rate": 9.756870611835507e-05,
812
- "loss": 1.1376,
813
- "step": 2725
814
- },
815
- {
816
- "epoch": 2.3729764731275633,
817
- "grad_norm": 0.29296875,
818
- "learning_rate": 9.754363089267804e-05,
819
- "loss": 1.2897,
820
- "step": 2750
821
- },
822
- {
823
- "epoch": 2.3945607597668896,
824
- "grad_norm": 0.201171875,
825
- "learning_rate": 9.751855566700102e-05,
826
- "loss": 1.1078,
827
- "step": 2775
828
- },
829
- {
830
- "epoch": 2.4161450464062164,
831
- "grad_norm": 0.173828125,
832
- "learning_rate": 9.749348044132398e-05,
833
- "loss": 1.1484,
834
- "step": 2800
835
- },
836
- {
837
- "epoch": 2.4377293330455427,
838
- "grad_norm": 0.220703125,
839
- "learning_rate": 9.746840521564694e-05,
840
- "loss": 1.154,
841
- "step": 2825
842
- },
843
- {
844
- "epoch": 2.4593136196848695,
845
- "grad_norm": 0.20703125,
846
- "learning_rate": 9.744332998996991e-05,
847
- "loss": 1.2163,
848
- "step": 2850
849
- },
850
- {
851
- "epoch": 2.480897906324196,
852
- "grad_norm": 0.1904296875,
853
- "learning_rate": 9.741825476429288e-05,
854
- "loss": 1.2316,
855
- "step": 2875
856
- },
857
- {
858
- "epoch": 2.5024821929635226,
859
- "grad_norm": 0.1845703125,
860
- "learning_rate": 9.739317953861585e-05,
861
- "loss": 1.1632,
862
- "step": 2900
863
- },
864
- {
865
- "epoch": 2.5240664796028494,
866
- "grad_norm": 0.2119140625,
867
- "learning_rate": 9.736810431293883e-05,
868
- "loss": 1.239,
869
- "step": 2925
870
- },
871
- {
872
- "epoch": 2.5456507662421757,
873
- "grad_norm": 0.2216796875,
874
- "learning_rate": 9.73430290872618e-05,
875
- "loss": 1.2671,
876
- "step": 2950
877
- },
878
- {
879
- "epoch": 2.567235052881502,
880
- "grad_norm": 0.1865234375,
881
- "learning_rate": 9.731795386158476e-05,
882
- "loss": 1.2046,
883
- "step": 2975
884
- },
885
- {
886
- "epoch": 2.5888193395208288,
887
- "grad_norm": 0.1826171875,
888
- "learning_rate": 9.729287863590773e-05,
889
- "loss": 1.108,
890
- "step": 3000
891
- },
892
- {
893
- "epoch": 2.5888193395208288,
894
- "eval_loss": 1.2088854312896729,
895
- "eval_runtime": 118.1325,
896
- "eval_samples_per_second": 4.233,
897
- "eval_steps_per_second": 4.233,
898
- "step": 3000
899
- },
900
- {
901
- "epoch": 2.6104036261601555,
902
- "grad_norm": 0.1865234375,
903
- "learning_rate": 9.726780341023069e-05,
904
- "loss": 1.3243,
905
- "step": 3025
906
- },
907
- {
908
- "epoch": 2.631987912799482,
909
- "grad_norm": 0.1865234375,
910
- "learning_rate": 9.724272818455367e-05,
911
- "loss": 1.1085,
912
- "step": 3050
913
- },
914
- {
915
- "epoch": 2.6535721994388086,
916
- "grad_norm": 0.208984375,
917
- "learning_rate": 9.721765295887664e-05,
918
- "loss": 1.1862,
919
- "step": 3075
920
- },
921
- {
922
- "epoch": 2.6751564860781354,
923
- "grad_norm": 0.25390625,
924
- "learning_rate": 9.71925777331996e-05,
925
- "loss": 1.1625,
926
- "step": 3100
927
- },
928
- {
929
- "epoch": 2.6967407727174617,
930
- "grad_norm": 0.177734375,
931
- "learning_rate": 9.716750250752257e-05,
932
- "loss": 1.0797,
933
- "step": 3125
934
- },
935
- {
936
- "epoch": 2.718325059356788,
937
- "grad_norm": 0.2392578125,
938
- "learning_rate": 9.714242728184554e-05,
939
- "loss": 1.0413,
940
- "step": 3150
941
- },
942
- {
943
- "epoch": 2.739909345996115,
944
- "grad_norm": 0.1982421875,
945
- "learning_rate": 9.71173520561685e-05,
946
- "loss": 1.1582,
947
- "step": 3175
948
- },
949
- {
950
- "epoch": 2.7614936326354416,
951
- "grad_norm": 0.232421875,
952
- "learning_rate": 9.709227683049148e-05,
953
- "loss": 1.2064,
954
- "step": 3200
955
- },
956
- {
957
- "epoch": 2.783077919274768,
958
- "grad_norm": 0.333984375,
959
- "learning_rate": 9.706720160481445e-05,
960
- "loss": 1.1679,
961
- "step": 3225
962
- },
963
- {
964
- "epoch": 2.8046622059140947,
965
- "grad_norm": 0.2041015625,
966
- "learning_rate": 9.704212637913742e-05,
967
- "loss": 1.1783,
968
- "step": 3250
969
- },
970
- {
971
- "epoch": 2.826246492553421,
972
- "grad_norm": 0.2216796875,
973
- "learning_rate": 9.701705115346038e-05,
974
- "loss": 1.2746,
975
- "step": 3275
976
- },
977
- {
978
- "epoch": 2.8478307791927477,
979
- "grad_norm": 0.1806640625,
980
- "learning_rate": 9.699197592778335e-05,
981
- "loss": 1.0532,
982
- "step": 3300
983
- },
984
- {
985
- "epoch": 2.869415065832074,
986
- "grad_norm": 0.1943359375,
987
- "learning_rate": 9.696690070210632e-05,
988
- "loss": 0.9242,
989
- "step": 3325
990
- },
991
- {
992
- "epoch": 2.890999352471401,
993
- "grad_norm": 0.228515625,
994
- "learning_rate": 9.69418254764293e-05,
995
- "loss": 1.244,
996
- "step": 3350
997
- },
998
- {
999
- "epoch": 2.9125836391107276,
1000
- "grad_norm": 0.2421875,
1001
- "learning_rate": 9.691675025075226e-05,
1002
- "loss": 1.1166,
1003
- "step": 3375
1004
- },
1005
- {
1006
- "epoch": 2.934167925750054,
1007
- "grad_norm": 0.171875,
1008
- "learning_rate": 9.689167502507523e-05,
1009
- "loss": 1.1511,
1010
- "step": 3400
1011
- },
1012
- {
1013
- "epoch": 2.9557522123893807,
1014
- "grad_norm": 0.208984375,
1015
- "learning_rate": 9.68665997993982e-05,
1016
- "loss": 1.2977,
1017
- "step": 3425
1018
- },
1019
- {
1020
- "epoch": 2.977336499028707,
1021
- "grad_norm": 0.201171875,
1022
- "learning_rate": 9.684152457372116e-05,
1023
- "loss": 1.2038,
1024
- "step": 3450
1025
- },
1026
- {
1027
- "epoch": 2.9989207856680338,
1028
- "grad_norm": 0.1884765625,
1029
- "learning_rate": 9.681644934804414e-05,
1030
- "loss": 1.3379,
1031
- "step": 3475
1032
- },
1033
- {
1034
- "epoch": 3.0198575437081803,
1035
- "grad_norm": 0.232421875,
1036
- "learning_rate": 9.679137412236711e-05,
1037
- "loss": 1.0725,
1038
- "step": 3500
1039
- },
1040
- {
1041
- "epoch": 3.0198575437081803,
1042
- "eval_loss": 1.202246069908142,
1043
- "eval_runtime": 117.3316,
1044
- "eval_samples_per_second": 4.261,
1045
- "eval_steps_per_second": 4.261,
1046
- "step": 3500
1047
- },
1048
- {
1049
- "epoch": 3.041441830347507,
1050
- "grad_norm": 0.25390625,
1051
- "learning_rate": 9.676629889669007e-05,
1052
- "loss": 1.1343,
1053
- "step": 3525
1054
- },
1055
- {
1056
- "epoch": 3.0630261169868334,
1057
- "grad_norm": 0.19140625,
1058
- "learning_rate": 9.674122367101305e-05,
1059
- "loss": 1.2788,
1060
- "step": 3550
1061
- },
1062
- {
1063
- "epoch": 3.08461040362616,
1064
- "grad_norm": 0.203125,
1065
- "learning_rate": 9.671614844533601e-05,
1066
- "loss": 1.2856,
1067
- "step": 3575
1068
- },
1069
- {
1070
- "epoch": 3.106194690265487,
1071
- "grad_norm": 0.224609375,
1072
- "learning_rate": 9.669107321965897e-05,
1073
- "loss": 1.1778,
1074
- "step": 3600
1075
- },
1076
- {
1077
- "epoch": 3.127778976904813,
1078
- "grad_norm": 0.2451171875,
1079
- "learning_rate": 9.666599799398195e-05,
1080
- "loss": 1.1496,
1081
- "step": 3625
1082
- },
1083
- {
1084
- "epoch": 3.14936326354414,
1085
- "grad_norm": 0.279296875,
1086
- "learning_rate": 9.664092276830492e-05,
1087
- "loss": 1.1717,
1088
- "step": 3650
1089
- },
1090
- {
1091
- "epoch": 3.1709475501834663,
1092
- "grad_norm": 0.2275390625,
1093
- "learning_rate": 9.661584754262789e-05,
1094
- "loss": 1.2103,
1095
- "step": 3675
1096
- },
1097
- {
1098
- "epoch": 3.192531836822793,
1099
- "grad_norm": 0.197265625,
1100
- "learning_rate": 9.659077231695087e-05,
1101
- "loss": 1.1198,
1102
- "step": 3700
1103
- },
1104
- {
1105
- "epoch": 3.2141161234621194,
1106
- "grad_norm": 0.1884765625,
1107
- "learning_rate": 9.656569709127382e-05,
1108
- "loss": 1.1266,
1109
- "step": 3725
1110
- },
1111
- {
1112
- "epoch": 3.235700410101446,
1113
- "grad_norm": 0.1982421875,
1114
- "learning_rate": 9.65406218655968e-05,
1115
- "loss": 1.1171,
1116
- "step": 3750
1117
- },
1118
- {
1119
- "epoch": 3.257284696740773,
1120
- "grad_norm": 0.1943359375,
1121
- "learning_rate": 9.651554663991976e-05,
1122
- "loss": 1.2812,
1123
- "step": 3775
1124
- },
1125
- {
1126
- "epoch": 3.2788689833800992,
1127
- "grad_norm": 0.2451171875,
1128
- "learning_rate": 9.649047141424273e-05,
1129
- "loss": 1.0618,
1130
- "step": 3800
1131
- },
1132
- {
1133
- "epoch": 3.300453270019426,
1134
- "grad_norm": 0.20703125,
1135
- "learning_rate": 9.64653961885657e-05,
1136
- "loss": 1.1916,
1137
- "step": 3825
1138
- },
1139
- {
1140
- "epoch": 3.3220375566587523,
1141
- "grad_norm": 0.26171875,
1142
- "learning_rate": 9.644032096288868e-05,
1143
- "loss": 1.1048,
1144
- "step": 3850
1145
- },
1146
- {
1147
- "epoch": 3.343621843298079,
1148
- "grad_norm": 0.208984375,
1149
- "learning_rate": 9.641524573721163e-05,
1150
- "loss": 1.1324,
1151
- "step": 3875
1152
- },
1153
- {
1154
- "epoch": 3.3652061299374054,
1155
- "grad_norm": 0.20703125,
1156
- "learning_rate": 9.639017051153461e-05,
1157
- "loss": 1.2728,
1158
- "step": 3900
1159
- },
1160
- {
1161
- "epoch": 3.386790416576732,
1162
- "grad_norm": 0.1923828125,
1163
- "learning_rate": 9.636509528585758e-05,
1164
- "loss": 1.2898,
1165
- "step": 3925
1166
- },
1167
- {
1168
- "epoch": 3.408374703216059,
1169
- "grad_norm": 0.232421875,
1170
- "learning_rate": 9.634002006018054e-05,
1171
- "loss": 1.1316,
1172
- "step": 3950
1173
- },
1174
- {
1175
- "epoch": 3.4299589898553853,
1176
- "grad_norm": 0.197265625,
1177
- "learning_rate": 9.631494483450352e-05,
1178
- "loss": 1.2914,
1179
- "step": 3975
1180
- },
1181
- {
1182
- "epoch": 3.451543276494712,
1183
- "grad_norm": 0.197265625,
1184
- "learning_rate": 9.628986960882649e-05,
1185
- "loss": 1.2218,
1186
- "step": 4000
1187
- },
1188
- {
1189
- "epoch": 3.451543276494712,
1190
- "eval_loss": 1.202254295349121,
1191
- "eval_runtime": 118.0884,
1192
- "eval_samples_per_second": 4.234,
1193
- "eval_steps_per_second": 4.234,
1194
- "step": 4000
1195
- },
1196
- {
1197
- "epoch": 3.4731275631340384,
1198
- "grad_norm": 0.251953125,
1199
- "learning_rate": 9.626479438314944e-05,
1200
- "loss": 1.1132,
1201
- "step": 4025
1202
- },
1203
- {
1204
- "epoch": 3.494711849773365,
1205
- "grad_norm": 0.1630859375,
1206
- "learning_rate": 9.623971915747242e-05,
1207
- "loss": 1.1148,
1208
- "step": 4050
1209
- },
1210
- {
1211
- "epoch": 3.5162961364126915,
1212
- "grad_norm": 0.20703125,
1213
- "learning_rate": 9.621464393179539e-05,
1214
- "loss": 1.1045,
1215
- "step": 4075
1216
- },
1217
- {
1218
- "epoch": 3.537880423052018,
1219
- "grad_norm": 0.2021484375,
1220
- "learning_rate": 9.618956870611835e-05,
1221
- "loss": 1.3047,
1222
- "step": 4100
1223
- },
1224
- {
1225
- "epoch": 3.5594647096913445,
1226
- "grad_norm": 0.162109375,
1227
- "learning_rate": 9.616449348044133e-05,
1228
- "loss": 1.0816,
1229
- "step": 4125
1230
- },
1231
- {
1232
- "epoch": 3.5810489963306713,
1233
- "grad_norm": 0.2001953125,
1234
- "learning_rate": 9.61394182547643e-05,
1235
- "loss": 1.3246,
1236
- "step": 4150
1237
- },
1238
- {
1239
- "epoch": 3.6026332829699976,
1240
- "grad_norm": 0.19921875,
1241
- "learning_rate": 9.611434302908727e-05,
1242
- "loss": 1.1461,
1243
- "step": 4175
1244
- },
1245
- {
1246
- "epoch": 3.6242175696093244,
1247
- "grad_norm": 0.259765625,
1248
- "learning_rate": 9.608926780341023e-05,
1249
- "loss": 1.0712,
1250
- "step": 4200
1251
- },
1252
- {
1253
- "epoch": 3.645801856248651,
1254
- "grad_norm": 0.2314453125,
1255
- "learning_rate": 9.60641925777332e-05,
1256
- "loss": 1.0729,
1257
- "step": 4225
1258
- },
1259
- {
1260
- "epoch": 3.6673861428879775,
1261
- "grad_norm": 0.2578125,
1262
- "learning_rate": 9.603911735205618e-05,
1263
- "loss": 1.1162,
1264
- "step": 4250
1265
- },
1266
- {
1267
- "epoch": 3.6889704295273043,
1268
- "grad_norm": 0.19921875,
1269
- "learning_rate": 9.601404212637915e-05,
1270
- "loss": 1.2936,
1271
- "step": 4275
1272
- },
1273
- {
1274
- "epoch": 3.7105547161666306,
1275
- "grad_norm": 0.248046875,
1276
- "learning_rate": 9.598896690070211e-05,
1277
- "loss": 1.2288,
1278
- "step": 4300
1279
- },
1280
- {
1281
- "epoch": 3.7321390028059573,
1282
- "grad_norm": 0.1953125,
1283
- "learning_rate": 9.596389167502508e-05,
1284
- "loss": 1.2297,
1285
- "step": 4325
1286
- },
1287
- {
1288
- "epoch": 3.7537232894452837,
1289
- "grad_norm": 0.1767578125,
1290
- "learning_rate": 9.593881644934805e-05,
1291
- "loss": 1.1267,
1292
- "step": 4350
1293
- },
1294
- {
1295
- "epoch": 3.7753075760846104,
1296
- "grad_norm": 0.2099609375,
1297
- "learning_rate": 9.591374122367101e-05,
1298
- "loss": 1.0324,
1299
- "step": 4375
1300
- },
1301
- {
1302
- "epoch": 3.796891862723937,
1303
- "grad_norm": 0.166015625,
1304
- "learning_rate": 9.588866599799399e-05,
1305
- "loss": 1.1171,
1306
- "step": 4400
1307
- },
1308
- {
1309
- "epoch": 3.8184761493632635,
1310
- "grad_norm": 0.224609375,
1311
- "learning_rate": 9.586359077231696e-05,
1312
- "loss": 1.292,
1313
- "step": 4425
1314
- },
1315
- {
1316
- "epoch": 3.84006043600259,
1317
- "grad_norm": 0.21875,
1318
- "learning_rate": 9.583851554663992e-05,
1319
- "loss": 1.0742,
1320
- "step": 4450
1321
- },
1322
- {
1323
- "epoch": 3.8616447226419166,
1324
- "grad_norm": 0.6171875,
1325
- "learning_rate": 9.581344032096289e-05,
1326
- "loss": 1.2224,
1327
- "step": 4475
1328
- },
1329
- {
1330
- "epoch": 3.8832290092812434,
1331
- "grad_norm": 0.1943359375,
1332
- "learning_rate": 9.578836509528586e-05,
1333
- "loss": 1.089,
1334
- "step": 4500
1335
- },
1336
- {
1337
- "epoch": 3.8832290092812434,
1338
- "eval_loss": 1.1965827941894531,
1339
- "eval_runtime": 118.9387,
1340
- "eval_samples_per_second": 4.204,
1341
- "eval_steps_per_second": 4.204,
1342
- "step": 4500
1343
- },
1344
- {
1345
- "epoch": 3.9048132959205697,
1346
- "grad_norm": 0.1943359375,
1347
- "learning_rate": 9.576328986960882e-05,
1348
- "loss": 1.0507,
1349
- "step": 4525
1350
- },
1351
- {
1352
- "epoch": 3.9263975825598965,
1353
- "grad_norm": 0.20703125,
1354
- "learning_rate": 9.57382146439318e-05,
1355
- "loss": 1.2054,
1356
- "step": 4550
1357
- },
1358
- {
1359
- "epoch": 3.9479818691992232,
1360
- "grad_norm": 0.1611328125,
1361
- "learning_rate": 9.571313941825477e-05,
1362
- "loss": 1.2096,
1363
- "step": 4575
1364
- },
1365
- {
1366
- "epoch": 3.9695661558385495,
1367
- "grad_norm": 0.22265625,
1368
- "learning_rate": 9.568806419257774e-05,
1369
- "loss": 1.2408,
1370
- "step": 4600
1371
- },
1372
- {
1373
- "epoch": 3.991150442477876,
1374
- "grad_norm": 0.2060546875,
1375
- "learning_rate": 9.56629889669007e-05,
1376
- "loss": 1.3001,
1377
- "step": 4625
1378
- },
1379
- {
1380
- "epoch": 4.012087200518023,
1381
- "grad_norm": 0.2578125,
1382
- "learning_rate": 9.563791374122367e-05,
1383
- "loss": 1.1829,
1384
- "step": 4650
1385
- },
1386
- {
1387
- "epoch": 4.033671487157349,
1388
- "grad_norm": 0.181640625,
1389
- "learning_rate": 9.561283851554665e-05,
1390
- "loss": 1.1251,
1391
- "step": 4675
1392
- },
1393
- {
1394
- "epoch": 4.055255773796676,
1395
- "grad_norm": 0.205078125,
1396
- "learning_rate": 9.558776328986961e-05,
1397
- "loss": 1.1453,
1398
- "step": 4700
1399
- },
1400
- {
1401
- "epoch": 4.076840060436003,
1402
- "grad_norm": 0.267578125,
1403
- "learning_rate": 9.556268806419258e-05,
1404
- "loss": 0.9924,
1405
- "step": 4725
1406
- },
1407
- {
1408
- "epoch": 4.098424347075329,
1409
- "grad_norm": 0.2138671875,
1410
- "learning_rate": 9.553761283851556e-05,
1411
- "loss": 1.1183,
1412
- "step": 4750
1413
- },
1414
- {
1415
- "epoch": 4.120008633714655,
1416
- "grad_norm": 0.212890625,
1417
- "learning_rate": 9.551253761283851e-05,
1418
- "loss": 1.1264,
1419
- "step": 4775
1420
- },
1421
- {
1422
- "epoch": 4.1415929203539825,
1423
- "grad_norm": 0.2451171875,
1424
- "learning_rate": 9.548746238716148e-05,
1425
- "loss": 1.083,
1426
- "step": 4800
1427
- },
1428
- {
1429
- "epoch": 4.163177206993309,
1430
- "grad_norm": 0.2001953125,
1431
- "learning_rate": 9.546238716148446e-05,
1432
- "loss": 1.2387,
1433
- "step": 4825
1434
- },
1435
- {
1436
- "epoch": 4.184761493632635,
1437
- "grad_norm": 0.21484375,
1438
- "learning_rate": 9.543731193580743e-05,
1439
- "loss": 1.1978,
1440
- "step": 4850
1441
- },
1442
- {
1443
- "epoch": 4.206345780271962,
1444
- "grad_norm": 0.205078125,
1445
- "learning_rate": 9.541223671013039e-05,
1446
- "loss": 1.2543,
1447
- "step": 4875
1448
- },
1449
- {
1450
- "epoch": 4.227930066911289,
1451
- "grad_norm": 0.208984375,
1452
- "learning_rate": 9.538716148445337e-05,
1453
- "loss": 1.1928,
1454
- "step": 4900
1455
- },
1456
- {
1457
- "epoch": 4.249514353550615,
1458
- "grad_norm": 0.1875,
1459
- "learning_rate": 9.536208625877633e-05,
1460
- "loss": 1.1346,
1461
- "step": 4925
1462
- },
1463
- {
1464
- "epoch": 4.271098640189941,
1465
- "grad_norm": 0.2021484375,
1466
- "learning_rate": 9.53370110330993e-05,
1467
- "loss": 1.3266,
1468
- "step": 4950
1469
- },
1470
- {
1471
- "epoch": 4.2926829268292686,
1472
- "grad_norm": 0.224609375,
1473
- "learning_rate": 9.531193580742227e-05,
1474
- "loss": 1.2258,
1475
- "step": 4975
1476
- },
1477
- {
1478
- "epoch": 4.314267213468595,
1479
- "grad_norm": 0.2431640625,
1480
- "learning_rate": 9.528686058174524e-05,
1481
- "loss": 1.0399,
1482
- "step": 5000
1483
- },
1484
- {
1485
- "epoch": 4.314267213468595,
1486
- "eval_loss": 1.194091796875,
1487
- "eval_runtime": 118.5046,
1488
- "eval_samples_per_second": 4.219,
1489
- "eval_steps_per_second": 4.219,
1490
- "step": 5000
1491
- },
1492
- {
1493
- "epoch": 4.335851500107921,
1494
- "grad_norm": 0.19921875,
1495
- "learning_rate": 9.526178535606822e-05,
1496
- "loss": 1.1494,
1497
- "step": 5025
1498
- },
1499
- {
1500
- "epoch": 4.357435786747248,
1501
- "grad_norm": 0.2119140625,
1502
- "learning_rate": 9.523671013039118e-05,
1503
- "loss": 1.0299,
1504
- "step": 5050
1505
- },
1506
- {
1507
- "epoch": 4.379020073386575,
1508
- "grad_norm": 0.2451171875,
1509
- "learning_rate": 9.521163490471414e-05,
1510
- "loss": 1.1657,
1511
- "step": 5075
1512
- },
1513
- {
1514
- "epoch": 4.400604360025901,
1515
- "grad_norm": 0.265625,
1516
- "learning_rate": 9.518655967903712e-05,
1517
- "loss": 1.0644,
1518
- "step": 5100
1519
- },
1520
- {
1521
- "epoch": 4.422188646665227,
1522
- "grad_norm": 0.1826171875,
1523
- "learning_rate": 9.516148445336008e-05,
1524
- "loss": 1.2457,
1525
- "step": 5125
1526
- },
1527
- {
1528
- "epoch": 4.443772933304555,
1529
- "grad_norm": 0.2578125,
1530
- "learning_rate": 9.513640922768305e-05,
1531
- "loss": 1.054,
1532
- "step": 5150
1533
- },
1534
- {
1535
- "epoch": 4.465357219943881,
1536
- "grad_norm": 0.18359375,
1537
- "learning_rate": 9.511133400200603e-05,
1538
- "loss": 1.2129,
1539
- "step": 5175
1540
- },
1541
- {
1542
- "epoch": 4.486941506583207,
1543
- "grad_norm": 0.173828125,
1544
- "learning_rate": 9.5086258776329e-05,
1545
- "loss": 0.9745,
1546
- "step": 5200
1547
- },
1548
- {
1549
- "epoch": 4.5085257932225336,
1550
- "grad_norm": 0.2119140625,
1551
- "learning_rate": 9.506118355065195e-05,
1552
- "loss": 1.1331,
1553
- "step": 5225
1554
- },
1555
- {
1556
- "epoch": 4.530110079861861,
1557
- "grad_norm": 0.201171875,
1558
- "learning_rate": 9.503610832497493e-05,
1559
- "loss": 1.1554,
1560
- "step": 5250
1561
- },
1562
- {
1563
- "epoch": 4.551694366501187,
1564
- "grad_norm": 0.369140625,
1565
- "learning_rate": 9.50110330992979e-05,
1566
- "loss": 1.3741,
1567
- "step": 5275
1568
- },
1569
- {
1570
- "epoch": 4.573278653140513,
1571
- "grad_norm": 0.31640625,
1572
- "learning_rate": 9.498595787362086e-05,
1573
- "loss": 1.2421,
1574
- "step": 5300
1575
- },
1576
- {
1577
- "epoch": 4.594862939779841,
1578
- "grad_norm": 0.16796875,
1579
- "learning_rate": 9.496088264794384e-05,
1580
- "loss": 1.1185,
1581
- "step": 5325
1582
- },
1583
- {
1584
- "epoch": 4.616447226419167,
1585
- "grad_norm": 0.423828125,
1586
- "learning_rate": 9.493580742226681e-05,
1587
- "loss": 1.1028,
1588
- "step": 5350
1589
- },
1590
- {
1591
- "epoch": 4.638031513058493,
1592
- "grad_norm": 0.2099609375,
1593
- "learning_rate": 9.491073219658977e-05,
1594
- "loss": 1.246,
1595
- "step": 5375
1596
- },
1597
- {
1598
- "epoch": 4.65961579969782,
1599
- "grad_norm": 0.212890625,
1600
- "learning_rate": 9.488565697091274e-05,
1601
- "loss": 1.1932,
1602
- "step": 5400
1603
- },
1604
- {
1605
- "epoch": 4.681200086337147,
1606
- "grad_norm": 0.2197265625,
1607
- "learning_rate": 9.486058174523571e-05,
1608
- "loss": 1.1054,
1609
- "step": 5425
1610
- },
1611
- {
1612
- "epoch": 4.702784372976473,
1613
- "grad_norm": 0.1982421875,
1614
- "learning_rate": 9.483550651955869e-05,
1615
- "loss": 1.2252,
1616
- "step": 5450
1617
- },
1618
- {
1619
- "epoch": 4.724368659615799,
1620
- "grad_norm": 0.294921875,
1621
- "learning_rate": 9.481043129388165e-05,
1622
- "loss": 1.3116,
1623
- "step": 5475
1624
- },
1625
- {
1626
- "epoch": 4.745952946255127,
1627
- "grad_norm": 0.205078125,
1628
- "learning_rate": 9.478535606820462e-05,
1629
- "loss": 1.0582,
1630
- "step": 5500
1631
- },
1632
- {
1633
- "epoch": 4.745952946255127,
1634
- "eval_loss": 1.1924177408218384,
1635
- "eval_runtime": 119.0148,
1636
- "eval_samples_per_second": 4.201,
1637
- "eval_steps_per_second": 4.201,
1638
- "step": 5500
1639
- },
1640
- {
1641
- "epoch": 4.767537232894453,
1642
- "grad_norm": 0.1904296875,
1643
- "learning_rate": 9.476028084252759e-05,
1644
- "loss": 1.2304,
1645
- "step": 5525
1646
- },
1647
- {
1648
- "epoch": 4.789121519533779,
1649
- "grad_norm": 0.2353515625,
1650
- "learning_rate": 9.473520561685055e-05,
1651
- "loss": 1.2139,
1652
- "step": 5550
1653
- },
1654
- {
1655
- "epoch": 4.810705806173106,
1656
- "grad_norm": 0.2451171875,
1657
- "learning_rate": 9.471013039117352e-05,
1658
- "loss": 1.2242,
1659
- "step": 5575
1660
- },
1661
- {
1662
- "epoch": 4.832290092812433,
1663
- "grad_norm": 0.1767578125,
1664
- "learning_rate": 9.46850551654965e-05,
1665
- "loss": 1.2189,
1666
- "step": 5600
1667
- },
1668
- {
1669
- "epoch": 4.853874379451759,
1670
- "grad_norm": 0.216796875,
1671
- "learning_rate": 9.465997993981946e-05,
1672
- "loss": 1.2028,
1673
- "step": 5625
1674
- },
1675
- {
1676
- "epoch": 4.8754586660910855,
1677
- "grad_norm": 0.1845703125,
1678
- "learning_rate": 9.463490471414243e-05,
1679
- "loss": 1.3289,
1680
- "step": 5650
1681
- },
1682
- {
1683
- "epoch": 4.897042952730413,
1684
- "grad_norm": 0.158203125,
1685
- "learning_rate": 9.46098294884654e-05,
1686
- "loss": 1.0569,
1687
- "step": 5675
1688
- },
1689
- {
1690
- "epoch": 4.918627239369739,
1691
- "grad_norm": 0.23046875,
1692
- "learning_rate": 9.458475426278836e-05,
1693
- "loss": 1.2749,
1694
- "step": 5700
1695
- },
1696
- {
1697
- "epoch": 4.940211526009065,
1698
- "grad_norm": 0.2216796875,
1699
- "learning_rate": 9.455967903711134e-05,
1700
- "loss": 1.1241,
1701
- "step": 5725
1702
- },
1703
- {
1704
- "epoch": 4.961795812648392,
1705
- "grad_norm": 0.1943359375,
1706
- "learning_rate": 9.453460381143431e-05,
1707
- "loss": 1.228,
1708
- "step": 5750
1709
- },
1710
- {
1711
- "epoch": 4.983380099287719,
1712
- "grad_norm": 0.1953125,
1713
- "learning_rate": 9.450952858575728e-05,
1714
- "loss": 1.1277,
1715
- "step": 5775
1716
- },
1717
- {
1718
- "epoch": 5.004316857327865,
1719
- "grad_norm": 0.1904296875,
1720
- "learning_rate": 9.448445336008024e-05,
1721
- "loss": 0.9851,
1722
- "step": 5800
1723
- },
1724
- {
1725
- "epoch": 5.025901143967192,
1726
- "grad_norm": 0.2490234375,
1727
- "learning_rate": 9.445937813440321e-05,
1728
- "loss": 1.1516,
1729
- "step": 5825
1730
- },
1731
- {
1732
- "epoch": 5.047485430606518,
1733
- "grad_norm": 0.1630859375,
1734
- "learning_rate": 9.443430290872618e-05,
1735
- "loss": 1.1664,
1736
- "step": 5850
1737
- },
1738
- {
1739
- "epoch": 5.069069717245845,
1740
- "grad_norm": 0.2080078125,
1741
- "learning_rate": 9.440922768304916e-05,
1742
- "loss": 1.0938,
1743
- "step": 5875
1744
- },
1745
- {
1746
- "epoch": 5.090654003885172,
1747
- "grad_norm": 0.2216796875,
1748
- "learning_rate": 9.438415245737212e-05,
1749
- "loss": 1.0517,
1750
- "step": 5900
1751
- },
1752
- {
1753
- "epoch": 5.112238290524498,
1754
- "grad_norm": 0.2001953125,
1755
- "learning_rate": 9.435907723169509e-05,
1756
- "loss": 1.1735,
1757
- "step": 5925
1758
- },
1759
- {
1760
- "epoch": 5.133822577163825,
1761
- "grad_norm": 0.25,
1762
- "learning_rate": 9.433400200601807e-05,
1763
- "loss": 0.9135,
1764
- "step": 5950
1765
- },
1766
- {
1767
- "epoch": 5.155406863803151,
1768
- "grad_norm": 0.208984375,
1769
- "learning_rate": 9.430892678034102e-05,
1770
- "loss": 1.0003,
1771
- "step": 5975
1772
- },
1773
- {
1774
- "epoch": 5.176991150442478,
1775
- "grad_norm": 0.228515625,
1776
- "learning_rate": 9.428385155466399e-05,
1777
- "loss": 1.1648,
1778
- "step": 6000
1779
- },
1780
- {
1781
- "epoch": 5.176991150442478,
1782
- "eval_loss": 1.1907110214233398,
1783
- "eval_runtime": 119.0768,
1784
- "eval_samples_per_second": 4.199,
1785
- "eval_steps_per_second": 4.199,
1786
- "step": 6000
1787
- },
1788
- {
1789
- "epoch": 5.1985754370818045,
1790
- "grad_norm": 0.2578125,
1791
- "learning_rate": 9.425877632898697e-05,
1792
- "loss": 1.2177,
1793
- "step": 6025
1794
- },
1795
- {
1796
- "epoch": 5.220159723721131,
1797
- "grad_norm": 0.2373046875,
1798
- "learning_rate": 9.423370110330993e-05,
1799
- "loss": 1.2648,
1800
- "step": 6050
1801
- },
1802
- {
1803
- "epoch": 5.241744010360458,
1804
- "grad_norm": 0.181640625,
1805
- "learning_rate": 9.42086258776329e-05,
1806
- "loss": 1.2015,
1807
- "step": 6075
1808
- },
1809
- {
1810
- "epoch": 5.263328296999784,
1811
- "grad_norm": 0.2265625,
1812
- "learning_rate": 9.418355065195588e-05,
1813
- "loss": 1.1269,
1814
- "step": 6100
1815
- },
1816
- {
1817
- "epoch": 5.284912583639111,
1818
- "grad_norm": 0.154296875,
1819
- "learning_rate": 9.415847542627883e-05,
1820
- "loss": 1.0723,
1821
- "step": 6125
1822
- },
1823
- {
1824
- "epoch": 5.306496870278437,
1825
- "grad_norm": 0.2001953125,
1826
- "learning_rate": 9.413340020060181e-05,
1827
- "loss": 1.0201,
1828
- "step": 6150
1829
- },
1830
- {
1831
- "epoch": 5.328081156917764,
1832
- "grad_norm": 0.212890625,
1833
- "learning_rate": 9.410832497492478e-05,
1834
- "loss": 1.3579,
1835
- "step": 6175
1836
- },
1837
- {
1838
- "epoch": 5.3496654435570905,
1839
- "grad_norm": 0.2333984375,
1840
- "learning_rate": 9.408324974924775e-05,
1841
- "loss": 1.1706,
1842
- "step": 6200
1843
- },
1844
- {
1845
- "epoch": 5.371249730196417,
1846
- "grad_norm": 0.2158203125,
1847
- "learning_rate": 9.405817452357073e-05,
1848
- "loss": 1.1311,
1849
- "step": 6225
1850
- },
1851
- {
1852
- "epoch": 5.392834016835744,
1853
- "grad_norm": 0.2021484375,
1854
- "learning_rate": 9.403309929789369e-05,
1855
- "loss": 1.133,
1856
- "step": 6250
1857
- },
1858
- {
1859
- "epoch": 5.41441830347507,
1860
- "grad_norm": 0.265625,
1861
- "learning_rate": 9.400802407221664e-05,
1862
- "loss": 1.1611,
1863
- "step": 6275
1864
- },
1865
- {
1866
- "epoch": 5.436002590114397,
1867
- "grad_norm": 0.22265625,
1868
- "learning_rate": 9.398294884653962e-05,
1869
- "loss": 1.1361,
1870
- "step": 6300
1871
- },
1872
- {
1873
- "epoch": 5.457586876753723,
1874
- "grad_norm": 0.287109375,
1875
- "learning_rate": 9.395787362086259e-05,
1876
- "loss": 1.1646,
1877
- "step": 6325
1878
- },
1879
- {
1880
- "epoch": 5.47917116339305,
1881
- "grad_norm": 0.310546875,
1882
- "learning_rate": 9.393279839518556e-05,
1883
- "loss": 1.2833,
1884
- "step": 6350
1885
- },
1886
- {
1887
- "epoch": 5.5007554500323765,
1888
- "grad_norm": 0.203125,
1889
- "learning_rate": 9.390772316950854e-05,
1890
- "loss": 1.1654,
1891
- "step": 6375
1892
- },
1893
- {
1894
- "epoch": 5.522339736671703,
1895
- "grad_norm": 0.1767578125,
1896
- "learning_rate": 9.38826479438315e-05,
1897
- "loss": 1.0767,
1898
- "step": 6400
1899
- },
1900
- {
1901
- "epoch": 5.54392402331103,
1902
- "grad_norm": 0.2236328125,
1903
- "learning_rate": 9.385757271815447e-05,
1904
- "loss": 1.2222,
1905
- "step": 6425
1906
- },
1907
- {
1908
- "epoch": 5.565508309950356,
1909
- "grad_norm": 0.1865234375,
1910
- "learning_rate": 9.383249749247744e-05,
1911
- "loss": 1.0511,
1912
- "step": 6450
1913
- },
1914
- {
1915
- "epoch": 5.587092596589683,
1916
- "grad_norm": 0.236328125,
1917
- "learning_rate": 9.38074222668004e-05,
1918
- "loss": 1.0983,
1919
- "step": 6475
1920
- },
1921
- {
1922
- "epoch": 5.608676883229009,
1923
- "grad_norm": 0.2119140625,
1924
- "learning_rate": 9.378234704112337e-05,
1925
- "loss": 1.1242,
1926
- "step": 6500
1927
- },
1928
- {
1929
- "epoch": 5.608676883229009,
1930
- "eval_loss": 1.189411997795105,
1931
- "eval_runtime": 116.8693,
1932
- "eval_samples_per_second": 4.278,
1933
- "eval_steps_per_second": 4.278,
1934
- "step": 6500
1935
- },
1936
- {
1937
- "epoch": 5.630261169868335,
1938
- "grad_norm": 0.1884765625,
1939
- "learning_rate": 9.375727181544635e-05,
1940
- "loss": 1.1597,
1941
- "step": 6525
1942
- },
1943
- {
1944
- "epoch": 5.651845456507663,
1945
- "grad_norm": 0.173828125,
1946
- "learning_rate": 9.373219658976931e-05,
1947
- "loss": 1.1403,
1948
- "step": 6550
1949
- },
1950
- {
1951
- "epoch": 5.673429743146989,
1952
- "grad_norm": 0.1875,
1953
- "learning_rate": 9.370712136409228e-05,
1954
- "loss": 1.0764,
1955
- "step": 6575
1956
- },
1957
- {
1958
- "epoch": 5.695014029786315,
1959
- "grad_norm": 0.2275390625,
1960
- "learning_rate": 9.368204613841525e-05,
1961
- "loss": 1.1857,
1962
- "step": 6600
1963
- },
1964
- {
1965
- "epoch": 5.716598316425642,
1966
- "grad_norm": 0.1943359375,
1967
- "learning_rate": 9.365697091273821e-05,
1968
- "loss": 1.1954,
1969
- "step": 6625
1970
- },
1971
- {
1972
- "epoch": 5.738182603064969,
1973
- "grad_norm": 0.2138671875,
1974
- "learning_rate": 9.36318956870612e-05,
1975
- "loss": 1.1375,
1976
- "step": 6650
1977
- },
1978
- {
1979
- "epoch": 5.759766889704295,
1980
- "grad_norm": 0.240234375,
1981
- "learning_rate": 9.360682046138416e-05,
1982
- "loss": 1.3564,
1983
- "step": 6675
1984
- },
1985
- {
1986
- "epoch": 5.781351176343621,
1987
- "grad_norm": 0.2333984375,
1988
- "learning_rate": 9.358174523570713e-05,
1989
- "loss": 1.1443,
1990
- "step": 6700
1991
- },
1992
- {
1993
- "epoch": 5.802935462982949,
1994
- "grad_norm": 0.263671875,
1995
- "learning_rate": 9.355667001003009e-05,
1996
- "loss": 1.0885,
1997
- "step": 6725
1998
- },
1999
- {
2000
- "epoch": 5.824519749622275,
2001
- "grad_norm": 0.283203125,
2002
- "learning_rate": 9.353159478435306e-05,
2003
- "loss": 1.2816,
2004
- "step": 6750
2005
- },
2006
- {
2007
- "epoch": 5.846104036261601,
2008
- "grad_norm": 0.283203125,
2009
- "learning_rate": 9.350651955867603e-05,
2010
- "loss": 1.3483,
2011
- "step": 6775
2012
- },
2013
- {
2014
- "epoch": 5.8676883229009285,
2015
- "grad_norm": 0.2490234375,
2016
- "learning_rate": 9.3481444332999e-05,
2017
- "loss": 1.2436,
2018
- "step": 6800
2019
- },
2020
- {
2021
- "epoch": 5.889272609540255,
2022
- "grad_norm": 0.29296875,
2023
- "learning_rate": 9.345636910732197e-05,
2024
- "loss": 1.2322,
2025
- "step": 6825
2026
- },
2027
- {
2028
- "epoch": 5.910856896179581,
2029
- "grad_norm": 0.2451171875,
2030
- "learning_rate": 9.343129388164494e-05,
2031
- "loss": 1.1397,
2032
- "step": 6850
2033
- },
2034
- {
2035
- "epoch": 5.932441182818907,
2036
- "grad_norm": 0.2314453125,
2037
- "learning_rate": 9.34062186559679e-05,
2038
- "loss": 1.0838,
2039
- "step": 6875
2040
- },
2041
- {
2042
- "epoch": 5.954025469458235,
2043
- "grad_norm": 0.1953125,
2044
- "learning_rate": 9.338114343029087e-05,
2045
- "loss": 1.235,
2046
- "step": 6900
2047
- },
2048
- {
2049
- "epoch": 5.975609756097561,
2050
- "grad_norm": 0.2373046875,
2051
- "learning_rate": 9.335606820461385e-05,
2052
- "loss": 1.2332,
2053
- "step": 6925
2054
- },
2055
- {
2056
- "epoch": 5.997194042736887,
2057
- "grad_norm": 0.185546875,
2058
- "learning_rate": 9.333099297893682e-05,
2059
- "loss": 1.2239,
2060
- "step": 6950
2061
- },
2062
- {
2063
- "epoch": 6.018130800777034,
2064
- "grad_norm": 0.193359375,
2065
- "learning_rate": 9.330591775325978e-05,
2066
- "loss": 1.1734,
2067
- "step": 6975
2068
- },
2069
- {
2070
- "epoch": 6.0397150874163605,
2071
- "grad_norm": 0.2578125,
2072
- "learning_rate": 9.328084252758276e-05,
2073
- "loss": 1.0798,
2074
- "step": 7000
2075
- },
2076
- {
2077
- "epoch": 6.0397150874163605,
2078
- "eval_loss": 1.1887987852096558,
2079
- "eval_runtime": 118.2372,
2080
- "eval_samples_per_second": 4.229,
2081
- "eval_steps_per_second": 4.229,
2082
- "step": 7000
2083
- },
2084
- {
2085
- "epoch": 6.061299374055688,
2086
- "grad_norm": 0.2353515625,
2087
- "learning_rate": 9.325576730190572e-05,
2088
- "loss": 1.1714,
2089
- "step": 7025
2090
- },
2091
- {
2092
- "epoch": 6.082883660695014,
2093
- "grad_norm": 0.23828125,
2094
- "learning_rate": 9.323069207622868e-05,
2095
- "loss": 1.2273,
2096
- "step": 7050
2097
- },
2098
- {
2099
- "epoch": 6.10446794733434,
2100
- "grad_norm": 0.216796875,
2101
- "learning_rate": 9.320561685055166e-05,
2102
- "loss": 1.1239,
2103
- "step": 7075
2104
- },
2105
- {
2106
- "epoch": 6.126052233973667,
2107
- "grad_norm": 0.228515625,
2108
- "learning_rate": 9.318054162487463e-05,
2109
- "loss": 1.2041,
2110
- "step": 7100
2111
- },
2112
- {
2113
- "epoch": 6.147636520612994,
2114
- "grad_norm": 0.2421875,
2115
- "learning_rate": 9.31554663991976e-05,
2116
- "loss": 1.1536,
2117
- "step": 7125
2118
- },
2119
- {
2120
- "epoch": 6.16922080725232,
2121
- "grad_norm": 0.1796875,
2122
- "learning_rate": 9.313039117352057e-05,
2123
- "loss": 1.1006,
2124
- "step": 7150
2125
- },
2126
- {
2127
- "epoch": 6.190805093891647,
2128
- "grad_norm": 0.259765625,
2129
- "learning_rate": 9.310531594784353e-05,
2130
- "loss": 1.2912,
2131
- "step": 7175
2132
- },
2133
- {
2134
- "epoch": 6.212389380530974,
2135
- "grad_norm": 0.205078125,
2136
- "learning_rate": 9.30802407221665e-05,
2137
- "loss": 1.1036,
2138
- "step": 7200
2139
- },
2140
- {
2141
- "epoch": 6.2339736671703,
2142
- "grad_norm": 0.1982421875,
2143
- "learning_rate": 9.305516549648947e-05,
2144
- "loss": 1.0882,
2145
- "step": 7225
2146
- },
2147
- {
2148
- "epoch": 6.255557953809626,
2149
- "grad_norm": 0.205078125,
2150
- "learning_rate": 9.303009027081244e-05,
2151
- "loss": 1.0551,
2152
- "step": 7250
2153
- },
2154
- {
2155
- "epoch": 6.277142240448953,
2156
- "grad_norm": 0.232421875,
2157
- "learning_rate": 9.30050150451354e-05,
2158
- "loss": 1.0999,
2159
- "step": 7275
2160
- },
2161
- {
2162
- "epoch": 6.29872652708828,
2163
- "grad_norm": 0.6171875,
2164
- "learning_rate": 9.297993981945839e-05,
2165
- "loss": 1.2349,
2166
- "step": 7300
2167
- },
2168
- {
2169
- "epoch": 6.320310813727606,
2170
- "grad_norm": 0.275390625,
2171
- "learning_rate": 9.295486459378134e-05,
2172
- "loss": 1.2692,
2173
- "step": 7325
2174
- },
2175
- {
2176
- "epoch": 6.341895100366933,
2177
- "grad_norm": 0.1982421875,
2178
- "learning_rate": 9.292978936810432e-05,
2179
- "loss": 1.0265,
2180
- "step": 7350
2181
- },
2182
- {
2183
- "epoch": 6.36347938700626,
2184
- "grad_norm": 0.408203125,
2185
- "learning_rate": 9.290471414242729e-05,
2186
- "loss": 1.1694,
2187
- "step": 7375
2188
- },
2189
- {
2190
- "epoch": 6.385063673645586,
2191
- "grad_norm": 0.4609375,
2192
- "learning_rate": 9.287963891675025e-05,
2193
- "loss": 0.9525,
2194
- "step": 7400
2195
- },
2196
- {
2197
- "epoch": 6.4066479602849125,
2198
- "grad_norm": 0.20703125,
2199
- "learning_rate": 9.285456369107323e-05,
2200
- "loss": 1.1041,
2201
- "step": 7425
2202
- },
2203
- {
2204
- "epoch": 6.428232246924239,
2205
- "grad_norm": 0.255859375,
2206
- "learning_rate": 9.28294884653962e-05,
2207
- "loss": 1.2264,
2208
- "step": 7450
2209
- },
2210
- {
2211
- "epoch": 6.449816533563566,
2212
- "grad_norm": 0.1982421875,
2213
- "learning_rate": 9.280441323971915e-05,
2214
- "loss": 1.2127,
2215
- "step": 7475
2216
- },
2217
- {
2218
- "epoch": 6.471400820202892,
2219
- "grad_norm": 0.318359375,
2220
- "learning_rate": 9.277933801404213e-05,
2221
- "loss": 1.1663,
2222
- "step": 7500
2223
- },
2224
- {
2225
- "epoch": 6.471400820202892,
2226
- "eval_loss": 1.1857322454452515,
2227
- "eval_runtime": 118.9404,
2228
- "eval_samples_per_second": 4.204,
2229
- "eval_steps_per_second": 4.204,
2230
- "step": 7500
2231
- },
2232
- {
2233
- "epoch": 6.492985106842219,
2234
- "grad_norm": 0.197265625,
2235
- "learning_rate": 9.27542627883651e-05,
2236
- "loss": 1.1601,
2237
- "step": 7525
2238
- },
2239
- {
2240
- "epoch": 6.514569393481546,
2241
- "grad_norm": 0.2333984375,
2242
- "learning_rate": 9.272918756268806e-05,
2243
- "loss": 1.2307,
2244
- "step": 7550
2245
- },
2246
- {
2247
- "epoch": 6.536153680120872,
2248
- "grad_norm": 0.2109375,
2249
- "learning_rate": 9.270411233701104e-05,
2250
- "loss": 1.2392,
2251
- "step": 7575
2252
- },
2253
- {
2254
- "epoch": 6.5577379667601985,
2255
- "grad_norm": 0.1953125,
2256
- "learning_rate": 9.267903711133401e-05,
2257
- "loss": 1.018,
2258
- "step": 7600
2259
- },
2260
- {
2261
- "epoch": 6.579322253399525,
2262
- "grad_norm": 0.236328125,
2263
- "learning_rate": 9.265396188565698e-05,
2264
- "loss": 1.0398,
2265
- "step": 7625
2266
- },
2267
- {
2268
- "epoch": 6.600906540038852,
2269
- "grad_norm": 0.2021484375,
2270
- "learning_rate": 9.262888665997994e-05,
2271
- "loss": 1.1174,
2272
- "step": 7650
2273
- },
2274
- {
2275
- "epoch": 6.622490826678178,
2276
- "grad_norm": 0.1962890625,
2277
- "learning_rate": 9.260381143430291e-05,
2278
- "loss": 1.2179,
2279
- "step": 7675
2280
- },
2281
- {
2282
- "epoch": 6.644075113317505,
2283
- "grad_norm": 0.2177734375,
2284
- "learning_rate": 9.257873620862589e-05,
2285
- "loss": 1.1747,
2286
- "step": 7700
2287
- },
2288
- {
2289
- "epoch": 6.665659399956832,
2290
- "grad_norm": 0.2421875,
2291
- "learning_rate": 9.255366098294886e-05,
2292
- "loss": 1.2091,
2293
- "step": 7725
2294
- },
2295
- {
2296
- "epoch": 6.687243686596158,
2297
- "grad_norm": 0.224609375,
2298
- "learning_rate": 9.252858575727182e-05,
2299
- "loss": 1.1983,
2300
- "step": 7750
2301
- },
2302
- {
2303
- "epoch": 6.7088279732354845,
2304
- "grad_norm": 0.23828125,
2305
- "learning_rate": 9.250351053159479e-05,
2306
- "loss": 1.2251,
2307
- "step": 7775
2308
- },
2309
- {
2310
- "epoch": 6.730412259874811,
2311
- "grad_norm": 0.33984375,
2312
- "learning_rate": 9.247843530591775e-05,
2313
- "loss": 1.0004,
2314
- "step": 7800
2315
- },
2316
- {
2317
- "epoch": 6.751996546514138,
2318
- "grad_norm": 0.193359375,
2319
- "learning_rate": 9.245336008024072e-05,
2320
- "loss": 1.1761,
2321
- "step": 7825
2322
- },
2323
- {
2324
- "epoch": 6.773580833153464,
2325
- "grad_norm": 0.2353515625,
2326
- "learning_rate": 9.24282848545637e-05,
2327
- "loss": 1.3334,
2328
- "step": 7850
2329
- },
2330
- {
2331
- "epoch": 6.795165119792791,
2332
- "grad_norm": 0.2041015625,
2333
- "learning_rate": 9.240320962888667e-05,
2334
- "loss": 1.1361,
2335
- "step": 7875
2336
- },
2337
- {
2338
- "epoch": 6.816749406432118,
2339
- "grad_norm": 0.1953125,
2340
- "learning_rate": 9.237813440320963e-05,
2341
- "loss": 1.2055,
2342
- "step": 7900
2343
- },
2344
- {
2345
- "epoch": 6.838333693071444,
2346
- "grad_norm": 0.244140625,
2347
- "learning_rate": 9.23530591775326e-05,
2348
- "loss": 1.1913,
2349
- "step": 7925
2350
- },
2351
- {
2352
- "epoch": 6.8599179797107706,
2353
- "grad_norm": 0.193359375,
2354
- "learning_rate": 9.232798395185557e-05,
2355
- "loss": 1.1817,
2356
- "step": 7950
2357
- },
2358
- {
2359
- "epoch": 6.881502266350097,
2360
- "grad_norm": 0.2197265625,
2361
- "learning_rate": 9.230290872617853e-05,
2362
- "loss": 1.1405,
2363
- "step": 7975
2364
- },
2365
- {
2366
- "epoch": 6.903086552989424,
2367
- "grad_norm": 0.255859375,
2368
- "learning_rate": 9.227783350050151e-05,
2369
- "loss": 1.0686,
2370
- "step": 8000
2371
- },
2372
- {
2373
- "epoch": 6.903086552989424,
2374
- "eval_loss": 1.1845334768295288,
2375
- "eval_runtime": 119.1754,
2376
- "eval_samples_per_second": 4.195,
2377
- "eval_steps_per_second": 4.195,
2378
- "step": 8000
2379
- },
2380
- {
2381
- "epoch": 6.92467083962875,
2382
- "grad_norm": 0.2255859375,
2383
- "learning_rate": 9.225275827482448e-05,
2384
- "loss": 1.1093,
2385
- "step": 8025
2386
- },
2387
- {
2388
- "epoch": 6.946255126268077,
2389
- "grad_norm": 0.431640625,
2390
- "learning_rate": 9.222768304914744e-05,
2391
- "loss": 1.1508,
2392
- "step": 8050
2393
- },
2394
- {
2395
- "epoch": 6.967839412907403,
2396
- "grad_norm": 0.2392578125,
2397
- "learning_rate": 9.220260782347041e-05,
2398
- "loss": 1.1785,
2399
- "step": 8075
2400
- },
2401
- {
2402
- "epoch": 6.98942369954673,
2403
- "grad_norm": 0.197265625,
2404
- "learning_rate": 9.217753259779338e-05,
2405
- "loss": 1.2516,
2406
- "step": 8100
2407
- },
2408
- {
2409
- "epoch": 7.010360457586876,
2410
- "grad_norm": 0.1982421875,
2411
- "learning_rate": 9.215245737211636e-05,
2412
- "loss": 1.1401,
2413
- "step": 8125
2414
- },
2415
- {
2416
- "epoch": 7.0319447442262035,
2417
- "grad_norm": 0.2099609375,
2418
- "learning_rate": 9.212738214643932e-05,
2419
- "loss": 0.9673,
2420
- "step": 8150
2421
- },
2422
- {
2423
- "epoch": 7.05352903086553,
2424
- "grad_norm": 0.2333984375,
2425
- "learning_rate": 9.210230692076229e-05,
2426
- "loss": 1.2195,
2427
- "step": 8175
2428
- },
2429
- {
2430
- "epoch": 7.075113317504856,
2431
- "grad_norm": 0.2373046875,
2432
- "learning_rate": 9.207723169508527e-05,
2433
- "loss": 1.1304,
2434
- "step": 8200
2435
- },
2436
- {
2437
- "epoch": 7.096697604144183,
2438
- "grad_norm": 0.169921875,
2439
- "learning_rate": 9.205215646940822e-05,
2440
- "loss": 1.2657,
2441
- "step": 8225
2442
- },
2443
- {
2444
- "epoch": 7.11828189078351,
2445
- "grad_norm": 0.2255859375,
2446
- "learning_rate": 9.202708124373119e-05,
2447
- "loss": 1.1376,
2448
- "step": 8250
2449
- },
2450
- {
2451
- "epoch": 7.139866177422836,
2452
- "grad_norm": 0.2119140625,
2453
- "learning_rate": 9.200200601805417e-05,
2454
- "loss": 1.2825,
2455
- "step": 8275
2456
- },
2457
- {
2458
- "epoch": 7.161450464062162,
2459
- "grad_norm": 0.25390625,
2460
- "learning_rate": 9.197693079237714e-05,
2461
- "loss": 1.1382,
2462
- "step": 8300
2463
- },
2464
- {
2465
- "epoch": 7.18303475070149,
2466
- "grad_norm": 0.20703125,
2467
- "learning_rate": 9.19518555667001e-05,
2468
- "loss": 1.1377,
2469
- "step": 8325
2470
- },
2471
- {
2472
- "epoch": 7.204619037340816,
2473
- "grad_norm": 0.1943359375,
2474
- "learning_rate": 9.192678034102308e-05,
2475
- "loss": 1.1458,
2476
- "step": 8350
2477
- },
2478
- {
2479
- "epoch": 7.226203323980142,
2480
- "grad_norm": 0.2119140625,
2481
- "learning_rate": 9.190170511534603e-05,
2482
- "loss": 0.9699,
2483
- "step": 8375
2484
- },
2485
- {
2486
- "epoch": 7.247787610619469,
2487
- "grad_norm": 0.1943359375,
2488
- "learning_rate": 9.187662988966901e-05,
2489
- "loss": 1.1423,
2490
- "step": 8400
2491
- },
2492
- {
2493
- "epoch": 7.269371897258796,
2494
- "grad_norm": 0.197265625,
2495
- "learning_rate": 9.185155466399198e-05,
2496
- "loss": 1.0721,
2497
- "step": 8425
2498
- },
2499
- {
2500
- "epoch": 7.290956183898122,
2501
- "grad_norm": 0.283203125,
2502
- "learning_rate": 9.182647943831495e-05,
2503
- "loss": 1.1735,
2504
- "step": 8450
2505
- },
2506
- {
2507
- "epoch": 7.312540470537448,
2508
- "grad_norm": 0.2080078125,
2509
- "learning_rate": 9.180140421263791e-05,
2510
- "loss": 1.2096,
2511
- "step": 8475
2512
- },
2513
- {
2514
- "epoch": 7.334124757176776,
2515
- "grad_norm": 0.21875,
2516
- "learning_rate": 9.17763289869609e-05,
2517
- "loss": 1.0578,
2518
- "step": 8500
2519
- },
2520
- {
2521
- "epoch": 7.334124757176776,
2522
- "eval_loss": 1.1841206550598145,
2523
- "eval_runtime": 118.5017,
2524
- "eval_samples_per_second": 4.219,
2525
- "eval_steps_per_second": 4.219,
2526
- "step": 8500
2527
- },
2528
- {
2529
- "epoch": 7.355709043816102,
2530
- "grad_norm": 0.2421875,
2531
- "learning_rate": 9.175125376128385e-05,
2532
- "loss": 1.1644,
2533
- "step": 8525
2534
- },
2535
- {
2536
- "epoch": 7.377293330455428,
2537
- "grad_norm": 0.1962890625,
2538
- "learning_rate": 9.172617853560683e-05,
2539
- "loss": 1.0903,
2540
- "step": 8550
2541
- },
2542
- {
2543
- "epoch": 7.3988776170947546,
2544
- "grad_norm": 0.154296875,
2545
- "learning_rate": 9.170110330992979e-05,
2546
- "loss": 1.1404,
2547
- "step": 8575
2548
- },
2549
- {
2550
- "epoch": 7.420461903734082,
2551
- "grad_norm": 0.2333984375,
2552
- "learning_rate": 9.167602808425276e-05,
2553
- "loss": 1.2855,
2554
- "step": 8600
2555
- },
2556
- {
2557
- "epoch": 7.442046190373408,
2558
- "grad_norm": 0.380859375,
2559
- "learning_rate": 9.165095285857574e-05,
2560
- "loss": 1.0852,
2561
- "step": 8625
2562
- },
2563
- {
2564
- "epoch": 7.463630477012734,
2565
- "grad_norm": 0.2158203125,
2566
- "learning_rate": 9.16258776328987e-05,
2567
- "loss": 1.3138,
2568
- "step": 8650
2569
- },
2570
- {
2571
- "epoch": 7.485214763652062,
2572
- "grad_norm": 0.212890625,
2573
- "learning_rate": 9.160080240722166e-05,
2574
- "loss": 1.1759,
2575
- "step": 8675
2576
- },
2577
- {
2578
- "epoch": 7.506799050291388,
2579
- "grad_norm": 0.2080078125,
2580
- "learning_rate": 9.157572718154464e-05,
2581
- "loss": 1.0517,
2582
- "step": 8700
2583
- },
2584
- {
2585
- "epoch": 7.528383336930714,
2586
- "grad_norm": 0.2236328125,
2587
- "learning_rate": 9.15506519558676e-05,
2588
- "loss": 1.3831,
2589
- "step": 8725
2590
- },
2591
- {
2592
- "epoch": 7.549967623570041,
2593
- "grad_norm": 0.2236328125,
2594
- "learning_rate": 9.152557673019057e-05,
2595
- "loss": 1.0559,
2596
- "step": 8750
2597
- },
2598
- {
2599
- "epoch": 7.571551910209368,
2600
- "grad_norm": 0.197265625,
2601
- "learning_rate": 9.150050150451355e-05,
2602
- "loss": 1.2865,
2603
- "step": 8775
2604
- },
2605
- {
2606
- "epoch": 7.593136196848694,
2607
- "grad_norm": 0.177734375,
2608
- "learning_rate": 9.147542627883652e-05,
2609
- "loss": 1.0873,
2610
- "step": 8800
2611
- },
2612
- {
2613
- "epoch": 7.61472048348802,
2614
- "grad_norm": 0.2392578125,
2615
- "learning_rate": 9.145035105315948e-05,
2616
- "loss": 1.0765,
2617
- "step": 8825
2618
- },
2619
- {
2620
- "epoch": 7.636304770127348,
2621
- "grad_norm": 0.279296875,
2622
- "learning_rate": 9.142527582748245e-05,
2623
- "loss": 1.0939,
2624
- "step": 8850
2625
- },
2626
- {
2627
- "epoch": 7.657889056766674,
2628
- "grad_norm": 0.16015625,
2629
- "learning_rate": 9.140020060180542e-05,
2630
- "loss": 1.2058,
2631
- "step": 8875
2632
- },
2633
- {
2634
- "epoch": 7.679473343406,
2635
- "grad_norm": 0.2021484375,
2636
- "learning_rate": 9.13751253761284e-05,
2637
- "loss": 1.0933,
2638
- "step": 8900
2639
- },
2640
- {
2641
- "epoch": 7.701057630045327,
2642
- "grad_norm": 0.2197265625,
2643
- "learning_rate": 9.135005015045136e-05,
2644
- "loss": 1.1407,
2645
- "step": 8925
2646
- },
2647
- {
2648
- "epoch": 7.722641916684654,
2649
- "grad_norm": 0.265625,
2650
- "learning_rate": 9.132497492477433e-05,
2651
- "loss": 1.1527,
2652
- "step": 8950
2653
- },
2654
- {
2655
- "epoch": 7.74422620332398,
2656
- "grad_norm": 0.2412109375,
2657
- "learning_rate": 9.12998996990973e-05,
2658
- "loss": 1.1988,
2659
- "step": 8975
2660
- },
2661
- {
2662
- "epoch": 7.7658104899633065,
2663
- "grad_norm": 0.1826171875,
2664
- "learning_rate": 9.127482447342026e-05,
2665
- "loss": 1.2294,
2666
- "step": 9000
2667
- },
2668
- {
2669
- "epoch": 7.7658104899633065,
2670
- "eval_loss": 1.1831315755844116,
2671
- "eval_runtime": 118.7044,
2672
- "eval_samples_per_second": 4.212,
2673
- "eval_steps_per_second": 4.212,
2674
- "step": 9000
2675
- },
2676
- {
2677
- "epoch": 7.787394776602634,
2678
- "grad_norm": 0.216796875,
2679
- "learning_rate": 9.124974924774323e-05,
2680
- "loss": 1.0605,
2681
- "step": 9025
2682
- },
2683
- {
2684
- "epoch": 7.80897906324196,
2685
- "grad_norm": 0.208984375,
2686
- "learning_rate": 9.122467402206621e-05,
2687
- "loss": 1.1654,
2688
- "step": 9050
2689
- },
2690
- {
2691
- "epoch": 7.830563349881286,
2692
- "grad_norm": 0.2373046875,
2693
- "learning_rate": 9.119959879638917e-05,
2694
- "loss": 1.0824,
2695
- "step": 9075
2696
- },
2697
- {
2698
- "epoch": 7.852147636520613,
2699
- "grad_norm": 0.21484375,
2700
- "learning_rate": 9.117452357071214e-05,
2701
- "loss": 1.1937,
2702
- "step": 9100
2703
- },
2704
- {
2705
- "epoch": 7.87373192315994,
2706
- "grad_norm": 0.2294921875,
2707
- "learning_rate": 9.11494483450351e-05,
2708
- "loss": 1.1615,
2709
- "step": 9125
2710
- },
2711
- {
2712
- "epoch": 7.895316209799266,
2713
- "grad_norm": 0.234375,
2714
- "learning_rate": 9.112437311935807e-05,
2715
- "loss": 1.151,
2716
- "step": 9150
2717
- },
2718
- {
2719
- "epoch": 7.9169004964385925,
2720
- "grad_norm": 0.197265625,
2721
- "learning_rate": 9.109929789368104e-05,
2722
- "loss": 1.1793,
2723
- "step": 9175
2724
- },
2725
- {
2726
- "epoch": 7.93848478307792,
2727
- "grad_norm": 0.220703125,
2728
- "learning_rate": 9.107422266800402e-05,
2729
- "loss": 1.2839,
2730
- "step": 9200
2731
- },
2732
- {
2733
- "epoch": 7.960069069717246,
2734
- "grad_norm": 0.1796875,
2735
- "learning_rate": 9.104914744232699e-05,
2736
- "loss": 1.1156,
2737
- "step": 9225
2738
- },
2739
- {
2740
- "epoch": 7.981653356356572,
2741
- "grad_norm": 0.1875,
2742
- "learning_rate": 9.102407221664995e-05,
2743
- "loss": 1.0741,
2744
- "step": 9250
2745
- },
2746
- {
2747
- "epoch": 8.00259011439672,
2748
- "grad_norm": 0.1806640625,
2749
- "learning_rate": 9.099899699097292e-05,
2750
- "loss": 1.1194,
2751
- "step": 9275
2752
- },
2753
- {
2754
- "epoch": 8.024174401036046,
2755
- "grad_norm": 0.220703125,
2756
- "learning_rate": 9.097392176529588e-05,
2757
- "loss": 0.9522,
2758
- "step": 9300
2759
- },
2760
- {
2761
- "epoch": 8.045758687675372,
2762
- "grad_norm": 0.2177734375,
2763
- "learning_rate": 9.094884653961886e-05,
2764
- "loss": 1.2578,
2765
- "step": 9325
2766
- },
2767
- {
2768
- "epoch": 8.067342974314698,
2769
- "grad_norm": 0.2177734375,
2770
- "learning_rate": 9.092377131394183e-05,
2771
- "loss": 1.1278,
2772
- "step": 9350
2773
- },
2774
- {
2775
- "epoch": 8.088927260954025,
2776
- "grad_norm": 0.251953125,
2777
- "learning_rate": 9.08986960882648e-05,
2778
- "loss": 1.2299,
2779
- "step": 9375
2780
- },
2781
- {
2782
- "epoch": 8.110511547593353,
2783
- "grad_norm": 0.24609375,
2784
- "learning_rate": 9.087362086258778e-05,
2785
- "loss": 1.2605,
2786
- "step": 9400
2787
- },
2788
- {
2789
- "epoch": 8.132095834232679,
2790
- "grad_norm": 0.2265625,
2791
- "learning_rate": 9.084854563691073e-05,
2792
- "loss": 1.0537,
2793
- "step": 9425
2794
- },
2795
- {
2796
- "epoch": 8.153680120872005,
2797
- "grad_norm": 0.2333984375,
2798
- "learning_rate": 9.08234704112337e-05,
2799
- "loss": 0.986,
2800
- "step": 9450
2801
- },
2802
- {
2803
- "epoch": 8.175264407511332,
2804
- "grad_norm": 0.201171875,
2805
- "learning_rate": 9.079839518555668e-05,
2806
- "loss": 1.1083,
2807
- "step": 9475
2808
- },
2809
- {
2810
- "epoch": 8.196848694150658,
2811
- "grad_norm": 0.1787109375,
2812
- "learning_rate": 9.077331995987964e-05,
2813
- "loss": 1.1064,
2814
- "step": 9500
2815
- },
2816
- {
2817
- "epoch": 8.196848694150658,
2818
- "eval_loss": 1.1820403337478638,
2819
- "eval_runtime": 119.2179,
2820
- "eval_samples_per_second": 4.194,
2821
- "eval_steps_per_second": 4.194,
2822
- "step": 9500
2823
- },
2824
- {
2825
- "epoch": 8.218432980789984,
2826
- "grad_norm": 0.19921875,
2827
- "learning_rate": 9.074824473420261e-05,
2828
- "loss": 1.1408,
2829
- "step": 9525
2830
- },
2831
- {
2832
- "epoch": 8.24001726742931,
2833
- "grad_norm": 0.2041015625,
2834
- "learning_rate": 9.072316950852559e-05,
2835
- "loss": 1.2541,
2836
- "step": 9550
2837
- },
2838
- {
2839
- "epoch": 8.261601554068639,
2840
- "grad_norm": 0.1728515625,
2841
- "learning_rate": 9.069809428284854e-05,
2842
- "loss": 1.1157,
2843
- "step": 9575
2844
- },
2845
- {
2846
- "epoch": 8.283185840707965,
2847
- "grad_norm": 0.16796875,
2848
- "learning_rate": 9.067301905717152e-05,
2849
- "loss": 0.9584,
2850
- "step": 9600
2851
- },
2852
- {
2853
- "epoch": 8.304770127347291,
2854
- "grad_norm": 0.2373046875,
2855
- "learning_rate": 9.064794383149449e-05,
2856
- "loss": 1.1505,
2857
- "step": 9625
2858
- },
2859
- {
2860
- "epoch": 8.326354413986618,
2861
- "grad_norm": 0.2412109375,
2862
- "learning_rate": 9.062286860581745e-05,
2863
- "loss": 1.1527,
2864
- "step": 9650
2865
- },
2866
- {
2867
- "epoch": 8.347938700625944,
2868
- "grad_norm": 0.298828125,
2869
- "learning_rate": 9.059779338014043e-05,
2870
- "loss": 1.151,
2871
- "step": 9675
2872
- },
2873
- {
2874
- "epoch": 8.36952298726527,
2875
- "grad_norm": 0.216796875,
2876
- "learning_rate": 9.05727181544634e-05,
2877
- "loss": 1.2479,
2878
- "step": 9700
2879
- },
2880
- {
2881
- "epoch": 8.391107273904597,
2882
- "grad_norm": 0.283203125,
2883
- "learning_rate": 9.054764292878635e-05,
2884
- "loss": 1.2619,
2885
- "step": 9725
2886
- },
2887
- {
2888
- "epoch": 8.412691560543925,
2889
- "grad_norm": 0.220703125,
2890
- "learning_rate": 9.052256770310933e-05,
2891
- "loss": 1.1102,
2892
- "step": 9750
2893
- },
2894
- {
2895
- "epoch": 8.434275847183251,
2896
- "grad_norm": 0.1708984375,
2897
- "learning_rate": 9.04974924774323e-05,
2898
- "loss": 1.0815,
2899
- "step": 9775
2900
- },
2901
- {
2902
- "epoch": 8.455860133822577,
2903
- "grad_norm": 0.193359375,
2904
- "learning_rate": 9.047241725175527e-05,
2905
- "loss": 1.1504,
2906
- "step": 9800
2907
- },
2908
- {
2909
- "epoch": 8.477444420461904,
2910
- "grad_norm": 0.1923828125,
2911
- "learning_rate": 9.044734202607825e-05,
2912
- "loss": 1.1213,
2913
- "step": 9825
2914
- },
2915
- {
2916
- "epoch": 8.49902870710123,
2917
- "grad_norm": 0.283203125,
2918
- "learning_rate": 9.042226680040121e-05,
2919
- "loss": 1.1919,
2920
- "step": 9850
2921
- },
2922
- {
2923
- "epoch": 8.520612993740556,
2924
- "grad_norm": 0.212890625,
2925
- "learning_rate": 9.039719157472416e-05,
2926
- "loss": 1.1695,
2927
- "step": 9875
2928
- },
2929
- {
2930
- "epoch": 8.542197280379883,
2931
- "grad_norm": 0.212890625,
2932
- "learning_rate": 9.037211634904714e-05,
2933
- "loss": 1.0681,
2934
- "step": 9900
2935
- },
2936
- {
2937
- "epoch": 8.56378156701921,
2938
- "grad_norm": 0.25390625,
2939
- "learning_rate": 9.034704112337011e-05,
2940
- "loss": 1.2605,
2941
- "step": 9925
2942
- },
2943
- {
2944
- "epoch": 8.585365853658537,
2945
- "grad_norm": 0.205078125,
2946
- "learning_rate": 9.032196589769308e-05,
2947
- "loss": 1.0834,
2948
- "step": 9950
2949
- },
2950
- {
2951
- "epoch": 8.606950140297863,
2952
- "grad_norm": 0.2451171875,
2953
- "learning_rate": 9.029689067201606e-05,
2954
- "loss": 1.2253,
2955
- "step": 9975
2956
- },
2957
- {
2958
- "epoch": 8.62853442693719,
2959
- "grad_norm": 0.38671875,
2960
- "learning_rate": 9.027181544633902e-05,
2961
- "loss": 1.1097,
2962
- "step": 10000
2963
- },
2964
- {
2965
- "epoch": 8.62853442693719,
2966
- "eval_loss": 1.1815690994262695,
2967
- "eval_runtime": 117.6085,
2968
- "eval_samples_per_second": 4.251,
2969
- "eval_steps_per_second": 4.251,
2970
- "step": 10000
2971
- },
2972
- {
2973
- "epoch": 8.650118713576516,
2974
- "grad_norm": 0.244140625,
2975
- "learning_rate": 9.024674022066199e-05,
2976
- "loss": 1.1,
2977
- "step": 10025
2978
- },
2979
- {
2980
- "epoch": 8.671703000215842,
2981
- "grad_norm": 0.2373046875,
2982
- "learning_rate": 9.022166499498496e-05,
2983
- "loss": 1.2114,
2984
- "step": 10050
2985
- },
2986
- {
2987
- "epoch": 8.693287286855169,
2988
- "grad_norm": 0.2099609375,
2989
- "learning_rate": 9.019658976930792e-05,
2990
- "loss": 1.1201,
2991
- "step": 10075
2992
- },
2993
- {
2994
- "epoch": 8.714871573494497,
2995
- "grad_norm": 0.181640625,
2996
- "learning_rate": 9.01715145436309e-05,
2997
- "loss": 1.0469,
2998
- "step": 10100
2999
- },
3000
- {
3001
- "epoch": 8.736455860133823,
3002
- "grad_norm": 0.185546875,
3003
- "learning_rate": 9.014643931795387e-05,
3004
- "loss": 1.1822,
3005
- "step": 10125
3006
- },
3007
- {
3008
- "epoch": 8.75804014677315,
3009
- "grad_norm": 0.205078125,
3010
- "learning_rate": 9.012136409227684e-05,
3011
- "loss": 1.059,
3012
- "step": 10150
3013
- },
3014
- {
3015
- "epoch": 8.779624433412476,
3016
- "grad_norm": 0.2216796875,
3017
- "learning_rate": 9.00962888665998e-05,
3018
- "loss": 1.175,
3019
- "step": 10175
3020
- },
3021
- {
3022
- "epoch": 8.801208720051802,
3023
- "grad_norm": 0.166015625,
3024
- "learning_rate": 9.007121364092277e-05,
3025
- "loss": 1.2437,
3026
- "step": 10200
3027
- },
3028
- {
3029
- "epoch": 8.822793006691128,
3030
- "grad_norm": 0.2177734375,
3031
- "learning_rate": 9.004613841524573e-05,
3032
- "loss": 1.2238,
3033
- "step": 10225
3034
- },
3035
- {
3036
- "epoch": 8.844377293330455,
3037
- "grad_norm": 0.2236328125,
3038
- "learning_rate": 9.002106318956871e-05,
3039
- "loss": 1.234,
3040
- "step": 10250
3041
- },
3042
- {
3043
- "epoch": 8.865961579969781,
3044
- "grad_norm": 0.212890625,
3045
- "learning_rate": 8.999598796389168e-05,
3046
- "loss": 1.2763,
3047
- "step": 10275
3048
- },
3049
- {
3050
- "epoch": 8.88754586660911,
3051
- "grad_norm": 0.2333984375,
3052
- "learning_rate": 8.997091273821465e-05,
3053
- "loss": 1.2638,
3054
- "step": 10300
3055
- },
3056
- {
3057
- "epoch": 8.909130153248435,
3058
- "grad_norm": 0.220703125,
3059
- "learning_rate": 8.994583751253761e-05,
3060
- "loss": 1.1188,
3061
- "step": 10325
3062
- },
3063
- {
3064
- "epoch": 8.930714439887762,
3065
- "grad_norm": 0.28125,
3066
- "learning_rate": 8.992076228686058e-05,
3067
- "loss": 1.1901,
3068
- "step": 10350
3069
- },
3070
- {
3071
- "epoch": 8.952298726527088,
3072
- "grad_norm": 0.21484375,
3073
- "learning_rate": 8.989568706118356e-05,
3074
- "loss": 1.0863,
3075
- "step": 10375
3076
- },
3077
- {
3078
- "epoch": 8.973883013166414,
3079
- "grad_norm": 0.2099609375,
3080
- "learning_rate": 8.987061183550653e-05,
3081
- "loss": 1.1654,
3082
- "step": 10400
3083
- },
3084
- {
3085
- "epoch": 8.99546729980574,
3086
- "grad_norm": 0.2294921875,
3087
- "learning_rate": 8.984553660982949e-05,
3088
- "loss": 1.1655,
3089
- "step": 10425
3090
- },
3091
- {
3092
- "epoch": 9.016404057845888,
3093
- "grad_norm": 0.2265625,
3094
- "learning_rate": 8.982046138415246e-05,
3095
- "loss": 1.1653,
3096
- "step": 10450
3097
- },
3098
- {
3099
- "epoch": 9.037988344485214,
3100
- "grad_norm": 0.1904296875,
3101
- "learning_rate": 8.979538615847543e-05,
3102
- "loss": 1.0543,
3103
- "step": 10475
3104
- },
3105
- {
3106
- "epoch": 9.059572631124542,
3107
- "grad_norm": 0.1787109375,
3108
- "learning_rate": 8.977031093279839e-05,
3109
- "loss": 1.0627,
3110
- "step": 10500
3111
- },
3112
- {
3113
- "epoch": 9.059572631124542,
3114
- "eval_loss": 1.181234359741211,
3115
- "eval_runtime": 118.6511,
3116
- "eval_samples_per_second": 4.214,
3117
- "eval_steps_per_second": 4.214,
3118
- "step": 10500
3119
- },
3120
- {
3121
- "epoch": 9.081156917763868,
3122
- "grad_norm": 0.2109375,
3123
- "learning_rate": 8.974523570712137e-05,
3124
- "loss": 1.118,
3125
- "step": 10525
3126
- },
3127
- {
3128
- "epoch": 9.102741204403195,
3129
- "grad_norm": 0.1875,
3130
- "learning_rate": 8.972016048144434e-05,
3131
- "loss": 1.1308,
3132
- "step": 10550
3133
- },
3134
- {
3135
- "epoch": 9.124325491042521,
3136
- "grad_norm": 0.28125,
3137
- "learning_rate": 8.96950852557673e-05,
3138
- "loss": 1.2378,
3139
- "step": 10575
3140
- },
3141
- {
3142
- "epoch": 9.145909777681847,
3143
- "grad_norm": 0.1943359375,
3144
- "learning_rate": 8.967001003009028e-05,
3145
- "loss": 1.0929,
3146
- "step": 10600
3147
- },
3148
- {
3149
- "epoch": 9.167494064321174,
3150
- "grad_norm": 0.2001953125,
3151
- "learning_rate": 8.964493480441324e-05,
3152
- "loss": 1.2355,
3153
- "step": 10625
3154
- },
3155
- {
3156
- "epoch": 9.1890783509605,
3157
- "grad_norm": 0.48046875,
3158
- "learning_rate": 8.96198595787362e-05,
3159
- "loss": 1.3411,
3160
- "step": 10650
3161
- },
3162
- {
3163
- "epoch": 9.210662637599828,
3164
- "grad_norm": 0.2177734375,
3165
- "learning_rate": 8.959478435305918e-05,
3166
- "loss": 1.0872,
3167
- "step": 10675
3168
- },
3169
- {
3170
- "epoch": 9.232246924239154,
3171
- "grad_norm": 0.205078125,
3172
- "learning_rate": 8.956970912738215e-05,
3173
- "loss": 1.1367,
3174
- "step": 10700
3175
- },
3176
- {
3177
- "epoch": 9.25383121087848,
3178
- "grad_norm": 0.24609375,
3179
- "learning_rate": 8.954463390170512e-05,
3180
- "loss": 1.1535,
3181
- "step": 10725
3182
- },
3183
- {
3184
- "epoch": 9.275415497517807,
3185
- "grad_norm": 0.26171875,
3186
- "learning_rate": 8.95195586760281e-05,
3187
- "loss": 1.171,
3188
- "step": 10750
3189
- },
3190
- {
3191
- "epoch": 9.296999784157133,
3192
- "grad_norm": 0.224609375,
3193
- "learning_rate": 8.949448345035105e-05,
3194
- "loss": 1.1827,
3195
- "step": 10775
3196
- },
3197
- {
3198
- "epoch": 9.31858407079646,
3199
- "grad_norm": 0.271484375,
3200
- "learning_rate": 8.946940822467403e-05,
3201
- "loss": 1.2406,
3202
- "step": 10800
3203
- },
3204
- {
3205
- "epoch": 9.340168357435786,
3206
- "grad_norm": 0.2099609375,
3207
- "learning_rate": 8.9444332998997e-05,
3208
- "loss": 1.091,
3209
- "step": 10825
3210
- },
3211
- {
3212
- "epoch": 9.361752644075114,
3213
- "grad_norm": 0.2001953125,
3214
- "learning_rate": 8.941925777331996e-05,
3215
- "loss": 1.2404,
3216
- "step": 10850
3217
- },
3218
- {
3219
- "epoch": 9.38333693071444,
3220
- "grad_norm": 0.240234375,
3221
- "learning_rate": 8.939418254764294e-05,
3222
- "loss": 1.1678,
3223
- "step": 10875
3224
- },
3225
- {
3226
- "epoch": 9.404921217353767,
3227
- "grad_norm": 0.2470703125,
3228
- "learning_rate": 8.936910732196591e-05,
3229
- "loss": 1.0658,
3230
- "step": 10900
3231
- },
3232
- {
3233
- "epoch": 9.426505503993093,
3234
- "grad_norm": 0.2236328125,
3235
- "learning_rate": 8.934403209628886e-05,
3236
- "loss": 1.2934,
3237
- "step": 10925
3238
- },
3239
- {
3240
- "epoch": 9.44808979063242,
3241
- "grad_norm": 0.1884765625,
3242
- "learning_rate": 8.931895687061184e-05,
3243
- "loss": 0.9592,
3244
- "step": 10950
3245
- },
3246
- {
3247
- "epoch": 9.469674077271746,
3248
- "grad_norm": 0.2099609375,
3249
- "learning_rate": 8.92938816449348e-05,
3250
- "loss": 1.211,
3251
- "step": 10975
3252
- },
3253
- {
3254
- "epoch": 9.491258363911072,
3255
- "grad_norm": 0.279296875,
3256
- "learning_rate": 8.926880641925777e-05,
3257
- "loss": 1.1496,
3258
- "step": 11000
3259
- },
3260
- {
3261
- "epoch": 9.491258363911072,
3262
- "eval_loss": 1.180627465248108,
3263
- "eval_runtime": 118.3608,
3264
- "eval_samples_per_second": 4.224,
3265
- "eval_steps_per_second": 4.224,
3266
- "step": 11000
3267
- },
3268
- {
3269
- "epoch": 9.5128426505504,
3270
- "grad_norm": 0.21484375,
3271
- "learning_rate": 8.924373119358075e-05,
3272
- "loss": 1.0245,
3273
- "step": 11025
3274
- },
3275
- {
3276
- "epoch": 9.534426937189727,
3277
- "grad_norm": 0.224609375,
3278
- "learning_rate": 8.921865596790372e-05,
3279
- "loss": 1.1634,
3280
- "step": 11050
3281
- },
3282
- {
3283
- "epoch": 9.556011223829053,
3284
- "grad_norm": 0.2041015625,
3285
- "learning_rate": 8.919358074222669e-05,
3286
- "loss": 1.1779,
3287
- "step": 11075
3288
- },
3289
- {
3290
- "epoch": 9.57759551046838,
3291
- "grad_norm": 0.2041015625,
3292
- "learning_rate": 8.916850551654965e-05,
3293
- "loss": 1.1216,
3294
- "step": 11100
3295
- },
3296
- {
3297
- "epoch": 9.599179797107706,
3298
- "grad_norm": 0.2119140625,
3299
- "learning_rate": 8.914343029087262e-05,
3300
- "loss": 1.1669,
3301
- "step": 11125
3302
- },
3303
- {
3304
- "epoch": 9.620764083747032,
3305
- "grad_norm": 0.2099609375,
3306
- "learning_rate": 8.911835506519558e-05,
3307
- "loss": 1.0702,
3308
- "step": 11150
3309
- },
3310
- {
3311
- "epoch": 9.642348370386358,
3312
- "grad_norm": 0.25,
3313
- "learning_rate": 8.909327983951856e-05,
3314
- "loss": 1.2574,
3315
- "step": 11175
3316
- },
3317
- {
3318
- "epoch": 9.663932657025684,
3319
- "grad_norm": 0.2490234375,
3320
- "learning_rate": 8.906820461384153e-05,
3321
- "loss": 1.2782,
3322
- "step": 11200
3323
- },
3324
- {
3325
- "epoch": 9.685516943665013,
3326
- "grad_norm": 0.228515625,
3327
- "learning_rate": 8.90431293881645e-05,
3328
- "loss": 0.999,
3329
- "step": 11225
3330
- },
3331
- {
3332
- "epoch": 9.707101230304339,
3333
- "grad_norm": 0.1943359375,
3334
- "learning_rate": 8.901805416248746e-05,
3335
- "loss": 1.2004,
3336
- "step": 11250
3337
- },
3338
- {
3339
- "epoch": 9.728685516943665,
3340
- "grad_norm": 0.279296875,
3341
- "learning_rate": 8.899297893681043e-05,
3342
- "loss": 1.2429,
3343
- "step": 11275
3344
- },
3345
- {
3346
- "epoch": 9.750269803582992,
3347
- "grad_norm": 0.2890625,
3348
- "learning_rate": 8.896790371113341e-05,
3349
- "loss": 1.1644,
3350
- "step": 11300
3351
- },
3352
- {
3353
- "epoch": 9.771854090222318,
3354
- "grad_norm": 0.240234375,
3355
- "learning_rate": 8.894282848545638e-05,
3356
- "loss": 1.1337,
3357
- "step": 11325
3358
- },
3359
- {
3360
- "epoch": 9.793438376861644,
3361
- "grad_norm": 0.236328125,
3362
- "learning_rate": 8.891775325977934e-05,
3363
- "loss": 1.0242,
3364
- "step": 11350
3365
- },
3366
- {
3367
- "epoch": 9.81502266350097,
3368
- "grad_norm": 0.2431640625,
3369
- "learning_rate": 8.889267803410231e-05,
3370
- "loss": 1.0413,
3371
- "step": 11375
3372
- },
3373
- {
3374
- "epoch": 9.836606950140299,
3375
- "grad_norm": 0.2275390625,
3376
- "learning_rate": 8.886760280842528e-05,
3377
- "loss": 1.0505,
3378
- "step": 11400
3379
- },
3380
- {
3381
- "epoch": 9.858191236779625,
3382
- "grad_norm": 0.2578125,
3383
- "learning_rate": 8.884252758274824e-05,
3384
- "loss": 1.3221,
3385
- "step": 11425
3386
- },
3387
- {
3388
- "epoch": 9.879775523418951,
3389
- "grad_norm": 0.27734375,
3390
- "learning_rate": 8.881745235707122e-05,
3391
- "loss": 1.0816,
3392
- "step": 11450
3393
- },
3394
- {
3395
- "epoch": 9.901359810058278,
3396
- "grad_norm": 0.1865234375,
3397
- "learning_rate": 8.879237713139419e-05,
3398
- "loss": 1.1502,
3399
- "step": 11475
3400
- },
3401
- {
3402
- "epoch": 9.922944096697604,
3403
- "grad_norm": 0.1865234375,
3404
- "learning_rate": 8.876730190571715e-05,
3405
- "loss": 1.2071,
3406
- "step": 11500
3407
- },
3408
- {
3409
- "epoch": 9.922944096697604,
3410
- "eval_loss": 1.1804978847503662,
3411
- "eval_runtime": 118.4662,
3412
- "eval_samples_per_second": 4.221,
3413
- "eval_steps_per_second": 4.221,
3414
- "step": 11500
3415
- },
3416
- {
3417
- "epoch": 9.94452838333693,
3418
- "grad_norm": 0.19140625,
3419
- "learning_rate": 8.874222668004012e-05,
3420
- "loss": 1.0383,
3421
- "step": 11525
3422
- },
3423
- {
3424
- "epoch": 9.966112669976257,
3425
- "grad_norm": 0.2734375,
3426
- "learning_rate": 8.871715145436309e-05,
3427
- "loss": 1.2457,
3428
- "step": 11550
3429
- },
3430
- {
3431
- "epoch": 9.987696956615585,
3432
- "grad_norm": 0.259765625,
3433
- "learning_rate": 8.869207622868607e-05,
3434
- "loss": 1.1177,
3435
- "step": 11575
3436
- },
3437
- {
3438
- "epoch": 10.00863371465573,
3439
- "grad_norm": 0.212890625,
3440
- "learning_rate": 8.866700100300903e-05,
3441
- "loss": 0.9983,
3442
- "step": 11600
3443
- },
3444
- {
3445
- "epoch": 10.030218001295058,
3446
- "grad_norm": 0.27734375,
3447
- "learning_rate": 8.8641925777332e-05,
3448
- "loss": 1.168,
3449
- "step": 11625
3450
- },
3451
- {
3452
- "epoch": 10.051802287934384,
3453
- "grad_norm": 0.185546875,
3454
- "learning_rate": 8.861685055165498e-05,
3455
- "loss": 1.1315,
3456
- "step": 11650
3457
- },
3458
- {
3459
- "epoch": 10.07338657457371,
3460
- "grad_norm": 0.228515625,
3461
- "learning_rate": 8.859177532597793e-05,
3462
- "loss": 1.0122,
3463
- "step": 11675
3464
- },
3465
- {
3466
- "epoch": 10.094970861213037,
3467
- "grad_norm": 0.244140625,
3468
- "learning_rate": 8.85667001003009e-05,
3469
- "loss": 1.1507,
3470
- "step": 11700
3471
- },
3472
- {
3473
- "epoch": 10.116555147852363,
3474
- "grad_norm": 0.1767578125,
3475
- "learning_rate": 8.854162487462388e-05,
3476
- "loss": 1.2644,
3477
- "step": 11725
3478
- },
3479
- {
3480
- "epoch": 10.13813943449169,
3481
- "grad_norm": 0.171875,
3482
- "learning_rate": 8.851654964894684e-05,
3483
- "loss": 1.0958,
3484
- "step": 11750
3485
- },
3486
- {
3487
- "epoch": 10.159723721131016,
3488
- "grad_norm": 0.18359375,
3489
- "learning_rate": 8.849147442326981e-05,
3490
- "loss": 1.1193,
3491
- "step": 11775
3492
- },
3493
- {
3494
- "epoch": 10.181308007770344,
3495
- "grad_norm": 0.25,
3496
- "learning_rate": 8.846639919759279e-05,
3497
- "loss": 1.1638,
3498
- "step": 11800
3499
- },
3500
- {
3501
- "epoch": 10.20289229440967,
3502
- "grad_norm": 0.244140625,
3503
- "learning_rate": 8.844132397191574e-05,
3504
- "loss": 1.1991,
3505
- "step": 11825
3506
- },
3507
- {
3508
- "epoch": 10.224476581048997,
3509
- "grad_norm": 0.2138671875,
3510
- "learning_rate": 8.841624874623871e-05,
3511
- "loss": 1.0886,
3512
- "step": 11850
3513
- },
3514
- {
3515
- "epoch": 10.246060867688323,
3516
- "grad_norm": 0.265625,
3517
- "learning_rate": 8.839117352056169e-05,
3518
- "loss": 1.1301,
3519
- "step": 11875
3520
- },
3521
- {
3522
- "epoch": 10.26764515432765,
3523
- "grad_norm": 0.271484375,
3524
- "learning_rate": 8.836609829488466e-05,
3525
- "loss": 1.0516,
3526
- "step": 11900
3527
- },
3528
- {
3529
- "epoch": 10.289229440966976,
3530
- "grad_norm": 0.23828125,
3531
- "learning_rate": 8.834102306920762e-05,
3532
- "loss": 1.1763,
3533
- "step": 11925
3534
- },
3535
- {
3536
- "epoch": 10.310813727606302,
3537
- "grad_norm": 0.2412109375,
3538
- "learning_rate": 8.83159478435306e-05,
3539
- "loss": 0.9276,
3540
- "step": 11950
3541
- },
3542
- {
3543
- "epoch": 10.33239801424563,
3544
- "grad_norm": 0.212890625,
3545
- "learning_rate": 8.829087261785356e-05,
3546
- "loss": 1.2325,
3547
- "step": 11975
3548
- },
3549
- {
3550
- "epoch": 10.353982300884956,
3551
- "grad_norm": 0.181640625,
3552
- "learning_rate": 8.826579739217654e-05,
3553
- "loss": 1.1026,
3554
- "step": 12000
3555
- },
3556
- {
3557
- "epoch": 10.353982300884956,
3558
- "eval_loss": 1.179988980293274,
3559
- "eval_runtime": 118.6831,
3560
- "eval_samples_per_second": 4.213,
3561
- "eval_steps_per_second": 4.213,
3562
- "step": 12000
3563
- }
3564
- ],
3565
- "logging_steps": 25,
3566
- "max_steps": 100000,
3567
- "num_input_tokens_seen": 0,
3568
- "num_train_epochs": 87,
3569
- "save_steps": 3000,
3570
- "stateful_callbacks": {
3571
- "TrainerControl": {
3572
- "args": {
3573
- "should_epoch_stop": false,
3574
- "should_evaluate": false,
3575
- "should_log": false,
3576
- "should_save": true,
3577
- "should_training_stop": false
3578
- },
3579
- "attributes": {}
3580
- }
3581
- },
3582
- "total_flos": 0.0,
3583
- "train_batch_size": 1,
3584
- "trial_name": null,
3585
- "trial_params": null
3586
- }