DmitriyYurckML commited on
Commit
bc349ab
·
verified ·
1 Parent(s): b972232

Delete checkpoint-192/trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. checkpoint-192/trainer_state.json +0 -1378
checkpoint-192/trainer_state.json DELETED
@@ -1,1378 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 3.0,
6
- "eval_steps": 500,
7
- "global_step": 192,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.015625,
14
- "grad_norm": 2.7545292377471924,
15
- "learning_rate": 0.0,
16
- "loss": 1.782,
17
- "step": 1
18
- },
19
- {
20
- "epoch": 0.03125,
21
- "grad_norm": 2.986503839492798,
22
- "learning_rate": 4e-05,
23
- "loss": 1.9094,
24
- "step": 2
25
- },
26
- {
27
- "epoch": 0.046875,
28
- "grad_norm": 1.059362530708313,
29
- "learning_rate": 8e-05,
30
- "loss": 1.555,
31
- "step": 3
32
- },
33
- {
34
- "epoch": 0.0625,
35
- "grad_norm": 0.9862841367721558,
36
- "learning_rate": 0.00012,
37
- "loss": 1.5554,
38
- "step": 4
39
- },
40
- {
41
- "epoch": 0.078125,
42
- "grad_norm": 1.2146193981170654,
43
- "learning_rate": 0.00016,
44
- "loss": 1.5662,
45
- "step": 5
46
- },
47
- {
48
- "epoch": 0.09375,
49
- "grad_norm": 0.6501009464263916,
50
- "learning_rate": 0.0002,
51
- "loss": 1.4973,
52
- "step": 6
53
- },
54
- {
55
- "epoch": 0.109375,
56
- "grad_norm": 0.7994824051856995,
57
- "learning_rate": 0.00019998588839790777,
58
- "loss": 1.5113,
59
- "step": 7
60
- },
61
- {
62
- "epoch": 0.125,
63
- "grad_norm": 0.6340547204017639,
64
- "learning_rate": 0.00019994355757437738,
65
- "loss": 1.4214,
66
- "step": 8
67
- },
68
- {
69
- "epoch": 0.140625,
70
- "grad_norm": 0.6426404118537903,
71
- "learning_rate": 0.00019987301947652352,
72
- "loss": 1.3852,
73
- "step": 9
74
- },
75
- {
76
- "epoch": 0.15625,
77
- "grad_norm": 0.5500874519348145,
78
- "learning_rate": 0.0001997742940124576,
79
- "loss": 1.2877,
80
- "step": 10
81
- },
82
- {
83
- "epoch": 0.171875,
84
- "grad_norm": 0.5563384294509888,
85
- "learning_rate": 0.000199647409045669,
86
- "loss": 1.4758,
87
- "step": 11
88
- },
89
- {
90
- "epoch": 0.1875,
91
- "grad_norm": 0.6488623023033142,
92
- "learning_rate": 0.0001994924003871609,
93
- "loss": 1.4119,
94
- "step": 12
95
- },
96
- {
97
- "epoch": 0.203125,
98
- "grad_norm": 0.5619658827781677,
99
- "learning_rate": 0.0001993093117853435,
100
- "loss": 1.2673,
101
- "step": 13
102
- },
103
- {
104
- "epoch": 0.21875,
105
- "grad_norm": 0.5723183155059814,
106
- "learning_rate": 0.00019909819491368676,
107
- "loss": 1.4025,
108
- "step": 14
109
- },
110
- {
111
- "epoch": 0.234375,
112
- "grad_norm": 0.5521570444107056,
113
- "learning_rate": 0.0001988591093561364,
114
- "loss": 1.3465,
115
- "step": 15
116
- },
117
- {
118
- "epoch": 0.25,
119
- "grad_norm": 0.5407305955886841,
120
- "learning_rate": 0.00019859212259029752,
121
- "loss": 1.1943,
122
- "step": 16
123
- },
124
- {
125
- "epoch": 0.265625,
126
- "grad_norm": 0.5799192190170288,
127
- "learning_rate": 0.0001982973099683902,
128
- "loss": 1.4066,
129
- "step": 17
130
- },
131
- {
132
- "epoch": 0.28125,
133
- "grad_norm": 0.5761305689811707,
134
- "learning_rate": 0.00019797475469598267,
135
- "loss": 1.3993,
136
- "step": 18
137
- },
138
- {
139
- "epoch": 0.296875,
140
- "grad_norm": 0.5504961013793945,
141
- "learning_rate": 0.00019762454780850806,
142
- "loss": 1.2544,
143
- "step": 19
144
- },
145
- {
146
- "epoch": 0.3125,
147
- "grad_norm": 0.5236067175865173,
148
- "learning_rate": 0.00019724678814557128,
149
- "loss": 1.2237,
150
- "step": 20
151
- },
152
- {
153
- "epoch": 0.328125,
154
- "grad_norm": 0.5526680946350098,
155
- "learning_rate": 0.0001968415823230534,
156
- "loss": 1.2591,
157
- "step": 21
158
- },
159
- {
160
- "epoch": 0.34375,
161
- "grad_norm": 0.5238738656044006,
162
- "learning_rate": 0.00019640904470302097,
163
- "loss": 1.1508,
164
- "step": 22
165
- },
166
- {
167
- "epoch": 0.359375,
168
- "grad_norm": 0.6222432851791382,
169
- "learning_rate": 0.00019594929736144976,
170
- "loss": 1.2375,
171
- "step": 23
172
- },
173
- {
174
- "epoch": 0.375,
175
- "grad_norm": 0.5575475692749023,
176
- "learning_rate": 0.00019546247005377065,
177
- "loss": 1.2426,
178
- "step": 24
179
- },
180
- {
181
- "epoch": 0.390625,
182
- "grad_norm": 0.5280503034591675,
183
- "learning_rate": 0.00019494870017824876,
184
- "loss": 1.2745,
185
- "step": 25
186
- },
187
- {
188
- "epoch": 0.40625,
189
- "grad_norm": 0.4982561469078064,
190
- "learning_rate": 0.00019440813273720504,
191
- "loss": 1.2538,
192
- "step": 26
193
- },
194
- {
195
- "epoch": 0.421875,
196
- "grad_norm": 0.5352105498313904,
197
- "learning_rate": 0.0001938409202960922,
198
- "loss": 1.28,
199
- "step": 27
200
- },
201
- {
202
- "epoch": 0.4375,
203
- "grad_norm": 0.5177728533744812,
204
- "learning_rate": 0.00019324722294043558,
205
- "loss": 1.3372,
206
- "step": 28
207
- },
208
- {
209
- "epoch": 0.453125,
210
- "grad_norm": 0.5144365429878235,
211
- "learning_rate": 0.00019262720823065216,
212
- "loss": 1.351,
213
- "step": 29
214
- },
215
- {
216
- "epoch": 0.46875,
217
- "grad_norm": 0.4943947196006775,
218
- "learning_rate": 0.00019198105115475947,
219
- "loss": 1.1735,
220
- "step": 30
221
- },
222
- {
223
- "epoch": 0.484375,
224
- "grad_norm": 0.5117695331573486,
225
- "learning_rate": 0.00019130893407898834,
226
- "loss": 1.2778,
227
- "step": 31
228
- },
229
- {
230
- "epoch": 0.5,
231
- "grad_norm": 0.5207276940345764,
232
- "learning_rate": 0.0001906110466963134,
233
- "loss": 1.3005,
234
- "step": 32
235
- },
236
- {
237
- "epoch": 0.515625,
238
- "grad_norm": 0.49616295099258423,
239
- "learning_rate": 0.00018988758597291577,
240
- "loss": 1.2386,
241
- "step": 33
242
- },
243
- {
244
- "epoch": 0.53125,
245
- "grad_norm": 0.5076295137405396,
246
- "learning_rate": 0.00018913875609259247,
247
- "loss": 1.3428,
248
- "step": 34
249
- },
250
- {
251
- "epoch": 0.546875,
252
- "grad_norm": 0.5187010169029236,
253
- "learning_rate": 0.00018836476839912967,
254
- "loss": 1.2319,
255
- "step": 35
256
- },
257
- {
258
- "epoch": 0.5625,
259
- "grad_norm": 0.7806910872459412,
260
- "learning_rate": 0.00018756584133665448,
261
- "loss": 1.2131,
262
- "step": 36
263
- },
264
- {
265
- "epoch": 0.578125,
266
- "grad_norm": 0.49942177534103394,
267
- "learning_rate": 0.00018674220038798298,
268
- "loss": 1.2394,
269
- "step": 37
270
- },
271
- {
272
- "epoch": 0.59375,
273
- "grad_norm": 0.5420028567314148,
274
- "learning_rate": 0.0001858940780109819,
275
- "loss": 1.2725,
276
- "step": 38
277
- },
278
- {
279
- "epoch": 0.609375,
280
- "grad_norm": 0.5082110166549683,
281
- "learning_rate": 0.00018502171357296144,
282
- "loss": 1.3452,
283
- "step": 39
284
- },
285
- {
286
- "epoch": 0.625,
287
- "grad_norm": 0.5396978855133057,
288
- "learning_rate": 0.00018412535328311814,
289
- "loss": 1.2206,
290
- "step": 40
291
- },
292
- {
293
- "epoch": 0.640625,
294
- "grad_norm": 0.5082774758338928,
295
- "learning_rate": 0.00018320525012304685,
296
- "loss": 1.2709,
297
- "step": 41
298
- },
299
- {
300
- "epoch": 0.65625,
301
- "grad_norm": 0.46711206436157227,
302
- "learning_rate": 0.00018226166377534114,
303
- "loss": 1.3097,
304
- "step": 42
305
- },
306
- {
307
- "epoch": 0.671875,
308
- "grad_norm": 0.5251287221908569,
309
- "learning_rate": 0.00018129486055030257,
310
- "loss": 1.4228,
311
- "step": 43
312
- },
313
- {
314
- "epoch": 0.6875,
315
- "grad_norm": 0.5274046659469604,
316
- "learning_rate": 0.00018030511331077945,
317
- "loss": 1.2037,
318
- "step": 44
319
- },
320
- {
321
- "epoch": 0.703125,
322
- "grad_norm": 0.5073964595794678,
323
- "learning_rate": 0.00017929270139515604,
324
- "loss": 1.1935,
325
- "step": 45
326
- },
327
- {
328
- "epoch": 0.71875,
329
- "grad_norm": 0.4888913035392761,
330
- "learning_rate": 0.0001782579105385145,
331
- "loss": 1.2166,
332
- "step": 46
333
- },
334
- {
335
- "epoch": 0.734375,
336
- "grad_norm": 0.500054121017456,
337
- "learning_rate": 0.0001772010327919912,
338
- "loss": 1.258,
339
- "step": 47
340
- },
341
- {
342
- "epoch": 0.75,
343
- "grad_norm": 0.5388707518577576,
344
- "learning_rate": 0.0001761223664403505,
345
- "loss": 1.3851,
346
- "step": 48
347
- },
348
- {
349
- "epoch": 0.765625,
350
- "grad_norm": 0.5153979063034058,
351
- "learning_rate": 0.0001750222159177993,
352
- "loss": 1.1528,
353
- "step": 49
354
- },
355
- {
356
- "epoch": 0.78125,
357
- "grad_norm": 0.49915435910224915,
358
- "learning_rate": 0.00017390089172206592,
359
- "loss": 1.2669,
360
- "step": 50
361
- },
362
- {
363
- "epoch": 0.796875,
364
- "grad_norm": 0.5064976215362549,
365
- "learning_rate": 0.0001727587103267677,
366
- "loss": 1.2352,
367
- "step": 51
368
- },
369
- {
370
- "epoch": 0.8125,
371
- "grad_norm": 0.5197826623916626,
372
- "learning_rate": 0.00017159599409209193,
373
- "loss": 1.3366,
374
- "step": 52
375
- },
376
- {
377
- "epoch": 0.828125,
378
- "grad_norm": 0.49064281582832336,
379
- "learning_rate": 0.0001704130711738157,
380
- "loss": 1.3363,
381
- "step": 53
382
- },
383
- {
384
- "epoch": 0.84375,
385
- "grad_norm": 0.5057505965232849,
386
- "learning_rate": 0.0001692102754306895,
387
- "loss": 1.2715,
388
- "step": 54
389
- },
390
- {
391
- "epoch": 0.859375,
392
- "grad_norm": 0.49357378482818604,
393
- "learning_rate": 0.00016798794633021192,
394
- "loss": 1.2188,
395
- "step": 55
396
- },
397
- {
398
- "epoch": 0.875,
399
- "grad_norm": 0.5010667443275452,
400
- "learning_rate": 0.0001667464288528207,
401
- "loss": 1.3006,
402
- "step": 56
403
- },
404
- {
405
- "epoch": 0.890625,
406
- "grad_norm": 0.5323114991188049,
407
- "learning_rate": 0.00016548607339452853,
408
- "loss": 1.4191,
409
- "step": 57
410
- },
411
- {
412
- "epoch": 0.90625,
413
- "grad_norm": 0.547682523727417,
414
- "learning_rate": 0.00016420723566802983,
415
- "loss": 1.3198,
416
- "step": 58
417
- },
418
- {
419
- "epoch": 0.921875,
420
- "grad_norm": 0.5031412839889526,
421
- "learning_rate": 0.00016291027660230733,
422
- "loss": 1.269,
423
- "step": 59
424
- },
425
- {
426
- "epoch": 0.9375,
427
- "grad_norm": 0.5092846751213074,
428
- "learning_rate": 0.00016159556224076637,
429
- "loss": 1.1915,
430
- "step": 60
431
- },
432
- {
433
- "epoch": 0.953125,
434
- "grad_norm": 0.560909628868103,
435
- "learning_rate": 0.00016026346363792567,
436
- "loss": 1.3038,
437
- "step": 61
438
- },
439
- {
440
- "epoch": 0.96875,
441
- "grad_norm": 0.540993869304657,
442
- "learning_rate": 0.00015891435675469376,
443
- "loss": 1.4336,
444
- "step": 62
445
- },
446
- {
447
- "epoch": 0.984375,
448
- "grad_norm": 0.4856269061565399,
449
- "learning_rate": 0.000157548622352261,
450
- "loss": 1.2208,
451
- "step": 63
452
- },
453
- {
454
- "epoch": 1.0,
455
- "grad_norm": 0.6747854948043823,
456
- "learning_rate": 0.00015616664588463647,
457
- "loss": 1.3273,
458
- "step": 64
459
- },
460
- {
461
- "epoch": 1.015625,
462
- "grad_norm": 0.5813027024269104,
463
- "learning_rate": 0.00015476881738986037,
464
- "loss": 0.896,
465
- "step": 65
466
- },
467
- {
468
- "epoch": 1.03125,
469
- "grad_norm": 0.49314695596694946,
470
- "learning_rate": 0.00015335553137992285,
471
- "loss": 0.9917,
472
- "step": 66
473
- },
474
- {
475
- "epoch": 1.046875,
476
- "grad_norm": 0.5075877904891968,
477
- "learning_rate": 0.0001519271867294203,
478
- "loss": 0.9436,
479
- "step": 67
480
- },
481
- {
482
- "epoch": 1.0625,
483
- "grad_norm": 0.6151750087738037,
484
- "learning_rate": 0.0001504841865629799,
485
- "loss": 1.0398,
486
- "step": 68
487
- },
488
- {
489
- "epoch": 1.078125,
490
- "grad_norm": 0.6225054860115051,
491
- "learning_rate": 0.0001490269381414849,
492
- "loss": 0.9602,
493
- "step": 69
494
- },
495
- {
496
- "epoch": 1.09375,
497
- "grad_norm": 0.6431002616882324,
498
- "learning_rate": 0.0001475558527471329,
499
- "loss": 0.8976,
500
- "step": 70
501
- },
502
- {
503
- "epoch": 1.109375,
504
- "grad_norm": 0.6990566253662109,
505
- "learning_rate": 0.00014607134556735834,
506
- "loss": 0.8879,
507
- "step": 71
508
- },
509
- {
510
- "epoch": 1.125,
511
- "grad_norm": 0.9784147143363953,
512
- "learning_rate": 0.00014457383557765386,
513
- "loss": 0.9902,
514
- "step": 72
515
- },
516
- {
517
- "epoch": 1.140625,
518
- "grad_norm": 0.6030399799346924,
519
- "learning_rate": 0.00014306374542332143,
520
- "loss": 0.94,
521
- "step": 73
522
- },
523
- {
524
- "epoch": 1.15625,
525
- "grad_norm": 0.5666849613189697,
526
- "learning_rate": 0.00014154150130018866,
527
- "loss": 0.8381,
528
- "step": 74
529
- },
530
- {
531
- "epoch": 1.171875,
532
- "grad_norm": 0.626495361328125,
533
- "learning_rate": 0.00014000753283432266,
534
- "loss": 0.868,
535
- "step": 75
536
- },
537
- {
538
- "epoch": 1.1875,
539
- "grad_norm": 0.5842832922935486,
540
- "learning_rate": 0.00013846227296077568,
541
- "loss": 0.8837,
542
- "step": 76
543
- },
544
- {
545
- "epoch": 1.203125,
546
- "grad_norm": 0.5995466113090515,
547
- "learning_rate": 0.000136906157801397,
548
- "loss": 0.979,
549
- "step": 77
550
- },
551
- {
552
- "epoch": 1.21875,
553
- "grad_norm": 0.6081043481826782,
554
- "learning_rate": 0.0001353396265417454,
555
- "loss": 0.882,
556
- "step": 78
557
- },
558
- {
559
- "epoch": 1.234375,
560
- "grad_norm": 0.5832274556159973,
561
- "learning_rate": 0.00013376312130713687,
562
- "loss": 0.9649,
563
- "step": 79
564
- },
565
- {
566
- "epoch": 1.25,
567
- "grad_norm": 0.6251536011695862,
568
- "learning_rate": 0.0001321770870378628,
569
- "loss": 0.917,
570
- "step": 80
571
- },
572
- {
573
- "epoch": 1.265625,
574
- "grad_norm": 0.6604623794555664,
575
- "learning_rate": 0.00013058197136361343,
576
- "loss": 0.9478,
577
- "step": 81
578
- },
579
- {
580
- "epoch": 1.28125,
581
- "grad_norm": 0.6800036430358887,
582
- "learning_rate": 0.00012897822447714247,
583
- "loss": 0.8825,
584
- "step": 82
585
- },
586
- {
587
- "epoch": 1.296875,
588
- "grad_norm": 0.709237813949585,
589
- "learning_rate": 0.0001273662990072083,
590
- "loss": 0.9113,
591
- "step": 83
592
- },
593
- {
594
- "epoch": 1.3125,
595
- "grad_norm": 0.6891036629676819,
596
- "learning_rate": 0.00012574664989082758,
597
- "loss": 0.9004,
598
- "step": 84
599
- },
600
- {
601
- "epoch": 1.328125,
602
- "grad_norm": 0.5835941433906555,
603
- "learning_rate": 0.0001241197342448775,
604
- "loss": 0.9267,
605
- "step": 85
606
- },
607
- {
608
- "epoch": 1.34375,
609
- "grad_norm": 0.5817910432815552,
610
- "learning_rate": 0.0001224860112370828,
611
- "loss": 0.8757,
612
- "step": 86
613
- },
614
- {
615
- "epoch": 1.359375,
616
- "grad_norm": 0.6036533117294312,
617
- "learning_rate": 0.00012084594195642367,
618
- "loss": 0.8987,
619
- "step": 87
620
- },
621
- {
622
- "epoch": 1.375,
623
- "grad_norm": 0.623345136642456,
624
- "learning_rate": 0.00011919998928300203,
625
- "loss": 0.9115,
626
- "step": 88
627
- },
628
- {
629
- "epoch": 1.390625,
630
- "grad_norm": 0.6265957951545715,
631
- "learning_rate": 0.00011754861775740162,
632
- "loss": 0.874,
633
- "step": 89
634
- },
635
- {
636
- "epoch": 1.40625,
637
- "grad_norm": 0.6529715061187744,
638
- "learning_rate": 0.00011589229344957999,
639
- "loss": 0.8702,
640
- "step": 90
641
- },
642
- {
643
- "epoch": 1.421875,
644
- "grad_norm": 0.6559955477714539,
645
- "learning_rate": 0.00011423148382732853,
646
- "loss": 0.9008,
647
- "step": 91
648
- },
649
- {
650
- "epoch": 1.4375,
651
- "grad_norm": 0.6313908100128174,
652
- "learning_rate": 0.00011256665762433798,
653
- "loss": 0.942,
654
- "step": 92
655
- },
656
- {
657
- "epoch": 1.453125,
658
- "grad_norm": 0.6203813552856445,
659
- "learning_rate": 0.00011089828470790693,
660
- "loss": 0.9118,
661
- "step": 93
662
- },
663
- {
664
- "epoch": 1.46875,
665
- "grad_norm": 0.7021353244781494,
666
- "learning_rate": 0.00010922683594633021,
667
- "loss": 0.7378,
668
- "step": 94
669
- },
670
- {
671
- "epoch": 1.484375,
672
- "grad_norm": 0.6011958122253418,
673
- "learning_rate": 0.00010755278307600458,
674
- "loss": 0.8333,
675
- "step": 95
676
- },
677
- {
678
- "epoch": 1.5,
679
- "grad_norm": 0.7509216070175171,
680
- "learning_rate": 0.0001058765985682898,
681
- "loss": 0.9437,
682
- "step": 96
683
- },
684
- {
685
- "epoch": 1.515625,
686
- "grad_norm": 0.638576865196228,
687
- "learning_rate": 0.00010419875549616196,
688
- "loss": 0.9436,
689
- "step": 97
690
- },
691
- {
692
- "epoch": 1.53125,
693
- "grad_norm": 0.6763246059417725,
694
- "learning_rate": 0.00010251972740069724,
695
- "loss": 0.9515,
696
- "step": 98
697
- },
698
- {
699
- "epoch": 1.546875,
700
- "grad_norm": 0.6507485508918762,
701
- "learning_rate": 0.00010083998815742335,
702
- "loss": 0.9399,
703
- "step": 99
704
- },
705
- {
706
- "epoch": 1.5625,
707
- "grad_norm": 0.615294337272644,
708
- "learning_rate": 9.916001184257668e-05,
709
- "loss": 0.9096,
710
- "step": 100
711
- },
712
- {
713
- "epoch": 1.578125,
714
- "grad_norm": 0.580854594707489,
715
- "learning_rate": 9.748027259930276e-05,
716
- "loss": 1.0188,
717
- "step": 101
718
- },
719
- {
720
- "epoch": 1.59375,
721
- "grad_norm": 0.5845345258712769,
722
- "learning_rate": 9.580124450383803e-05,
723
- "loss": 0.8994,
724
- "step": 102
725
- },
726
- {
727
- "epoch": 1.609375,
728
- "grad_norm": 0.6133562922477722,
729
- "learning_rate": 9.412340143171024e-05,
730
- "loss": 0.8916,
731
- "step": 103
732
- },
733
- {
734
- "epoch": 1.625,
735
- "grad_norm": 0.6874757409095764,
736
- "learning_rate": 9.244721692399545e-05,
737
- "loss": 0.9934,
738
- "step": 104
739
- },
740
- {
741
- "epoch": 1.640625,
742
- "grad_norm": 0.5951738357543945,
743
- "learning_rate": 9.077316405366981e-05,
744
- "loss": 0.8346,
745
- "step": 105
746
- },
747
- {
748
- "epoch": 1.65625,
749
- "grad_norm": 0.6542007923126221,
750
- "learning_rate": 8.910171529209305e-05,
751
- "loss": 0.883,
752
- "step": 106
753
- },
754
- {
755
- "epoch": 1.671875,
756
- "grad_norm": 0.6282486915588379,
757
- "learning_rate": 8.743334237566202e-05,
758
- "loss": 0.8681,
759
- "step": 107
760
- },
761
- {
762
- "epoch": 1.6875,
763
- "grad_norm": 0.6178871989250183,
764
- "learning_rate": 8.57685161726715e-05,
765
- "loss": 0.8562,
766
- "step": 108
767
- },
768
- {
769
- "epoch": 1.703125,
770
- "grad_norm": 0.6271396279335022,
771
- "learning_rate": 8.410770655042003e-05,
772
- "loss": 0.8078,
773
- "step": 109
774
- },
775
- {
776
- "epoch": 1.71875,
777
- "grad_norm": 0.6065046191215515,
778
- "learning_rate": 8.245138224259841e-05,
779
- "loss": 0.8439,
780
- "step": 110
781
- },
782
- {
783
- "epoch": 1.734375,
784
- "grad_norm": 0.6628541350364685,
785
- "learning_rate": 8.0800010716998e-05,
786
- "loss": 0.93,
787
- "step": 111
788
- },
789
- {
790
- "epoch": 1.75,
791
- "grad_norm": 0.6493437886238098,
792
- "learning_rate": 7.915405804357633e-05,
793
- "loss": 0.8781,
794
- "step": 112
795
- },
796
- {
797
- "epoch": 1.765625,
798
- "grad_norm": 0.6871931552886963,
799
- "learning_rate": 7.751398876291725e-05,
800
- "loss": 0.8897,
801
- "step": 113
802
- },
803
- {
804
- "epoch": 1.78125,
805
- "grad_norm": 0.6025776863098145,
806
- "learning_rate": 7.588026575512251e-05,
807
- "loss": 0.8177,
808
- "step": 114
809
- },
810
- {
811
- "epoch": 1.796875,
812
- "grad_norm": 0.6263808608055115,
813
- "learning_rate": 7.425335010917244e-05,
814
- "loss": 0.8712,
815
- "step": 115
816
- },
817
- {
818
- "epoch": 1.8125,
819
- "grad_norm": 0.6137387752532959,
820
- "learning_rate": 7.263370099279172e-05,
821
- "loss": 0.8607,
822
- "step": 116
823
- },
824
- {
825
- "epoch": 1.828125,
826
- "grad_norm": 0.6260087490081787,
827
- "learning_rate": 7.102177552285753e-05,
828
- "loss": 0.9137,
829
- "step": 117
830
- },
831
- {
832
- "epoch": 1.84375,
833
- "grad_norm": 0.6399638056755066,
834
- "learning_rate": 6.941802863638659e-05,
835
- "loss": 0.9687,
836
- "step": 118
837
- },
838
- {
839
- "epoch": 1.859375,
840
- "grad_norm": 0.6690117120742798,
841
- "learning_rate": 6.782291296213722e-05,
842
- "loss": 0.9625,
843
- "step": 119
844
- },
845
- {
846
- "epoch": 1.875,
847
- "grad_norm": 0.6296346783638,
848
- "learning_rate": 6.623687869286313e-05,
849
- "loss": 0.8872,
850
- "step": 120
851
- },
852
- {
853
- "epoch": 1.890625,
854
- "grad_norm": 0.5934789776802063,
855
- "learning_rate": 6.466037345825462e-05,
856
- "loss": 0.7974,
857
- "step": 121
858
- },
859
- {
860
- "epoch": 1.90625,
861
- "grad_norm": 0.6167766451835632,
862
- "learning_rate": 6.309384219860301e-05,
863
- "loss": 0.8482,
864
- "step": 122
865
- },
866
- {
867
- "epoch": 1.921875,
868
- "grad_norm": 0.6420045495033264,
869
- "learning_rate": 6.153772703922433e-05,
870
- "loss": 0.9149,
871
- "step": 123
872
- },
873
- {
874
- "epoch": 1.9375,
875
- "grad_norm": 0.6557563543319702,
876
- "learning_rate": 5.999246716567737e-05,
877
- "loss": 0.9324,
878
- "step": 124
879
- },
880
- {
881
- "epoch": 1.953125,
882
- "grad_norm": 0.6023949980735779,
883
- "learning_rate": 5.845849869981137e-05,
884
- "loss": 0.8399,
885
- "step": 125
886
- },
887
- {
888
- "epoch": 1.96875,
889
- "grad_norm": 0.6570467352867126,
890
- "learning_rate": 5.693625457667862e-05,
891
- "loss": 0.9289,
892
- "step": 126
893
- },
894
- {
895
- "epoch": 1.984375,
896
- "grad_norm": 0.6379124522209167,
897
- "learning_rate": 5.542616442234618e-05,
898
- "loss": 0.9081,
899
- "step": 127
900
- },
901
- {
902
- "epoch": 2.0,
903
- "grad_norm": 0.7978578209877014,
904
- "learning_rate": 5.392865443264163e-05,
905
- "loss": 0.9279,
906
- "step": 128
907
- },
908
- {
909
- "epoch": 2.015625,
910
- "grad_norm": 0.7730198502540588,
911
- "learning_rate": 5.244414725286717e-05,
912
- "loss": 0.6409,
913
- "step": 129
914
- },
915
- {
916
- "epoch": 2.03125,
917
- "grad_norm": 0.7514629364013672,
918
- "learning_rate": 5.0973061858515145e-05,
919
- "loss": 0.6736,
920
- "step": 130
921
- },
922
- {
923
- "epoch": 2.046875,
924
- "grad_norm": 0.6814653873443604,
925
- "learning_rate": 4.9515813437020144e-05,
926
- "loss": 0.6016,
927
- "step": 131
928
- },
929
- {
930
- "epoch": 2.0625,
931
- "grad_norm": 0.8626322150230408,
932
- "learning_rate": 4.807281327057972e-05,
933
- "loss": 0.5432,
934
- "step": 132
935
- },
936
- {
937
- "epoch": 2.078125,
938
- "grad_norm": 1.3772077560424805,
939
- "learning_rate": 4.6644468620077174e-05,
940
- "loss": 0.6443,
941
- "step": 133
942
- },
943
- {
944
- "epoch": 2.09375,
945
- "grad_norm": 1.0154165029525757,
946
- "learning_rate": 4.523118261013969e-05,
947
- "loss": 0.5985,
948
- "step": 134
949
- },
950
- {
951
- "epoch": 2.109375,
952
- "grad_norm": 0.7242003679275513,
953
- "learning_rate": 4.383335411536357e-05,
954
- "loss": 0.5446,
955
- "step": 135
956
- },
957
- {
958
- "epoch": 2.125,
959
- "grad_norm": 0.7211042046546936,
960
- "learning_rate": 4.2451377647738985e-05,
961
- "loss": 0.5153,
962
- "step": 136
963
- },
964
- {
965
- "epoch": 2.140625,
966
- "grad_norm": 0.7623264193534851,
967
- "learning_rate": 4.108564324530626e-05,
968
- "loss": 0.5706,
969
- "step": 137
970
- },
971
- {
972
- "epoch": 2.15625,
973
- "grad_norm": 0.7787522077560425,
974
- "learning_rate": 3.973653636207437e-05,
975
- "loss": 0.666,
976
- "step": 138
977
- },
978
- {
979
- "epoch": 2.171875,
980
- "grad_norm": 0.6728441119194031,
981
- "learning_rate": 3.840443775923365e-05,
982
- "loss": 0.5425,
983
- "step": 139
984
- },
985
- {
986
- "epoch": 2.1875,
987
- "grad_norm": 0.7324855327606201,
988
- "learning_rate": 3.70897233976927e-05,
989
- "loss": 0.5939,
990
- "step": 140
991
- },
992
- {
993
- "epoch": 2.203125,
994
- "grad_norm": 0.7769740223884583,
995
- "learning_rate": 3.5792764331970185e-05,
996
- "loss": 0.6202,
997
- "step": 141
998
- },
999
- {
1000
- "epoch": 2.21875,
1001
- "grad_norm": 0.767734169960022,
1002
- "learning_rate": 3.45139266054715e-05,
1003
- "loss": 0.5615,
1004
- "step": 142
1005
- },
1006
- {
1007
- "epoch": 2.234375,
1008
- "grad_norm": 0.751020073890686,
1009
- "learning_rate": 3.325357114717933e-05,
1010
- "loss": 0.575,
1011
- "step": 143
1012
- },
1013
- {
1014
- "epoch": 2.25,
1015
- "grad_norm": 0.8365825414657593,
1016
- "learning_rate": 3.2012053669788135e-05,
1017
- "loss": 0.601,
1018
- "step": 144
1019
- },
1020
- {
1021
- "epoch": 2.265625,
1022
- "grad_norm": 0.8232370615005493,
1023
- "learning_rate": 3.078972456931053e-05,
1024
- "loss": 0.5701,
1025
- "step": 145
1026
- },
1027
- {
1028
- "epoch": 2.28125,
1029
- "grad_norm": 0.8521618843078613,
1030
- "learning_rate": 2.9586928826184325e-05,
1031
- "loss": 0.5909,
1032
- "step": 146
1033
- },
1034
- {
1035
- "epoch": 2.296875,
1036
- "grad_norm": 0.7347127795219421,
1037
- "learning_rate": 2.8404005907908082e-05,
1038
- "loss": 0.5318,
1039
- "step": 147
1040
- },
1041
- {
1042
- "epoch": 2.3125,
1043
- "grad_norm": 0.7891002893447876,
1044
- "learning_rate": 2.724128967323234e-05,
1045
- "loss": 0.6265,
1046
- "step": 148
1047
- },
1048
- {
1049
- "epoch": 2.328125,
1050
- "grad_norm": 0.7826504111289978,
1051
- "learning_rate": 2.6099108277934103e-05,
1052
- "loss": 0.5961,
1053
- "step": 149
1054
- },
1055
- {
1056
- "epoch": 2.34375,
1057
- "grad_norm": 0.7414980530738831,
1058
- "learning_rate": 2.497778408220073e-05,
1059
- "loss": 0.582,
1060
- "step": 150
1061
- },
1062
- {
1063
- "epoch": 2.359375,
1064
- "grad_norm": 0.8048492074012756,
1065
- "learning_rate": 2.3877633559649505e-05,
1066
- "loss": 0.5838,
1067
- "step": 151
1068
- },
1069
- {
1070
- "epoch": 2.375,
1071
- "grad_norm": 0.7372992634773254,
1072
- "learning_rate": 2.2798967208008804e-05,
1073
- "loss": 0.5877,
1074
- "step": 152
1075
- },
1076
- {
1077
- "epoch": 2.390625,
1078
- "grad_norm": 0.7260457873344421,
1079
- "learning_rate": 2.1742089461485504e-05,
1080
- "loss": 0.5532,
1081
- "step": 153
1082
- },
1083
- {
1084
- "epoch": 2.40625,
1085
- "grad_norm": 0.7830657362937927,
1086
- "learning_rate": 2.070729860484396e-05,
1087
- "loss": 0.5761,
1088
- "step": 154
1089
- },
1090
- {
1091
- "epoch": 2.421875,
1092
- "grad_norm": 0.7378799915313721,
1093
- "learning_rate": 1.9694886689220594e-05,
1094
- "loss": 0.5386,
1095
- "step": 155
1096
- },
1097
- {
1098
- "epoch": 2.4375,
1099
- "grad_norm": 0.7603411078453064,
1100
- "learning_rate": 1.870513944969743e-05,
1101
- "loss": 0.5384,
1102
- "step": 156
1103
- },
1104
- {
1105
- "epoch": 2.453125,
1106
- "grad_norm": 0.7275040745735168,
1107
- "learning_rate": 1.7738336224658882e-05,
1108
- "loss": 0.5791,
1109
- "step": 157
1110
- },
1111
- {
1112
- "epoch": 2.46875,
1113
- "grad_norm": 0.7369095683097839,
1114
- "learning_rate": 1.6794749876953188e-05,
1115
- "loss": 0.5816,
1116
- "step": 158
1117
- },
1118
- {
1119
- "epoch": 2.484375,
1120
- "grad_norm": 0.741500735282898,
1121
- "learning_rate": 1.587464671688187e-05,
1122
- "loss": 0.5534,
1123
- "step": 159
1124
- },
1125
- {
1126
- "epoch": 2.5,
1127
- "grad_norm": 0.6802282929420471,
1128
- "learning_rate": 1.4978286427038601e-05,
1129
- "loss": 0.5091,
1130
- "step": 160
1131
- },
1132
- {
1133
- "epoch": 2.515625,
1134
- "grad_norm": 0.7638409733772278,
1135
- "learning_rate": 1.4105921989018111e-05,
1136
- "loss": 0.6298,
1137
- "step": 161
1138
- },
1139
- {
1140
- "epoch": 2.53125,
1141
- "grad_norm": 0.7683125138282776,
1142
- "learning_rate": 1.325779961201703e-05,
1143
- "loss": 0.551,
1144
- "step": 162
1145
- },
1146
- {
1147
- "epoch": 2.546875,
1148
- "grad_norm": 0.7749364376068115,
1149
- "learning_rate": 1.2434158663345552e-05,
1150
- "loss": 0.5891,
1151
- "step": 163
1152
- },
1153
- {
1154
- "epoch": 2.5625,
1155
- "grad_norm": 0.6828547716140747,
1156
- "learning_rate": 1.1635231600870333e-05,
1157
- "loss": 0.5396,
1158
- "step": 164
1159
- },
1160
- {
1161
- "epoch": 2.578125,
1162
- "grad_norm": 0.7164223194122314,
1163
- "learning_rate": 1.086124390740757e-05,
1164
- "loss": 0.5755,
1165
- "step": 165
1166
- },
1167
- {
1168
- "epoch": 2.59375,
1169
- "grad_norm": 0.7504003643989563,
1170
- "learning_rate": 1.0112414027084261e-05,
1171
- "loss": 0.6454,
1172
- "step": 166
1173
- },
1174
- {
1175
- "epoch": 2.609375,
1176
- "grad_norm": 0.7125520706176758,
1177
- "learning_rate": 9.388953303686588e-06,
1178
- "loss": 0.5439,
1179
- "step": 167
1180
- },
1181
- {
1182
- "epoch": 2.625,
1183
- "grad_norm": 0.7042016386985779,
1184
- "learning_rate": 8.691065921011687e-06,
1185
- "loss": 0.5675,
1186
- "step": 168
1187
- },
1188
- {
1189
- "epoch": 2.640625,
1190
- "grad_norm": 0.7672927975654602,
1191
- "learning_rate": 8.018948845240538e-06,
1192
- "loss": 0.5189,
1193
- "step": 169
1194
- },
1195
- {
1196
- "epoch": 2.65625,
1197
- "grad_norm": 0.701722264289856,
1198
- "learning_rate": 7.372791769347842e-06,
1199
- "loss": 0.5917,
1200
- "step": 170
1201
- },
1202
- {
1203
- "epoch": 2.671875,
1204
- "grad_norm": 0.7036768198013306,
1205
- "learning_rate": 6.75277705956443e-06,
1206
- "loss": 0.5691,
1207
- "step": 171
1208
- },
1209
- {
1210
- "epoch": 2.6875,
1211
- "grad_norm": 0.7126287817955017,
1212
- "learning_rate": 6.159079703907822e-06,
1213
- "loss": 0.5656,
1214
- "step": 172
1215
- },
1216
- {
1217
- "epoch": 2.703125,
1218
- "grad_norm": 0.7400851249694824,
1219
- "learning_rate": 5.59186726279497e-06,
1220
- "loss": 0.573,
1221
- "step": 173
1222
- },
1223
- {
1224
- "epoch": 2.71875,
1225
- "grad_norm": 0.7629361152648926,
1226
- "learning_rate": 5.051299821751254e-06,
1227
- "loss": 0.5996,
1228
- "step": 174
1229
- },
1230
- {
1231
- "epoch": 2.734375,
1232
- "grad_norm": 0.7034224271774292,
1233
- "learning_rate": 4.537529946229368e-06,
1234
- "loss": 0.5706,
1235
- "step": 175
1236
- },
1237
- {
1238
- "epoch": 2.75,
1239
- "grad_norm": 0.7580721974372864,
1240
- "learning_rate": 4.050702638550275e-06,
1241
- "loss": 0.5697,
1242
- "step": 176
1243
- },
1244
- {
1245
- "epoch": 2.765625,
1246
- "grad_norm": 0.7541771531105042,
1247
- "learning_rate": 3.590955296979037e-06,
1248
- "loss": 0.5489,
1249
- "step": 177
1250
- },
1251
- {
1252
- "epoch": 2.78125,
1253
- "grad_norm": 0.7310436964035034,
1254
- "learning_rate": 3.1584176769466346e-06,
1255
- "loss": 0.5879,
1256
- "step": 178
1257
- },
1258
- {
1259
- "epoch": 2.796875,
1260
- "grad_norm": 0.7326951622962952,
1261
- "learning_rate": 2.7532118544287276e-06,
1262
- "loss": 0.5475,
1263
- "step": 179
1264
- },
1265
- {
1266
- "epoch": 2.8125,
1267
- "grad_norm": 0.7719696164131165,
1268
- "learning_rate": 2.3754521914919668e-06,
1269
- "loss": 0.6063,
1270
- "step": 180
1271
- },
1272
- {
1273
- "epoch": 2.828125,
1274
- "grad_norm": 0.8477444648742676,
1275
- "learning_rate": 2.0252453040173647e-06,
1276
- "loss": 0.5877,
1277
- "step": 181
1278
- },
1279
- {
1280
- "epoch": 2.84375,
1281
- "grad_norm": 0.7376056909561157,
1282
- "learning_rate": 1.7026900316098215e-06,
1283
- "loss": 0.5247,
1284
- "step": 182
1285
- },
1286
- {
1287
- "epoch": 2.859375,
1288
- "grad_norm": 0.734314501285553,
1289
- "learning_rate": 1.407877409702496e-06,
1290
- "loss": 0.5798,
1291
- "step": 183
1292
- },
1293
- {
1294
- "epoch": 2.875,
1295
- "grad_norm": 0.7004963755607605,
1296
- "learning_rate": 1.1408906438636236e-06,
1297
- "loss": 0.4904,
1298
- "step": 184
1299
- },
1300
- {
1301
- "epoch": 2.890625,
1302
- "grad_norm": 0.7169924378395081,
1303
- "learning_rate": 9.018050863132565e-07,
1304
- "loss": 0.5529,
1305
- "step": 185
1306
- },
1307
- {
1308
- "epoch": 2.90625,
1309
- "grad_norm": 0.7529072165489197,
1310
- "learning_rate": 6.906882146565096e-07,
1311
- "loss": 0.5226,
1312
- "step": 186
1313
- },
1314
- {
1315
- "epoch": 2.921875,
1316
- "grad_norm": 0.762661337852478,
1317
- "learning_rate": 5.075996128391158e-07,
1318
- "loss": 0.6149,
1319
- "step": 187
1320
- },
1321
- {
1322
- "epoch": 2.9375,
1323
- "grad_norm": 0.7445721626281738,
1324
- "learning_rate": 3.525909543310002e-07,
1325
- "loss": 0.6561,
1326
- "step": 188
1327
- },
1328
- {
1329
- "epoch": 2.953125,
1330
- "grad_norm": 0.7027716040611267,
1331
- "learning_rate": 2.2570598754237947e-07,
1332
- "loss": 0.54,
1333
- "step": 189
1334
- },
1335
- {
1336
- "epoch": 2.96875,
1337
- "grad_norm": 0.7662133574485779,
1338
- "learning_rate": 1.2698052347649426e-07,
1339
- "loss": 0.5341,
1340
- "step": 190
1341
- },
1342
- {
1343
- "epoch": 2.984375,
1344
- "grad_norm": 0.7629265785217285,
1345
- "learning_rate": 5.644242562264923e-08,
1346
- "loss": 0.5647,
1347
- "step": 191
1348
- },
1349
- {
1350
- "epoch": 3.0,
1351
- "grad_norm": 0.8505879044532776,
1352
- "learning_rate": 1.4111602092226062e-08,
1353
- "loss": 0.5023,
1354
- "step": 192
1355
- }
1356
- ],
1357
- "logging_steps": 1,
1358
- "max_steps": 192,
1359
- "num_input_tokens_seen": 0,
1360
- "num_train_epochs": 3,
1361
- "save_steps": 500,
1362
- "stateful_callbacks": {
1363
- "TrainerControl": {
1364
- "args": {
1365
- "should_epoch_stop": false,
1366
- "should_evaluate": false,
1367
- "should_log": false,
1368
- "should_save": true,
1369
- "should_training_stop": true
1370
- },
1371
- "attributes": {}
1372
- }
1373
- },
1374
- "total_flos": 4.417588075387945e+17,
1375
- "train_batch_size": 8,
1376
- "trial_name": null,
1377
- "trial_params": null
1378
- }