Sabbir772 commited on
Commit
dfcfac0
·
verified ·
1 Parent(s): b642bc9

Delete banglat5_bn_sy/trainer_state.json

Browse files
Files changed (1) hide show
  1. banglat5_bn_sy/trainer_state.json +0 -1700
banglat5_bn_sy/trainer_state.json DELETED
@@ -1,1700 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 14.0,
6
- "eval_steps": 500,
7
- "global_step": 22274,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.06285355122564425,
14
- "grad_norm": 608.9674682617188,
15
- "learning_rate": 4.844437460716531e-05,
16
- "loss": 14.2524,
17
- "step": 100
18
- },
19
- {
20
- "epoch": 0.1257071024512885,
21
- "grad_norm": 34.65327453613281,
22
- "learning_rate": 4.6873035826524205e-05,
23
- "loss": 10.3562,
24
- "step": 200
25
- },
26
- {
27
- "epoch": 0.18856065367693275,
28
- "grad_norm": 21.24808120727539,
29
- "learning_rate": 4.5301697045883096e-05,
30
- "loss": 7.8551,
31
- "step": 300
32
- },
33
- {
34
- "epoch": 0.251414204902577,
35
- "grad_norm": 17.404918670654297,
36
- "learning_rate": 4.373035826524199e-05,
37
- "loss": 6.6346,
38
- "step": 400
39
- },
40
- {
41
- "epoch": 0.3142677561282212,
42
- "grad_norm": 12.713433265686035,
43
- "learning_rate": 4.2159019484600884e-05,
44
- "loss": 5.9755,
45
- "step": 500
46
- },
47
- {
48
- "epoch": 0.3771213073538655,
49
- "grad_norm": 10.050477981567383,
50
- "learning_rate": 4.0587680703959775e-05,
51
- "loss": 5.5595,
52
- "step": 600
53
- },
54
- {
55
- "epoch": 0.43997485857950974,
56
- "grad_norm": 13.709216117858887,
57
- "learning_rate": 3.9016341923318666e-05,
58
- "loss": 5.2853,
59
- "step": 700
60
- },
61
- {
62
- "epoch": 0.502828409805154,
63
- "grad_norm": 9.112940788269043,
64
- "learning_rate": 3.744500314267756e-05,
65
- "loss": 5.1417,
66
- "step": 800
67
- },
68
- {
69
- "epoch": 0.5656819610307983,
70
- "grad_norm": 8.267425537109375,
71
- "learning_rate": 3.587366436203646e-05,
72
- "loss": 4.9615,
73
- "step": 900
74
- },
75
- {
76
- "epoch": 0.6285355122564424,
77
- "grad_norm": 9.709076881408691,
78
- "learning_rate": 3.430232558139535e-05,
79
- "loss": 4.6907,
80
- "step": 1000
81
- },
82
- {
83
- "epoch": 0.6913890634820867,
84
- "grad_norm": 845.80859375,
85
- "learning_rate": 3.273098680075424e-05,
86
- "loss": 4.5456,
87
- "step": 1100
88
- },
89
- {
90
- "epoch": 0.754242614707731,
91
- "grad_norm": 5.943735599517822,
92
- "learning_rate": 3.115964802011313e-05,
93
- "loss": 4.4291,
94
- "step": 1200
95
- },
96
- {
97
- "epoch": 0.8170961659333752,
98
- "grad_norm": 5.8759989738464355,
99
- "learning_rate": 2.9588309239472034e-05,
100
- "loss": 4.3252,
101
- "step": 1300
102
- },
103
- {
104
- "epoch": 0.8799497171590195,
105
- "grad_norm": 14.995753288269043,
106
- "learning_rate": 2.8016970458830928e-05,
107
- "loss": 4.2586,
108
- "step": 1400
109
- },
110
- {
111
- "epoch": 0.9428032683846638,
112
- "grad_norm": 23.3351993560791,
113
- "learning_rate": 2.644563167818982e-05,
114
- "loss": 4.1372,
115
- "step": 1500
116
- },
117
- {
118
- "epoch": 1.0,
119
- "eval_loss": 3.215750217437744,
120
- "eval_runtime": 19.7611,
121
- "eval_samples_per_second": 48.479,
122
- "eval_steps_per_second": 6.073,
123
- "step": 1591
124
- },
125
- {
126
- "epoch": 1.005656819610308,
127
- "grad_norm": 8.584565162658691,
128
- "learning_rate": 2.4874292897548713e-05,
129
- "loss": 4.0272,
130
- "step": 1600
131
- },
132
- {
133
- "epoch": 1.0685103708359522,
134
- "grad_norm": 6.45043420791626,
135
- "learning_rate": 2.3302954116907607e-05,
136
- "loss": 3.9602,
137
- "step": 1700
138
- },
139
- {
140
- "epoch": 1.1313639220615965,
141
- "grad_norm": 6.03476095199585,
142
- "learning_rate": 2.17316153362665e-05,
143
- "loss": 3.9052,
144
- "step": 1800
145
- },
146
- {
147
- "epoch": 1.1942174732872408,
148
- "grad_norm": 5.746309280395508,
149
- "learning_rate": 2.0160276555625392e-05,
150
- "loss": 3.9282,
151
- "step": 1900
152
- },
153
- {
154
- "epoch": 1.2570710245128849,
155
- "grad_norm": 8.062549591064453,
156
- "learning_rate": 1.858893777498429e-05,
157
- "loss": 3.8096,
158
- "step": 2000
159
- },
160
- {
161
- "epoch": 1.3199245757385292,
162
- "grad_norm": 8.58310317993164,
163
- "learning_rate": 1.701759899434318e-05,
164
- "loss": 3.803,
165
- "step": 2100
166
- },
167
- {
168
- "epoch": 1.3827781269641735,
169
- "grad_norm": 7.599905490875244,
170
- "learning_rate": 1.5446260213702074e-05,
171
- "loss": 3.8381,
172
- "step": 2200
173
- },
174
- {
175
- "epoch": 1.4456316781898177,
176
- "grad_norm": 22.772512435913086,
177
- "learning_rate": 1.3874921433060969e-05,
178
- "loss": 3.6456,
179
- "step": 2300
180
- },
181
- {
182
- "epoch": 1.508485229415462,
183
- "grad_norm": 6.949570178985596,
184
- "learning_rate": 1.2303582652419863e-05,
185
- "loss": 3.7442,
186
- "step": 2400
187
- },
188
- {
189
- "epoch": 1.5713387806411063,
190
- "grad_norm": 5.7536821365356445,
191
- "learning_rate": 1.0732243871778757e-05,
192
- "loss": 3.691,
193
- "step": 2500
194
- },
195
- {
196
- "epoch": 1.6341923318667506,
197
- "grad_norm": 55.64060974121094,
198
- "learning_rate": 9.160905091137651e-06,
199
- "loss": 3.7461,
200
- "step": 2600
201
- },
202
- {
203
- "epoch": 1.6970458830923947,
204
- "grad_norm": 6.573077201843262,
205
- "learning_rate": 7.589566310496543e-06,
206
- "loss": 3.6186,
207
- "step": 2700
208
- },
209
- {
210
- "epoch": 1.759899434318039,
211
- "grad_norm": 8.615326881408691,
212
- "learning_rate": 6.018227529855437e-06,
213
- "loss": 3.6546,
214
- "step": 2800
215
- },
216
- {
217
- "epoch": 1.8227529855436833,
218
- "grad_norm": 6.359428405761719,
219
- "learning_rate": 4.446888749214331e-06,
220
- "loss": 3.5724,
221
- "step": 2900
222
- },
223
- {
224
- "epoch": 1.8856065367693273,
225
- "grad_norm": 5.5190582275390625,
226
- "learning_rate": 2.8755499685732243e-06,
227
- "loss": 3.6164,
228
- "step": 3000
229
- },
230
- {
231
- "epoch": 1.9484600879949716,
232
- "grad_norm": 5.9382004737854,
233
- "learning_rate": 1.3042111879321182e-06,
234
- "loss": 3.52,
235
- "step": 3100
236
- },
237
- {
238
- "epoch": 2.0,
239
- "eval_loss": 2.803544521331787,
240
- "eval_runtime": 19.8643,
241
- "eval_samples_per_second": 48.227,
242
- "eval_steps_per_second": 6.041,
243
- "step": 3182
244
- },
245
- {
246
- "epoch": 2.011313639220616,
247
- "grad_norm": 10.074417114257812,
248
- "learning_rate": 3.9946574481458206e-05,
249
- "loss": 3.5087,
250
- "step": 3200
251
- },
252
- {
253
- "epoch": 2.07416719044626,
254
- "grad_norm": 6.9990434646606445,
255
- "learning_rate": 3.963230672532998e-05,
256
- "loss": 3.5746,
257
- "step": 3300
258
- },
259
- {
260
- "epoch": 2.1370207416719045,
261
- "grad_norm": 6.968172073364258,
262
- "learning_rate": 3.931803896920176e-05,
263
- "loss": 3.6324,
264
- "step": 3400
265
- },
266
- {
267
- "epoch": 2.1998742928975488,
268
- "grad_norm": 179.99803161621094,
269
- "learning_rate": 3.9003771213073545e-05,
270
- "loss": 3.4072,
271
- "step": 3500
272
- },
273
- {
274
- "epoch": 2.262727844123193,
275
- "grad_norm": 59.86805725097656,
276
- "learning_rate": 3.868950345694532e-05,
277
- "loss": 3.391,
278
- "step": 3600
279
- },
280
- {
281
- "epoch": 2.3255813953488373,
282
- "grad_norm": 7.445355415344238,
283
- "learning_rate": 3.83752357008171e-05,
284
- "loss": 3.2032,
285
- "step": 3700
286
- },
287
- {
288
- "epoch": 2.3884349465744816,
289
- "grad_norm": 5.553746700286865,
290
- "learning_rate": 3.806096794468888e-05,
291
- "loss": 3.3644,
292
- "step": 3800
293
- },
294
- {
295
- "epoch": 2.4512884978001255,
296
- "grad_norm": 6.544325351715088,
297
- "learning_rate": 3.7746700188560656e-05,
298
- "loss": 3.1666,
299
- "step": 3900
300
- },
301
- {
302
- "epoch": 2.5141420490257698,
303
- "grad_norm": 7.863962650299072,
304
- "learning_rate": 3.7432432432432436e-05,
305
- "loss": 3.1982,
306
- "step": 4000
307
- },
308
- {
309
- "epoch": 2.576995600251414,
310
- "grad_norm": 10.573624610900879,
311
- "learning_rate": 3.7118164676304215e-05,
312
- "loss": 3.1336,
313
- "step": 4100
314
- },
315
- {
316
- "epoch": 2.6398491514770583,
317
- "grad_norm": 8.506134986877441,
318
- "learning_rate": 3.680389692017599e-05,
319
- "loss": 3.0191,
320
- "step": 4200
321
- },
322
- {
323
- "epoch": 2.7027027027027026,
324
- "grad_norm": 7.1274518966674805,
325
- "learning_rate": 3.6489629164047774e-05,
326
- "loss": 3.003,
327
- "step": 4300
328
- },
329
- {
330
- "epoch": 2.765556253928347,
331
- "grad_norm": 5.121671199798584,
332
- "learning_rate": 3.617536140791955e-05,
333
- "loss": 3.085,
334
- "step": 4400
335
- },
336
- {
337
- "epoch": 2.828409805153991,
338
- "grad_norm": 6.66685152053833,
339
- "learning_rate": 3.5861093651791327e-05,
340
- "loss": 3.0205,
341
- "step": 4500
342
- },
343
- {
344
- "epoch": 2.8912633563796355,
345
- "grad_norm": 8.410430908203125,
346
- "learning_rate": 3.5546825895663106e-05,
347
- "loss": 2.9611,
348
- "step": 4600
349
- },
350
- {
351
- "epoch": 2.95411690760528,
352
- "grad_norm": 6.266846179962158,
353
- "learning_rate": 3.5232558139534886e-05,
354
- "loss": 2.9299,
355
- "step": 4700
356
- },
357
- {
358
- "epoch": 3.0,
359
- "eval_loss": 2.3084471225738525,
360
- "eval_runtime": 20.0337,
361
- "eval_samples_per_second": 47.819,
362
- "eval_steps_per_second": 5.99,
363
- "step": 4773
364
- },
365
- {
366
- "epoch": 3.016970458830924,
367
- "grad_norm": 6.011202335357666,
368
- "learning_rate": 3.4918290383406665e-05,
369
- "loss": 2.886,
370
- "step": 4800
371
- },
372
- {
373
- "epoch": 3.0798240100565684,
374
- "grad_norm": 7.204225063323975,
375
- "learning_rate": 3.4604022627278445e-05,
376
- "loss": 2.8579,
377
- "step": 4900
378
- },
379
- {
380
- "epoch": 3.1426775612822127,
381
- "grad_norm": 10.316048622131348,
382
- "learning_rate": 3.428975487115022e-05,
383
- "loss": 2.8155,
384
- "step": 5000
385
- },
386
- {
387
- "epoch": 3.2055311125078565,
388
- "grad_norm": 6.55385684967041,
389
- "learning_rate": 3.3975487115022e-05,
390
- "loss": 2.8938,
391
- "step": 5100
392
- },
393
- {
394
- "epoch": 3.268384663733501,
395
- "grad_norm": 6.081694602966309,
396
- "learning_rate": 3.366121935889378e-05,
397
- "loss": 2.7344,
398
- "step": 5200
399
- },
400
- {
401
- "epoch": 3.331238214959145,
402
- "grad_norm": 8.186753273010254,
403
- "learning_rate": 3.3346951602765556e-05,
404
- "loss": 2.7899,
405
- "step": 5300
406
- },
407
- {
408
- "epoch": 3.3940917661847894,
409
- "grad_norm": 7.425989627838135,
410
- "learning_rate": 3.3032683846637335e-05,
411
- "loss": 2.7317,
412
- "step": 5400
413
- },
414
- {
415
- "epoch": 3.4569453174104336,
416
- "grad_norm": 5.459439277648926,
417
- "learning_rate": 3.2718416090509115e-05,
418
- "loss": 2.6456,
419
- "step": 5500
420
- },
421
- {
422
- "epoch": 3.519798868636078,
423
- "grad_norm": 5.077919006347656,
424
- "learning_rate": 3.2404148334380894e-05,
425
- "loss": 2.6816,
426
- "step": 5600
427
- },
428
- {
429
- "epoch": 3.5826524198617222,
430
- "grad_norm": 5.81939172744751,
431
- "learning_rate": 3.2089880578252674e-05,
432
- "loss": 2.64,
433
- "step": 5700
434
- },
435
- {
436
- "epoch": 3.6455059710873665,
437
- "grad_norm": 39.74727249145508,
438
- "learning_rate": 3.177561282212445e-05,
439
- "loss": 2.6725,
440
- "step": 5800
441
- },
442
- {
443
- "epoch": 3.708359522313011,
444
- "grad_norm": 5.927642345428467,
445
- "learning_rate": 3.1461345065996226e-05,
446
- "loss": 2.5395,
447
- "step": 5900
448
- },
449
- {
450
- "epoch": 3.771213073538655,
451
- "grad_norm": 5.984442710876465,
452
- "learning_rate": 3.114707730986801e-05,
453
- "loss": 2.6297,
454
- "step": 6000
455
- },
456
- {
457
- "epoch": 3.834066624764299,
458
- "grad_norm": 5.258358478546143,
459
- "learning_rate": 3.083280955373979e-05,
460
- "loss": 2.6291,
461
- "step": 6100
462
- },
463
- {
464
- "epoch": 3.8969201759899432,
465
- "grad_norm": 5.7379937171936035,
466
- "learning_rate": 3.0518541797611565e-05,
467
- "loss": 2.6116,
468
- "step": 6200
469
- },
470
- {
471
- "epoch": 3.9597737272155875,
472
- "grad_norm": 5.038835048675537,
473
- "learning_rate": 3.0204274041483344e-05,
474
- "loss": 2.6695,
475
- "step": 6300
476
- },
477
- {
478
- "epoch": 4.0,
479
- "eval_loss": 2.0932769775390625,
480
- "eval_runtime": 20.0417,
481
- "eval_samples_per_second": 47.8,
482
- "eval_steps_per_second": 5.988,
483
- "step": 6364
484
- },
485
- {
486
- "epoch": 4.022627278441232,
487
- "grad_norm": 7.459395885467529,
488
- "learning_rate": 2.9890006285355127e-05,
489
- "loss": 2.6404,
490
- "step": 6400
491
- },
492
- {
493
- "epoch": 4.085480829666876,
494
- "grad_norm": 6.721461296081543,
495
- "learning_rate": 2.9575738529226903e-05,
496
- "loss": 2.4614,
497
- "step": 6500
498
- },
499
- {
500
- "epoch": 4.14833438089252,
501
- "grad_norm": 6.69769287109375,
502
- "learning_rate": 2.9261470773098683e-05,
503
- "loss": 2.457,
504
- "step": 6600
505
- },
506
- {
507
- "epoch": 4.211187932118165,
508
- "grad_norm": 5.306356906890869,
509
- "learning_rate": 2.894720301697046e-05,
510
- "loss": 2.513,
511
- "step": 6700
512
- },
513
- {
514
- "epoch": 4.274041483343809,
515
- "grad_norm": 5.425265312194824,
516
- "learning_rate": 2.8632935260842235e-05,
517
- "loss": 2.5467,
518
- "step": 6800
519
- },
520
- {
521
- "epoch": 4.336895034569453,
522
- "grad_norm": 4.722207546234131,
523
- "learning_rate": 2.8318667504714018e-05,
524
- "loss": 2.3467,
525
- "step": 6900
526
- },
527
- {
528
- "epoch": 4.3997485857950975,
529
- "grad_norm": 4.346086502075195,
530
- "learning_rate": 2.8004399748585797e-05,
531
- "loss": 2.5098,
532
- "step": 7000
533
- },
534
- {
535
- "epoch": 4.462602137020742,
536
- "grad_norm": 7.4684319496154785,
537
- "learning_rate": 2.7690131992457573e-05,
538
- "loss": 2.4396,
539
- "step": 7100
540
- },
541
- {
542
- "epoch": 4.525455688246386,
543
- "grad_norm": 5.709039688110352,
544
- "learning_rate": 2.7375864236329353e-05,
545
- "loss": 2.4688,
546
- "step": 7200
547
- },
548
- {
549
- "epoch": 4.58830923947203,
550
- "grad_norm": 4.952858924865723,
551
- "learning_rate": 2.7061596480201136e-05,
552
- "loss": 2.3643,
553
- "step": 7300
554
- },
555
- {
556
- "epoch": 4.651162790697675,
557
- "grad_norm": 6.68017578125,
558
- "learning_rate": 2.6747328724072912e-05,
559
- "loss": 2.4242,
560
- "step": 7400
561
- },
562
- {
563
- "epoch": 4.714016341923319,
564
- "grad_norm": 3.584669828414917,
565
- "learning_rate": 2.6433060967944688e-05,
566
- "loss": 2.4552,
567
- "step": 7500
568
- },
569
- {
570
- "epoch": 4.776869893148963,
571
- "grad_norm": 5.264488220214844,
572
- "learning_rate": 2.6118793211816468e-05,
573
- "loss": 2.4232,
574
- "step": 7600
575
- },
576
- {
577
- "epoch": 4.8397234443746076,
578
- "grad_norm": 4.609414100646973,
579
- "learning_rate": 2.580452545568825e-05,
580
- "loss": 2.4418,
581
- "step": 7700
582
- },
583
- {
584
- "epoch": 4.902576995600251,
585
- "grad_norm": 4.986881256103516,
586
- "learning_rate": 2.5490257699560027e-05,
587
- "loss": 2.4065,
588
- "step": 7800
589
- },
590
- {
591
- "epoch": 4.965430546825896,
592
- "grad_norm": 4.9718098640441895,
593
- "learning_rate": 2.5175989943431806e-05,
594
- "loss": 2.4589,
595
- "step": 7900
596
- },
597
- {
598
- "epoch": 5.0,
599
- "eval_loss": 1.984979271888733,
600
- "eval_runtime": 20.0353,
601
- "eval_samples_per_second": 47.816,
602
- "eval_steps_per_second": 5.989,
603
- "step": 7955
604
- },
605
- {
606
- "epoch": 5.0282840980515395,
607
- "grad_norm": 5.2526750564575195,
608
- "learning_rate": 2.4861722187303586e-05,
609
- "loss": 2.2708,
610
- "step": 8000
611
- },
612
- {
613
- "epoch": 5.091137649277184,
614
- "grad_norm": 5.312747001647949,
615
- "learning_rate": 2.454745443117536e-05,
616
- "loss": 2.3068,
617
- "step": 8100
618
- },
619
- {
620
- "epoch": 5.153991200502828,
621
- "grad_norm": 7.204046726226807,
622
- "learning_rate": 2.423318667504714e-05,
623
- "loss": 2.3729,
624
- "step": 8200
625
- },
626
- {
627
- "epoch": 5.216844751728472,
628
- "grad_norm": 4.8044753074646,
629
- "learning_rate": 2.391891891891892e-05,
630
- "loss": 2.3501,
631
- "step": 8300
632
- },
633
- {
634
- "epoch": 5.279698302954117,
635
- "grad_norm": 6.9473185539245605,
636
- "learning_rate": 2.3604651162790697e-05,
637
- "loss": 2.3398,
638
- "step": 8400
639
- },
640
- {
641
- "epoch": 5.342551854179761,
642
- "grad_norm": 4.014726161956787,
643
- "learning_rate": 2.3290383406662476e-05,
644
- "loss": 2.2938,
645
- "step": 8500
646
- },
647
- {
648
- "epoch": 5.405405405405405,
649
- "grad_norm": 6.722488880157471,
650
- "learning_rate": 2.2976115650534256e-05,
651
- "loss": 2.2354,
652
- "step": 8600
653
- },
654
- {
655
- "epoch": 5.4682589566310495,
656
- "grad_norm": 5.856524467468262,
657
- "learning_rate": 2.2661847894406035e-05,
658
- "loss": 2.2757,
659
- "step": 8700
660
- },
661
- {
662
- "epoch": 5.531112507856694,
663
- "grad_norm": 4.9930644035339355,
664
- "learning_rate": 2.234758013827781e-05,
665
- "loss": 2.2586,
666
- "step": 8800
667
- },
668
- {
669
- "epoch": 5.593966059082338,
670
- "grad_norm": 5.49005126953125,
671
- "learning_rate": 2.2033312382149594e-05,
672
- "loss": 2.3155,
673
- "step": 8900
674
- },
675
- {
676
- "epoch": 5.656819610307982,
677
- "grad_norm": 8.850517272949219,
678
- "learning_rate": 2.171904462602137e-05,
679
- "loss": 2.2841,
680
- "step": 9000
681
- },
682
- {
683
- "epoch": 5.719673161533627,
684
- "grad_norm": 5.094405651092529,
685
- "learning_rate": 2.140477686989315e-05,
686
- "loss": 2.3147,
687
- "step": 9100
688
- },
689
- {
690
- "epoch": 5.782526712759271,
691
- "grad_norm": 4.709909439086914,
692
- "learning_rate": 2.109050911376493e-05,
693
- "loss": 2.1584,
694
- "step": 9200
695
- },
696
- {
697
- "epoch": 5.845380263984915,
698
- "grad_norm": 4.1693525314331055,
699
- "learning_rate": 2.077624135763671e-05,
700
- "loss": 2.2396,
701
- "step": 9300
702
- },
703
- {
704
- "epoch": 5.90823381521056,
705
- "grad_norm": 6.800940036773682,
706
- "learning_rate": 2.0461973601508485e-05,
707
- "loss": 2.301,
708
- "step": 9400
709
- },
710
- {
711
- "epoch": 5.971087366436204,
712
- "grad_norm": 7.419278144836426,
713
- "learning_rate": 2.0147705845380265e-05,
714
- "loss": 2.3142,
715
- "step": 9500
716
- },
717
- {
718
- "epoch": 6.0,
719
- "eval_loss": 1.905881643295288,
720
- "eval_runtime": 20.0332,
721
- "eval_samples_per_second": 47.821,
722
- "eval_steps_per_second": 5.99,
723
- "step": 9546
724
- },
725
- {
726
- "epoch": 6.033940917661848,
727
- "grad_norm": 4.217894077301025,
728
- "learning_rate": 1.9833438089252044e-05,
729
- "loss": 2.1013,
730
- "step": 9600
731
- },
732
- {
733
- "epoch": 6.096794468887492,
734
- "grad_norm": 5.345584869384766,
735
- "learning_rate": 1.9519170333123824e-05,
736
- "loss": 2.2714,
737
- "step": 9700
738
- },
739
- {
740
- "epoch": 6.159648020113137,
741
- "grad_norm": 5.364700794219971,
742
- "learning_rate": 1.92049025769956e-05,
743
- "loss": 2.2381,
744
- "step": 9800
745
- },
746
- {
747
- "epoch": 6.222501571338781,
748
- "grad_norm": 4.380568504333496,
749
- "learning_rate": 1.8890634820867383e-05,
750
- "loss": 2.1527,
751
- "step": 9900
752
- },
753
- {
754
- "epoch": 6.285355122564425,
755
- "grad_norm": 6.300790309906006,
756
- "learning_rate": 1.857636706473916e-05,
757
- "loss": 2.1771,
758
- "step": 10000
759
- },
760
- {
761
- "epoch": 6.348208673790069,
762
- "grad_norm": 5.757110118865967,
763
- "learning_rate": 1.8262099308610938e-05,
764
- "loss": 2.1695,
765
- "step": 10100
766
- },
767
- {
768
- "epoch": 6.411062225015713,
769
- "grad_norm": 4.908361434936523,
770
- "learning_rate": 1.7947831552482718e-05,
771
- "loss": 2.1056,
772
- "step": 10200
773
- },
774
- {
775
- "epoch": 6.473915776241357,
776
- "grad_norm": 5.048102378845215,
777
- "learning_rate": 1.7633563796354494e-05,
778
- "loss": 2.2112,
779
- "step": 10300
780
- },
781
- {
782
- "epoch": 6.536769327467002,
783
- "grad_norm": 8.040143013000488,
784
- "learning_rate": 1.7319296040226273e-05,
785
- "loss": 2.0298,
786
- "step": 10400
787
- },
788
- {
789
- "epoch": 6.599622878692646,
790
- "grad_norm": 5.15581750869751,
791
- "learning_rate": 1.7005028284098053e-05,
792
- "loss": 2.1224,
793
- "step": 10500
794
- },
795
- {
796
- "epoch": 6.66247642991829,
797
- "grad_norm": 4.935842514038086,
798
- "learning_rate": 1.6690760527969832e-05,
799
- "loss": 2.0772,
800
- "step": 10600
801
- },
802
- {
803
- "epoch": 6.725329981143934,
804
- "grad_norm": 5.487718105316162,
805
- "learning_rate": 1.637649277184161e-05,
806
- "loss": 2.2552,
807
- "step": 10700
808
- },
809
- {
810
- "epoch": 6.788183532369579,
811
- "grad_norm": 5.713748455047607,
812
- "learning_rate": 1.6062225015713388e-05,
813
- "loss": 2.1358,
814
- "step": 10800
815
- },
816
- {
817
- "epoch": 6.851037083595223,
818
- "grad_norm": 4.882757186889648,
819
- "learning_rate": 1.5747957259585168e-05,
820
- "loss": 2.1613,
821
- "step": 10900
822
- },
823
- {
824
- "epoch": 6.913890634820867,
825
- "grad_norm": 5.634950637817383,
826
- "learning_rate": 1.5433689503456947e-05,
827
- "loss": 2.2567,
828
- "step": 11000
829
- },
830
- {
831
- "epoch": 6.976744186046512,
832
- "grad_norm": 5.634829044342041,
833
- "learning_rate": 1.5119421747328725e-05,
834
- "loss": 2.1283,
835
- "step": 11100
836
- },
837
- {
838
- "epoch": 7.0,
839
- "eval_loss": 1.84635591506958,
840
- "eval_runtime": 20.0367,
841
- "eval_samples_per_second": 47.812,
842
- "eval_steps_per_second": 5.989,
843
- "step": 11137
844
- },
845
- {
846
- "epoch": 7.039597737272156,
847
- "grad_norm": 5.635861873626709,
848
- "learning_rate": 1.4805153991200504e-05,
849
- "loss": 2.0938,
850
- "step": 11200
851
- },
852
- {
853
- "epoch": 7.1024512884978,
854
- "grad_norm": 5.214977741241455,
855
- "learning_rate": 1.4490886235072282e-05,
856
- "loss": 2.062,
857
- "step": 11300
858
- },
859
- {
860
- "epoch": 7.1653048397234445,
861
- "grad_norm": 7.498839855194092,
862
- "learning_rate": 1.4176618478944062e-05,
863
- "loss": 2.1292,
864
- "step": 11400
865
- },
866
- {
867
- "epoch": 7.228158390949089,
868
- "grad_norm": 5.83459997177124,
869
- "learning_rate": 1.386235072281584e-05,
870
- "loss": 2.0796,
871
- "step": 11500
872
- },
873
- {
874
- "epoch": 7.291011942174733,
875
- "grad_norm": 3.8935282230377197,
876
- "learning_rate": 1.3548082966687619e-05,
877
- "loss": 2.1414,
878
- "step": 11600
879
- },
880
- {
881
- "epoch": 7.353865493400377,
882
- "grad_norm": 5.774020671844482,
883
- "learning_rate": 1.3233815210559397e-05,
884
- "loss": 2.145,
885
- "step": 11700
886
- },
887
- {
888
- "epoch": 7.416719044626022,
889
- "grad_norm": 128.24192810058594,
890
- "learning_rate": 1.2919547454431178e-05,
891
- "loss": 2.0242,
892
- "step": 11800
893
- },
894
- {
895
- "epoch": 7.479572595851666,
896
- "grad_norm": 4.4846367835998535,
897
- "learning_rate": 1.2605279698302954e-05,
898
- "loss": 2.0936,
899
- "step": 11900
900
- },
901
- {
902
- "epoch": 7.54242614707731,
903
- "grad_norm": 5.091222763061523,
904
- "learning_rate": 1.2291011942174734e-05,
905
- "loss": 2.1988,
906
- "step": 12000
907
- },
908
- {
909
- "epoch": 7.6052796983029545,
910
- "grad_norm": 3.3482093811035156,
911
- "learning_rate": 1.1976744186046513e-05,
912
- "loss": 2.1323,
913
- "step": 12100
914
- },
915
- {
916
- "epoch": 7.668133249528598,
917
- "grad_norm": 5.329409599304199,
918
- "learning_rate": 1.1662476429918291e-05,
919
- "loss": 2.0587,
920
- "step": 12200
921
- },
922
- {
923
- "epoch": 7.730986800754243,
924
- "grad_norm": 7.584386348724365,
925
- "learning_rate": 1.134820867379007e-05,
926
- "loss": 2.1341,
927
- "step": 12300
928
- },
929
- {
930
- "epoch": 7.7938403519798864,
931
- "grad_norm": 5.996345520019531,
932
- "learning_rate": 1.1033940917661848e-05,
933
- "loss": 2.1108,
934
- "step": 12400
935
- },
936
- {
937
- "epoch": 7.856693903205531,
938
- "grad_norm": 6.1731648445129395,
939
- "learning_rate": 1.0719673161533628e-05,
940
- "loss": 2.1218,
941
- "step": 12500
942
- },
943
- {
944
- "epoch": 7.919547454431175,
945
- "grad_norm": 5.414481163024902,
946
- "learning_rate": 1.0405405405405407e-05,
947
- "loss": 2.028,
948
- "step": 12600
949
- },
950
- {
951
- "epoch": 7.982401005656819,
952
- "grad_norm": 7.198294639587402,
953
- "learning_rate": 1.0091137649277185e-05,
954
- "loss": 2.0489,
955
- "step": 12700
956
- },
957
- {
958
- "epoch": 8.0,
959
- "eval_loss": 1.8111430406570435,
960
- "eval_runtime": 20.0666,
961
- "eval_samples_per_second": 47.741,
962
- "eval_steps_per_second": 5.98,
963
- "step": 12728
964
- },
965
- {
966
- "epoch": 8.045254556882464,
967
- "grad_norm": 6.677022933959961,
968
- "learning_rate": 9.776869893148963e-06,
969
- "loss": 2.0814,
970
- "step": 12800
971
- },
972
- {
973
- "epoch": 8.108108108108109,
974
- "grad_norm": 5.1916728019714355,
975
- "learning_rate": 9.46260213702074e-06,
976
- "loss": 2.119,
977
- "step": 12900
978
- },
979
- {
980
- "epoch": 8.170961659333752,
981
- "grad_norm": 6.04162073135376,
982
- "learning_rate": 9.14833438089252e-06,
983
- "loss": 2.0058,
984
- "step": 13000
985
- },
986
- {
987
- "epoch": 8.233815210559397,
988
- "grad_norm": 4.764267444610596,
989
- "learning_rate": 8.8340666247643e-06,
990
- "loss": 2.0113,
991
- "step": 13100
992
- },
993
- {
994
- "epoch": 8.29666876178504,
995
- "grad_norm": 5.77971887588501,
996
- "learning_rate": 8.519798868636078e-06,
997
- "loss": 2.0392,
998
- "step": 13200
999
- },
1000
- {
1001
- "epoch": 8.359522313010686,
1002
- "grad_norm": 5.698218822479248,
1003
- "learning_rate": 8.205531112507857e-06,
1004
- "loss": 2.107,
1005
- "step": 13300
1006
- },
1007
- {
1008
- "epoch": 8.42237586423633,
1009
- "grad_norm": 5.236012935638428,
1010
- "learning_rate": 7.891263356379635e-06,
1011
- "loss": 2.0829,
1012
- "step": 13400
1013
- },
1014
- {
1015
- "epoch": 8.485229415461973,
1016
- "grad_norm": 4.379955291748047,
1017
- "learning_rate": 7.576995600251414e-06,
1018
- "loss": 1.9321,
1019
- "step": 13500
1020
- },
1021
- {
1022
- "epoch": 8.548082966687618,
1023
- "grad_norm": 6.034859657287598,
1024
- "learning_rate": 7.262727844123193e-06,
1025
- "loss": 2.1013,
1026
- "step": 13600
1027
- },
1028
- {
1029
- "epoch": 8.610936517913261,
1030
- "grad_norm": 5.320705413818359,
1031
- "learning_rate": 6.948460087994972e-06,
1032
- "loss": 2.0543,
1033
- "step": 13700
1034
- },
1035
- {
1036
- "epoch": 8.673790069138906,
1037
- "grad_norm": 5.735895156860352,
1038
- "learning_rate": 6.634192331866751e-06,
1039
- "loss": 2.0594,
1040
- "step": 13800
1041
- },
1042
- {
1043
- "epoch": 8.73664362036455,
1044
- "grad_norm": 4.845800876617432,
1045
- "learning_rate": 6.31992457573853e-06,
1046
- "loss": 1.9402,
1047
- "step": 13900
1048
- },
1049
- {
1050
- "epoch": 8.799497171590195,
1051
- "grad_norm": 4.628382682800293,
1052
- "learning_rate": 6.0056568196103085e-06,
1053
- "loss": 1.9937,
1054
- "step": 14000
1055
- },
1056
- {
1057
- "epoch": 8.862350722815838,
1058
- "grad_norm": 4.747410774230957,
1059
- "learning_rate": 5.691389063482086e-06,
1060
- "loss": 2.0654,
1061
- "step": 14100
1062
- },
1063
- {
1064
- "epoch": 8.925204274041484,
1065
- "grad_norm": 4.694166660308838,
1066
- "learning_rate": 5.377121307353866e-06,
1067
- "loss": 2.0523,
1068
- "step": 14200
1069
- },
1070
- {
1071
- "epoch": 8.988057825267127,
1072
- "grad_norm": 6.711084365844727,
1073
- "learning_rate": 5.0628535512256445e-06,
1074
- "loss": 1.9856,
1075
- "step": 14300
1076
- },
1077
- {
1078
- "epoch": 9.0,
1079
- "eval_loss": 1.7920939922332764,
1080
- "eval_runtime": 20.0378,
1081
- "eval_samples_per_second": 47.81,
1082
- "eval_steps_per_second": 5.989,
1083
- "step": 14319
1084
- },
1085
- {
1086
- "epoch": 9.050911376492772,
1087
- "grad_norm": 6.053162097930908,
1088
- "learning_rate": 4.748585795097423e-06,
1089
- "loss": 2.0392,
1090
- "step": 14400
1091
- },
1092
- {
1093
- "epoch": 9.113764927718416,
1094
- "grad_norm": 4.806529521942139,
1095
- "learning_rate": 4.434318038969202e-06,
1096
- "loss": 2.0308,
1097
- "step": 14500
1098
- },
1099
- {
1100
- "epoch": 9.17661847894406,
1101
- "grad_norm": 4.725819110870361,
1102
- "learning_rate": 4.1200502828409805e-06,
1103
- "loss": 2.0441,
1104
- "step": 14600
1105
- },
1106
- {
1107
- "epoch": 9.239472030169704,
1108
- "grad_norm": 4.637420177459717,
1109
- "learning_rate": 3.8057825267127596e-06,
1110
- "loss": 2.0061,
1111
- "step": 14700
1112
- },
1113
- {
1114
- "epoch": 9.30232558139535,
1115
- "grad_norm": 6.441665172576904,
1116
- "learning_rate": 3.4915147705845382e-06,
1117
- "loss": 2.1299,
1118
- "step": 14800
1119
- },
1120
- {
1121
- "epoch": 9.365179132620993,
1122
- "grad_norm": 3.506943941116333,
1123
- "learning_rate": 3.1772470144563173e-06,
1124
- "loss": 1.9443,
1125
- "step": 14900
1126
- },
1127
- {
1128
- "epoch": 9.428032683846638,
1129
- "grad_norm": 8.454822540283203,
1130
- "learning_rate": 2.8629792583280956e-06,
1131
- "loss": 2.0327,
1132
- "step": 15000
1133
- },
1134
- {
1135
- "epoch": 9.490886235072281,
1136
- "grad_norm": 5.021187782287598,
1137
- "learning_rate": 2.5487115021998746e-06,
1138
- "loss": 1.9839,
1139
- "step": 15100
1140
- },
1141
- {
1142
- "epoch": 9.553739786297927,
1143
- "grad_norm": 6.3962016105651855,
1144
- "learning_rate": 2.234443746071653e-06,
1145
- "loss": 2.0604,
1146
- "step": 15200
1147
- },
1148
- {
1149
- "epoch": 9.61659333752357,
1150
- "grad_norm": 5.531436443328857,
1151
- "learning_rate": 1.920175989943432e-06,
1152
- "loss": 2.0168,
1153
- "step": 15300
1154
- },
1155
- {
1156
- "epoch": 9.679446888749215,
1157
- "grad_norm": 4.300695896148682,
1158
- "learning_rate": 1.6059082338152106e-06,
1159
- "loss": 1.9994,
1160
- "step": 15400
1161
- },
1162
- {
1163
- "epoch": 9.742300439974859,
1164
- "grad_norm": 3.102018356323242,
1165
- "learning_rate": 1.2916404776869893e-06,
1166
- "loss": 2.0441,
1167
- "step": 15500
1168
- },
1169
- {
1170
- "epoch": 9.805153991200502,
1171
- "grad_norm": 4.91919469833374,
1172
- "learning_rate": 9.773727215587681e-07,
1173
- "loss": 1.9584,
1174
- "step": 15600
1175
- },
1176
- {
1177
- "epoch": 9.868007542426147,
1178
- "grad_norm": 4.21737813949585,
1179
- "learning_rate": 6.631049654305469e-07,
1180
- "loss": 2.0019,
1181
- "step": 15700
1182
- },
1183
- {
1184
- "epoch": 9.930861093651792,
1185
- "grad_norm": 4.098769187927246,
1186
- "learning_rate": 3.4883720930232557e-07,
1187
- "loss": 2.0121,
1188
- "step": 15800
1189
- },
1190
- {
1191
- "epoch": 9.993714644877436,
1192
- "grad_norm": 4.722096920013428,
1193
- "learning_rate": 3.456945317410434e-08,
1194
- "loss": 2.0196,
1195
- "step": 15900
1196
- },
1197
- {
1198
- "epoch": 10.0,
1199
- "eval_loss": 1.787421464920044,
1200
- "eval_runtime": 20.0243,
1201
- "eval_samples_per_second": 47.842,
1202
- "eval_steps_per_second": 5.993,
1203
- "step": 15910
1204
- },
1205
- {
1206
- "epoch": 10.056568196103079,
1207
- "grad_norm": 3.8331987857818604,
1208
- "learning_rate": 2.4860150848522942e-05,
1209
- "loss": 2.0388,
1210
- "step": 16000
1211
- },
1212
- {
1213
- "epoch": 10.119421747328724,
1214
- "grad_norm": 3.9292027950286865,
1215
- "learning_rate": 2.4703016970458832e-05,
1216
- "loss": 2.0913,
1217
- "step": 16100
1218
- },
1219
- {
1220
- "epoch": 10.182275298554368,
1221
- "grad_norm": 5.124855995178223,
1222
- "learning_rate": 2.454588309239472e-05,
1223
- "loss": 2.0452,
1224
- "step": 16200
1225
- },
1226
- {
1227
- "epoch": 10.245128849780013,
1228
- "grad_norm": 5.743933200836182,
1229
- "learning_rate": 2.438874921433061e-05,
1230
- "loss": 2.016,
1231
- "step": 16300
1232
- },
1233
- {
1234
- "epoch": 10.307982401005656,
1235
- "grad_norm": 6.4510931968688965,
1236
- "learning_rate": 2.42316153362665e-05,
1237
- "loss": 1.9785,
1238
- "step": 16400
1239
- },
1240
- {
1241
- "epoch": 10.370835952231301,
1242
- "grad_norm": 6.550465106964111,
1243
- "learning_rate": 2.4074481458202387e-05,
1244
- "loss": 1.9912,
1245
- "step": 16500
1246
- },
1247
- {
1248
- "epoch": 10.433689503456945,
1249
- "grad_norm": 5.37285852432251,
1250
- "learning_rate": 2.391734758013828e-05,
1251
- "loss": 2.0549,
1252
- "step": 16600
1253
- },
1254
- {
1255
- "epoch": 10.49654305468259,
1256
- "grad_norm": 5.4893412590026855,
1257
- "learning_rate": 2.376021370207417e-05,
1258
- "loss": 1.9434,
1259
- "step": 16700
1260
- },
1261
- {
1262
- "epoch": 10.559396605908233,
1263
- "grad_norm": 4.316259384155273,
1264
- "learning_rate": 2.3603079824010057e-05,
1265
- "loss": 1.8413,
1266
- "step": 16800
1267
- },
1268
- {
1269
- "epoch": 10.622250157133879,
1270
- "grad_norm": 3.4342756271362305,
1271
- "learning_rate": 2.3445945945945946e-05,
1272
- "loss": 1.9312,
1273
- "step": 16900
1274
- },
1275
- {
1276
- "epoch": 10.685103708359522,
1277
- "grad_norm": 5.680815696716309,
1278
- "learning_rate": 2.3288812067881836e-05,
1279
- "loss": 1.9678,
1280
- "step": 17000
1281
- },
1282
- {
1283
- "epoch": 10.747957259585167,
1284
- "grad_norm": 6.04569149017334,
1285
- "learning_rate": 2.3131678189817726e-05,
1286
- "loss": 2.0329,
1287
- "step": 17100
1288
- },
1289
- {
1290
- "epoch": 10.81081081081081,
1291
- "grad_norm": 9.336991310119629,
1292
- "learning_rate": 2.2974544311753616e-05,
1293
- "loss": 1.9575,
1294
- "step": 17200
1295
- },
1296
- {
1297
- "epoch": 10.873664362036456,
1298
- "grad_norm": 3.826447010040283,
1299
- "learning_rate": 2.2817410433689505e-05,
1300
- "loss": 1.9692,
1301
- "step": 17300
1302
- },
1303
- {
1304
- "epoch": 10.936517913262099,
1305
- "grad_norm": 4.134801387786865,
1306
- "learning_rate": 2.2660276555625392e-05,
1307
- "loss": 2.0406,
1308
- "step": 17400
1309
- },
1310
- {
1311
- "epoch": 10.999371464487744,
1312
- "grad_norm": 5.291431903839111,
1313
- "learning_rate": 2.2503142677561285e-05,
1314
- "loss": 1.9631,
1315
- "step": 17500
1316
- },
1317
- {
1318
- "epoch": 11.0,
1319
- "eval_loss": 1.7517410516738892,
1320
- "eval_runtime": 21.6572,
1321
- "eval_samples_per_second": 44.235,
1322
- "eval_steps_per_second": 5.541,
1323
- "step": 17501
1324
- },
1325
- {
1326
- "epoch": 11.062225015713388,
1327
- "grad_norm": 4.9575066566467285,
1328
- "learning_rate": 2.234600879949717e-05,
1329
- "loss": 1.9381,
1330
- "step": 17600
1331
- },
1332
- {
1333
- "epoch": 11.125078566939033,
1334
- "grad_norm": 12.871175765991211,
1335
- "learning_rate": 2.218887492143306e-05,
1336
- "loss": 1.8867,
1337
- "step": 17700
1338
- },
1339
- {
1340
- "epoch": 11.187932118164676,
1341
- "grad_norm": 4.3662519454956055,
1342
- "learning_rate": 2.203174104336895e-05,
1343
- "loss": 1.9713,
1344
- "step": 17800
1345
- },
1346
- {
1347
- "epoch": 11.250785669390321,
1348
- "grad_norm": 5.662289619445801,
1349
- "learning_rate": 2.187460716530484e-05,
1350
- "loss": 1.9188,
1351
- "step": 17900
1352
- },
1353
- {
1354
- "epoch": 11.313639220615965,
1355
- "grad_norm": 7.633818626403809,
1356
- "learning_rate": 2.171747328724073e-05,
1357
- "loss": 1.9142,
1358
- "step": 18000
1359
- },
1360
- {
1361
- "epoch": 11.376492771841608,
1362
- "grad_norm": 4.940028667449951,
1363
- "learning_rate": 2.156033940917662e-05,
1364
- "loss": 1.8697,
1365
- "step": 18100
1366
- },
1367
- {
1368
- "epoch": 11.439346323067253,
1369
- "grad_norm": 5.070211410522461,
1370
- "learning_rate": 2.1403205531112506e-05,
1371
- "loss": 1.9624,
1372
- "step": 18200
1373
- },
1374
- {
1375
- "epoch": 11.502199874292897,
1376
- "grad_norm": 7.409548282623291,
1377
- "learning_rate": 2.12460716530484e-05,
1378
- "loss": 1.9283,
1379
- "step": 18300
1380
- },
1381
- {
1382
- "epoch": 11.565053425518542,
1383
- "grad_norm": 6.541192531585693,
1384
- "learning_rate": 2.108893777498429e-05,
1385
- "loss": 1.9357,
1386
- "step": 18400
1387
- },
1388
- {
1389
- "epoch": 11.627906976744185,
1390
- "grad_norm": 5.941864967346191,
1391
- "learning_rate": 2.0931803896920176e-05,
1392
- "loss": 1.869,
1393
- "step": 18500
1394
- },
1395
- {
1396
- "epoch": 11.69076052796983,
1397
- "grad_norm": 9.418646812438965,
1398
- "learning_rate": 2.0774670018856065e-05,
1399
- "loss": 1.8518,
1400
- "step": 18600
1401
- },
1402
- {
1403
- "epoch": 11.753614079195474,
1404
- "grad_norm": 5.367152690887451,
1405
- "learning_rate": 2.061753614079196e-05,
1406
- "loss": 1.8945,
1407
- "step": 18700
1408
- },
1409
- {
1410
- "epoch": 11.81646763042112,
1411
- "grad_norm": 5.896432399749756,
1412
- "learning_rate": 2.0460402262727845e-05,
1413
- "loss": 1.8569,
1414
- "step": 18800
1415
- },
1416
- {
1417
- "epoch": 11.879321181646763,
1418
- "grad_norm": 6.137564182281494,
1419
- "learning_rate": 2.0303268384663735e-05,
1420
- "loss": 1.9179,
1421
- "step": 18900
1422
- },
1423
- {
1424
- "epoch": 11.942174732872408,
1425
- "grad_norm": 4.5933918952941895,
1426
- "learning_rate": 2.0146134506599625e-05,
1427
- "loss": 1.8941,
1428
- "step": 19000
1429
- },
1430
- {
1431
- "epoch": 12.0,
1432
- "eval_loss": 1.7062737941741943,
1433
- "eval_runtime": 21.7167,
1434
- "eval_samples_per_second": 44.114,
1435
- "eval_steps_per_second": 5.526,
1436
- "step": 19092
1437
- },
1438
- {
1439
- "epoch": 12.005028284098051,
1440
- "grad_norm": 5.298050880432129,
1441
- "learning_rate": 1.998900062853551e-05,
1442
- "loss": 1.8681,
1443
- "step": 19100
1444
- },
1445
- {
1446
- "epoch": 12.067881835323696,
1447
- "grad_norm": 7.001854419708252,
1448
- "learning_rate": 1.9831866750471404e-05,
1449
- "loss": 1.8377,
1450
- "step": 19200
1451
- },
1452
- {
1453
- "epoch": 12.13073538654934,
1454
- "grad_norm": 4.692386150360107,
1455
- "learning_rate": 1.9674732872407294e-05,
1456
- "loss": 1.8279,
1457
- "step": 19300
1458
- },
1459
- {
1460
- "epoch": 12.193588937774985,
1461
- "grad_norm": 6.864208221435547,
1462
- "learning_rate": 1.951759899434318e-05,
1463
- "loss": 1.8855,
1464
- "step": 19400
1465
- },
1466
- {
1467
- "epoch": 12.256442489000628,
1468
- "grad_norm": 3.883880853652954,
1469
- "learning_rate": 1.936046511627907e-05,
1470
- "loss": 1.84,
1471
- "step": 19500
1472
- },
1473
- {
1474
- "epoch": 12.319296040226273,
1475
- "grad_norm": 5.302524566650391,
1476
- "learning_rate": 1.920333123821496e-05,
1477
- "loss": 1.8791,
1478
- "step": 19600
1479
- },
1480
- {
1481
- "epoch": 12.382149591451917,
1482
- "grad_norm": 6.854051113128662,
1483
- "learning_rate": 1.904619736015085e-05,
1484
- "loss": 1.9189,
1485
- "step": 19700
1486
- },
1487
- {
1488
- "epoch": 12.445003142677562,
1489
- "grad_norm": 4.728283405303955,
1490
- "learning_rate": 1.888906348208674e-05,
1491
- "loss": 1.8903,
1492
- "step": 19800
1493
- },
1494
- {
1495
- "epoch": 12.507856693903205,
1496
- "grad_norm": 4.314347267150879,
1497
- "learning_rate": 1.8731929604022626e-05,
1498
- "loss": 1.8615,
1499
- "step": 19900
1500
- },
1501
- {
1502
- "epoch": 12.57071024512885,
1503
- "grad_norm": 3.873619318008423,
1504
- "learning_rate": 1.857479572595852e-05,
1505
- "loss": 1.8232,
1506
- "step": 20000
1507
- },
1508
- {
1509
- "epoch": 12.633563796354494,
1510
- "grad_norm": 6.445096969604492,
1511
- "learning_rate": 1.841766184789441e-05,
1512
- "loss": 1.7764,
1513
- "step": 20100
1514
- },
1515
- {
1516
- "epoch": 12.696417347580137,
1517
- "grad_norm": 4.258322715759277,
1518
- "learning_rate": 1.8260527969830295e-05,
1519
- "loss": 1.869,
1520
- "step": 20200
1521
- },
1522
- {
1523
- "epoch": 12.759270898805783,
1524
- "grad_norm": 7.782538414001465,
1525
- "learning_rate": 1.8103394091766185e-05,
1526
- "loss": 1.7986,
1527
- "step": 20300
1528
- },
1529
- {
1530
- "epoch": 12.822124450031426,
1531
- "grad_norm": 7.189488887786865,
1532
- "learning_rate": 1.7946260213702078e-05,
1533
- "loss": 1.8448,
1534
- "step": 20400
1535
- },
1536
- {
1537
- "epoch": 12.884978001257071,
1538
- "grad_norm": 5.59601354598999,
1539
- "learning_rate": 1.7789126335637964e-05,
1540
- "loss": 1.7924,
1541
- "step": 20500
1542
- },
1543
- {
1544
- "epoch": 12.947831552482715,
1545
- "grad_norm": 4.675200939178467,
1546
- "learning_rate": 1.7631992457573854e-05,
1547
- "loss": 1.8212,
1548
- "step": 20600
1549
- },
1550
- {
1551
- "epoch": 13.0,
1552
- "eval_loss": 1.6696668863296509,
1553
- "eval_runtime": 21.645,
1554
- "eval_samples_per_second": 44.26,
1555
- "eval_steps_per_second": 5.544,
1556
- "step": 20683
1557
- },
1558
- {
1559
- "epoch": 13.01068510370836,
1560
- "grad_norm": 3.3650217056274414,
1561
- "learning_rate": 1.7474858579509744e-05,
1562
- "loss": 1.6872,
1563
- "step": 20700
1564
- },
1565
- {
1566
- "epoch": 13.073538654934003,
1567
- "grad_norm": 6.4758219718933105,
1568
- "learning_rate": 1.731772470144563e-05,
1569
- "loss": 1.8029,
1570
- "step": 20800
1571
- },
1572
- {
1573
- "epoch": 13.136392206159648,
1574
- "grad_norm": 4.500367641448975,
1575
- "learning_rate": 1.7160590823381523e-05,
1576
- "loss": 1.8655,
1577
- "step": 20900
1578
- },
1579
- {
1580
- "epoch": 13.199245757385292,
1581
- "grad_norm": 5.369949817657471,
1582
- "learning_rate": 1.7003456945317413e-05,
1583
- "loss": 1.821,
1584
- "step": 21000
1585
- },
1586
- {
1587
- "epoch": 13.262099308610937,
1588
- "grad_norm": 4.84245491027832,
1589
- "learning_rate": 1.68463230672533e-05,
1590
- "loss": 1.7454,
1591
- "step": 21100
1592
- },
1593
- {
1594
- "epoch": 13.32495285983658,
1595
- "grad_norm": 4.510051727294922,
1596
- "learning_rate": 1.668918918918919e-05,
1597
- "loss": 1.8378,
1598
- "step": 21200
1599
- },
1600
- {
1601
- "epoch": 13.387806411062225,
1602
- "grad_norm": 5.163560390472412,
1603
- "learning_rate": 1.653205531112508e-05,
1604
- "loss": 1.7985,
1605
- "step": 21300
1606
- },
1607
- {
1608
- "epoch": 13.450659962287869,
1609
- "grad_norm": 4.454617023468018,
1610
- "learning_rate": 1.637492143306097e-05,
1611
- "loss": 1.8177,
1612
- "step": 21400
1613
- },
1614
- {
1615
- "epoch": 13.513513513513514,
1616
- "grad_norm": 3.672908067703247,
1617
- "learning_rate": 1.6217787554996858e-05,
1618
- "loss": 1.6908,
1619
- "step": 21500
1620
- },
1621
- {
1622
- "epoch": 13.576367064739157,
1623
- "grad_norm": 4.549923419952393,
1624
- "learning_rate": 1.6060653676932748e-05,
1625
- "loss": 1.7603,
1626
- "step": 21600
1627
- },
1628
- {
1629
- "epoch": 13.639220615964803,
1630
- "grad_norm": 5.733989715576172,
1631
- "learning_rate": 1.5903519798868638e-05,
1632
- "loss": 1.7689,
1633
- "step": 21700
1634
- },
1635
- {
1636
- "epoch": 13.702074167190446,
1637
- "grad_norm": 4.507519245147705,
1638
- "learning_rate": 1.5746385920804527e-05,
1639
- "loss": 1.7984,
1640
- "step": 21800
1641
- },
1642
- {
1643
- "epoch": 13.764927718416091,
1644
- "grad_norm": 4.713226795196533,
1645
- "learning_rate": 1.5589252042740414e-05,
1646
- "loss": 1.8011,
1647
- "step": 21900
1648
- },
1649
- {
1650
- "epoch": 13.827781269641735,
1651
- "grad_norm": 4.300686359405518,
1652
- "learning_rate": 1.5432118164676304e-05,
1653
- "loss": 1.7743,
1654
- "step": 22000
1655
- },
1656
- {
1657
- "epoch": 13.89063482086738,
1658
- "grad_norm": 4.702789306640625,
1659
- "learning_rate": 1.5274984286612197e-05,
1660
- "loss": 1.6903,
1661
- "step": 22100
1662
- },
1663
- {
1664
- "epoch": 13.953488372093023,
1665
- "grad_norm": 6.481640815734863,
1666
- "learning_rate": 1.5117850408548085e-05,
1667
- "loss": 1.822,
1668
- "step": 22200
1669
- },
1670
- {
1671
- "epoch": 14.0,
1672
- "eval_loss": 1.648952603340149,
1673
- "eval_runtime": 21.6512,
1674
- "eval_samples_per_second": 44.247,
1675
- "eval_steps_per_second": 5.542,
1676
- "step": 22274
1677
- }
1678
- ],
1679
- "logging_steps": 100,
1680
- "max_steps": 31820,
1681
- "num_input_tokens_seen": 0,
1682
- "num_train_epochs": 20,
1683
- "save_steps": 500,
1684
- "stateful_callbacks": {
1685
- "TrainerControl": {
1686
- "args": {
1687
- "should_epoch_stop": false,
1688
- "should_evaluate": false,
1689
- "should_log": false,
1690
- "should_save": true,
1691
- "should_training_stop": false
1692
- },
1693
- "attributes": {}
1694
- }
1695
- },
1696
- "total_flos": 3.0487459270754304e+16,
1697
- "train_batch_size": 8,
1698
- "trial_name": null,
1699
- "trial_params": null
1700
- }