File size: 24,453 Bytes
1aa6f21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.5675488430095608,
  "eval_steps": 1024,
  "global_step": 12288,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.011823934229365849,
      "grad_norm": 1.1338618993759155,
      "learning_rate": 1.9615384615384617e-05,
      "loss": 10.4459,
      "step": 256
    },
    {
      "epoch": 0.023647868458731697,
      "grad_norm": 1.0851895809173584,
      "learning_rate": 3.930769230769231e-05,
      "loss": 7.9458,
      "step": 512
    },
    {
      "epoch": 0.03547180268809755,
      "grad_norm": 0.9216171503067017,
      "learning_rate": 4.999617095521894e-05,
      "loss": 5.6401,
      "step": 768
    },
    {
      "epoch": 0.047295736917463395,
      "grad_norm": 0.5787180066108704,
      "learning_rate": 4.9961092368776736e-05,
      "loss": 3.8256,
      "step": 1024
    },
    {
      "epoch": 0.047295736917463395,
      "eval_acr_loss": 0.9939580324305791,
      "eval_across_var": 0.003025606172010473,
      "eval_bleu": 0.5515564027779047,
      "eval_ce_loss": 2.3591716888288383,
      "eval_cos_loss": 0.9272929947125857,
      "eval_cov": 0.0706253400131992,
      "eval_cov_loss": 0.00802828647306861,
      "eval_global_var": 0.2767718589469178,
      "eval_loss": 2.832596059803549,
      "eval_mse_loss": 1.9087850182023767,
      "eval_per_var": 0.2680246013484589,
      "eval_within_var": 0.27379363795665845,
      "step": 1024
    },
    {
      "epoch": 0.047295736917463395,
      "eval_acr_loss": 0.9939580324305791,
      "eval_across_var": 0.003025606172010473,
      "eval_bleu": 0.5515564027779047,
      "eval_ce_loss": 2.3591716888288383,
      "eval_cos_loss": 0.9272929947125857,
      "eval_cov": 0.0706253400131992,
      "eval_cov_loss": 0.00802828647306861,
      "eval_global_var": 0.2767718589469178,
      "eval_loss": 2.832596059803549,
      "eval_mse_loss": 1.9087850182023767,
      "eval_per_var": 0.2680246013484589,
      "eval_runtime": 159.2542,
      "eval_samples_per_second": 175.776,
      "eval_steps_per_second": 2.75,
      "eval_within_var": 0.27379363795665845,
      "step": 1024
    },
    {
      "epoch": 0.05911967114682925,
      "grad_norm": 0.38924261927604675,
      "learning_rate": 4.988941132556799e-05,
      "loss": 2.7681,
      "step": 1280
    },
    {
      "epoch": 0.0709436053761951,
      "grad_norm": 0.3134535551071167,
      "learning_rate": 4.9781232937269974e-05,
      "loss": 2.1522,
      "step": 1536
    },
    {
      "epoch": 0.08276753960556095,
      "grad_norm": 0.2510242760181427,
      "learning_rate": 4.963671583455164e-05,
      "loss": 1.7487,
      "step": 1792
    },
    {
      "epoch": 0.09459147383492679,
      "grad_norm": 0.22504042088985443,
      "learning_rate": 4.945607193446079e-05,
      "loss": 1.4694,
      "step": 2048
    },
    {
      "epoch": 0.09459147383492679,
      "eval_acr_loss": 0.9901781947645423,
      "eval_across_var": 0.004923156680556261,
      "eval_bleu": 0.8030880203505176,
      "eval_ce_loss": 0.700762519825539,
      "eval_cos_loss": 0.7673689819634233,
      "eval_cov": 0.07080983897866723,
      "eval_cov_loss": 0.008125514746749917,
      "eval_global_var": 0.4061385202625571,
      "eval_loss": 1.1126885499856243,
      "eval_mse_loss": 1.6458245439616512,
      "eval_per_var": 0.3933824513056507,
      "eval_within_var": 0.4013502570592105,
      "step": 2048
    },
    {
      "epoch": 0.09459147383492679,
      "eval_acr_loss": 0.9901781947645423,
      "eval_across_var": 0.004923156680556261,
      "eval_bleu": 0.8030880203505176,
      "eval_ce_loss": 0.700762519825539,
      "eval_cos_loss": 0.7673689819634233,
      "eval_cov": 0.07080983897866723,
      "eval_cov_loss": 0.008125514746749917,
      "eval_global_var": 0.4061385202625571,
      "eval_loss": 1.1126885499856243,
      "eval_mse_loss": 1.6458245439616512,
      "eval_per_var": 0.3933824513056507,
      "eval_runtime": 155.1424,
      "eval_samples_per_second": 180.434,
      "eval_steps_per_second": 2.823,
      "eval_within_var": 0.4013502570592105,
      "step": 2048
    },
    {
      "epoch": 0.10641540806429264,
      "grad_norm": 0.18675386905670166,
      "learning_rate": 4.923956612967301e-05,
      "loss": 1.2664,
      "step": 2304
    },
    {
      "epoch": 0.1182393422936585,
      "grad_norm": 0.18214967846870422,
      "learning_rate": 4.898751590005826e-05,
      "loss": 1.1058,
      "step": 2560
    },
    {
      "epoch": 0.13006327652302435,
      "grad_norm": 0.15246237814426422,
      "learning_rate": 4.870029084713462e-05,
      "loss": 0.981,
      "step": 2816
    },
    {
      "epoch": 0.1418872107523902,
      "grad_norm": 0.1368647962808609,
      "learning_rate": 4.837831215209188e-05,
      "loss": 0.8816,
      "step": 3072
    },
    {
      "epoch": 0.1418872107523902,
      "eval_acr_loss": 0.9858010461613468,
      "eval_across_var": 0.007125071276481089,
      "eval_bleu": 0.8956566730262217,
      "eval_ce_loss": 0.32717657133460587,
      "eval_cos_loss": 0.6143105866974348,
      "eval_cov": 0.06988288496182934,
      "eval_cov_loss": 0.007913092302252032,
      "eval_global_var": 0.5146751926369864,
      "eval_loss": 0.6788079935938256,
      "eval_mse_loss": 1.3672495941593223,
      "eval_per_var": 0.4983278039383562,
      "eval_within_var": 0.5077701699516001,
      "step": 3072
    },
    {
      "epoch": 0.1418872107523902,
      "eval_acr_loss": 0.9858010461613468,
      "eval_across_var": 0.007125071276481089,
      "eval_bleu": 0.8956566730262217,
      "eval_ce_loss": 0.32717657133460587,
      "eval_cos_loss": 0.6143105866974348,
      "eval_cov": 0.06988288496182934,
      "eval_cov_loss": 0.007913092302252032,
      "eval_global_var": 0.5146751926369864,
      "eval_loss": 0.6788079935938256,
      "eval_mse_loss": 1.3672495941593223,
      "eval_per_var": 0.4983278039383562,
      "eval_runtime": 156.011,
      "eval_samples_per_second": 179.43,
      "eval_steps_per_second": 2.807,
      "eval_within_var": 0.5077701699516001,
      "step": 3072
    },
    {
      "epoch": 0.15371114498175603,
      "grad_norm": 0.13020840287208557,
      "learning_rate": 4.802205195817963e-05,
      "loss": 0.8019,
      "step": 3328
    },
    {
      "epoch": 0.1655350792111219,
      "grad_norm": 0.12300444394350052,
      "learning_rate": 4.763203267836576e-05,
      "loss": 0.7339,
      "step": 3584
    },
    {
      "epoch": 0.17735901344048774,
      "grad_norm": 0.10956571996212006,
      "learning_rate": 4.720882622928019e-05,
      "loss": 0.6774,
      "step": 3840
    },
    {
      "epoch": 0.18918294766985358,
      "grad_norm": 0.11182258278131485,
      "learning_rate": 4.675305319256765e-05,
      "loss": 0.6307,
      "step": 4096
    },
    {
      "epoch": 0.18918294766985358,
      "eval_acr_loss": 0.9774447257660296,
      "eval_across_var": 0.011342588644134536,
      "eval_bleu": 0.9352497637682005,
      "eval_ce_loss": 0.18979091641225226,
      "eval_cos_loss": 0.49574217472446563,
      "eval_cov": 0.06955429843571632,
      "eval_cov_loss": 0.007831381105490403,
      "eval_global_var": 0.6115789544092466,
      "eval_loss": 0.49388179734145127,
      "eval_mse_loss": 1.144643609110079,
      "eval_per_var": 0.592599529109589,
      "eval_within_var": 0.600530758568141,
      "step": 4096
    },
    {
      "epoch": 0.18918294766985358,
      "eval_acr_loss": 0.9774447257660296,
      "eval_across_var": 0.011342588644134536,
      "eval_bleu": 0.9352497637682005,
      "eval_ce_loss": 0.18979091641225226,
      "eval_cos_loss": 0.49574217472446563,
      "eval_cov": 0.06955429843571632,
      "eval_cov_loss": 0.007831381105490403,
      "eval_global_var": 0.6115789544092466,
      "eval_loss": 0.49388179734145127,
      "eval_mse_loss": 1.144643609110079,
      "eval_per_var": 0.592599529109589,
      "eval_runtime": 155.2844,
      "eval_samples_per_second": 180.269,
      "eval_steps_per_second": 2.821,
      "eval_within_var": 0.600530758568141,
      "step": 4096
    },
    {
      "epoch": 0.20100688189921945,
      "grad_norm": 0.1080719456076622,
      "learning_rate": 4.6265381904878854e-05,
      "loss": 0.588,
      "step": 4352
    },
    {
      "epoch": 0.2128308161285853,
      "grad_norm": 0.10819243639707565,
      "learning_rate": 4.57465274778347e-05,
      "loss": 0.5554,
      "step": 4608
    },
    {
      "epoch": 0.22465475035795113,
      "grad_norm": 0.1115206629037857,
      "learning_rate": 4.519725074940068e-05,
      "loss": 0.5198,
      "step": 4864
    },
    {
      "epoch": 0.236478684587317,
      "grad_norm": 0.1552964597940445,
      "learning_rate": 4.461835716820895e-05,
      "loss": 0.473,
      "step": 5120
    },
    {
      "epoch": 0.236478684587317,
      "eval_acr_loss": 0.11126784305088222,
      "eval_across_var": 0.680014660641483,
      "eval_bleu": 0.9523306149700821,
      "eval_ce_loss": 0.12964293614165967,
      "eval_cos_loss": 0.4240787292588247,
      "eval_cov": 0.07667060747538527,
      "eval_cov_loss": 0.010102802944969232,
      "eval_global_var": 1.6775805329623288,
      "eval_loss": 0.31928212995126365,
      "eval_mse_loss": 1.0156728980475909,
      "eval_per_var": 1.6460250784817352,
      "eval_within_var": 1.0011268127454471,
      "step": 5120
    },
    {
      "epoch": 0.236478684587317,
      "eval_acr_loss": 0.11126784305088222,
      "eval_across_var": 0.680014660641483,
      "eval_bleu": 0.9523306149700821,
      "eval_ce_loss": 0.12964293614165967,
      "eval_cos_loss": 0.4240787292588247,
      "eval_cov": 0.07667060747538527,
      "eval_cov_loss": 0.010102802944969232,
      "eval_global_var": 1.6775805329623288,
      "eval_loss": 0.31928212995126365,
      "eval_mse_loss": 1.0156728980475909,
      "eval_per_var": 1.6460250784817352,
      "eval_runtime": 154.2182,
      "eval_samples_per_second": 181.516,
      "eval_steps_per_second": 2.84,
      "eval_within_var": 1.0011268127454471,
      "step": 5120
    },
    {
      "epoch": 0.24830261881668284,
      "grad_norm": 0.11613737791776657,
      "learning_rate": 4.401069561246422e-05,
      "loss": 0.3958,
      "step": 5376
    },
    {
      "epoch": 0.2601265530460487,
      "grad_norm": 0.11101594567298889,
      "learning_rate": 4.337515714516545e-05,
      "loss": 0.3648,
      "step": 5632
    },
    {
      "epoch": 0.27195048727541454,
      "grad_norm": 0.14844343066215515,
      "learning_rate": 4.2712673707468434e-05,
      "loss": 0.3464,
      "step": 5888
    },
    {
      "epoch": 0.2837744215047804,
      "grad_norm": 0.10389428585767746,
      "learning_rate": 4.202421675210565e-05,
      "loss": 0.3281,
      "step": 6144
    },
    {
      "epoch": 0.2837744215047804,
      "eval_acr_loss": 0.015462962337612025,
      "eval_across_var": 0.9652468334866441,
      "eval_bleu": 0.9665450842704094,
      "eval_ce_loss": 0.08923274265882904,
      "eval_cos_loss": 0.35504293285276245,
      "eval_cov": 0.06710940844392123,
      "eval_cov_loss": 0.007327720047650884,
      "eval_global_var": 2.2822310216894977,
      "eval_loss": 0.24187510450408883,
      "eval_mse_loss": 0.8843358904803724,
      "eval_per_var": 2.3488914454908674,
      "eval_within_var": 1.3261088448572376,
      "step": 6144
    },
    {
      "epoch": 0.2837744215047804,
      "eval_acr_loss": 0.015462962337612025,
      "eval_across_var": 0.9652468334866441,
      "eval_bleu": 0.9665450842704094,
      "eval_ce_loss": 0.08923274265882904,
      "eval_cos_loss": 0.35504293285276245,
      "eval_cov": 0.06710940844392123,
      "eval_cov_loss": 0.007327720047650884,
      "eval_global_var": 2.2822310216894977,
      "eval_loss": 0.24187510450408883,
      "eval_mse_loss": 0.8843358904803724,
      "eval_per_var": 2.3488914454908674,
      "eval_runtime": 155.0579,
      "eval_samples_per_second": 180.533,
      "eval_steps_per_second": 2.825,
      "eval_within_var": 1.3261088448572376,
      "step": 6144
    },
    {
      "epoch": 0.2955983557341462,
      "grad_norm": 0.129238098859787,
      "learning_rate": 4.131079581886694e-05,
      "loss": 0.3099,
      "step": 6400
    },
    {
      "epoch": 0.30742228996351206,
      "grad_norm": 0.1061507984995842,
      "learning_rate": 4.057345705423016e-05,
      "loss": 0.2963,
      "step": 6656
    },
    {
      "epoch": 0.3192462241928779,
      "grad_norm": 0.10803277790546417,
      "learning_rate": 3.981328167731251e-05,
      "loss": 0.2854,
      "step": 6912
    },
    {
      "epoch": 0.3310701584222438,
      "grad_norm": 0.10297808796167374,
      "learning_rate": 3.9031384394391954e-05,
      "loss": 0.2709,
      "step": 7168
    },
    {
      "epoch": 0.3310701584222438,
      "eval_acr_loss": 0.014716924630300589,
      "eval_across_var": 0.9652062489834006,
      "eval_bleu": 0.9750850142716162,
      "eval_ce_loss": 0.0659905160653945,
      "eval_cos_loss": 0.31133458848413265,
      "eval_cov": 0.06627595805686358,
      "eval_cov_loss": 0.007171058822125537,
      "eval_global_var": 2.3926940639269407,
      "eval_loss": 0.20149783922793113,
      "eval_mse_loss": 0.8055339337211765,
      "eval_per_var": 2.513992936643836,
      "eval_within_var": 1.4371973874906427,
      "step": 7168
    },
    {
      "epoch": 0.3310701584222438,
      "eval_acr_loss": 0.014716924630300589,
      "eval_across_var": 0.9652062489834006,
      "eval_bleu": 0.9750850142716162,
      "eval_ce_loss": 0.0659905160653945,
      "eval_cos_loss": 0.31133458848413265,
      "eval_cov": 0.06627595805686358,
      "eval_cov_loss": 0.007171058822125537,
      "eval_global_var": 2.3926940639269407,
      "eval_loss": 0.20149783922793113,
      "eval_mse_loss": 0.8055339337211765,
      "eval_per_var": 2.513992936643836,
      "eval_runtime": 152.9262,
      "eval_samples_per_second": 183.049,
      "eval_steps_per_second": 2.864,
      "eval_within_var": 1.4371973874906427,
      "step": 7168
    },
    {
      "epoch": 0.34289409265160964,
      "grad_norm": 0.08976765722036362,
      "learning_rate": 3.822891176432382e-05,
      "loss": 0.2629,
      "step": 7424
    },
    {
      "epoch": 0.3547180268809755,
      "grad_norm": 0.25151142477989197,
      "learning_rate": 3.7407040517249335e-05,
      "loss": 0.2533,
      "step": 7680
    },
    {
      "epoch": 0.3665419611103413,
      "grad_norm": 0.09347163140773773,
      "learning_rate": 3.6566975829061614e-05,
      "loss": 0.2437,
      "step": 7936
    },
    {
      "epoch": 0.37836589533970716,
      "grad_norm": 0.15004394948482513,
      "learning_rate": 3.5709949554159355e-05,
      "loss": 0.2348,
      "step": 8192
    },
    {
      "epoch": 0.37836589533970716,
      "eval_acr_loss": 0.014583685287022457,
      "eval_across_var": 0.9538344581649728,
      "eval_bleu": 0.980561100967318,
      "eval_ce_loss": 0.05129089445454073,
      "eval_cos_loss": 0.28137742255104187,
      "eval_cov": 0.06562282614511987,
      "eval_cov_loss": 0.007066378377369482,
      "eval_global_var": 2.479759738869863,
      "eval_loss": 0.17527394500225102,
      "eval_mse_loss": 0.7551626324925793,
      "eval_per_var": 2.6612933433219177,
      "eval_within_var": 1.53611661745533,
      "step": 8192
    },
    {
      "epoch": 0.37836589533970716,
      "eval_acr_loss": 0.014583685287022457,
      "eval_across_var": 0.9538344581649728,
      "eval_bleu": 0.980561100967318,
      "eval_ce_loss": 0.05129089445454073,
      "eval_cos_loss": 0.28137742255104187,
      "eval_cov": 0.06562282614511987,
      "eval_cov_loss": 0.007066378377369482,
      "eval_global_var": 2.479759738869863,
      "eval_loss": 0.17527394500225102,
      "eval_mse_loss": 0.7551626324925793,
      "eval_per_var": 2.6612933433219177,
      "eval_runtime": 151.3473,
      "eval_samples_per_second": 184.959,
      "eval_steps_per_second": 2.894,
      "eval_within_var": 1.53611661745533,
      "step": 8192
    },
    {
      "epoch": 0.390189829569073,
      "grad_norm": 0.09314695745706558,
      "learning_rate": 3.483721841907964e-05,
      "loss": 0.2288,
      "step": 8448
    },
    {
      "epoch": 0.4020137637984389,
      "grad_norm": 0.09459128230810165,
      "learning_rate": 3.395006217965885e-05,
      "loss": 0.2225,
      "step": 8704
    },
    {
      "epoch": 0.41383769802780473,
      "grad_norm": 0.1038607731461525,
      "learning_rate": 3.3049781744423665e-05,
      "loss": 0.215,
      "step": 8960
    },
    {
      "epoch": 0.4256616322571706,
      "grad_norm": 0.06964848935604095,
      "learning_rate": 3.213769726696439e-05,
      "loss": 0.2103,
      "step": 9216
    },
    {
      "epoch": 0.4256616322571706,
      "eval_acr_loss": 0.01336662589730249,
      "eval_across_var": 0.987674045780478,
      "eval_bleu": 0.9840502134478613,
      "eval_ce_loss": 0.0413797390200708,
      "eval_cos_loss": 0.2597663049080056,
      "eval_cov": 0.0652730584688927,
      "eval_cov_loss": 0.007007736591494655,
      "eval_global_var": 2.6066415168378994,
      "eval_loss": 0.15707423451216254,
      "eval_mse_loss": 0.7215510081482804,
      "eval_per_var": 2.776688249143836,
      "eval_within_var": 1.6285416399507218,
      "step": 9216
    },
    {
      "epoch": 0.4256616322571706,
      "eval_acr_loss": 0.01336662589730249,
      "eval_across_var": 0.987674045780478,
      "eval_bleu": 0.9840502134478613,
      "eval_ce_loss": 0.0413797390200708,
      "eval_cos_loss": 0.2597663049080056,
      "eval_cov": 0.0652730584688927,
      "eval_cov_loss": 0.007007736591494655,
      "eval_global_var": 2.6066415168378994,
      "eval_loss": 0.15707423451216254,
      "eval_mse_loss": 0.7215510081482804,
      "eval_per_var": 2.776688249143836,
      "eval_runtime": 151.5839,
      "eval_samples_per_second": 184.67,
      "eval_steps_per_second": 2.889,
      "eval_within_var": 1.6285416399507218,
      "step": 9216
    },
    {
      "epoch": 0.4374855664865364,
      "grad_norm": 0.07443105429410934,
      "learning_rate": 3.121514621008757e-05,
      "loss": 0.2053,
      "step": 9472
    },
    {
      "epoch": 0.44930950071590225,
      "grad_norm": 0.08320944011211395,
      "learning_rate": 3.0283481384586697e-05,
      "loss": 0.2017,
      "step": 9728
    },
    {
      "epoch": 0.4611334349452681,
      "grad_norm": 0.1169746071100235,
      "learning_rate": 2.9344068965507027e-05,
      "loss": 0.1966,
      "step": 9984
    },
    {
      "epoch": 0.472957369174634,
      "grad_norm": 0.08953411877155304,
      "learning_rate": 2.840199155190943e-05,
      "loss": 0.1938,
      "step": 10240
    },
    {
      "epoch": 0.472957369174634,
      "eval_acr_loss": 0.012974254141045301,
      "eval_across_var": 0.9815338524781405,
      "eval_bleu": 0.9866441773938769,
      "eval_ce_loss": 0.034374653984745755,
      "eval_cos_loss": 0.24375769958648508,
      "eval_cov": 0.06485466107930223,
      "eval_cov_loss": 0.006919686747374668,
      "eval_global_var": 2.6645574700342465,
      "eval_loss": 0.1440824061359989,
      "eval_mse_loss": 0.6987405730981261,
      "eval_per_var": 2.866460652111872,
      "eval_within_var": 1.6929740287941886,
      "step": 10240
    },
    {
      "epoch": 0.472957369174634,
      "eval_acr_loss": 0.012974254141045301,
      "eval_across_var": 0.9815338524781405,
      "eval_bleu": 0.9866441773938769,
      "eval_ce_loss": 0.034374653984745755,
      "eval_cos_loss": 0.24375769958648508,
      "eval_cov": 0.06485466107930223,
      "eval_cov_loss": 0.006919686747374668,
      "eval_global_var": 2.6645574700342465,
      "eval_loss": 0.1440824061359989,
      "eval_mse_loss": 0.6987405730981261,
      "eval_per_var": 2.866460652111872,
      "eval_runtime": 151.0941,
      "eval_samples_per_second": 185.269,
      "eval_steps_per_second": 2.899,
      "eval_within_var": 1.6929740287941886,
      "step": 10240
    },
    {
      "epoch": 0.48478130340399983,
      "grad_norm": 0.08886408805847168,
      "learning_rate": 2.745124265175868e-05,
      "loss": 0.1892,
      "step": 10496
    },
    {
      "epoch": 0.49660523763336567,
      "grad_norm": 0.11508477479219437,
      "learning_rate": 2.6496899297412598e-05,
      "loss": 0.1853,
      "step": 10752
    },
    {
      "epoch": 0.5084291718627315,
      "grad_norm": 0.0753609761595726,
      "learning_rate": 2.554036091926675e-05,
      "loss": 0.1839,
      "step": 11008
    },
    {
      "epoch": 0.5202531060920974,
      "grad_norm": 0.09823817759752274,
      "learning_rate": 2.4583030166456618e-05,
      "loss": 0.18,
      "step": 11264
    },
    {
      "epoch": 0.5202531060920974,
      "eval_acr_loss": 0.013261525430046982,
      "eval_across_var": 1.0205278622505327,
      "eval_bleu": 0.9883516965111658,
      "eval_ce_loss": 0.029720396500880316,
      "eval_cos_loss": 0.23187202939840212,
      "eval_cov": 0.06469566310377425,
      "eval_cov_loss": 0.006903172251619569,
      "eval_global_var": 2.7817003781392695,
      "eval_loss": 0.13512653335248498,
      "eval_mse_loss": 0.6834642570040542,
      "eval_per_var": 3.0010434503424657,
      "eval_within_var": 1.7715753001165173,
      "step": 11264
    },
    {
      "epoch": 0.5202531060920974,
      "eval_acr_loss": 0.013261525430046982,
      "eval_across_var": 1.0205278622505327,
      "eval_bleu": 0.9883516965111658,
      "eval_ce_loss": 0.029720396500880316,
      "eval_cos_loss": 0.23187202939840212,
      "eval_cov": 0.06469566310377425,
      "eval_cov_loss": 0.006903172251619569,
      "eval_global_var": 2.7817003781392695,
      "eval_loss": 0.13512653335248498,
      "eval_mse_loss": 0.6834642570040542,
      "eval_per_var": 3.0010434503424657,
      "eval_runtime": 151.0663,
      "eval_samples_per_second": 185.303,
      "eval_steps_per_second": 2.899,
      "eval_within_var": 1.7715753001165173,
      "step": 11264
    },
    {
      "epoch": 0.5320770403214632,
      "grad_norm": 0.09726043790578842,
      "learning_rate": 2.3626310850040373e-05,
      "loss": 0.1772,
      "step": 11520
    },
    {
      "epoch": 0.5439009745508291,
      "grad_norm": 0.0948578342795372,
      "learning_rate": 2.2671605884477816e-05,
      "loss": 0.1754,
      "step": 11776
    },
    {
      "epoch": 0.5557249087801949,
      "grad_norm": 0.08570394665002823,
      "learning_rate": 2.1720315230424133e-05,
      "loss": 0.1733,
      "step": 12032
    },
    {
      "epoch": 0.5675488430095608,
      "grad_norm": 0.10034994781017303,
      "learning_rate": 2.0777519879097458e-05,
      "loss": 0.1722,
      "step": 12288
    },
    {
      "epoch": 0.5675488430095608,
      "eval_acr_loss": 0.01237716493560104,
      "eval_across_var": 0.997331017772901,
      "eval_bleu": 0.9896437649984566,
      "eval_ce_loss": 0.026328031107630222,
      "eval_cos_loss": 0.22285830515296493,
      "eval_cov": 0.064281376529502,
      "eval_cov_loss": 0.006827230591828761,
      "eval_global_var": 2.807202482876712,
      "eval_loss": 0.12841411385702217,
      "eval_mse_loss": 0.6729901853489549,
      "eval_per_var": 3.002407962328767,
      "eval_within_var": 1.8194298801356799,
      "step": 12288
    },
    {
      "epoch": 0.5675488430095608,
      "eval_acr_loss": 0.01237716493560104,
      "eval_across_var": 0.997331017772901,
      "eval_bleu": 0.9896437649984566,
      "eval_ce_loss": 0.026328031107630222,
      "eval_cos_loss": 0.22285830515296493,
      "eval_cov": 0.064281376529502,
      "eval_cov_loss": 0.006827230591828761,
      "eval_global_var": 2.807202482876712,
      "eval_loss": 0.12841411385702217,
      "eval_mse_loss": 0.6729901853489549,
      "eval_per_var": 3.002407962328767,
      "eval_runtime": 149.5452,
      "eval_samples_per_second": 187.188,
      "eval_steps_per_second": 2.929,
      "eval_within_var": 1.8194298801356799,
      "step": 12288
    }
  ],
  "logging_steps": 256,
  "max_steps": 21651,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 1024,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}