File size: 23,361 Bytes
4ddc7a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.4854440062921542,
  "eval_steps": 1024,
  "global_step": 11264,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.011032818324821687,
      "grad_norm": 0.10309942811727524,
      "learning_rate": 0.000498046875,
      "loss": 1.9074174165725708,
      "step": 256
    },
    {
      "epoch": 0.022065636649643373,
      "grad_norm": 0.2910110056400299,
      "learning_rate": 0.000998046875,
      "loss": 1.5273144245147705,
      "step": 512
    },
    {
      "epoch": 0.03309845497446506,
      "grad_norm": 0.3859289586544037,
      "learning_rate": 0.000999688448778502,
      "loss": 1.3800736665725708,
      "step": 768
    },
    {
      "epoch": 0.04413127329928675,
      "grad_norm": 0.5722110867500305,
      "learning_rate": 0.0009987492950653055,
      "loss": 1.342606544494629,
      "step": 1024
    },
    {
      "epoch": 0.04413127329928675,
      "eval_bleu": 0.9366650964401493,
      "eval_cos_loss": 0.4710617309440174,
      "eval_dec_loss": 0.11786629736169314,
      "eval_loss": 1.3323029561845987,
      "eval_mse2_loss": 0.1665979178824913,
      "eval_mse_loss": 1.3323029561845987,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5289382661329404,
      "flow/improvement_ratio": 0.8936813888010948,
      "flow/mag_ratio_mean": 0.5435932263382462,
      "flow/mag_ratio_std": 0.2489985737210906,
      "step": 1024
    },
    {
      "epoch": 0.04413127329928675,
      "eval_bleu": 0.9366650964401493,
      "eval_cos_loss": 0.4710617309440174,
      "eval_dec_loss": 0.11786629736169314,
      "eval_loss": 1.3323029561845987,
      "eval_mse2_loss": 0.1665979178824913,
      "eval_mse_loss": 1.3323029561845987,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 157.3375,
      "eval_samples_per_second": 190.673,
      "eval_steps_per_second": 2.981,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5289382661329404,
      "flow/improvement_ratio": 0.8936813888010948,
      "flow/mag_ratio_mean": 0.5435932263382462,
      "flow/mag_ratio_std": 0.2489985737210906,
      "step": 1024
    },
    {
      "epoch": 0.05516409162410843,
      "grad_norm": 0.6506242752075195,
      "learning_rate": 0.0009971837136430763,
      "loss": 1.3261979818344116,
      "step": 1280
    },
    {
      "epoch": 0.06619690994893011,
      "grad_norm": 0.6324401497840881,
      "learning_rate": 0.0009949936708776692,
      "loss": 1.3123514652252197,
      "step": 1536
    },
    {
      "epoch": 0.07722972827375181,
      "grad_norm": 1.1031574010849,
      "learning_rate": 0.0009921819174566252,
      "loss": 1.3050185441970825,
      "step": 1792
    },
    {
      "epoch": 0.0882625465985735,
      "grad_norm": 0.762417733669281,
      "learning_rate": 0.000988751984934317,
      "loss": 1.3001574277877808,
      "step": 2048
    },
    {
      "epoch": 0.0882625465985735,
      "eval_bleu": 0.938925796606621,
      "eval_cos_loss": 0.4579503086330032,
      "eval_dec_loss": 0.10506504188690867,
      "eval_loss": 1.2970875999820766,
      "eval_mse2_loss": 0.15707123614768229,
      "eval_mse_loss": 1.2970875999820766,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5420496905409197,
      "flow/improvement_ratio": 0.8918823948038667,
      "flow/mag_ratio_mean": 0.5503126610316702,
      "flow/mag_ratio_std": 0.25175602854823254,
      "step": 2048
    },
    {
      "epoch": 0.0882625465985735,
      "eval_bleu": 0.938925796606621,
      "eval_cos_loss": 0.4579503086330032,
      "eval_dec_loss": 0.10506504188690867,
      "eval_loss": 1.2970875999820766,
      "eval_mse2_loss": 0.15707123614768229,
      "eval_mse_loss": 1.2970875999820766,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 151.9416,
      "eval_samples_per_second": 197.444,
      "eval_steps_per_second": 3.087,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5420496905409197,
      "flow/improvement_ratio": 0.8918823948038667,
      "flow/mag_ratio_mean": 0.5503126610316702,
      "flow/mag_ratio_std": 0.25175602854823254,
      "step": 2048
    },
    {
      "epoch": 0.09929536492339518,
      "grad_norm": 0.39165085554122925,
      "learning_rate": 0.0009847081812963268,
      "loss": 1.2909460067749023,
      "step": 2304
    },
    {
      "epoch": 0.11032818324821686,
      "grad_norm": 0.6050369739532471,
      "learning_rate": 0.0009800555855486275,
      "loss": 1.291382908821106,
      "step": 2560
    },
    {
      "epoch": 0.12136100157303854,
      "grad_norm": 0.6340572237968445,
      "learning_rate": 0.0009748000413383664,
      "loss": 1.2860350608825684,
      "step": 2816
    },
    {
      "epoch": 0.13239381989786023,
      "grad_norm": 0.8046131134033203,
      "learning_rate": 0.0009689481496142604,
      "loss": 1.2806360721588135,
      "step": 3072
    },
    {
      "epoch": 0.13239381989786023,
      "eval_bleu": 0.9365596012238808,
      "eval_cos_loss": 0.4510079253075728,
      "eval_dec_loss": 0.1170106883853801,
      "eval_loss": 1.2785198518208094,
      "eval_mse2_loss": 0.15482012001372603,
      "eval_mse_loss": 1.2785198518208094,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5489920710703966,
      "flow/improvement_ratio": 0.895310169598187,
      "flow/mag_ratio_mean": 0.5600611698398712,
      "flow/mag_ratio_std": 0.2589119763326035,
      "step": 3072
    },
    {
      "epoch": 0.13239381989786023,
      "eval_bleu": 0.9365596012238808,
      "eval_cos_loss": 0.4510079253075728,
      "eval_dec_loss": 0.1170106883853801,
      "eval_loss": 1.2785198518208094,
      "eval_mse2_loss": 0.15482012001372603,
      "eval_mse_loss": 1.2785198518208094,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 150.2303,
      "eval_samples_per_second": 199.693,
      "eval_steps_per_second": 3.122,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5489920710703966,
      "flow/improvement_ratio": 0.895310169598187,
      "flow/mag_ratio_mean": 0.5600611698398712,
      "flow/mag_ratio_std": 0.2589119763326035,
      "step": 3072
    },
    {
      "epoch": 0.14342663822268192,
      "grad_norm": 0.7344346046447754,
      "learning_rate": 0.0009625072603358231,
      "loss": 1.277908444404602,
      "step": 3328
    },
    {
      "epoch": 0.15445945654750362,
      "grad_norm": 0.7456739544868469,
      "learning_rate": 0.0009554854632418371,
      "loss": 1.274967074394226,
      "step": 3584
    },
    {
      "epoch": 0.1654922748723253,
      "grad_norm": 0.528167724609375,
      "learning_rate": 0.000947891577689663,
      "loss": 1.2722811698913574,
      "step": 3840
    },
    {
      "epoch": 0.176525093197147,
      "grad_norm": 0.7374073266983032,
      "learning_rate": 0.0009397351415781539,
      "loss": 1.2716022729873657,
      "step": 4096
    },
    {
      "epoch": 0.176525093197147,
      "eval_bleu": 0.9383145863088955,
      "eval_cos_loss": 0.44795799712890755,
      "eval_dec_loss": 0.11301154795406597,
      "eval_loss": 1.2707049117159488,
      "eval_mse2_loss": 0.15204078735890927,
      "eval_mse_loss": 1.2707049117159488,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.552042003760714,
      "flow/improvement_ratio": 0.8948889724227157,
      "flow/mag_ratio_mean": 0.5576132778674047,
      "flow/mag_ratio_std": 0.25525683488672984,
      "step": 4096
    },
    {
      "epoch": 0.176525093197147,
      "eval_bleu": 0.9383145863088955,
      "eval_cos_loss": 0.44795799712890755,
      "eval_dec_loss": 0.11301154795406597,
      "eval_loss": 1.2707049117159488,
      "eval_mse2_loss": 0.15204078735890927,
      "eval_mse_loss": 1.2707049117159488,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 149.5476,
      "eval_samples_per_second": 200.605,
      "eval_steps_per_second": 3.136,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.552042003760714,
      "flow/improvement_ratio": 0.8948889724227157,
      "flow/mag_ratio_mean": 0.5576132778674047,
      "flow/mag_ratio_std": 0.25525683488672984,
      "step": 4096
    },
    {
      "epoch": 0.18755791152196866,
      "grad_norm": 1.123129963874817,
      "learning_rate": 0.000931026399368079,
      "loss": 1.2691912651062012,
      "step": 4352
    },
    {
      "epoch": 0.19859072984679035,
      "grad_norm": 0.49173882603645325,
      "learning_rate": 0.0009217762892151117,
      "loss": 1.26752769947052,
      "step": 4608
    },
    {
      "epoch": 0.20962354817161205,
      "grad_norm": 0.5665431618690491,
      "learning_rate": 0.0009119964292315354,
      "loss": 1.2669333219528198,
      "step": 4864
    },
    {
      "epoch": 0.22065636649643372,
      "grad_norm": 0.4946308732032776,
      "learning_rate": 0.0009016991028939279,
      "loss": 1.2646225690841675,
      "step": 5120
    },
    {
      "epoch": 0.22065636649643372,
      "eval_bleu": 0.9396675860722136,
      "eval_cos_loss": 0.44516199083724767,
      "eval_dec_loss": 0.10893038547893705,
      "eval_loss": 1.264682760879175,
      "eval_mse2_loss": 0.1498125367073108,
      "eval_mse_loss": 1.264682760879175,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5548380070657872,
      "flow/improvement_ratio": 0.8946977740665997,
      "flow/mag_ratio_mean": 0.5694006043456511,
      "flow/mag_ratio_std": 0.2655116878211625,
      "step": 5120
    },
    {
      "epoch": 0.22065636649643372,
      "eval_bleu": 0.9396675860722136,
      "eval_cos_loss": 0.44516199083724767,
      "eval_dec_loss": 0.10893038547893705,
      "eval_loss": 1.264682760879175,
      "eval_mse2_loss": 0.1498125367073108,
      "eval_mse_loss": 1.264682760879175,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 151.8799,
      "eval_samples_per_second": 197.524,
      "eval_steps_per_second": 3.088,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5548380070657872,
      "flow/improvement_ratio": 0.8946977740665997,
      "flow/mag_ratio_mean": 0.5694006043456511,
      "flow/mag_ratio_std": 0.2655116878211625,
      "step": 5120
    },
    {
      "epoch": 0.23168918482125542,
      "grad_norm": 0.5147830843925476,
      "learning_rate": 0.0008908972436151494,
      "loss": 1.261371374130249,
      "step": 5376
    },
    {
      "epoch": 0.2427220031460771,
      "grad_norm": 0.7221893668174744,
      "learning_rate": 0.0008796044185000127,
      "loss": 1.259010672569275,
      "step": 5632
    },
    {
      "epoch": 0.2537548214708988,
      "grad_norm": 0.6270182132720947,
      "learning_rate": 0.0008678348113050368,
      "loss": 1.2565613985061646,
      "step": 5888
    },
    {
      "epoch": 0.26478763979572045,
      "grad_norm": 0.3954711854457855,
      "learning_rate": 0.0008556032046236897,
      "loss": 1.258548378944397,
      "step": 6144
    },
    {
      "epoch": 0.26478763979572045,
      "eval_bleu": 0.9381239377332383,
      "eval_cos_loss": 0.4434889930524806,
      "eval_dec_loss": 0.11391587999226378,
      "eval_loss": 1.2588644528439812,
      "eval_mse2_loss": 0.15056055846181252,
      "eval_mse_loss": 1.2588644528439812,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5565110067568863,
      "flow/improvement_ratio": 0.8946461940625074,
      "flow/mag_ratio_mean": 0.5628604918146438,
      "flow/mag_ratio_std": 0.2606462057528974,
      "step": 6144
    },
    {
      "epoch": 0.26478763979572045,
      "eval_bleu": 0.9381239377332383,
      "eval_cos_loss": 0.4434889930524806,
      "eval_dec_loss": 0.11391587999226378,
      "eval_loss": 1.2588644528439812,
      "eval_mse2_loss": 0.15056055846181252,
      "eval_mse_loss": 1.2588644528439812,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 153.8457,
      "eval_samples_per_second": 195.001,
      "eval_steps_per_second": 3.049,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5565110067568863,
      "flow/improvement_ratio": 0.8946461940625074,
      "flow/mag_ratio_mean": 0.5628604918146438,
      "flow/mag_ratio_std": 0.2606462057528974,
      "step": 6144
    },
    {
      "epoch": 0.2758204581205422,
      "grad_norm": 0.8126729130744934,
      "learning_rate": 0.000842924961319492,
      "loss": 1.2565950155258179,
      "step": 6400
    },
    {
      "epoch": 0.28685327644536385,
      "grad_norm": 0.84797203540802,
      "learning_rate": 0.0008298160052303045,
      "loss": 1.2548315525054932,
      "step": 6656
    },
    {
      "epoch": 0.2978860947701855,
      "grad_norm": 0.561568021774292,
      "learning_rate": 0.0008162928011680314,
      "loss": 1.2526129484176636,
      "step": 6912
    },
    {
      "epoch": 0.30891891309500724,
      "grad_norm": 0.45474377274513245,
      "learning_rate": 0.000802372334238864,
      "loss": 1.2513761520385742,
      "step": 7168
    },
    {
      "epoch": 0.30891891309500724,
      "eval_bleu": 0.9385536520845816,
      "eval_cos_loss": 0.4402598062557961,
      "eval_dec_loss": 0.11249503215500858,
      "eval_loss": 1.2510530173397267,
      "eval_mse2_loss": 0.1480516226116274,
      "eval_mse_loss": 1.2510530173397267,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5597401952692694,
      "flow/improvement_ratio": 0.895444744939743,
      "flow/mag_ratio_mean": 0.5710282248220464,
      "flow/mag_ratio_std": 0.26387540328858505,
      "step": 7168
    },
    {
      "epoch": 0.30891891309500724,
      "eval_bleu": 0.9385536520845816,
      "eval_cos_loss": 0.4402598062557961,
      "eval_dec_loss": 0.11249503215500858,
      "eval_loss": 1.2510530173397267,
      "eval_mse2_loss": 0.1480516226116274,
      "eval_mse_loss": 1.2510530173397267,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 152.7181,
      "eval_samples_per_second": 196.44,
      "eval_steps_per_second": 3.071,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5597401952692694,
      "flow/improvement_ratio": 0.895444744939743,
      "flow/mag_ratio_mean": 0.5710282248220464,
      "flow/mag_ratio_std": 0.26387540328858505,
      "step": 7168
    },
    {
      "epoch": 0.3199517314198289,
      "grad_norm": 1.3543585538864136,
      "learning_rate": 0.0007880720885100349,
      "loss": 1.2521653175354004,
      "step": 7424
    },
    {
      "epoch": 0.3309845497446506,
      "grad_norm": 0.4370076358318329,
      "learning_rate": 0.0007734100250498788,
      "loss": 1.249273419380188,
      "step": 7680
    },
    {
      "epoch": 0.3420173680694723,
      "grad_norm": 1.0196475982666016,
      "learning_rate": 0.000758404559368781,
      "loss": 1.2500712871551514,
      "step": 7936
    },
    {
      "epoch": 0.353050186394294,
      "grad_norm": 0.733001708984375,
      "learning_rate": 0.0007430745382893488,
      "loss": 1.245364785194397,
      "step": 8192
    },
    {
      "epoch": 0.353050186394294,
      "eval_bleu": 0.9376793187397806,
      "eval_cos_loss": 0.4385024095013706,
      "eval_dec_loss": 0.11364057421017049,
      "eval_loss": 1.2459661925016945,
      "eval_mse2_loss": 0.148339767350571,
      "eval_mse_loss": 1.2459661925016945,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5614975882745755,
      "flow/improvement_ratio": 0.8961417695352518,
      "flow/mag_ratio_mean": 0.5688313084370547,
      "flow/mag_ratio_std": 0.26494109700483554,
      "step": 8192
    },
    {
      "epoch": 0.353050186394294,
      "eval_bleu": 0.9376793187397806,
      "eval_cos_loss": 0.4385024095013706,
      "eval_dec_loss": 0.11364057421017049,
      "eval_loss": 1.2459661925016945,
      "eval_mse2_loss": 0.148339767350571,
      "eval_mse_loss": 1.2459661925016945,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 152.8054,
      "eval_samples_per_second": 196.328,
      "eval_steps_per_second": 3.069,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5614975882745755,
      "flow/improvement_ratio": 0.8961417695352518,
      "flow/mag_ratio_mean": 0.5688313084370547,
      "flow/mag_ratio_std": 0.26494109700483554,
      "step": 8192
    },
    {
      "epoch": 0.36408300471911564,
      "grad_norm": 0.676328718662262,
      "learning_rate": 0.0007274392162748551,
      "loss": 1.2448910474777222,
      "step": 8448
    },
    {
      "epoch": 0.3751158230439373,
      "grad_norm": 0.6379961967468262,
      "learning_rate": 0.000711518231245687,
      "loss": 1.2442706823349,
      "step": 8704
    },
    {
      "epoch": 0.38614864136875904,
      "grad_norm": 0.5386805534362793,
      "learning_rate": 0.0006953315799141723,
      "loss": 1.2446835041046143,
      "step": 8960
    },
    {
      "epoch": 0.3971814596935807,
      "grad_norm": 0.8263258934020996,
      "learning_rate": 0.0006788995926687669,
      "loss": 1.2411766052246094,
      "step": 9216
    },
    {
      "epoch": 0.3971814596935807,
      "eval_bleu": 0.9372486918854673,
      "eval_cos_loss": 0.43675092898452206,
      "eval_dec_loss": 0.11516488874867273,
      "eval_loss": 1.241364901762273,
      "eval_mse2_loss": 0.1478570194196091,
      "eval_mse_loss": 1.241364901762273,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5632490722863659,
      "flow/improvement_ratio": 0.8974738620491679,
      "flow/mag_ratio_mean": 0.5655419154207844,
      "flow/mag_ratio_std": 0.2603240320041998,
      "step": 9216
    },
    {
      "epoch": 0.3971814596935807,
      "eval_bleu": 0.9372486918854673,
      "eval_cos_loss": 0.43675092898452206,
      "eval_dec_loss": 0.11516488874867273,
      "eval_loss": 1.241364901762273,
      "eval_mse2_loss": 0.1478570194196091,
      "eval_mse_loss": 1.241364901762273,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 152.8433,
      "eval_samples_per_second": 196.28,
      "eval_steps_per_second": 3.069,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5632490722863659,
      "flow/improvement_ratio": 0.8974738620491679,
      "flow/mag_ratio_mean": 0.5655419154207844,
      "flow/mag_ratio_std": 0.2603240320041998,
      "step": 9216
    },
    {
      "epoch": 0.4082142780184024,
      "grad_norm": 0.7855456471443176,
      "learning_rate": 0.0006622429080391422,
      "loss": 1.2460049390792847,
      "step": 9472
    },
    {
      "epoch": 0.4192470963432241,
      "grad_norm": 0.4608207941055298,
      "learning_rate": 0.0006453824467742515,
      "loss": 1.2414920330047607,
      "step": 9728
    },
    {
      "epoch": 0.43027991466804577,
      "grad_norm": 0.5247617959976196,
      "learning_rate": 0.0006283393855659275,
      "loss": 1.2424880266189575,
      "step": 9984
    },
    {
      "epoch": 0.44131273299286744,
      "grad_norm": 0.8765453100204468,
      "learning_rate": 0.0006111351304510173,
      "loss": 1.237776517868042,
      "step": 10240
    },
    {
      "epoch": 0.44131273299286744,
      "eval_bleu": 0.937646836000478,
      "eval_cos_loss": 0.4353823194752878,
      "eval_dec_loss": 0.11402556833737632,
      "eval_loss": 1.2377641976260936,
      "eval_mse2_loss": 0.1474350707204357,
      "eval_mse_loss": 1.2377641976260936,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.564617679063191,
      "flow/improvement_ratio": 0.899760089830549,
      "flow/mag_ratio_mean": 0.5730336795229394,
      "flow/mag_ratio_std": 0.26344449729172154,
      "step": 10240
    },
    {
      "epoch": 0.44131273299286744,
      "eval_bleu": 0.937646836000478,
      "eval_cos_loss": 0.4353823194752878,
      "eval_dec_loss": 0.11402556833737632,
      "eval_loss": 1.2377641976260936,
      "eval_mse2_loss": 0.1474350707204357,
      "eval_mse_loss": 1.2377641976260936,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 151.9737,
      "eval_samples_per_second": 197.403,
      "eval_steps_per_second": 3.086,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.564617679063191,
      "flow/improvement_ratio": 0.899760089830549,
      "flow/mag_ratio_mean": 0.5730336795229394,
      "flow/mag_ratio_std": 0.26344449729172154,
      "step": 10240
    },
    {
      "epoch": 0.45234555131768917,
      "grad_norm": 0.6895334124565125,
      "learning_rate": 0.0005937912899254605,
      "loss": 1.2384426593780518,
      "step": 10496
    },
    {
      "epoch": 0.46337836964251083,
      "grad_norm": 0.6421330571174622,
      "learning_rate": 0.0005763296478040787,
      "loss": 1.240878939628601,
      "step": 10752
    },
    {
      "epoch": 0.4744111879673325,
      "grad_norm": 0.7770284414291382,
      "learning_rate": 0.0005587721358601663,
      "loss": 1.2393468618392944,
      "step": 11008
    },
    {
      "epoch": 0.4854440062921542,
      "grad_norm": 1.0520166158676147,
      "learning_rate": 0.0005411408062792448,
      "loss": 1.237922191619873,
      "step": 11264
    },
    {
      "epoch": 0.4854440062921542,
      "eval_bleu": 0.93652744913201,
      "eval_cos_loss": 0.4366011674851497,
      "eval_dec_loss": 0.11468809016390459,
      "eval_loss": 1.2409222840246108,
      "eval_mse2_loss": 0.14564816977804912,
      "eval_mse_loss": 1.2409222840246108,
      "eval_rec_loss": 0.047009017791098624,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5633988297824413,
      "flow/improvement_ratio": 0.897065937773251,
      "flow/mag_ratio_mean": 0.5639294942558956,
      "flow/mag_ratio_std": 0.25510865748564066,
      "step": 11264
    },
    {
      "epoch": 0.4854440062921542,
      "eval_bleu": 0.93652744913201,
      "eval_cos_loss": 0.4366011674851497,
      "eval_dec_loss": 0.11468809016390459,
      "eval_loss": 1.2409222840246108,
      "eval_mse2_loss": 0.14564816977804912,
      "eval_mse_loss": 1.2409222840246108,
      "eval_rec_loss": 0.047009017791098624,
      "eval_runtime": 152.4483,
      "eval_samples_per_second": 196.788,
      "eval_steps_per_second": 3.076,
      "eval_var_loss": 0.01723895594080501,
      "flow/cos_sim": 0.5633988297824413,
      "flow/improvement_ratio": 0.897065937773251,
      "flow/mag_ratio_mean": 0.5639294942558956,
      "flow/mag_ratio_std": 0.25510865748564066,
      "step": 11264
    }
  ],
  "logging_steps": 256,
  "max_steps": 23204,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 1024,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}