File size: 33,162 Bytes
99441b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.06666666666666667,
  "eval_steps": 50,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.3988752365112305,
      "epoch": 0.0013333333333333333,
      "grad_norm": 0.01450820083840917,
      "importance_ratio": 0.9983458518981934,
      "learning_rate": 0.0,
      "loss": -0.0028,
      "mismatch_kl": 0.004329901188611984,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 1,
      "timing/generation_ms": 12196.653502061963,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 12196.653502061963,
      "tokens/completion": 562.04296875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 163.39810061454773
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.0297880172729492,
      "epoch": 0.0026666666666666666,
      "grad_norm": 0.006125098422428371,
      "importance_ratio": 0.9977808594703674,
      "learning_rate": 1.0000000000000002e-06,
      "loss": 0.0118,
      "mismatch_kl": 0.0036596579011529684,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 2,
      "timing/generation_ms": 10855.522208847106,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 10855.522208847106,
      "tokens/completion": 652.203125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 98.15957498550415
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.2343848943710327,
      "epoch": 0.004,
      "grad_norm": 0.0093110934895908,
      "importance_ratio": 0.9983258843421936,
      "learning_rate": 2.0000000000000003e-06,
      "loss": -0.0068,
      "mismatch_kl": 0.00391958886757493,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 3,
      "timing/generation_ms": 14581.869984045625,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 14581.869984045625,
      "tokens/completion": 722.37109375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 171.60404181480408
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.581649661064148,
      "epoch": 0.005333333333333333,
      "grad_norm": 0.007696628408420481,
      "importance_ratio": 0.9986447095870972,
      "learning_rate": 3e-06,
      "loss": -0.0043,
      "mismatch_kl": 0.0024762798566371202,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 4,
      "timing/generation_ms": 11191.347393207252,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 11191.347393207252,
      "tokens/completion": 595.73046875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 186.33580946922302
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8588891625404358,
      "epoch": 0.006666666666666667,
      "grad_norm": 0.0055080213738955075,
      "importance_ratio": 0.9988943934440613,
      "learning_rate": 4.000000000000001e-06,
      "loss": -0.0033,
      "mismatch_kl": 0.0031517043244093657,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 5,
      "timing/generation_ms": 10668.582463636994,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 10668.582463636994,
      "tokens/completion": 636.53125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 82.27488708496094
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.071407675743103,
      "epoch": 0.008,
      "grad_norm": 0.02271832942623967,
      "importance_ratio": 0.998067319393158,
      "learning_rate": 5e-06,
      "loss": 0.0019,
      "mismatch_kl": 0.003643231000751257,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 6,
      "timing/generation_ms": 3378.591795451939,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 3378.591795451939,
      "tokens/completion": 178.73828125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 105.60203862190247
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8531922698020935,
      "epoch": 0.009333333333333334,
      "grad_norm": 0.018354067998903482,
      "importance_ratio": 0.9980432391166687,
      "learning_rate": 5e-06,
      "loss": -0.0002,
      "mismatch_kl": 0.003655636915937066,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 7,
      "timing/generation_ms": 12279.695899225771,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 12279.695899225771,
      "tokens/completion": 631.35546875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 104.63563537597656
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7072162628173828,
      "epoch": 0.010666666666666666,
      "grad_norm": 0.005552532749027135,
      "importance_ratio": 0.9982293844223022,
      "learning_rate": 5e-06,
      "loss": -0.0014,
      "mismatch_kl": 0.0029395928140729666,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 8,
      "timing/generation_ms": 6614.743183366954,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 6614.743183366954,
      "tokens/completion": 339.1640625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 132.164165019989
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8156145215034485,
      "epoch": 0.012,
      "grad_norm": 0.008176505226750404,
      "importance_ratio": 0.9981797933578491,
      "learning_rate": 5e-06,
      "loss": 0.0027,
      "mismatch_kl": 0.0031279518734663725,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 9,
      "timing/generation_ms": 8826.908372342587,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 8826.908372342587,
      "tokens/completion": 444.53515625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 144.61542773246765
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8708666563034058,
      "epoch": 0.013333333333333334,
      "grad_norm": 0.009382372847258274,
      "importance_ratio": 0.9981642961502075,
      "learning_rate": 5e-06,
      "loss": 0.0126,
      "mismatch_kl": 0.0030885515734553337,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 10,
      "timing/generation_ms": 7367.4805322662,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 7367.4805322662,
      "tokens/completion": 400.74609375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 176.57727003097534
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.6906348466873169,
      "epoch": 0.014666666666666666,
      "grad_norm": 0.007616251351947248,
      "importance_ratio": 1.0045424699783325,
      "learning_rate": 5e-06,
      "loss": 0.0542,
      "mismatch_kl": 0.03194786608219147,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 11,
      "timing/generation_ms": 26879.562875255942,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 26879.562875255942,
      "tokens/completion": 1682.0546875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 190.3386266231537
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.6506091356277466,
      "epoch": 0.016,
      "grad_norm": 0.004382353798954015,
      "importance_ratio": 0.9982648491859436,
      "learning_rate": 5e-06,
      "loss": 0.043,
      "mismatch_kl": 0.02482638508081436,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 12,
      "timing/generation_ms": 22301.60311050713,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 22301.60311050713,
      "tokens/completion": 1387.734375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 177.6784646511078
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.143129825592041,
      "epoch": 0.017333333333333333,
      "grad_norm": 0.009321138085104996,
      "importance_ratio": 1.001217007637024,
      "learning_rate": 5e-06,
      "loss": -0.0139,
      "mismatch_kl": 0.0036374337505549192,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 13,
      "timing/generation_ms": 6277.724616229534,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 6277.724616229534,
      "tokens/completion": 432.66796875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 129.69259929656982
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.650863766670227,
      "epoch": 0.018666666666666668,
      "grad_norm": 0.0076614251264825245,
      "importance_ratio": 0.9983827471733093,
      "learning_rate": 5e-06,
      "loss": 0.0049,
      "mismatch_kl": 0.0027237425092607737,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 14,
      "timing/generation_ms": 7103.812717832625,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 7103.812717832625,
      "tokens/completion": 404.96484375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 57.65379452705383
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7439635992050171,
      "epoch": 0.02,
      "grad_norm": 0.009401568464987338,
      "importance_ratio": 0.9981654286384583,
      "learning_rate": 5e-06,
      "loss": 0.0109,
      "mismatch_kl": 0.002909082220867276,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 15,
      "timing/generation_ms": 8292.532542720437,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 8292.532542720437,
      "tokens/completion": 459.85546875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 92.21157336235046
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7638830542564392,
      "epoch": 0.021333333333333333,
      "grad_norm": 0.010374572910211358,
      "importance_ratio": 0.9969711899757385,
      "learning_rate": 5e-06,
      "loss": -0.005,
      "mismatch_kl": 0.0034673516638576984,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 16,
      "timing/generation_ms": 5712.000676430762,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 5712.000676430762,
      "tokens/completion": 308.4296875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 52.6866238117218
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.18189160525798798,
      "epoch": 0.02266666666666667,
      "grad_norm": 0.00257455457059234,
      "importance_ratio": 0.9984971880912781,
      "learning_rate": 5e-06,
      "loss": 0.0681,
      "mismatch_kl": 0.018514186143875122,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 17,
      "timing/generation_ms": 14281.423358246684,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 14281.423358246684,
      "tokens/completion": 1101.546875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 129.24327325820923
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.5917271375656128,
      "epoch": 0.024,
      "grad_norm": 0.005668903472483887,
      "importance_ratio": 0.999828577041626,
      "learning_rate": 5e-06,
      "loss": 0.027,
      "mismatch_kl": 0.002100760815665126,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 18,
      "timing/generation_ms": 24175.399120897055,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 24175.399120897055,
      "tokens/completion": 1504.0390625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 158.57504653930664
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.3282008171081543,
      "epoch": 0.025333333333333333,
      "grad_norm": 0.006636047431888786,
      "importance_ratio": 1.0022344589233398,
      "learning_rate": 5e-06,
      "loss": -0.0015,
      "mismatch_kl": 0.004634195473045111,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 19,
      "timing/generation_ms": 15713.139976374805,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 15713.139976374805,
      "tokens/completion": 764.3203125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 78.56244468688965
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.02470862865448,
      "epoch": 0.02666666666666667,
      "grad_norm": 0.00833481021786943,
      "importance_ratio": 1.0026451349258423,
      "learning_rate": 5e-06,
      "loss": -0.0052,
      "mismatch_kl": 0.004158638883382082,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 20,
      "timing/generation_ms": 6632.851202040911,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 6632.851202040911,
      "tokens/completion": 382.6171875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 199.43552422523499
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.07275530695915222,
      "epoch": 0.028,
      "grad_norm": 0.005944388738403685,
      "importance_ratio": 0.9988561868667603,
      "learning_rate": 5e-06,
      "loss": 0.0452,
      "mismatch_kl": 0.00023643655003979802,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 21,
      "timing/generation_ms": 119174.6030151844,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 119174.6030151844,
      "tokens/completion": 4008.38671875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 685.4423098564148
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.83598792552948,
      "epoch": 0.029333333333333333,
      "grad_norm": 0.00977617477475085,
      "importance_ratio": 0.995696485042572,
      "learning_rate": 5e-06,
      "loss": 0.0066,
      "mismatch_kl": 0.003957619424909353,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 22,
      "timing/generation_ms": 12322.44247943163,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 12322.44247943163,
      "tokens/completion": 442.85546875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 94.37256598472595
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8219886422157288,
      "epoch": 0.030666666666666665,
      "grad_norm": 0.0057449599218849946,
      "importance_ratio": 0.9990558624267578,
      "learning_rate": 5e-06,
      "loss": 0.0388,
      "mismatch_kl": 0.031180420890450478,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 23,
      "timing/generation_ms": 29090.628595091403,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 29090.628595091403,
      "tokens/completion": 1716.3515625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 169.45334482192993
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.0089167356491089,
      "epoch": 0.032,
      "grad_norm": 0.009762837519367,
      "importance_ratio": 0.9979202151298523,
      "learning_rate": 5e-06,
      "loss": 0.0012,
      "mismatch_kl": 0.00405939482152462,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 24,
      "timing/generation_ms": 17154.327374882996,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 17154.327374882996,
      "tokens/completion": 883.390625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 130.7891206741333
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.5053093433380127,
      "epoch": 0.03333333333333333,
      "grad_norm": 0.007416974683241316,
      "importance_ratio": 0.9982149600982666,
      "learning_rate": 5e-06,
      "loss": -0.0068,
      "mismatch_kl": 0.0024536694400012493,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 25,
      "timing/generation_ms": 28463.361867703497,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 28463.361867703497,
      "tokens/completion": 1409.54296875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 178.60342526435852
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.4973055422306061,
      "epoch": 0.034666666666666665,
      "grad_norm": 0.004048717808220336,
      "importance_ratio": 1.0012173652648926,
      "learning_rate": 5e-06,
      "loss": 0.0547,
      "mismatch_kl": 0.03473234549164772,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 26,
      "timing/generation_ms": 18848.746892996132,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 18848.746892996132,
      "tokens/completion": 1286.6875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 181.75563287734985
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.5914682149887085,
      "epoch": 0.036,
      "grad_norm": 0.010568088931367656,
      "importance_ratio": 0.9986244440078735,
      "learning_rate": 5e-06,
      "loss": -0.0214,
      "mismatch_kl": 0.002536088228225708,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 27,
      "timing/generation_ms": 11602.461927570403,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 11602.461927570403,
      "tokens/completion": 734.80078125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 188.88015818595886
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.4526905119419098,
      "epoch": 0.037333333333333336,
      "grad_norm": 0.0035728175606856527,
      "importance_ratio": 0.9999799728393555,
      "learning_rate": 5e-06,
      "loss": 0.0026,
      "mismatch_kl": 0.0024842985440045595,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 28,
      "timing/generation_ms": 30549.59301650524,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 30549.59301650524,
      "tokens/completion": 1536.96875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 328.0478210449219
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.32794511318206787,
      "epoch": 0.03866666666666667,
      "grad_norm": 0.003333518820406266,
      "importance_ratio": 0.9995192885398865,
      "learning_rate": 5e-06,
      "loss": 0.056,
      "mismatch_kl": 0.028769802302122116,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 29,
      "timing/generation_ms": 18838.333567604423,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 18838.333567604423,
      "tokens/completion": 1263.640625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 290.5948350429535
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.9063822031021118,
      "epoch": 0.04,
      "grad_norm": 0.007342388496075293,
      "importance_ratio": 0.9953157901763916,
      "learning_rate": 5e-06,
      "loss": 0.0025,
      "mismatch_kl": 0.004266439005732536,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 30,
      "timing/generation_ms": 9477.213966660202,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 9477.213966660202,
      "tokens/completion": 473.26953125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 62.30127143859863
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7977282404899597,
      "epoch": 0.04133333333333333,
      "grad_norm": 0.00884043332375607,
      "importance_ratio": 0.9971498847007751,
      "learning_rate": 5e-06,
      "loss": -0.0029,
      "mismatch_kl": 0.004033135715872049,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 31,
      "timing/generation_ms": 18995.201839134097,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 18995.201839134097,
      "tokens/completion": 958.625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 88.6347918510437
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8451470732688904,
      "epoch": 0.042666666666666665,
      "grad_norm": 0.018842389370386323,
      "importance_ratio": 0.9982671141624451,
      "learning_rate": 5e-06,
      "loss": 0.0369,
      "mismatch_kl": 0.003600390162318945,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 32,
      "timing/generation_ms": 5587.277088314295,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 5587.277088314295,
      "tokens/completion": 407.12109375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 140.4788475036621
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.1521912813186646,
      "epoch": 0.044,
      "grad_norm": 0.006379742039913797,
      "importance_ratio": 0.997858464717865,
      "learning_rate": 5e-06,
      "loss": -0.0065,
      "mismatch_kl": 0.005035887472331524,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 33,
      "timing/generation_ms": 18916.152058169246,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 18916.152058169246,
      "tokens/completion": 966.55859375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 90.41954302787781
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.1553761959075928,
      "epoch": 0.04533333333333334,
      "grad_norm": 0.010733713274389883,
      "importance_ratio": 1.0111567974090576,
      "learning_rate": 5e-06,
      "loss": 0.0014,
      "mismatch_kl": 0.006704343948513269,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 34,
      "timing/generation_ms": 17302.85968258977,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 17302.85968258977,
      "tokens/completion": 864.44140625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 143.8659963607788
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.3105199635028839,
      "epoch": 0.04666666666666667,
      "grad_norm": 0.003940130100379767,
      "importance_ratio": 1.0006911754608154,
      "learning_rate": 5e-06,
      "loss": 0.0315,
      "mismatch_kl": 0.022524980828166008,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 35,
      "timing/generation_ms": 29806.298807263374,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 29806.298807263374,
      "tokens/completion": 1672.48828125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 164.04821372032166
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7972971200942993,
      "epoch": 0.048,
      "grad_norm": 0.008409173142054645,
      "importance_ratio": 0.9948906898498535,
      "learning_rate": 5e-06,
      "loss": 0.004,
      "mismatch_kl": 0.004282351583242416,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 36,
      "timing/generation_ms": 14936.399303376675,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 14936.399303376675,
      "tokens/completion": 787.78125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 196.79586815834045
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.5769950747489929,
      "epoch": 0.04933333333333333,
      "grad_norm": 0.009636377939254703,
      "importance_ratio": 0.9972301721572876,
      "learning_rate": 5e-06,
      "loss": -0.0011,
      "mismatch_kl": 0.003603809280321002,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 37,
      "timing/generation_ms": 13729.571803472936,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 13729.571803472936,
      "tokens/completion": 697.64453125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 76.77378511428833
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.715777575969696,
      "epoch": 0.050666666666666665,
      "grad_norm": 0.005305945077729364,
      "importance_ratio": 0.9969701766967773,
      "learning_rate": 5e-06,
      "loss": 0.0093,
      "mismatch_kl": 0.004232620354741812,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 38,
      "timing/generation_ms": 26689.202761277556,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 26689.202761277556,
      "tokens/completion": 1302.75390625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 119.53459739685059
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.108477234840393,
      "epoch": 0.052,
      "grad_norm": 0.01158876392732835,
      "importance_ratio": 0.9918505549430847,
      "learning_rate": 5e-06,
      "loss": 0.0069,
      "mismatch_kl": 0.0055715711787343025,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 39,
      "timing/generation_ms": 9316.26115180552,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 9316.26115180552,
      "tokens/completion": 510.88671875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 134.0537760257721
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.0468562841415405,
      "epoch": 0.05333333333333334,
      "grad_norm": 0.006250915142780056,
      "importance_ratio": 0.993874192237854,
      "learning_rate": 5e-06,
      "loss": -0.004,
      "mismatch_kl": 0.00569565873593092,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 40,
      "timing/generation_ms": 22657.30178449303,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 22657.30178449303,
      "tokens/completion": 1117.0390625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 104.31990480422974
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 1.0242066383361816,
      "epoch": 0.05466666666666667,
      "grad_norm": 0.009730238448609988,
      "importance_ratio": 1.0014866590499878,
      "learning_rate": 5e-06,
      "loss": 0.0029,
      "mismatch_kl": 0.006813807878643274,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 41,
      "timing/generation_ms": 15266.15516282618,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 15266.15516282618,
      "tokens/completion": 789.296875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 137.6475269794464
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.9917812943458557,
      "epoch": 0.056,
      "grad_norm": 0.015130940878153589,
      "importance_ratio": 0.9915910959243774,
      "learning_rate": 5e-06,
      "loss": -0.0016,
      "mismatch_kl": 0.006494010798633099,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 42,
      "timing/generation_ms": 8552.51188017428,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 8552.51188017428,
      "tokens/completion": 427.00390625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 58.246270418167114
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7529230117797852,
      "epoch": 0.05733333333333333,
      "grad_norm": 0.017225340266775354,
      "importance_ratio": 0.9983583092689514,
      "learning_rate": 5e-06,
      "loss": 0.0017,
      "mismatch_kl": 0.005849814508110285,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 43,
      "timing/generation_ms": 5776.03021170944,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 5776.03021170944,
      "tokens/completion": 292.6484375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 138.62879586219788
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.9116057753562927,
      "epoch": 0.058666666666666666,
      "grad_norm": 0.013240792131345649,
      "importance_ratio": 0.993713915348053,
      "learning_rate": 5e-06,
      "loss": -0.006,
      "mismatch_kl": 0.00599726801738143,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 44,
      "timing/generation_ms": 4909.729053266346,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 4909.729053266346,
      "tokens/completion": 252.2890625,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 53.461458683013916
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.6952740550041199,
      "epoch": 0.06,
      "grad_norm": 0.007271643900369788,
      "importance_ratio": 0.9978048205375671,
      "learning_rate": 5e-06,
      "loss": -0.0094,
      "mismatch_kl": 0.004028764553368092,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 45,
      "timing/generation_ms": 12042.251928709447,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 12042.251928709447,
      "tokens/completion": 668.03125,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 77.72424340248108
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8735002279281616,
      "epoch": 0.06133333333333333,
      "grad_norm": 0.00817327643152143,
      "importance_ratio": 1.0016076564788818,
      "learning_rate": 5e-06,
      "loss": 0.0002,
      "mismatch_kl": 0.004535754211246967,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 46,
      "timing/generation_ms": 8553.523855283856,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 8553.523855283856,
      "tokens/completion": 459.87109375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 104.68091750144958
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7288662195205688,
      "epoch": 0.06266666666666666,
      "grad_norm": 0.016180435920793518,
      "importance_ratio": 1.0001567602157593,
      "learning_rate": 5e-06,
      "loss": 0.0002,
      "mismatch_kl": 0.006666674744337797,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 47,
      "timing/generation_ms": 7466.575676575303,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 7466.575676575303,
      "tokens/completion": 361.484375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 59.11225175857544
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.6449630856513977,
      "epoch": 0.064,
      "grad_norm": 0.004581873635760183,
      "importance_ratio": 1.0026441812515259,
      "learning_rate": 5e-06,
      "loss": 0.0588,
      "mismatch_kl": 0.059744831174612045,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 48,
      "timing/generation_ms": 14945.35976741463,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 14945.35976741463,
      "tokens/completion": 1044.0,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 182.3247947692871
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.8048098683357239,
      "epoch": 0.06533333333333333,
      "grad_norm": 0.0052364810032066635,
      "importance_ratio": 0.9973055720329285,
      "learning_rate": 5e-06,
      "loss": 0.0347,
      "mismatch_kl": 0.0451083704829216,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 49,
      "timing/generation_ms": 28440.53523708135,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 28440.53523708135,
      "tokens/completion": 1630.4296875,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 221.0147523880005
    },
    {
      "advantage/absmean": 0.12451171875,
      "entropy": 0.7735000252723694,
      "epoch": 0.06666666666666667,
      "grad_norm": 0.015103596816141955,
      "importance_ratio": 0.9899436831474304,
      "learning_rate": 5e-06,
      "loss": -0.0022,
      "mismatch_kl": 0.008240272291004658,
      "reward": 0.12451171875,
      "reward/std": 0.1738164722919464,
      "step": 50,
      "timing/generation_ms": 6061.147706583142,
      "timing/scoring_ms": 0.0,
      "timing/total_ms": 6061.147706583142,
      "tokens/completion": 331.859375,
      "tokens/masked_fraction": 0.0,
      "wall_clock/generate_s": 108.05560183525085
    }
  ],
  "logging_steps": 1,
  "max_steps": 750,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 5,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}