File size: 55,302 Bytes
130e1a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.0006,
  "eval_steps": 500,
  "global_step": 60,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.54889988899231,
      "epoch": 1e-05,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.03602602332830429,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0011,
      "num_tokens": 37068.0,
      "reward": -0.20241966843605042,
      "reward_std": 0.48188072443008423,
      "rewards/rollout_reward_func/mean": -0.20241966843605042,
      "rewards/rollout_reward_func/std": 0.5643806457519531,
      "sampling/importance_sampling_ratio/max": 0.07946816831827164,
      "sampling/importance_sampling_ratio/mean": 0.04483851045370102,
      "sampling/importance_sampling_ratio/min": 0.013508557341992855,
      "sampling/sampling_logp_difference/max": 2.2406487464904785,
      "sampling/sampling_logp_difference/mean": 1.6288080215454102,
      "step": 1,
      "step_time": 5.987766452998585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.54889988899231,
      "epoch": 2e-05,
      "grad_norm": 0.03546445816755295,
      "kl": 0.0,
      "learning_rate": 2.8571428571428575e-07,
      "loss": -0.0011,
      "step": 2,
      "step_time": 2.7118166719992587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.612898111343384,
      "epoch": 3e-05,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.028961816802620888,
      "kl": 0.0006349515169858932,
      "learning_rate": 5.714285714285715e-07,
      "loss": -0.0011,
      "num_tokens": 73706.0,
      "reward": -0.41709062457084656,
      "reward_std": 0.38032135367393494,
      "rewards/rollout_reward_func/mean": -0.41709062457084656,
      "rewards/rollout_reward_func/std": 0.5864750146865845,
      "sampling/importance_sampling_ratio/max": 0.07319076359272003,
      "sampling/importance_sampling_ratio/mean": 0.04485338553786278,
      "sampling/importance_sampling_ratio/min": 0.010412708856165409,
      "sampling/sampling_logp_difference/max": 2.62746524810791,
      "sampling/sampling_logp_difference/mean": 1.631974458694458,
      "step": 3,
      "step_time": 5.237267270999837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.621538519859314,
      "epoch": 4e-05,
      "grad_norm": 0.027420829981565475,
      "kl": 0.0009405575692653656,
      "learning_rate": 8.571428571428572e-07,
      "loss": -0.0011,
      "step": 4,
      "step_time": 2.7475997250003275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 5.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.096774101257324,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.608618140220642,
      "epoch": 5e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04249607026576996,
      "kl": 0.0009066914208233356,
      "learning_rate": 1.142857142857143e-06,
      "loss": -0.0019,
      "num_tokens": 108621.0,
      "reward": -0.4377398192882538,
      "reward_std": 0.6512277126312256,
      "rewards/rollout_reward_func/mean": -0.4377398192882538,
      "rewards/rollout_reward_func/std": 0.8918678760528564,
      "sampling/importance_sampling_ratio/max": 0.0737227350473404,
      "sampling/importance_sampling_ratio/mean": 0.03858622536063194,
      "sampling/importance_sampling_ratio/min": 1.0664673354490728e-12,
      "sampling/sampling_logp_difference/max": 4.4544358253479,
      "sampling/sampling_logp_difference/mean": 1.7643263339996338,
      "step": 5,
      "step_time": 6.21450720699886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.600322604179382,
      "epoch": 6e-05,
      "grad_norm": 0.04186399653553963,
      "kl": 0.0009581162594258785,
      "learning_rate": 1.4285714285714286e-06,
      "loss": -0.002,
      "step": 6,
      "step_time": 2.7577716289997625
    },
    {
      "clip_ratio/high_max": 0.013888888992369175,
      "clip_ratio/high_mean": 0.0069444444961845875,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0069444444961845875,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 7.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 2.15625,
      "completions/mean_terminated_length": 2.15625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.887233972549438,
      "epoch": 7e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.01845015399158001,
      "kl": 0.001277084978937637,
      "learning_rate": 1.7142857142857145e-06,
      "loss": -0.0001,
      "num_tokens": 142642.0,
      "reward": -0.24416625499725342,
      "reward_std": 0.7532411813735962,
      "rewards/rollout_reward_func/mean": -0.24416625499725342,
      "rewards/rollout_reward_func/std": 0.8073007464408875,
      "sampling/importance_sampling_ratio/max": 0.05460228770971298,
      "sampling/importance_sampling_ratio/mean": 0.028035324066877365,
      "sampling/importance_sampling_ratio/min": 1.4402675105884555e-06,
      "sampling/sampling_logp_difference/max": 3.2391209602355957,
      "sampling/sampling_logp_difference/mean": 1.8511791229248047,
      "step": 7,
      "step_time": 5.192916050999884
    },
    {
      "clip_ratio/high_max": 0.02777777798473835,
      "clip_ratio/high_mean": 0.013888888992369175,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.013888888992369175,
      "entropy": 8.885694026947021,
      "epoch": 8e-05,
      "grad_norm": 0.018649807199835777,
      "kl": 0.0008755421440582722,
      "learning_rate": 2.0000000000000003e-06,
      "loss": -0.0001,
      "step": 8,
      "step_time": 2.734651068999483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.617319464683533,
      "epoch": 9e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03367479518055916,
      "kl": 0.0011472837650217116,
      "learning_rate": 2.285714285714286e-06,
      "loss": -0.0017,
      "num_tokens": 180338.0,
      "reward": -0.2762250006198883,
      "reward_std": 0.6308639645576477,
      "rewards/rollout_reward_func/mean": -0.2762250006198883,
      "rewards/rollout_reward_func/std": 0.619946300983429,
      "sampling/importance_sampling_ratio/max": 0.08318282663822174,
      "sampling/importance_sampling_ratio/mean": 0.04022577404975891,
      "sampling/importance_sampling_ratio/min": 8.560036707239149e-11,
      "sampling/sampling_logp_difference/max": 2.8513259887695312,
      "sampling/sampling_logp_difference/mean": 1.6676621437072754,
      "step": 9,
      "step_time": 5.192849637998734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.604341268539429,
      "epoch": 0.0001,
      "grad_norm": 0.03420183062553406,
      "kl": 0.001177078731416259,
      "learning_rate": 2.571428571428571e-06,
      "loss": -0.0017,
      "step": 10,
      "step_time": 2.779620520998833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.559729814529419,
      "epoch": 0.00011,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.02679450251162052,
      "kl": 0.0014951191842556,
      "learning_rate": 2.8571428571428573e-06,
      "loss": -0.0013,
      "num_tokens": 215884.0,
      "reward": 0.05839650332927704,
      "reward_std": 0.5475454926490784,
      "rewards/rollout_reward_func/mean": 0.05839650332927704,
      "rewards/rollout_reward_func/std": 0.7081521153450012,
      "sampling/importance_sampling_ratio/max": 0.09729397296905518,
      "sampling/importance_sampling_ratio/mean": 0.04614107310771942,
      "sampling/importance_sampling_ratio/min": 0.009668753482401371,
      "sampling/sampling_logp_difference/max": 2.5881471633911133,
      "sampling/sampling_logp_difference/mean": 1.6770999431610107,
      "step": 11,
      "step_time": 6.151045407999845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.533240914344788,
      "epoch": 0.00012,
      "grad_norm": 0.027015963569283485,
      "kl": 0.002467602491378784,
      "learning_rate": 3.142857142857143e-06,
      "loss": -0.0013,
      "step": 12,
      "step_time": 2.712993424999695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 5.0,
      "completions/max_terminated_length": 5.0,
      "completions/mean_length": 2.09375,
      "completions/mean_terminated_length": 2.09375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.65865409374237,
      "epoch": 0.00013,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03438998758792877,
      "kl": 0.0025057614548131824,
      "learning_rate": 3.428571428571429e-06,
      "loss": -0.0012,
      "num_tokens": 250983.0,
      "reward": -0.3477444052696228,
      "reward_std": 0.8011652231216431,
      "rewards/rollout_reward_func/mean": -0.3477444052696228,
      "rewards/rollout_reward_func/std": 0.8506107926368713,
      "sampling/importance_sampling_ratio/max": 0.08270177990198135,
      "sampling/importance_sampling_ratio/mean": 0.03867126256227493,
      "sampling/importance_sampling_ratio/min": 2.151069793399074e-06,
      "sampling/sampling_logp_difference/max": 4.863964557647705,
      "sampling/sampling_logp_difference/mean": 1.7552906274795532,
      "step": 13,
      "step_time": 5.348588517999815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.637501120567322,
      "epoch": 0.00014,
      "grad_norm": 0.03334236517548561,
      "kl": 0.003458394785411656,
      "learning_rate": 3.7142857142857146e-06,
      "loss": -0.0013,
      "step": 14,
      "step_time": 2.796233564999966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 8.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 2.1875,
      "completions/mean_terminated_length": 2.1875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.605337858200073,
      "epoch": 0.00015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0344076044857502,
      "kl": 0.007246186607517302,
      "learning_rate": 4.000000000000001e-06,
      "loss": -0.0011,
      "num_tokens": 287257.0,
      "reward": -0.5175027251243591,
      "reward_std": 0.7213335633277893,
      "rewards/rollout_reward_func/mean": -0.5175027251243591,
      "rewards/rollout_reward_func/std": 0.7736045718193054,
      "sampling/importance_sampling_ratio/max": 0.07941123098134995,
      "sampling/importance_sampling_ratio/mean": 0.03353440761566162,
      "sampling/importance_sampling_ratio/min": 8.447619620710611e-05,
      "sampling/sampling_logp_difference/max": 2.514495372772217,
      "sampling/sampling_logp_difference/mean": 1.715531349182129,
      "step": 15,
      "step_time": 5.0371513690006395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 8.572421252727509,
      "epoch": 0.00016,
      "grad_norm": 0.0361904576420784,
      "kl": 0.010158459888771176,
      "learning_rate": 4.2857142857142855e-06,
      "loss": -0.0012,
      "step": 16,
      "step_time": 3.6766240449996985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0052083334885537624,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0052083334885537624,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 10.0,
      "completions/mean_length": 2.6875,
      "completions/mean_terminated_length": 2.2580645084381104,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.298084318637848,
      "epoch": 0.00017,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.03764951229095459,
      "kl": 0.01280841525294818,
      "learning_rate": 4.571428571428572e-06,
      "loss": -0.0025,
      "num_tokens": 319698.0,
      "reward": -0.4354173243045807,
      "reward_std": 0.6479126214981079,
      "rewards/rollout_reward_func/mean": -0.4354173243045807,
      "rewards/rollout_reward_func/std": 0.8862241506576538,
      "sampling/importance_sampling_ratio/max": 0.11163407564163208,
      "sampling/importance_sampling_ratio/mean": 0.04720360040664673,
      "sampling/importance_sampling_ratio/min": 5.244973292489741e-12,
      "sampling/sampling_logp_difference/max": 4.713146686553955,
      "sampling/sampling_logp_difference/mean": 1.6690685749053955,
      "step": 17,
      "step_time": 5.1438336040009744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0052083334885537624,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0052083334885537624,
      "entropy": 8.210463047027588,
      "epoch": 0.00018,
      "grad_norm": 0.03806741535663605,
      "kl": 0.01803363612270914,
      "learning_rate": 4.857142857142858e-06,
      "loss": -0.0026,
      "step": 18,
      "step_time": 2.7552450699986366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 8.558752536773682,
      "epoch": 0.00019,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04838458448648453,
      "kl": 0.022023072466254234,
      "learning_rate": 5.142857142857142e-06,
      "loss": -0.0019,
      "num_tokens": 351318.0,
      "reward": -0.6331132054328918,
      "reward_std": 0.8535451292991638,
      "rewards/rollout_reward_func/mean": -0.6331132054328918,
      "rewards/rollout_reward_func/std": 0.9285341501235962,
      "sampling/importance_sampling_ratio/max": 0.11876487731933594,
      "sampling/importance_sampling_ratio/mean": 0.04305056482553482,
      "sampling/importance_sampling_ratio/min": 0.01192709431052208,
      "sampling/sampling_logp_difference/max": 2.4490432739257812,
      "sampling/sampling_logp_difference/mean": 1.6915147304534912,
      "step": 19,
      "step_time": 4.906003911999505
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "entropy": 8.480961680412292,
      "epoch": 0.0002,
      "grad_norm": 0.047831419855356216,
      "kl": 0.03148091956973076,
      "learning_rate": 5.428571428571429e-06,
      "loss": -0.0021,
      "step": 20,
      "step_time": 2.542041312000947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.914846777915955,
      "epoch": 0.00021,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0461098738014698,
      "kl": 0.05386248789727688,
      "learning_rate": 5.7142857142857145e-06,
      "loss": -0.004,
      "num_tokens": 382194.0,
      "reward": 0.12508951127529144,
      "reward_std": 1.0255252122879028,
      "rewards/rollout_reward_func/mean": 0.12508951127529144,
      "rewards/rollout_reward_func/std": 1.0441889762878418,
      "sampling/importance_sampling_ratio/max": 0.15263937413692474,
      "sampling/importance_sampling_ratio/mean": 0.07228268682956696,
      "sampling/importance_sampling_ratio/min": 0.011906024999916553,
      "sampling/sampling_logp_difference/max": 2.5268516540527344,
      "sampling/sampling_logp_difference/mean": 1.4949290752410889,
      "step": 21,
      "step_time": 5.119830969998475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 7.746252477169037,
      "epoch": 0.00022,
      "grad_norm": 0.04356187954545021,
      "kl": 0.06908445432782173,
      "learning_rate": 6e-06,
      "loss": -0.0043,
      "step": 22,
      "step_time": 3.6642874440012747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.505157113075256,
      "epoch": 0.00023,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.027521340176463127,
      "kl": 0.11650293320417404,
      "learning_rate": 6.285714285714286e-06,
      "loss": -0.0037,
      "num_tokens": 418193.0,
      "reward": 0.050230324268341064,
      "reward_std": 0.5550512671470642,
      "rewards/rollout_reward_func/mean": 0.050230324268341064,
      "rewards/rollout_reward_func/std": 0.7158081531524658,
      "sampling/importance_sampling_ratio/max": 0.17737992107868195,
      "sampling/importance_sampling_ratio/mean": 0.08728897571563721,
      "sampling/importance_sampling_ratio/min": 0.01220005378127098,
      "sampling/sampling_logp_difference/max": 2.312082290649414,
      "sampling/sampling_logp_difference/mean": 1.4727132320404053,
      "step": 23,
      "step_time": 5.118901103000098
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.03125,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.046875,
      "entropy": 7.405115187168121,
      "epoch": 0.00024,
      "grad_norm": 0.027487488463521004,
      "kl": 0.18349953554570675,
      "learning_rate": 6.571428571428572e-06,
      "loss": -0.0038,
      "step": 24,
      "step_time": 2.720049919002122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.015625,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 2.8125,
      "completions/mean_terminated_length": 2.387096643447876,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.812176287174225,
      "epoch": 0.00025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04480979964137077,
      "kl": 0.12202088022604585,
      "learning_rate": 6.857142857142858e-06,
      "loss": -0.003,
      "num_tokens": 453429.0,
      "reward": -0.5567976832389832,
      "reward_std": 0.5721786022186279,
      "rewards/rollout_reward_func/mean": -0.5567976832389832,
      "rewards/rollout_reward_func/std": 0.6177526116371155,
      "sampling/importance_sampling_ratio/max": 0.19887018203735352,
      "sampling/importance_sampling_ratio/mean": 0.06785006076097488,
      "sampling/importance_sampling_ratio/min": 6.38808743654907e-15,
      "sampling/sampling_logp_difference/max": 4.993244647979736,
      "sampling/sampling_logp_difference/mean": 1.6287214756011963,
      "step": 25,
      "step_time": 5.136989613000878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 7.775655627250671,
      "epoch": 0.00026,
      "grad_norm": 0.051532234996557236,
      "kl": 0.13951302412897348,
      "learning_rate": 7.1428571428571436e-06,
      "loss": -0.0031,
      "step": 26,
      "step_time": 2.7561433119990397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 2.40625,
      "completions/mean_terminated_length": 2.40625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.139402210712433,
      "epoch": 0.00027,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1159820482134819,
      "kl": 0.21849708445370197,
      "learning_rate": 7.428571428571429e-06,
      "loss": -0.0033,
      "num_tokens": 488130.0,
      "reward": 0.0451994352042675,
      "reward_std": 0.5705078840255737,
      "rewards/rollout_reward_func/mean": 0.0451994352042675,
      "rewards/rollout_reward_func/std": 0.7002898454666138,
      "sampling/importance_sampling_ratio/max": 0.21661195158958435,
      "sampling/importance_sampling_ratio/mean": 0.11537822335958481,
      "sampling/importance_sampling_ratio/min": 1.9111427718154772e-11,
      "sampling/sampling_logp_difference/max": 5.494892120361328,
      "sampling/sampling_logp_difference/mean": 1.4119889736175537,
      "step": 27,
      "step_time": 5.695921420998275
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.015625,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.03125,
      "entropy": 7.091515064239502,
      "epoch": 0.00028,
      "grad_norm": 0.05888032913208008,
      "kl": 0.19798127934336662,
      "learning_rate": 7.714285714285716e-06,
      "loss": -0.0036,
      "step": 28,
      "step_time": 3.262695406999228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 2.84375,
      "completions/mean_terminated_length": 2.4193546772003174,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.829086482524872,
      "epoch": 0.00029,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07569188624620438,
      "kl": 0.3484500087797642,
      "learning_rate": 8.000000000000001e-06,
      "loss": -0.0047,
      "num_tokens": 523810.0,
      "reward": 0.25646376609802246,
      "reward_std": 0.19004222750663757,
      "rewards/rollout_reward_func/mean": 0.25646376609802246,
      "rewards/rollout_reward_func/std": 0.5962904691696167,
      "sampling/importance_sampling_ratio/max": 0.2295530140399933,
      "sampling/importance_sampling_ratio/mean": 0.12002766132354736,
      "sampling/importance_sampling_ratio/min": 3.580996610352827e-10,
      "sampling/sampling_logp_difference/max": 4.408937454223633,
      "sampling/sampling_logp_difference/mean": 1.4061723947525024,
      "step": 29,
      "step_time": 5.195359340000323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 6.82170957326889,
      "epoch": 0.0003,
      "grad_norm": 0.072876937687397,
      "kl": 0.3563471883535385,
      "learning_rate": 8.285714285714287e-06,
      "loss": -0.0048,
      "step": 30,
      "step_time": 2.7664619609995498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.176153600215912,
      "epoch": 0.00031,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.012421791441738605,
      "kl": 0.19252930209040642,
      "learning_rate": 8.571428571428571e-06,
      "loss": -0.0031,
      "num_tokens": 562344.0,
      "reward": 0.007906697690486908,
      "reward_std": 0.25298231840133667,
      "rewards/rollout_reward_func/mean": 0.007906697690486908,
      "rewards/rollout_reward_func/std": 0.34386366605758667,
      "sampling/importance_sampling_ratio/max": 0.2527497410774231,
      "sampling/importance_sampling_ratio/mean": 0.12306913733482361,
      "sampling/importance_sampling_ratio/min": 1.3953619618263524e-13,
      "sampling/sampling_logp_difference/max": 4.638323783874512,
      "sampling/sampling_logp_difference/mean": 1.436547040939331,
      "step": 31,
      "step_time": 5.2235907870026494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 7.177456617355347,
      "epoch": 0.00032,
      "grad_norm": 0.011528298258781433,
      "kl": 0.19802813418209553,
      "learning_rate": 8.857142857142858e-06,
      "loss": -0.0031,
      "step": 32,
      "step_time": 2.7817133249991457
    },
    {
      "clip_ratio/high_max": 0.013888888992369175,
      "clip_ratio/high_mean": 0.0069444444961845875,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0069444444961845875,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.5588818192481995,
      "epoch": 0.00033,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.019391389563679695,
      "kl": 0.19253090023994446,
      "learning_rate": 9.142857142857144e-06,
      "loss": -0.0068,
      "num_tokens": 596903.0,
      "reward": -0.23663240671157837,
      "reward_std": 0.7337036728858948,
      "rewards/rollout_reward_func/mean": -0.23663240671157837,
      "rewards/rollout_reward_func/std": 0.7588505148887634,
      "sampling/importance_sampling_ratio/max": 0.24936437606811523,
      "sampling/importance_sampling_ratio/mean": 0.09548068046569824,
      "sampling/importance_sampling_ratio/min": 2.8611614813489616e-11,
      "sampling/sampling_logp_difference/max": 4.682908058166504,
      "sampling/sampling_logp_difference/mean": 1.4331482648849487,
      "step": 33,
      "step_time": 5.650446067998928
    },
    {
      "clip_ratio/high_max": 0.045138888992369175,
      "clip_ratio/high_mean": 0.022569444496184587,
      "clip_ratio/low_mean": 0.015625,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.03819444449618459,
      "entropy": 7.561095893383026,
      "epoch": 0.00034,
      "grad_norm": 0.0243048295378685,
      "kl": 0.2237775418907404,
      "learning_rate": 9.42857142857143e-06,
      "loss": -0.0068,
      "step": 34,
      "step_time": 3.267381875997671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 7.21353954076767,
      "epoch": 0.00035,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.03242993354797363,
      "kl": 0.23187845014035702,
      "learning_rate": 9.714285714285715e-06,
      "loss": -0.0051,
      "num_tokens": 630878.0,
      "reward": 0.07425153255462646,
      "reward_std": 0.39872053265571594,
      "rewards/rollout_reward_func/mean": 0.07425153255462646,
      "rewards/rollout_reward_func/std": 0.5568375587463379,
      "sampling/importance_sampling_ratio/max": 0.26548439264297485,
      "sampling/importance_sampling_ratio/mean": 0.12363378703594208,
      "sampling/importance_sampling_ratio/min": 0.00769618758931756,
      "sampling/sampling_logp_difference/max": 3.031831979751587,
      "sampling/sampling_logp_difference/mean": 1.3642754554748535,
      "step": 35,
      "step_time": 5.206078858000183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 7.159900069236755,
      "epoch": 0.00036,
      "grad_norm": 0.0284445621073246,
      "kl": 0.249108312651515,
      "learning_rate": 1e-05,
      "loss": -0.0052,
      "step": 36,
      "step_time": 2.761536221001734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 5.0,
      "completions/mean_length": 2.53125,
      "completions/mean_terminated_length": 2.096774101257324,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.710843145847321,
      "epoch": 0.00037,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.012415582314133644,
      "kl": 0.21114499121904373,
      "learning_rate": 9.9999999995372e-06,
      "loss": -0.0052,
      "num_tokens": 668297.0,
      "reward": -0.05626409500837326,
      "reward_std": 0.34207823872566223,
      "rewards/rollout_reward_func/mean": -0.05626409500837326,
      "rewards/rollout_reward_func/std": 0.40576615929603577,
      "sampling/importance_sampling_ratio/max": 0.28296810388565063,
      "sampling/importance_sampling_ratio/mean": 0.16521048545837402,
      "sampling/importance_sampling_ratio/min": 7.181252052973312e-15,
      "sampling/sampling_logp_difference/max": 3.9721055030822754,
      "sampling/sampling_logp_difference/mean": 1.2990117073059082,
      "step": 37,
      "step_time": 5.382229439999719
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "entropy": 6.636479735374451,
      "epoch": 0.00038,
      "grad_norm": 0.011466137133538723,
      "kl": 0.22695374488830566,
      "learning_rate": 9.999999998148802e-06,
      "loss": -0.0053,
      "step": 38,
      "step_time": 2.8060927069982426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.019097222248092294,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.019097222248092294,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 5.0,
      "completions/mean_length": 3.40625,
      "completions/mean_terminated_length": 2.1034481525421143,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.852519392967224,
      "epoch": 0.00039,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.024172235280275345,
      "kl": 0.25472032092511654,
      "learning_rate": 9.999999995834804e-06,
      "loss": -0.009,
      "num_tokens": 704978.0,
      "reward": -0.23096521198749542,
      "reward_std": 0.4871903955936432,
      "rewards/rollout_reward_func/mean": -0.23096521198749542,
      "rewards/rollout_reward_func/std": 0.5004134774208069,
      "sampling/importance_sampling_ratio/max": 0.2927253544330597,
      "sampling/importance_sampling_ratio/mean": 0.1400977373123169,
      "sampling/importance_sampling_ratio/min": 3.20386295618591e-13,
      "sampling/sampling_logp_difference/max": 4.607293128967285,
      "sampling/sampling_logp_difference/mean": 1.322880744934082,
      "step": 39,
      "step_time": 5.8398218920001455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0034722222480922937,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0034722222480922937,
      "entropy": 6.7819119691848755,
      "epoch": 0.0004,
      "grad_norm": 0.023572752252221107,
      "kl": 0.2616057936102152,
      "learning_rate": 9.999999992595207e-06,
      "loss": -0.0092,
      "step": 40,
      "step_time": 2.772617318999437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.03125,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.03125,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.629358530044556,
      "epoch": 0.00041,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.021721255034208298,
      "kl": 0.3800278306007385,
      "learning_rate": 9.999999988430008e-06,
      "loss": -0.0068,
      "num_tokens": 739943.0,
      "reward": -0.0039390698075294495,
      "reward_std": 0.5532426238059998,
      "rewards/rollout_reward_func/mean": -0.0039390698075294495,
      "rewards/rollout_reward_func/std": 0.6965489983558655,
      "sampling/importance_sampling_ratio/max": 0.31811973452568054,
      "sampling/importance_sampling_ratio/mean": 0.16932719945907593,
      "sampling/importance_sampling_ratio/min": 2.988759240096783e-11,
      "sampling/sampling_logp_difference/max": 3.9750123023986816,
      "sampling/sampling_logp_difference/mean": 1.231735110282898,
      "step": 41,
      "step_time": 5.137522204998277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 6.514784038066864,
      "epoch": 0.00042,
      "grad_norm": 0.016385503113269806,
      "kl": 0.3676511310040951,
      "learning_rate": 9.999999983339212e-06,
      "loss": -0.0069,
      "step": 42,
      "step_time": 2.7499034580005173
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.131894528865814,
      "epoch": 0.00043,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.021295806393027306,
      "kl": 0.3743965122848749,
      "learning_rate": 9.999999977322818e-06,
      "loss": -0.0082,
      "num_tokens": 775585.0,
      "reward": -0.1512046605348587,
      "reward_std": 0.5463583469390869,
      "rewards/rollout_reward_func/mean": -0.1512046605348587,
      "rewards/rollout_reward_func/std": 0.6555477976799011,
      "sampling/importance_sampling_ratio/max": 0.33739665150642395,
      "sampling/importance_sampling_ratio/mean": 0.2010621726512909,
      "sampling/importance_sampling_ratio/min": 0.013526393100619316,
      "sampling/sampling_logp_difference/max": 2.7704052925109863,
      "sampling/sampling_logp_difference/mean": 1.0442215204238892,
      "step": 43,
      "step_time": 4.971326774000772
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "entropy": 6.021390020847321,
      "epoch": 0.00044,
      "grad_norm": 0.024328157305717468,
      "kl": 0.3970394376665354,
      "learning_rate": 9.999999970380822e-06,
      "loss": -0.0084,
      "step": 44,
      "step_time": 2.748106813000959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 5.561417460441589,
      "epoch": 0.00045,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.0709402859210968,
      "kl": 0.5104918628931046,
      "learning_rate": 9.999999962513228e-06,
      "loss": -0.0063,
      "num_tokens": 813291.0,
      "reward": 0.015873141586780548,
      "reward_std": 0.19914725422859192,
      "rewards/rollout_reward_func/mean": 0.015873141586780548,
      "rewards/rollout_reward_func/std": 0.27820026874542236,
      "sampling/importance_sampling_ratio/max": 0.35048311948776245,
      "sampling/importance_sampling_ratio/mean": 0.24818521738052368,
      "sampling/importance_sampling_ratio/min": 1.603953254936119e-11,
      "sampling/sampling_logp_difference/max": 2.631711959838867,
      "sampling/sampling_logp_difference/mean": 0.938794732093811,
      "step": 45,
      "step_time": 5.729797389999476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 5.534043729305267,
      "epoch": 0.00046,
      "grad_norm": 0.07074378430843353,
      "kl": 0.5150652844458818,
      "learning_rate": 9.999999953720035e-06,
      "loss": -0.0064,
      "step": 46,
      "step_time": 2.7925665359998675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.875,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.189878046512604,
      "epoch": 0.00047,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.018252786248922348,
      "kl": 0.3964756764471531,
      "learning_rate": 9.99999994400124e-06,
      "loss": -0.0065,
      "num_tokens": 850551.0,
      "reward": -0.0004236232489347458,
      "reward_std": 0.22690175473690033,
      "rewards/rollout_reward_func/mean": -0.0004236232489347458,
      "rewards/rollout_reward_func/std": 0.31507667899131775,
      "sampling/importance_sampling_ratio/max": 0.36628416180610657,
      "sampling/importance_sampling_ratio/mean": 0.19846494495868683,
      "sampling/importance_sampling_ratio/min": 1.2659386039448606e-10,
      "sampling/sampling_logp_difference/max": 4.142038822174072,
      "sampling/sampling_logp_difference/mean": 1.0495840311050415,
      "step": 47,
      "step_time": 5.31281194299936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 6.272528171539307,
      "epoch": 0.00048,
      "grad_norm": 0.01557189505547285,
      "kl": 0.37549079209566116,
      "learning_rate": 9.999999933356848e-06,
      "loss": -0.0065,
      "step": 48,
      "step_time": 2.7983943749995888
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.4375,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 5.748881280422211,
      "epoch": 0.00049,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.02466733008623123,
      "kl": 0.4844143260270357,
      "learning_rate": 9.999999921786855e-06,
      "loss": -0.0128,
      "num_tokens": 888151.0,
      "reward": -0.03596695885062218,
      "reward_std": 0.2966480255126953,
      "rewards/rollout_reward_func/mean": -0.03596695885062218,
      "rewards/rollout_reward_func/std": 0.41368940472602844,
      "sampling/importance_sampling_ratio/max": 0.3809582591056824,
      "sampling/importance_sampling_ratio/mean": 0.24179524183273315,
      "sampling/importance_sampling_ratio/min": 1.7608540181512922e-09,
      "sampling/sampling_logp_difference/max": 3.226844310760498,
      "sampling/sampling_logp_difference/mean": 1.0408389568328857,
      "step": 49,
      "step_time": 5.312362946001485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.03125,
      "clip_ratio/low_min": 0.03125,
      "clip_ratio/region_mean": 0.03125,
      "entropy": 5.748369365930557,
      "epoch": 0.0005,
      "grad_norm": 0.022601323202252388,
      "kl": 0.5012617446482182,
      "learning_rate": 9.999999909291265e-06,
      "loss": -0.013,
      "step": 50,
      "step_time": 3.2023114370003896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 5.002339065074921,
      "epoch": 0.00051,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.017942914739251137,
      "kl": 0.3239247240126133,
      "learning_rate": 9.999999895870075e-06,
      "loss": -0.0091,
      "num_tokens": 926183.0,
      "reward": 0.09451328217983246,
      "reward_std": 0.026711096987128258,
      "rewards/rollout_reward_func/mean": 0.09451328217983246,
      "rewards/rollout_reward_func/std": 0.038156598806381226,
      "sampling/importance_sampling_ratio/max": 0.39242714643478394,
      "sampling/importance_sampling_ratio/mean": 0.2970275282859802,
      "sampling/importance_sampling_ratio/min": 0.010224375873804092,
      "sampling/sampling_logp_difference/max": 3.037482500076294,
      "sampling/sampling_logp_difference/mean": 0.8098665475845337,
      "step": 51,
      "step_time": 5.696433805002016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 5.001863986253738,
      "epoch": 0.00052,
      "grad_norm": 0.016663307324051857,
      "kl": 0.3272556010633707,
      "learning_rate": 9.999999881523285e-06,
      "loss": -0.0093,
      "step": 52,
      "step_time": 2.7697423050003636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 2.0,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.3582839369773865,
      "epoch": 0.00053,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04750742390751839,
      "kl": 0.5478413961827755,
      "learning_rate": 9.999999866250896e-06,
      "loss": -0.0016,
      "num_tokens": 960803.0,
      "reward": -0.5398619174957275,
      "reward_std": 0.47637394070625305,
      "rewards/rollout_reward_func/mean": -0.5398619174957275,
      "rewards/rollout_reward_func/std": 0.5747888684272766,
      "sampling/importance_sampling_ratio/max": 0.4040625989437103,
      "sampling/importance_sampling_ratio/mean": 0.21569415926933289,
      "sampling/importance_sampling_ratio/min": 0.005049743689596653,
      "sampling/sampling_logp_difference/max": 3.7417404651641846,
      "sampling/sampling_logp_difference/mean": 1.302799940109253,
      "step": 53,
      "step_time": 5.149267007999697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 6.336145222187042,
      "epoch": 0.00054,
      "grad_norm": 0.038806021213531494,
      "kl": 0.5272092521190643,
      "learning_rate": 9.999999850052909e-06,
      "loss": -0.0016,
      "step": 54,
      "step_time": 2.742318255999635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 2.5,
      "completions/mean_terminated_length": 2.064516067504883,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 5.6335746347904205,
      "epoch": 0.00055,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.029065297916531563,
      "kl": 0.5286761000752449,
      "learning_rate": 9.99999983292932e-06,
      "loss": -0.0135,
      "num_tokens": 998217.0,
      "reward": -0.1721905618906021,
      "reward_std": 0.5463275909423828,
      "rewards/rollout_reward_func/mean": -0.1721905618906021,
      "rewards/rollout_reward_func/std": 0.5355534553527832,
      "sampling/importance_sampling_ratio/max": 0.41675877571105957,
      "sampling/importance_sampling_ratio/mean": 0.2475878894329071,
      "sampling/importance_sampling_ratio/min": 2.238332399429055e-07,
      "sampling/sampling_logp_difference/max": 3.269535541534424,
      "sampling/sampling_logp_difference/mean": 1.0780588388442993,
      "step": 55,
      "step_time": 5.403120343999035
    },
    {
      "clip_ratio/high_max": 0.03125,
      "clip_ratio/high_mean": 0.015625,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.015625,
      "entropy": 5.5938030779361725,
      "epoch": 0.00056,
      "grad_norm": 0.02255568467080593,
      "kl": 0.4606229439377785,
      "learning_rate": 9.999999814880132e-06,
      "loss": -0.0136,
      "step": 56,
      "step_time": 3.2759828810003455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 2.133333444595337,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 5.78191477060318,
      "epoch": 0.00057,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.01589336432516575,
      "kl": 0.42849645018577576,
      "learning_rate": 9.999999795905347e-06,
      "loss": -0.009,
      "num_tokens": 1031529.0,
      "reward": -0.07622712850570679,
      "reward_std": 0.6089849472045898,
      "rewards/rollout_reward_func/mean": -0.07622712850570679,
      "rewards/rollout_reward_func/std": 1.0020984411239624,
      "sampling/importance_sampling_ratio/max": 0.425922691822052,
      "sampling/importance_sampling_ratio/mean": 0.229965940117836,
      "sampling/importance_sampling_ratio/min": 6.920892747785956e-09,
      "sampling/sampling_logp_difference/max": 3.239529609680176,
      "sampling/sampling_logp_difference/mean": 1.154854416847229,
      "step": 57,
      "step_time": 5.6268242130017825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 5.738904058933258,
      "epoch": 0.00058,
      "grad_norm": 0.013298124074935913,
      "kl": 0.3968820311129093,
      "learning_rate": 9.999999776004962e-06,
      "loss": -0.009,
      "step": 58,
      "step_time": 2.7349948540004334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 16.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 3.21875,
      "completions/mean_terminated_length": 2.366666793823242,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "entropy": 6.8580867648124695,
      "epoch": 0.00059,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.013808239251375198,
      "kl": 0.29788459837436676,
      "learning_rate": 9.999999755178978e-06,
      "loss": -0.0089,
      "num_tokens": 1065697.0,
      "reward": -0.1516963541507721,
      "reward_std": 0.6373498439788818,
      "rewards/rollout_reward_func/mean": -0.1516963541507721,
      "rewards/rollout_reward_func/std": 0.7994406819343567,
      "sampling/importance_sampling_ratio/max": 0.4323439598083496,
      "sampling/importance_sampling_ratio/mean": 0.16014467179775238,
      "sampling/importance_sampling_ratio/min": 3.285566610444768e-11,
      "sampling/sampling_logp_difference/max": 3.304318428039551,
      "sampling/sampling_logp_difference/mean": 1.2896037101745605,
      "step": 59,
      "step_time": 5.172425778999241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "entropy": 6.81160169839859,
      "epoch": 0.0006,
      "grad_norm": 0.012560456059873104,
      "kl": 0.2846912257373333,
      "learning_rate": 9.999999733427394e-06,
      "loss": -0.0089,
      "step": 60,
      "step_time": 2.6920222089984236
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 200000,
  "num_input_tokens_seen": 1065697,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}