File size: 102,588 Bytes
4a369f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.0015,
  "eval_steps": 500,
  "global_step": 75,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.04213663242626353,
      "epoch": 2e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36506643891334534,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": -0.0001,
      "num_tokens": 34720.0,
      "reward": -0.4953559935092926,
      "reward_std": 0.10753969848155975,
      "rewards/rollout_reward_func/mean": -0.4953559935092926,
      "rewards/rollout_reward_func/std": 0.11737043410539627,
      "sampling/importance_sampling_ratio/max": 1.01706862449646,
      "sampling/importance_sampling_ratio/mean": 0.9976009130477905,
      "sampling/importance_sampling_ratio/min": 0.8514507412910461,
      "sampling/sampling_logp_difference/max": 0.160813570022583,
      "sampling/sampling_logp_difference/mean": 0.0033812490291893482,
      "step": 1,
      "step_time": 10.583001638000042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.056682895723497495,
      "epoch": 4e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.230589509010315,
      "kl": 0.0,
      "learning_rate": 2.8571428571428573e-06,
      "loss": -0.0012,
      "num_tokens": 72329.0,
      "reward": -0.4865216612815857,
      "reward_std": 0.08774720132350922,
      "rewards/rollout_reward_func/mean": -0.4865216612815857,
      "rewards/rollout_reward_func/std": 0.1045704185962677,
      "sampling/importance_sampling_ratio/max": 1.5042492151260376,
      "sampling/importance_sampling_ratio/mean": 1.0045233964920044,
      "sampling/importance_sampling_ratio/min": 0.7999074459075928,
      "sampling/sampling_logp_difference/max": 0.4082939624786377,
      "sampling/sampling_logp_difference/mean": 0.010556299239397049,
      "step": 2,
      "step_time": 9.847310764999861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.04655165857911925,
      "epoch": 6e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6786770224571228,
      "kl": 0.0003507627650863876,
      "learning_rate": 5.7142857142857145e-06,
      "loss": 0.0005,
      "num_tokens": 107126.0,
      "reward": -0.4738577902317047,
      "reward_std": 0.14315202832221985,
      "rewards/rollout_reward_func/mean": -0.4738577902317047,
      "rewards/rollout_reward_func/std": 0.14606907963752747,
      "sampling/importance_sampling_ratio/max": 1.2076377868652344,
      "sampling/importance_sampling_ratio/mean": 1.0011494159698486,
      "sampling/importance_sampling_ratio/min": 0.8115481734275818,
      "sampling/sampling_logp_difference/max": 0.20881152153015137,
      "sampling/sampling_logp_difference/mean": 0.005916388239711523,
      "step": 3,
      "step_time": 9.515110119000042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.06136378643714124,
      "epoch": 8e-05,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5827341079711914,
      "kl": 0.0021565575205606535,
      "learning_rate": 8.571428571428573e-06,
      "loss": 0.0007,
      "num_tokens": 147637.0,
      "reward": -0.42704540491104126,
      "reward_std": 0.1624506115913391,
      "rewards/rollout_reward_func/mean": -0.42704540491104126,
      "rewards/rollout_reward_func/std": 0.17692908644676208,
      "sampling/importance_sampling_ratio/max": 1.2322131395339966,
      "sampling/importance_sampling_ratio/mean": 1.0034774541854858,
      "sampling/importance_sampling_ratio/min": 0.7550826072692871,
      "sampling/sampling_logp_difference/max": 0.2809281349182129,
      "sampling/sampling_logp_difference/mean": 0.013284817337989807,
      "step": 4,
      "step_time": 10.327476732999912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.06213088113145204,
      "epoch": 0.0001,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1783960461616516,
      "kl": 0.010104762356288433,
      "learning_rate": 1.1428571428571429e-05,
      "loss": 0.0004,
      "num_tokens": 183808.0,
      "reward": -0.5187968015670776,
      "reward_std": 0.10742770880460739,
      "rewards/rollout_reward_func/mean": -0.5187968015670776,
      "rewards/rollout_reward_func/std": 0.114708311855793,
      "sampling/importance_sampling_ratio/max": 1.5868523120880127,
      "sampling/importance_sampling_ratio/mean": 1.0063602924346924,
      "sampling/importance_sampling_ratio/min": 0.8623139262199402,
      "sampling/sampling_logp_difference/max": 0.46175241470336914,
      "sampling/sampling_logp_difference/mean": 0.009860638529062271,
      "step": 5,
      "step_time": 9.938454288000116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.083112089203496,
      "epoch": 0.00012,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.643200635910034,
      "kl": 0.04417062703578267,
      "learning_rate": 1.4285714285714285e-05,
      "loss": 0.0007,
      "num_tokens": 221524.0,
      "reward": -0.526121973991394,
      "reward_std": 0.16247841715812683,
      "rewards/rollout_reward_func/mean": -0.526121973991394,
      "rewards/rollout_reward_func/std": 0.16897279024124146,
      "sampling/importance_sampling_ratio/max": 1.3004112243652344,
      "sampling/importance_sampling_ratio/mean": 0.9950670003890991,
      "sampling/importance_sampling_ratio/min": 0.6120707392692566,
      "sampling/sampling_logp_difference/max": 0.4909074306488037,
      "sampling/sampling_logp_difference/mean": 0.01979595422744751,
      "step": 6,
      "step_time": 11.122546247000173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.10465507118351525,
      "epoch": 0.00014,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.6755809783935547,
      "kl": 0.06485261598226089,
      "learning_rate": 1.7142857142857145e-05,
      "loss": 0.0006,
      "num_tokens": 262058.0,
      "reward": -0.45354732871055603,
      "reward_std": 0.11865675449371338,
      "rewards/rollout_reward_func/mean": -0.45354732871055603,
      "rewards/rollout_reward_func/std": 0.1317674219608307,
      "sampling/importance_sampling_ratio/max": 1.3646963834762573,
      "sampling/importance_sampling_ratio/mean": 0.9980586767196655,
      "sampling/importance_sampling_ratio/min": 0.7788010835647583,
      "sampling/sampling_logp_difference/max": 0.310931921005249,
      "sampling/sampling_logp_difference/mean": 0.013520372100174427,
      "step": 7,
      "step_time": 10.180680208999888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.06767257605679333,
      "epoch": 0.00016,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7358646392822266,
      "kl": 0.10022372181984451,
      "learning_rate": 2e-05,
      "loss": -0.0003,
      "num_tokens": 300372.0,
      "reward": -0.438113808631897,
      "reward_std": 0.1628333181142807,
      "rewards/rollout_reward_func/mean": -0.438113808631897,
      "rewards/rollout_reward_func/std": 0.18601371347904205,
      "sampling/importance_sampling_ratio/max": 1.5303767919540405,
      "sampling/importance_sampling_ratio/mean": 1.010486364364624,
      "sampling/importance_sampling_ratio/min": 0.5898699164390564,
      "sampling/sampling_logp_difference/max": 0.52785325050354,
      "sampling/sampling_logp_difference/mean": 0.026673050597310066,
      "step": 8,
      "step_time": 10.86024607800016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.010416666977107525,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.05736969155987026,
      "epoch": 0.00018,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4610992968082428,
      "kl": 0.14014234533533454,
      "learning_rate": 2.2857142857142858e-05,
      "loss": 0.0011,
      "num_tokens": 340975.0,
      "reward": -0.5177702307701111,
      "reward_std": 0.10829603672027588,
      "rewards/rollout_reward_func/mean": -0.5177702307701111,
      "rewards/rollout_reward_func/std": 0.11044981330633163,
      "sampling/importance_sampling_ratio/max": 1.7585923671722412,
      "sampling/importance_sampling_ratio/mean": 1.0206780433654785,
      "sampling/importance_sampling_ratio/min": 0.9417206048965454,
      "sampling/sampling_logp_difference/max": 0.5645136833190918,
      "sampling/sampling_logp_difference/mean": 0.019472315907478333,
      "step": 9,
      "step_time": 11.569765732000178
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.05415748237646767,
      "epoch": 0.0002,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.42527446150779724,
      "kl": 0.15265877915955645,
      "learning_rate": 2.5714285714285714e-05,
      "loss": 0.0002,
      "num_tokens": 380815.0,
      "reward": -0.5060328245162964,
      "reward_std": 0.12003964185714722,
      "rewards/rollout_reward_func/mean": -0.5060328245162964,
      "rewards/rollout_reward_func/std": 0.13507631421089172,
      "sampling/importance_sampling_ratio/max": 1.2756245136260986,
      "sampling/importance_sampling_ratio/mean": 1.0032804012298584,
      "sampling/importance_sampling_ratio/min": 0.8109822869300842,
      "sampling/sampling_logp_difference/max": 0.24343585968017578,
      "sampling/sampling_logp_difference/mean": 0.010368866845965385,
      "step": 10,
      "step_time": 11.035615327000073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.03652133769719512,
      "epoch": 0.00022,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.7140417098999023,
      "kl": 0.5853048088065407,
      "learning_rate": 2.857142857142857e-05,
      "loss": 0.0007,
      "num_tokens": 419201.0,
      "reward": -0.49228060245513916,
      "reward_std": 0.1299666315317154,
      "rewards/rollout_reward_func/mean": -0.49228060245513916,
      "rewards/rollout_reward_func/std": 0.1354321837425232,
      "sampling/importance_sampling_ratio/max": 1.192307949066162,
      "sampling/importance_sampling_ratio/mean": 0.9962027668952942,
      "sampling/importance_sampling_ratio/min": 0.18285171687602997,
      "sampling/sampling_logp_difference/max": 1.6990797519683838,
      "sampling/sampling_logp_difference/mean": 0.022965161129832268,
      "step": 11,
      "step_time": 12.10885376400006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.010421836544992402,
      "epoch": 0.00024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0725681409239769,
      "kl": 0.09164208328971435,
      "learning_rate": 3.142857142857143e-05,
      "loss": 0.0002,
      "num_tokens": 455342.0,
      "reward": -0.42868927121162415,
      "reward_std": 0.1808679699897766,
      "rewards/rollout_reward_func/mean": -0.42868927121162415,
      "rewards/rollout_reward_func/std": 0.1826806664466858,
      "sampling/importance_sampling_ratio/max": 1.0040708780288696,
      "sampling/importance_sampling_ratio/mean": 0.9997765421867371,
      "sampling/importance_sampling_ratio/min": 0.9736456274986267,
      "sampling/sampling_logp_difference/max": 0.026707857847213745,
      "sampling/sampling_logp_difference/mean": 0.0005151446675881743,
      "step": 12,
      "step_time": 11.46827613999983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.005160837485163938,
      "epoch": 0.00026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.003414501203224063,
      "kl": 0.054030168646306775,
      "learning_rate": 3.428571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 487838.0,
      "reward": -0.5129345655441284,
      "reward_std": 0.09682037681341171,
      "rewards/rollout_reward_func/mean": -0.5129345655441284,
      "rewards/rollout_reward_func/std": 0.10943721234798431,
      "sampling/importance_sampling_ratio/max": 1.0242013931274414,
      "sampling/importance_sampling_ratio/mean": 1.0002690553665161,
      "sampling/importance_sampling_ratio/min": 0.9995214939117432,
      "sampling/sampling_logp_difference/max": 0.023913156241178513,
      "sampling/sampling_logp_difference/mean": 0.00029178019030950963,
      "step": 13,
      "step_time": 13.078657843000315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0049433065178163815,
      "epoch": 0.00028,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.006202552933245897,
      "kl": 0.11296933640962958,
      "learning_rate": 3.7142857142857143e-05,
      "loss": 0.0002,
      "num_tokens": 523251.0,
      "reward": -0.4549490213394165,
      "reward_std": 0.1518395096063614,
      "rewards/rollout_reward_func/mean": -0.4549490213394165,
      "rewards/rollout_reward_func/std": 0.15026704967021942,
      "sampling/importance_sampling_ratio/max": 1.0146245956420898,
      "sampling/importance_sampling_ratio/mean": 1.0001541376113892,
      "sampling/importance_sampling_ratio/min": 0.9956898093223572,
      "sampling/sampling_logp_difference/max": 0.014518730342388153,
      "sampling/sampling_logp_difference/mean": 0.0002873566118068993,
      "step": 14,
      "step_time": 11.818882993999978
    },
    {
      "clip_ratio/high_max": 0.02083333395421505,
      "clip_ratio/high_mean": 0.010416666977107525,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.010416666977107525,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.05449013704492245,
      "epoch": 0.0003,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.2176028490066528,
      "kl": 0.3055893306495818,
      "learning_rate": 4e-05,
      "loss": 0.0004,
      "num_tokens": 554972.0,
      "reward": -0.4694768786430359,
      "reward_std": 0.1240941733121872,
      "rewards/rollout_reward_func/mean": -0.4694768786430359,
      "rewards/rollout_reward_func/std": 0.13716520369052887,
      "sampling/importance_sampling_ratio/max": 2.6555840969085693,
      "sampling/importance_sampling_ratio/mean": 0.9897008538246155,
      "sampling/importance_sampling_ratio/min": 0.24389615654945374,
      "sampling/sampling_logp_difference/max": 1.4110127687454224,
      "sampling/sampling_logp_difference/mean": 0.05285795032978058,
      "step": 15,
      "step_time": 12.903524508000032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.02083333395421505,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.02083333395421505,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.029744337291049305,
      "epoch": 0.00032,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5681190490722656,
      "kl": 1.5352033322367333,
      "learning_rate": 4.2857142857142856e-05,
      "loss": 0.0024,
      "num_tokens": 591125.0,
      "reward": -0.49898892641067505,
      "reward_std": 0.1363566815853119,
      "rewards/rollout_reward_func/mean": -0.49898892641067505,
      "rewards/rollout_reward_func/std": 0.17710748314857483,
      "sampling/importance_sampling_ratio/max": 1.0899990797042847,
      "sampling/importance_sampling_ratio/mean": 0.9730724692344666,
      "sampling/importance_sampling_ratio/min": 0.11985374242067337,
      "sampling/sampling_logp_difference/max": 2.121483087539673,
      "sampling/sampling_logp_difference/mean": 0.06464457511901855,
      "step": 16,
      "step_time": 10.869735754999738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.005307975381583674,
      "epoch": 0.00034,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.022409122437238693,
      "kl": 2.8874126337468624,
      "learning_rate": 4.5714285714285716e-05,
      "loss": 0.0053,
      "num_tokens": 624998.0,
      "reward": -0.42809420824050903,
      "reward_std": 0.10570663213729858,
      "rewards/rollout_reward_func/mean": -0.42809420824050903,
      "rewards/rollout_reward_func/std": 0.11102946847677231,
      "sampling/importance_sampling_ratio/max": 1.113768219947815,
      "sampling/importance_sampling_ratio/mean": 1.001915454864502,
      "sampling/importance_sampling_ratio/min": 0.9997419118881226,
      "sampling/sampling_logp_difference/max": 0.10774913430213928,
      "sampling/sampling_logp_difference/mean": 0.0018536553252488375,
      "step": 17,
      "step_time": 10.573343859999682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0018984277339768596,
      "epoch": 0.00036,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.001003155717626214,
      "kl": 2.4075989934303834,
      "learning_rate": 4.8571428571428576e-05,
      "loss": 0.0045,
      "num_tokens": 660422.0,
      "reward": -0.4815124273300171,
      "reward_std": 0.17238172888755798,
      "rewards/rollout_reward_func/mean": -0.4815124273300171,
      "rewards/rollout_reward_func/std": 0.17162199318408966,
      "sampling/importance_sampling_ratio/max": 1.0038193464279175,
      "sampling/importance_sampling_ratio/mean": 1.0001835823059082,
      "sampling/importance_sampling_ratio/min": 0.9999402761459351,
      "sampling/sampling_logp_difference/max": 0.00381203880533576,
      "sampling/sampling_logp_difference/mean": 0.00019016250735148787,
      "step": 18,
      "step_time": 10.092614914000478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.001787107794370968,
      "epoch": 0.00038,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0019859045278280973,
      "kl": 2.131671732679621,
      "learning_rate": 5.142857142857143e-05,
      "loss": 0.004,
      "num_tokens": 695119.0,
      "reward": -0.45420968532562256,
      "reward_std": 0.1451229602098465,
      "rewards/rollout_reward_func/mean": -0.45420968532562256,
      "rewards/rollout_reward_func/std": 0.16084149479866028,
      "sampling/importance_sampling_ratio/max": 1.012938380241394,
      "sampling/importance_sampling_ratio/mean": 1.0002269744873047,
      "sampling/importance_sampling_ratio/min": 0.9999643564224243,
      "sampling/sampling_logp_difference/max": 0.012855470180511475,
      "sampling/sampling_logp_difference/mean": 0.0002334651944693178,
      "step": 19,
      "step_time": 11.423286533999544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0023332082309934776,
      "epoch": 0.0004,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0014479915844276547,
      "kl": 1.2728006265497243,
      "learning_rate": 5.428571428571428e-05,
      "loss": 0.0024,
      "num_tokens": 734861.0,
      "reward": -0.4497176706790924,
      "reward_std": 0.12124527990818024,
      "rewards/rollout_reward_func/mean": -0.4497176706790924,
      "rewards/rollout_reward_func/std": 0.12396564334630966,
      "sampling/importance_sampling_ratio/max": 1.0169790983200073,
      "sampling/importance_sampling_ratio/mean": 1.0002472400665283,
      "sampling/importance_sampling_ratio/min": 0.999942421913147,
      "sampling/sampling_logp_difference/max": 0.01683656871318817,
      "sampling/sampling_logp_difference/mean": 0.00025150534929707646,
      "step": 20,
      "step_time": 10.984022518999609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0017557634673721623,
      "epoch": 0.00042,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008899558335542679,
      "kl": 2.1504041726390426,
      "learning_rate": 5.714285714285714e-05,
      "loss": 0.004,
      "num_tokens": 770251.0,
      "reward": -0.4525429606437683,
      "reward_std": 0.1307295560836792,
      "rewards/rollout_reward_func/mean": -0.4525429606437683,
      "rewards/rollout_reward_func/std": 0.13270536065101624,
      "sampling/importance_sampling_ratio/max": 1.007300615310669,
      "sampling/importance_sampling_ratio/mean": 1.0000929832458496,
      "sampling/importance_sampling_ratio/min": 0.9999077320098877,
      "sampling/sampling_logp_difference/max": 0.007273993454873562,
      "sampling/sampling_logp_difference/mean": 9.861911530606449e-05,
      "step": 21,
      "step_time": 11.750048858000582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0014675547063234262,
      "epoch": 0.00044,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0015787131851539016,
      "kl": 2.142753223429594,
      "learning_rate": 6e-05,
      "loss": 0.004,
      "num_tokens": 804919.0,
      "reward": -0.4373324513435364,
      "reward_std": 0.14926397800445557,
      "rewards/rollout_reward_func/mean": -0.4373324513435364,
      "rewards/rollout_reward_func/std": 0.1527157872915268,
      "sampling/importance_sampling_ratio/max": 1.0027562379837036,
      "sampling/importance_sampling_ratio/mean": 1.0000301599502563,
      "sampling/importance_sampling_ratio/min": 0.9997082352638245,
      "sampling/sampling_logp_difference/max": 0.002752469852566719,
      "sampling/sampling_logp_difference/mean": 4.924799213767983e-05,
      "step": 22,
      "step_time": 10.57756851799968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0016250821427092887,
      "epoch": 0.00046,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.007080785930156708,
      "kl": 2.471247175708413,
      "learning_rate": 6.285714285714286e-05,
      "loss": 0.0046,
      "num_tokens": 840304.0,
      "reward": -0.5088313221931458,
      "reward_std": 0.12063620984554291,
      "rewards/rollout_reward_func/mean": -0.5088313221931458,
      "rewards/rollout_reward_func/std": 0.11942265927791595,
      "sampling/importance_sampling_ratio/max": 1.000611424446106,
      "sampling/importance_sampling_ratio/mean": 0.9999845027923584,
      "sampling/importance_sampling_ratio/min": 0.9976289868354797,
      "sampling/sampling_logp_difference/max": 0.0023738397285342216,
      "sampling/sampling_logp_difference/mean": 5.626135680358857e-05,
      "step": 23,
      "step_time": 11.196098771000152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.002501545510313008,
      "epoch": 0.00048,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002404371974989772,
      "kl": 1.4448929702630267,
      "learning_rate": 6.571428571428571e-05,
      "loss": 0.0027,
      "num_tokens": 878691.0,
      "reward": -0.4859924912452698,
      "reward_std": 0.17074629664421082,
      "rewards/rollout_reward_func/mean": -0.4859924912452698,
      "rewards/rollout_reward_func/std": 0.18796174228191376,
      "sampling/importance_sampling_ratio/max": 1.008969783782959,
      "sampling/importance_sampling_ratio/mean": 1.0001083612442017,
      "sampling/importance_sampling_ratio/min": 0.9983657002449036,
      "sampling/sampling_logp_difference/max": 0.008929748088121414,
      "sampling/sampling_logp_difference/mean": 0.00018690252909436822,
      "step": 24,
      "step_time": 11.436561123000047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.00125838804160594,
      "epoch": 0.0005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.00037121796049177647,
      "kl": 2.728724977933325,
      "learning_rate": 6.857142857142858e-05,
      "loss": 0.0051,
      "num_tokens": 912630.0,
      "reward": -0.43685033917427063,
      "reward_std": 0.14631888270378113,
      "rewards/rollout_reward_func/mean": -0.43685033917427063,
      "rewards/rollout_reward_func/std": 0.14818531274795532,
      "sampling/importance_sampling_ratio/max": 1.0000602006912231,
      "sampling/importance_sampling_ratio/mean": 0.999987781047821,
      "sampling/importance_sampling_ratio/min": 0.9996044039726257,
      "sampling/sampling_logp_difference/max": 0.000395664683310315,
      "sampling/sampling_logp_difference/mean": 2.7497631890582852e-05,
      "step": 25,
      "step_time": 10.338325488999317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.004701976798969554,
      "epoch": 0.00052,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0168001726269722,
      "kl": 2.670653583481908,
      "learning_rate": 7.142857142857143e-05,
      "loss": 0.005,
      "num_tokens": 945129.0,
      "reward": -0.49788784980773926,
      "reward_std": 0.11309097707271576,
      "rewards/rollout_reward_func/mean": -0.49788784980773926,
      "rewards/rollout_reward_func/std": 0.11171143501996994,
      "sampling/importance_sampling_ratio/max": 1.0177559852600098,
      "sampling/importance_sampling_ratio/mean": 1.0001691579818726,
      "sampling/importance_sampling_ratio/min": 0.9972944855690002,
      "sampling/sampling_logp_difference/max": 0.017600178718566895,
      "sampling/sampling_logp_difference/mean": 0.0002922487910836935,
      "step": 26,
      "step_time": 11.807520030999513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.005813408846734092,
      "epoch": 0.00054,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.04519229754805565,
      "kl": 2.455354232341051,
      "learning_rate": 7.428571428571429e-05,
      "loss": 0.0046,
      "num_tokens": 979857.0,
      "reward": -0.48440974950790405,
      "reward_std": 0.1276542842388153,
      "rewards/rollout_reward_func/mean": -0.48440974950790405,
      "rewards/rollout_reward_func/std": 0.13863544166088104,
      "sampling/importance_sampling_ratio/max": 1.0226891040802002,
      "sampling/importance_sampling_ratio/mean": 1.0000253915786743,
      "sampling/importance_sampling_ratio/min": 0.9920601844787598,
      "sampling/sampling_logp_difference/max": 0.02243557944893837,
      "sampling/sampling_logp_difference/mean": 0.00045728118857368827,
      "step": 27,
      "step_time": 11.074748723000539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.007451985773514025,
      "epoch": 0.00056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.020447248592972755,
      "kl": 2.786820446451202,
      "learning_rate": 7.714285714285715e-05,
      "loss": 0.0052,
      "num_tokens": 1010868.0,
      "reward": -0.4477247893810272,
      "reward_std": 0.12471087276935577,
      "rewards/rollout_reward_func/mean": -0.4477247893810272,
      "rewards/rollout_reward_func/std": 0.1252235472202301,
      "sampling/importance_sampling_ratio/max": 1.0000356435775757,
      "sampling/importance_sampling_ratio/mean": 0.9992380142211914,
      "sampling/importance_sampling_ratio/min": 0.9429383277893066,
      "sampling/sampling_logp_difference/max": 0.058754414319992065,
      "sampling/sampling_logp_difference/mean": 0.0007906818063929677,
      "step": 28,
      "step_time": 11.204566938999733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.005258000070170965,
      "epoch": 0.00058,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12415958195924759,
      "kl": 1.8602933554793708,
      "learning_rate": 8e-05,
      "loss": 0.0036,
      "num_tokens": 1047027.0,
      "reward": -0.47356218099594116,
      "reward_std": 0.17148526012897491,
      "rewards/rollout_reward_func/mean": -0.47356218099594116,
      "rewards/rollout_reward_func/std": 0.17493870854377747,
      "sampling/importance_sampling_ratio/max": 1.0000348091125488,
      "sampling/importance_sampling_ratio/mean": 0.9999445676803589,
      "sampling/importance_sampling_ratio/min": 0.9985746145248413,
      "sampling/sampling_logp_difference/max": 0.0014263943303376436,
      "sampling/sampling_logp_difference/mean": 6.454815593315288e-05,
      "step": 29,
      "step_time": 10.754735779999919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.005060947762103751,
      "epoch": 0.0006,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.015699295327067375,
      "kl": 3.0101311548302476,
      "learning_rate": 8.285714285714287e-05,
      "loss": 0.0056,
      "num_tokens": 1078040.0,
      "reward": -0.44326093792915344,
      "reward_std": 0.1546049863100052,
      "rewards/rollout_reward_func/mean": -0.44326093792915344,
      "rewards/rollout_reward_func/std": 0.16109547019004822,
      "sampling/importance_sampling_ratio/max": 1.0001112222671509,
      "sampling/importance_sampling_ratio/mean": 0.999657928943634,
      "sampling/importance_sampling_ratio/min": 0.9917479157447815,
      "sampling/sampling_logp_difference/max": 0.00828632339835167,
      "sampling/sampling_logp_difference/mean": 0.00035381870111450553,
      "step": 30,
      "step_time": 11.130074380999304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.010435697484354023,
      "epoch": 0.00062,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.011954938992857933,
      "kl": 2.1385079924175443,
      "learning_rate": 8.571428571428571e-05,
      "loss": 0.004,
      "num_tokens": 1114270.0,
      "reward": -0.5106649994850159,
      "reward_std": 0.15311290323734283,
      "rewards/rollout_reward_func/mean": -0.5106649994850159,
      "rewards/rollout_reward_func/std": 0.16842880845069885,
      "sampling/importance_sampling_ratio/max": 1.0115782022476196,
      "sampling/importance_sampling_ratio/mean": 0.9992225170135498,
      "sampling/importance_sampling_ratio/min": 0.9679325819015503,
      "sampling/sampling_logp_difference/max": 0.03259281814098358,
      "sampling/sampling_logp_difference/mean": 0.001033698208630085,
      "step": 31,
      "step_time": 10.868044704000113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.015111156193597708,
      "epoch": 0.00064,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.07620638608932495,
      "kl": 1.894992141673356,
      "learning_rate": 8.857142857142857e-05,
      "loss": 0.0033,
      "num_tokens": 1148908.0,
      "reward": -0.42877840995788574,
      "reward_std": 0.12491989135742188,
      "rewards/rollout_reward_func/mean": -0.42877840995788574,
      "rewards/rollout_reward_func/std": 0.15547259151935577,
      "sampling/importance_sampling_ratio/max": 1.0025962591171265,
      "sampling/importance_sampling_ratio/mean": 0.997442364692688,
      "sampling/importance_sampling_ratio/min": 0.8747695088386536,
      "sampling/sampling_logp_difference/max": 0.133794903755188,
      "sampling/sampling_logp_difference/mean": 0.0027670767158269882,
      "step": 32,
      "step_time": 12.017423004999955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.03369333760929294,
      "epoch": 0.00066,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15489332377910614,
      "kl": 2.431340977549553,
      "learning_rate": 9.142857142857143e-05,
      "loss": 0.0043,
      "num_tokens": 1182817.0,
      "reward": -0.43676623702049255,
      "reward_std": 0.19219474494457245,
      "rewards/rollout_reward_func/mean": -0.43676623702049255,
      "rewards/rollout_reward_func/std": 0.18783578276634216,
      "sampling/importance_sampling_ratio/max": 1.029738187789917,
      "sampling/importance_sampling_ratio/mean": 0.9952020645141602,
      "sampling/importance_sampling_ratio/min": 0.8523503541946411,
      "sampling/sampling_logp_difference/max": 0.1597576141357422,
      "sampling/sampling_logp_difference/mean": 0.005681644193828106,
      "step": 33,
      "step_time": 11.107914947999916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.08210996998241171,
      "epoch": 0.00068,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.0649993419647217,
      "kl": 2.27309197653085,
      "learning_rate": 9.428571428571429e-05,
      "loss": 0.0194,
      "num_tokens": 1216117.0,
      "reward": -0.5223650932312012,
      "reward_std": 0.15171676874160767,
      "rewards/rollout_reward_func/mean": -0.5223650932312012,
      "rewards/rollout_reward_func/std": 0.14959992468357086,
      "sampling/importance_sampling_ratio/max": 2.7629706859588623,
      "sampling/importance_sampling_ratio/mean": 1.0301706790924072,
      "sampling/importance_sampling_ratio/min": 0.6142791509628296,
      "sampling/sampling_logp_difference/max": 1.0163064002990723,
      "sampling/sampling_logp_difference/mean": 0.04744146391749382,
      "step": 34,
      "step_time": 11.677880323999716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0031382560227939393,
      "epoch": 0.0007,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.008347594179213047,
      "kl": 2.0830320498597246,
      "learning_rate": 9.714285714285715e-05,
      "loss": 0.0039,
      "num_tokens": 1249382.0,
      "reward": -0.45180174708366394,
      "reward_std": 0.19802165031433105,
      "rewards/rollout_reward_func/mean": -0.45180174708366394,
      "rewards/rollout_reward_func/std": 0.19667227566242218,
      "sampling/importance_sampling_ratio/max": 1.0000340938568115,
      "sampling/importance_sampling_ratio/mean": 0.9999133348464966,
      "sampling/importance_sampling_ratio/min": 0.997832179069519,
      "sampling/sampling_logp_difference/max": 0.002170148305594921,
      "sampling/sampling_logp_difference/mean": 9.547994704917073e-05,
      "step": 35,
      "step_time": 10.335512492000134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.010472146794199944,
      "epoch": 0.00072,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0902613028883934,
      "kl": 2.3452741815708578,
      "learning_rate": 0.0001,
      "loss": 0.0043,
      "num_tokens": 1281161.0,
      "reward": -0.4157106280326843,
      "reward_std": 0.1772732436656952,
      "rewards/rollout_reward_func/mean": -0.4157106280326843,
      "rewards/rollout_reward_func/std": 0.1765340268611908,
      "sampling/importance_sampling_ratio/max": 1.0063142776489258,
      "sampling/importance_sampling_ratio/mean": 0.9980408549308777,
      "sampling/importance_sampling_ratio/min": 0.8053573369979858,
      "sampling/sampling_logp_difference/max": 0.21646922826766968,
      "sampling/sampling_logp_difference/mean": 0.0023312268313020468,
      "step": 36,
      "step_time": 11.72868796600028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0007935629546409473,
      "epoch": 0.00074,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.5115637223934755e-05,
      "kl": 2.979110558206912,
      "learning_rate": 9.999942030039711e-05,
      "loss": 0.0056,
      "num_tokens": 1309941.0,
      "reward": -0.4328516721725464,
      "reward_std": 0.11836086213588715,
      "rewards/rollout_reward_func/mean": -0.4328516721725464,
      "rewards/rollout_reward_func/std": 0.12694095075130463,
      "sampling/importance_sampling_ratio/max": 1.0000559091567993,
      "sampling/importance_sampling_ratio/mean": 1.0000051259994507,
      "sampling/importance_sampling_ratio/min": 0.9999697208404541,
      "sampling/sampling_logp_difference/max": 5.5903590691741556e-05,
      "sampling/sampling_logp_difference/mean": 6.916895472386386e-06,
      "step": 37,
      "step_time": 10.544788530000005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0007049098967399914,
      "epoch": 0.00076,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.5064314538904e-05,
      "kl": 3.3144653998315334,
      "learning_rate": 9.999768121951115e-05,
      "loss": 0.0062,
      "num_tokens": 1340982.0,
      "reward": -0.4486263394355774,
      "reward_std": 0.18141409754753113,
      "rewards/rollout_reward_func/mean": -0.4486263394355774,
      "rewards/rollout_reward_func/std": 0.18459345400333405,
      "sampling/importance_sampling_ratio/max": 1.0000298023223877,
      "sampling/importance_sampling_ratio/mean": 1.0000050067901611,
      "sampling/importance_sampling_ratio/min": 0.9999734163284302,
      "sampling/sampling_logp_difference/max": 2.9786722734570503e-05,
      "sampling/sampling_logp_difference/mean": 6.072340511309449e-06,
      "step": 38,
      "step_time": 10.22810908700012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.002045353794528637,
      "epoch": 0.00078,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.02569839172065258,
      "kl": 2.3597085239986058,
      "learning_rate": 9.999478281110987e-05,
      "loss": 0.0045,
      "num_tokens": 1374938.0,
      "reward": -0.4885009527206421,
      "reward_std": 0.12055516242980957,
      "rewards/rollout_reward_func/mean": -0.4885009527206421,
      "rewards/rollout_reward_func/std": 0.12474851310253143,
      "sampling/importance_sampling_ratio/max": 1.0000395774841309,
      "sampling/importance_sampling_ratio/mean": 0.9997438788414001,
      "sampling/importance_sampling_ratio/min": 0.9747907519340515,
      "sampling/sampling_logp_difference/max": 0.025532402098178864,
      "sampling/sampling_logp_difference/mean": 0.00027372626936994493,
      "step": 39,
      "step_time": 10.025046984000483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.001170793577330187,
      "epoch": 0.0008,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0011198727879673243,
      "kl": 2.575058995746076,
      "learning_rate": 9.999072516480423e-05,
      "loss": 0.0048,
      "num_tokens": 1405986.0,
      "reward": -0.49351146817207336,
      "reward_std": 0.11518004536628723,
      "rewards/rollout_reward_func/mean": -0.49351146817207336,
      "rewards/rollout_reward_func/std": 0.12014364451169968,
      "sampling/importance_sampling_ratio/max": 1.0233973264694214,
      "sampling/importance_sampling_ratio/mean": 1.0002632141113281,
      "sampling/importance_sampling_ratio/min": 0.9999645948410034,
      "sampling/sampling_logp_difference/max": 0.023127814754843712,
      "sampling/sampling_logp_difference/mean": 0.0002636197314132005,
      "step": 40,
      "step_time": 11.72272543899976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.000821109762910055,
      "epoch": 0.00082,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.002684971783310175,
      "kl": 2.9493437994582337,
      "learning_rate": 9.998550840604579e-05,
      "loss": 0.0055,
      "num_tokens": 1438504.0,
      "reward": -0.44643768668174744,
      "reward_std": 0.18104322254657745,
      "rewards/rollout_reward_func/mean": -0.44643768668174744,
      "rewards/rollout_reward_func/std": 0.19707725942134857,
      "sampling/importance_sampling_ratio/max": 1.0007014274597168,
      "sampling/importance_sampling_ratio/mean": 1.000012755393982,
      "sampling/importance_sampling_ratio/min": 0.9998999834060669,
      "sampling/sampling_logp_difference/max": 0.0007011198904365301,
      "sampling/sampling_logp_difference/mean": 1.5726367564639077e-05,
      "step": 41,
      "step_time": 10.699221135000244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006376944766088855,
      "epoch": 0.00084,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.305521962232888e-05,
      "kl": 1.718819092882768,
      "learning_rate": 9.997913269612266e-05,
      "loss": 0.0032,
      "num_tokens": 1473957.0,
      "reward": -0.4666598439216614,
      "reward_std": 0.1580410599708557,
      "rewards/rollout_reward_func/mean": -0.4666598439216614,
      "rewards/rollout_reward_func/std": 0.15718111395835876,
      "sampling/importance_sampling_ratio/max": 1.000121831893921,
      "sampling/importance_sampling_ratio/mean": 1.0000066757202148,
      "sampling/importance_sampling_ratio/min": 0.9999860525131226,
      "sampling/sampling_logp_difference/max": 0.00012180398334749043,
      "sampling/sampling_logp_difference/mean": 8.024451744859107e-06,
      "step": 42,
      "step_time": 11.311061955000014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.000830665525427321,
      "epoch": 0.00086,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0017081554979085922,
      "kl": 2.11324807908386,
      "learning_rate": 9.997159823215467e-05,
      "loss": 0.004,
      "num_tokens": 1507226.0,
      "reward": -0.5063143372535706,
      "reward_std": 0.11844731867313385,
      "rewards/rollout_reward_func/mean": -0.5063143372535706,
      "rewards/rollout_reward_func/std": 0.11648620665073395,
      "sampling/importance_sampling_ratio/max": 1.0000935792922974,
      "sampling/importance_sampling_ratio/mean": 1.0000015497207642,
      "sampling/importance_sampling_ratio/min": 0.999706506729126,
      "sampling/sampling_logp_difference/max": 0.00029356300365179777,
      "sampling/sampling_logp_difference/mean": 1.2860547030868474e-05,
      "step": 43,
      "step_time": 12.482276944999057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006207615115272347,
      "epoch": 0.00088,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.307548129465431e-05,
      "kl": 2.9673098786734045,
      "learning_rate": 9.996290524708723e-05,
      "loss": 0.0056,
      "num_tokens": 1539066.0,
      "reward": -0.4601210951805115,
      "reward_std": 0.150915265083313,
      "rewards/rollout_reward_func/mean": -0.4601210951805115,
      "rewards/rollout_reward_func/std": 0.1748569756746292,
      "sampling/importance_sampling_ratio/max": 1.0000370740890503,
      "sampling/importance_sampling_ratio/mean": 1.000004768371582,
      "sampling/importance_sampling_ratio/min": 0.9999825358390808,
      "sampling/sampling_logp_difference/max": 3.706202551256865e-05,
      "sampling/sampling_logp_difference/mean": 6.1202204051369336e-06,
      "step": 44,
      "step_time": 10.131044015999805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006315578757494222,
      "epoch": 0.0009,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.3875041733845137e-05,
      "kl": 2.4858804441367397,
      "learning_rate": 9.995305400968402e-05,
      "loss": 0.0047,
      "num_tokens": 1570815.0,
      "reward": -0.46754807233810425,
      "reward_std": 0.11122138798236847,
      "rewards/rollout_reward_func/mean": -0.46754807233810425,
      "rewards/rollout_reward_func/std": 0.12286069989204407,
      "sampling/importance_sampling_ratio/max": 1.0000466108322144,
      "sampling/importance_sampling_ratio/mean": 1.0000091791152954,
      "sampling/importance_sampling_ratio/min": 0.9999758005142212,
      "sampling/sampling_logp_difference/max": 4.659905971493572e-05,
      "sampling/sampling_logp_difference/mean": 1.0045773706224281e-05,
      "step": 45,
      "step_time": 11.542584926000018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006228533748071641,
      "epoch": 0.00092,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.2772155059792567e-05,
      "kl": 2.317846190145474,
      "learning_rate": 9.994204482451885e-05,
      "loss": 0.0043,
      "num_tokens": 1602554.0,
      "reward": -0.4817233085632324,
      "reward_std": 0.14001183211803436,
      "rewards/rollout_reward_func/mean": -0.4817233085632324,
      "rewards/rollout_reward_func/std": 0.13829727470874786,
      "sampling/importance_sampling_ratio/max": 1.0000641345977783,
      "sampling/importance_sampling_ratio/mean": 1.0000078678131104,
      "sampling/importance_sampling_ratio/min": 0.9999889731407166,
      "sampling/sampling_logp_difference/max": 6.411726644728333e-05,
      "sampling/sampling_logp_difference/mean": 9.003964805742726e-06,
      "step": 46,
      "step_time": 11.153836768000701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006155803639558144,
      "epoch": 0.00094,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8806122170644812e-05,
      "kl": 2.859031245112419,
      "learning_rate": 9.992987803196614e-05,
      "loss": 0.0054,
      "num_tokens": 1633642.0,
      "reward": -0.47458043694496155,
      "reward_std": 0.13620741665363312,
      "rewards/rollout_reward_func/mean": -0.47458043694496155,
      "rewards/rollout_reward_func/std": 0.1397487074136734,
      "sampling/importance_sampling_ratio/max": 1.0000464916229248,
      "sampling/importance_sampling_ratio/mean": 1.0000081062316895,
      "sampling/importance_sampling_ratio/min": 0.9999859929084778,
      "sampling/sampling_logp_difference/max": 4.647710011340678e-05,
      "sampling/sampling_logp_difference/mean": 9.343058081867639e-06,
      "step": 47,
      "step_time": 11.58096628200019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006193299777805805,
      "epoch": 0.00096,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.139285763609223e-05,
      "kl": 1.9367534266784787,
      "learning_rate": 9.99165540081904e-05,
      "loss": 0.0036,
      "num_tokens": 1668978.0,
      "reward": -0.41765451431274414,
      "reward_std": 0.1612691581249237,
      "rewards/rollout_reward_func/mean": -0.41765451431274414,
      "rewards/rollout_reward_func/std": 0.17802941799163818,
      "sampling/importance_sampling_ratio/max": 1.0000407695770264,
      "sampling/importance_sampling_ratio/mean": 1.0000057220458984,
      "sampling/importance_sampling_ratio/min": 0.9999812841415405,
      "sampling/sampling_logp_difference/max": 4.07575280405581e-05,
      "sampling/sampling_logp_difference/mean": 6.66259802528657e-06,
      "step": 48,
      "step_time": 11.587671636999858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006365674234984908,
      "epoch": 0.00098,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.2500289560412057e-05,
      "kl": 2.572330966591835,
      "learning_rate": 9.990207316513463e-05,
      "loss": 0.0048,
      "num_tokens": 1697807.0,
      "reward": -0.5218106508255005,
      "reward_std": 0.10971593111753464,
      "rewards/rollout_reward_func/mean": -0.5218106508255005,
      "rewards/rollout_reward_func/std": 0.11586499959230423,
      "sampling/importance_sampling_ratio/max": 1.00003981590271,
      "sampling/importance_sampling_ratio/mean": 1.0000057220458984,
      "sampling/importance_sampling_ratio/min": 0.999975323677063,
      "sampling/sampling_logp_difference/max": 3.980120527558029e-05,
      "sampling/sampling_logp_difference/mean": 7.030243068584241e-06,
      "step": 49,
      "step_time": 12.058197169999858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006443986458180007,
      "epoch": 0.001,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.7668247639667243e-05,
      "kl": 2.39994593441952,
      "learning_rate": 9.98864359505076e-05,
      "loss": 0.0045,
      "num_tokens": 1732535.0,
      "reward": -0.4483959674835205,
      "reward_std": 0.1457299292087555,
      "rewards/rollout_reward_func/mean": -0.4483959674835205,
      "rewards/rollout_reward_func/std": 0.15753024816513062,
      "sampling/importance_sampling_ratio/max": 1.0000965595245361,
      "sampling/importance_sampling_ratio/mean": 1.000012755393982,
      "sampling/importance_sampling_ratio/min": 0.9999797344207764,
      "sampling/sampling_logp_difference/max": 9.653686720412225e-05,
      "sampling/sampling_logp_difference/mean": 1.4131255738902837e-05,
      "step": 50,
      "step_time": 10.359382950999816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006518985392176546,
      "epoch": 0.00102,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.547618689481169e-05,
      "kl": 1.7859944765614273,
      "learning_rate": 9.986964284776992e-05,
      "loss": 0.0033,
      "num_tokens": 1763607.0,
      "reward": -0.47022631764411926,
      "reward_std": 0.16651226580142975,
      "rewards/rollout_reward_func/mean": -0.47022631764411926,
      "rewards/rollout_reward_func/std": 0.17189878225326538,
      "sampling/importance_sampling_ratio/max": 1.0000604391098022,
      "sampling/importance_sampling_ratio/mean": 1.000004768371582,
      "sampling/importance_sampling_ratio/min": 0.9999511241912842,
      "sampling/sampling_logp_difference/max": 6.0423146351240575e-05,
      "sampling/sampling_logp_difference/mean": 8.4705961853615e-06,
      "step": 51,
      "step_time": 12.569274266999855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006323890884232242,
      "epoch": 0.00104,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.1937954695895314e-05,
      "kl": 2.6566081009805202,
      "learning_rate": 9.985169437611922e-05,
      "loss": 0.005,
      "num_tokens": 1796876.0,
      "reward": -0.509475827217102,
      "reward_std": 0.11160407960414886,
      "rewards/rollout_reward_func/mean": -0.509475827217102,
      "rewards/rollout_reward_func/std": 0.1130366399884224,
      "sampling/importance_sampling_ratio/max": 1.0000765323638916,
      "sampling/importance_sampling_ratio/mean": 1.0000137090682983,
      "sampling/importance_sampling_ratio/min": 0.9999876618385315,
      "sampling/sampling_logp_difference/max": 7.65095028327778e-05,
      "sampling/sampling_logp_difference/mean": 1.4244134035834577e-05,
      "step": 52,
      "step_time": 10.43953393899983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006390170674421825,
      "epoch": 0.00106,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.9535973226302303e-05,
      "kl": 1.9292238809430273,
      "learning_rate": 9.983259109047396e-05,
      "loss": 0.0036,
      "num_tokens": 1830883.0,
      "reward": -0.4491024613380432,
      "reward_std": 0.15517105162143707,
      "rewards/rollout_reward_func/mean": -0.4491024613380432,
      "rewards/rollout_reward_func/std": 0.16186949610710144,
      "sampling/importance_sampling_ratio/max": 1.0000708103179932,
      "sampling/importance_sampling_ratio/mean": 1.0000100135803223,
      "sampling/importance_sampling_ratio/min": 0.9999610185623169,
      "sampling/sampling_logp_difference/max": 7.079487841110677e-05,
      "sampling/sampling_logp_difference/mean": 1.1913212802028283e-05,
      "step": 53,
      "step_time": 12.08518075500001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006665495311608538,
      "epoch": 0.00108,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6661115296301432e-05,
      "kl": 2.2144708826672286,
      "learning_rate": 9.981233358145643e-05,
      "loss": 0.0042,
      "num_tokens": 1862673.0,
      "reward": -0.4556786119937897,
      "reward_std": 0.16825318336486816,
      "rewards/rollout_reward_func/mean": -0.4556786119937897,
      "rewards/rollout_reward_func/std": 0.1777384877204895,
      "sampling/importance_sampling_ratio/max": 1.0000735521316528,
      "sampling/importance_sampling_ratio/mean": 1.0000085830688477,
      "sampling/importance_sampling_ratio/min": 0.999980628490448,
      "sampling/sampling_logp_difference/max": 7.353299588430673e-05,
      "sampling/sampling_logp_difference/mean": 1.047209843818564e-05,
      "step": 54,
      "step_time": 11.007667734000506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006211438776517753,
      "epoch": 0.0011,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.1390176041459199e-05,
      "kl": 3.496194064617157,
      "learning_rate": 9.979092247537435e-05,
      "loss": 0.0066,
      "num_tokens": 1894457.0,
      "reward": -0.5075165033340454,
      "reward_std": 0.13905443251132965,
      "rewards/rollout_reward_func/mean": -0.5075165033340454,
      "rewards/rollout_reward_func/std": 0.14110973477363586,
      "sampling/importance_sampling_ratio/max": 1.000069260597229,
      "sampling/importance_sampling_ratio/mean": 1.0000133514404297,
      "sampling/importance_sampling_ratio/min": 0.999997615814209,
      "sampling/sampling_logp_difference/max": 6.924373155925423e-05,
      "sampling/sampling_logp_difference/mean": 1.3386375940172002e-05,
      "step": 55,
      "step_time": 11.282138439999471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006376511955750175,
      "epoch": 0.00112,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.527481435914524e-05,
      "kl": 2.392844100482762,
      "learning_rate": 9.976835843420156e-05,
      "loss": 0.0045,
      "num_tokens": 1927719.0,
      "reward": -0.4271199703216553,
      "reward_std": 0.20685634016990662,
      "rewards/rollout_reward_func/mean": -0.4271199703216553,
      "rewards/rollout_reward_func/std": 0.20447339117527008,
      "sampling/importance_sampling_ratio/max": 1.0000630617141724,
      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
      "sampling/importance_sampling_ratio/min": 0.9999785423278809,
      "sampling/sampling_logp_difference/max": 6.304663838818669e-05,
      "sampling/sampling_logp_difference/mean": 9.967272490030155e-06,
      "step": 56,
      "step_time": 11.81715834400029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006685723128612153,
      "epoch": 0.00114,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.624979788379278e-05,
      "kl": 2.3886548094451427,
      "learning_rate": 9.974464215555756e-05,
      "loss": 0.0045,
      "num_tokens": 1959495.0,
      "reward": -0.4609224796295166,
      "reward_std": 0.12867671251296997,
      "rewards/rollout_reward_func/mean": -0.4609224796295166,
      "rewards/rollout_reward_func/std": 0.14006492495536804,
      "sampling/importance_sampling_ratio/max": 1.0000592470169067,
      "sampling/importance_sampling_ratio/mean": 1.0000089406967163,
      "sampling/importance_sampling_ratio/min": 0.9999776482582092,
      "sampling/sampling_logp_difference/max": 5.92273281654343e-05,
      "sampling/sampling_logp_difference/mean": 1.0794034096761607e-05,
      "step": 57,
      "step_time": 10.286673817000292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006622151267947629,
      "epoch": 0.00116,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8251206711283885e-05,
      "kl": 2.1817052587866783,
      "learning_rate": 9.971977437268594e-05,
      "loss": 0.0041,
      "num_tokens": 1992702.0,
      "reward": -0.44977936148643494,
      "reward_std": 0.11797778308391571,
      "rewards/rollout_reward_func/mean": -0.44977936148643494,
      "rewards/rollout_reward_func/std": 0.11889227479696274,
      "sampling/importance_sampling_ratio/max": 1.0000721216201782,
      "sampling/importance_sampling_ratio/mean": 1.0000098943710327,
      "sampling/importance_sampling_ratio/min": 0.9999783039093018,
      "sampling/sampling_logp_difference/max": 7.20885582268238e-05,
      "sampling/sampling_logp_difference/mean": 1.1335238013998605e-05,
      "step": 58,
      "step_time": 11.882596950000561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006698454853903968,
      "epoch": 0.00118,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.0426890841918066e-05,
      "kl": 2.7415281403809786,
      "learning_rate": 9.969375585443172e-05,
      "loss": 0.0051,
      "num_tokens": 2023008.0,
      "reward": -0.45059582591056824,
      "reward_std": 0.133949413895607,
      "rewards/rollout_reward_func/mean": -0.45059582591056824,
      "rewards/rollout_reward_func/std": 0.15718908607959747,
      "sampling/importance_sampling_ratio/max": 1.0000840425491333,
      "sampling/importance_sampling_ratio/mean": 1.0000112056732178,
      "sampling/importance_sampling_ratio/min": 0.999981701374054,
      "sampling/sampling_logp_difference/max": 8.402469393331558e-05,
      "sampling/sampling_logp_difference/mean": 1.222399623657111e-05,
      "step": 59,
      "step_time": 10.513273939999635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006553210005222354,
      "epoch": 0.0012,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.9337732485146262e-05,
      "kl": 2.6224523090447036,
      "learning_rate": 9.966658740521754e-05,
      "loss": 0.0049,
      "num_tokens": 2054084.0,
      "reward": -0.5213782787322998,
      "reward_std": 0.11130588501691818,
      "rewards/rollout_reward_func/mean": -0.5213782787322998,
      "rewards/rollout_reward_func/std": 0.13300269842147827,
      "sampling/importance_sampling_ratio/max": 1.000054121017456,
      "sampling/importance_sampling_ratio/mean": 1.0000088214874268,
      "sampling/importance_sampling_ratio/min": 0.9999767541885376,
      "sampling/sampling_logp_difference/max": 5.4102332796901464e-05,
      "sampling/sampling_logp_difference/mean": 1.0590176316327415e-05,
      "step": 60,
      "step_time": 11.05362786600017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006773007989977486,
      "epoch": 0.00122,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.810842877603136e-05,
      "kl": 3.1815832147064356,
      "learning_rate": 9.963826986501882e-05,
      "loss": 0.006,
      "num_tokens": 2084335.0,
      "reward": -0.4689787030220032,
      "reward_std": 0.08655819296836853,
      "rewards/rollout_reward_func/mean": -0.4689787030220032,
      "rewards/rollout_reward_func/std": 0.09671928733587265,
      "sampling/importance_sampling_ratio/max": 1.0000590085983276,
      "sampling/importance_sampling_ratio/mean": 1.0000102519989014,
      "sampling/importance_sampling_ratio/min": 0.9999881982803345,
      "sampling/sampling_logp_difference/max": 5.898933159187436e-05,
      "sampling/sampling_logp_difference/mean": 1.1276682016614359e-05,
      "step": 61,
      "step_time": 9.853760416000114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006641297259193379,
      "epoch": 0.00124,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.755722354981117e-05,
      "kl": 2.3530580727383494,
      "learning_rate": 9.960880410933783e-05,
      "loss": 0.0044,
      "num_tokens": 2116836.0,
      "reward": -0.48839297890663147,
      "reward_std": 0.11407047510147095,
      "rewards/rollout_reward_func/mean": -0.48839297890663147,
      "rewards/rollout_reward_func/std": 0.13152439892292023,
      "sampling/importance_sampling_ratio/max": 1.0000629425048828,
      "sampling/importance_sampling_ratio/mean": 1.0000091791152954,
      "sampling/importance_sampling_ratio/min": 0.9999599456787109,
      "sampling/sampling_logp_difference/max": 6.291928002610803e-05,
      "sampling/sampling_logp_difference/mean": 1.072325358109083e-05,
      "step": 62,
      "step_time": 12.218522935000692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006711007954436354,
      "epoch": 0.00126,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.2082309442339465e-05,
      "kl": 2.3587748343124986,
      "learning_rate": 9.957819104917648e-05,
      "loss": 0.0044,
      "num_tokens": 2147147.0,
      "reward": -0.4642190635204315,
      "reward_std": 0.17485395073890686,
      "rewards/rollout_reward_func/mean": -0.4642190635204315,
      "rewards/rollout_reward_func/std": 0.17033183574676514,
      "sampling/importance_sampling_ratio/max": 1.0000776052474976,
      "sampling/importance_sampling_ratio/mean": 1.0000108480453491,
      "sampling/importance_sampling_ratio/min": 0.9999765753746033,
      "sampling/sampling_logp_difference/max": 7.758568972349167e-05,
      "sampling/sampling_logp_difference/mean": 1.2784492355422117e-05,
      "step": 63,
      "step_time": 10.816209253000125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006661249899480026,
      "epoch": 0.00128,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.198375659645535e-05,
      "kl": 3.386210630647838,
      "learning_rate": 9.954643163100835e-05,
      "loss": 0.0063,
      "num_tokens": 2175955.0,
      "reward": -0.4366099238395691,
      "reward_std": 0.11404688656330109,
      "rewards/rollout_reward_func/mean": -0.4366099238395691,
      "rewards/rollout_reward_func/std": 0.11409632116556168,
      "sampling/importance_sampling_ratio/max": 1.0000742673873901,
      "sampling/importance_sampling_ratio/mean": 1.0000113248825073,
      "sampling/importance_sampling_ratio/min": 0.9999960660934448,
      "sampling/sampling_logp_difference/max": 7.424886280205101e-05,
      "sampling/sampling_logp_difference/mean": 1.218299803440459e-05,
      "step": 64,
      "step_time": 11.2057395060001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006794004984840285,
      "epoch": 0.0013,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.13264058099594e-05,
      "kl": 3.4261841475963593,
      "learning_rate": 9.951352683674924e-05,
      "loss": 0.0064,
      "num_tokens": 2204870.0,
      "reward": -0.47110214829444885,
      "reward_std": 0.14154572784900665,
      "rewards/rollout_reward_func/mean": -0.47110214829444885,
      "rewards/rollout_reward_func/std": 0.15598046779632568,
      "sampling/importance_sampling_ratio/max": 1.0000603199005127,
      "sampling/importance_sampling_ratio/mean": 1.0000087022781372,
      "sampling/importance_sampling_ratio/min": 0.999940037727356,
      "sampling/sampling_logp_difference/max": 6.02992222411558e-05,
      "sampling/sampling_logp_difference/mean": 1.1587392691581044e-05,
      "step": 65,
      "step_time": 10.006985341000927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006701685379084665,
      "epoch": 0.00132,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.4127660910598934e-05,
      "kl": 3.0201471662148833,
      "learning_rate": 9.947947768372698e-05,
      "loss": 0.0057,
      "num_tokens": 2234457.0,
      "reward": -0.5099438428878784,
      "reward_std": 0.09435595571994781,
      "rewards/rollout_reward_func/mean": -0.5099438428878784,
      "rewards/rollout_reward_func/std": 0.09857697784900665,
      "sampling/importance_sampling_ratio/max": 1.0000659227371216,
      "sampling/importance_sampling_ratio/mean": 1.0000109672546387,
      "sampling/importance_sampling_ratio/min": 0.9999938607215881,
      "sampling/sampling_logp_difference/max": 6.590578414034098e-05,
      "sampling/sampling_logp_difference/mean": 1.2094817066099495e-05,
      "step": 66,
      "step_time": 11.312162051000314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006757073315384332,
      "epoch": 0.00134,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.9790588087053038e-05,
      "kl": 2.9807451650267467,
      "learning_rate": 9.944428522464987e-05,
      "loss": 0.0056,
      "num_tokens": 2265528.0,
      "reward": -0.46639153361320496,
      "reward_std": 0.15289044380187988,
      "rewards/rollout_reward_func/mean": -0.46639153361320496,
      "rewards/rollout_reward_func/std": 0.15913020074367523,
      "sampling/importance_sampling_ratio/max": 1.0000736713409424,
      "sampling/importance_sampling_ratio/mean": 1.0000100135803223,
      "sampling/importance_sampling_ratio/min": 0.9999826550483704,
      "sampling/sampling_logp_difference/max": 7.36506626708433e-05,
      "sampling/sampling_logp_difference/mean": 1.1719173926394433e-05,
      "step": 67,
      "step_time": 10.908072316999778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006568060161953326,
      "epoch": 0.00136,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.412132355151698e-05,
      "kl": 3.028280718252063,
      "learning_rate": 9.940795054757413e-05,
      "loss": 0.0057,
      "num_tokens": 2295782.0,
      "reward": -0.4513282775878906,
      "reward_std": 0.12359032034873962,
      "rewards/rollout_reward_func/mean": -0.4513282775878906,
      "rewards/rollout_reward_func/std": 0.1289566457271576,
      "sampling/importance_sampling_ratio/max": 1.0000699758529663,
      "sampling/importance_sampling_ratio/mean": 1.0000100135803223,
      "sampling/importance_sampling_ratio/min": 0.9999933242797852,
      "sampling/sampling_logp_difference/max": 6.99566735420376e-05,
      "sampling/sampling_logp_difference/mean": 1.1112841093563475e-05,
      "step": 68,
      "step_time": 11.857117537999557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006762990342394914,
      "epoch": 0.00138,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.2748485207557678e-05,
      "kl": 2.8810745738446712,
      "learning_rate": 9.937047477587032e-05,
      "loss": 0.0054,
      "num_tokens": 2326067.0,
      "reward": -0.48495256900787354,
      "reward_std": 0.10630611330270767,
      "rewards/rollout_reward_func/mean": -0.48495256900787354,
      "rewards/rollout_reward_func/std": 0.1172296553850174,
      "sampling/importance_sampling_ratio/max": 1.0000613927841187,
      "sampling/importance_sampling_ratio/mean": 1.000011920928955,
      "sampling/importance_sampling_ratio/min": 0.9999844431877136,
      "sampling/sampling_logp_difference/max": 6.137110176496208e-05,
      "sampling/sampling_logp_difference/mean": 1.3601927093986887e-05,
      "step": 69,
      "step_time": 10.554767250000168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0007178151245170739,
      "epoch": 0.0014,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.0173592424253e-05,
      "kl": 2.523578153923154,
      "learning_rate": 9.933185906818858e-05,
      "loss": 0.0047,
      "num_tokens": 2356445.0,
      "reward": -0.5431434512138367,
      "reward_std": 0.09289485216140747,
      "rewards/rollout_reward_func/mean": -0.5431434512138367,
      "rewards/rollout_reward_func/std": 0.11382929980754852,
      "sampling/importance_sampling_ratio/max": 1.000108003616333,
      "sampling/importance_sampling_ratio/mean": 1.000010371208191,
      "sampling/importance_sampling_ratio/min": 0.9999796152114868,
      "sampling/sampling_logp_difference/max": 0.00010799059236887842,
      "sampling/sampling_logp_difference/mean": 1.2703325410257094e-05,
      "step": 70,
      "step_time": 10.974104634000241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006705400555802044,
      "epoch": 0.00142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.3166499886428937e-05,
      "kl": 3.6824486549012363,
      "learning_rate": 9.929210461842278e-05,
      "loss": 0.0069,
      "num_tokens": 2383101.0,
      "reward": -0.47830814123153687,
      "reward_std": 0.14856326580047607,
      "rewards/rollout_reward_func/mean": -0.47830814123153687,
      "rewards/rollout_reward_func/std": 0.14815828204154968,
      "sampling/importance_sampling_ratio/max": 1.0000710487365723,
      "sampling/importance_sampling_ratio/mean": 1.0000113248825073,
      "sampling/importance_sampling_ratio/min": 0.9999920129776001,
      "sampling/sampling_logp_difference/max": 7.103009556885809e-05,
      "sampling/sampling_logp_difference/mean": 1.2573122148751281e-05,
      "step": 71,
      "step_time": 11.173864316999925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006955651369935367,
      "epoch": 0.00144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.546903462847695e-05,
      "kl": 2.761172599857673,
      "learning_rate": 9.925121265567366e-05,
      "loss": 0.0052,
      "num_tokens": 2413413.0,
      "reward": -0.4632667899131775,
      "reward_std": 0.15058688819408417,
      "rewards/rollout_reward_func/mean": -0.4632667899131775,
      "rewards/rollout_reward_func/std": 0.16061994433403015,
      "sampling/importance_sampling_ratio/max": 1.0000720024108887,
      "sampling/importance_sampling_ratio/mean": 1.0000090599060059,
      "sampling/importance_sampling_ratio/min": 0.9999868273735046,
      "sampling/sampling_logp_difference/max": 7.198394450824708e-05,
      "sampling/sampling_logp_difference/mean": 1.0825106073752977e-05,
      "step": 72,
      "step_time": 11.13052461299958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.000684412763803266,
      "epoch": 0.00146,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.656666219991166e-05,
      "kl": 2.885673111770302,
      "learning_rate": 9.920918444421082e-05,
      "loss": 0.0054,
      "num_tokens": 2443679.0,
      "reward": -0.45213764905929565,
      "reward_std": 0.17062178254127502,
      "rewards/rollout_reward_func/mean": -0.45213764905929565,
      "rewards/rollout_reward_func/std": 0.16972234845161438,
      "sampling/importance_sampling_ratio/max": 1.0000693798065186,
      "sampling/importance_sampling_ratio/mean": 1.000009536743164,
      "sampling/importance_sampling_ratio/min": 0.9999594688415527,
      "sampling/sampling_logp_difference/max": 6.936074350960553e-05,
      "sampling/sampling_logp_difference/mean": 1.2211980902065989e-05,
      "step": 73,
      "step_time": 11.717069765000133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006837020846433006,
      "epoch": 0.00148,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.652607069350779e-05,
      "kl": 3.2031972631812096,
      "learning_rate": 9.916602128343356e-05,
      "loss": 0.006,
      "num_tokens": 2473305.0,
      "reward": -0.45073336362838745,
      "reward_std": 0.1874859631061554,
      "rewards/rollout_reward_func/mean": -0.45073336362838745,
      "rewards/rollout_reward_func/std": 0.19179198145866394,
      "sampling/importance_sampling_ratio/max": 1.0000991821289062,
      "sampling/importance_sampling_ratio/mean": 1.0000112056732178,
      "sampling/importance_sampling_ratio/min": 0.9999768733978271,
      "sampling/sampling_logp_difference/max": 9.916012641042471e-05,
      "sampling/sampling_logp_difference/mean": 1.318043996434426e-05,
      "step": 74,
      "step_time": 10.713412969000046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 3.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 3.0,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "entropy": 0.0006799578113714233,
      "epoch": 0.0015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.1676881917519495e-05,
      "kl": 3.322963882237673,
      "learning_rate": 9.91217245078308e-05,
      "loss": 0.0062,
      "num_tokens": 2501412.0,
      "reward": -0.4546876847743988,
      "reward_std": 0.14404551684856415,
      "rewards/rollout_reward_func/mean": -0.4546876847743988,
      "rewards/rollout_reward_func/std": 0.14299967885017395,
      "sampling/importance_sampling_ratio/max": 1.0000700950622559,
      "sampling/importance_sampling_ratio/mean": 1.0000104904174805,
      "sampling/importance_sampling_ratio/min": 0.9999879002571106,
      "sampling/sampling_logp_difference/max": 7.007511158008128e-05,
      "sampling/sampling_logp_difference/mean": 1.1978003385593183e-05,
      "step": 75,
      "step_time": 11.524892917999523
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 600,
  "num_input_tokens_seen": 2501412,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}