File size: 70,807 Bytes
d8c3758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9984,
  "eval_steps": 50,
  "global_step": 312,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03251953125,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 1501.4,
      "completions/mean_length": 261.44921875,
      "completions/mean_terminated_length": 218.59018249511718,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.016,
      "grad_norm": 0.04527007043361664,
      "learning_rate": 3.1249999999999997e-07,
      "loss": 0.0817,
      "num_tokens": 17521272.0,
      "reward": 0.6406179428100586,
      "reward_std": 0.49268757104873656,
      "rewards/accuracy_reward": 0.2216796875,
      "rewards/brier_reward": 0.37469087839126586,
      "rewards/confidence_one_or_zero": 0.26728515625,
      "rewards/format_reward": 0.68486328125,
      "rewards/mean_confidence_reward": 0.7439393520355224,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03271484375,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 1502.0,
      "completions/mean_length": 256.41748046875,
      "completions/mean_terminated_length": 213.14424743652344,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.032,
      "grad_norm": 0.13754118978977203,
      "learning_rate": 6.249999999999999e-07,
      "loss": 0.0794,
      "num_tokens": 35247339.0,
      "reward": 0.6558520913124084,
      "reward_std": 0.46240503787994386,
      "rewards/accuracy_reward": 0.2123046875,
      "rewards/brier_reward": 0.37713180780410765,
      "rewards/confidence_one_or_zero": 0.27294921875,
      "rewards/format_reward": 0.722265625,
      "rewards/mean_confidence_reward": 0.7526058673858642,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.023046875,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 1481.6,
      "completions/mean_length": 217.0734375,
      "completions/mean_terminated_length": 186.05869750976564,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.048,
      "grad_norm": 0.07561526447534561,
      "learning_rate": 9.374999999999999e-07,
      "loss": 0.074,
      "num_tokens": 52518907.0,
      "reward": 0.7986767530441284,
      "reward_std": 0.38383460640907285,
      "rewards/accuracy_reward": 0.26533203125,
      "rewards/brier_reward": 0.4729374527931213,
      "rewards/confidence_one_or_zero": 0.252734375,
      "rewards/format_reward": 0.85908203125,
      "rewards/mean_confidence_reward": 0.7597096920013428,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0064453125,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 1263.0,
      "completions/mean_length": 151.86923828125,
      "completions/mean_terminated_length": 142.8934539794922,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.064,
      "grad_norm": 0.016455456614494324,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 68992448.0,
      "reward": 0.935137140750885,
      "reward_std": 0.29728189706802366,
      "rewards/accuracy_reward": 0.33779296875,
      "rewards/brier_reward": 0.5799404501914978,
      "rewards/confidence_one_or_zero": 0.1966796875,
      "rewards/format_reward": 0.9525390625,
      "rewards/mean_confidence_reward": 0.7396757483482361,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0021484375,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 923.6,
      "completions/mean_length": 128.238671875,
      "completions/mean_terminated_length": 125.20802764892578,
      "completions/min_length": 11.4,
      "completions/min_terminated_length": 11.4,
      "epoch": 0.08,
      "grad_norm": 0.05269403010606766,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 85238764.0,
      "reward": 1.0060295939445496,
      "reward_std": 0.2248561441898346,
      "rewards/accuracy_reward": 0.35712890625,
      "rewards/brier_reward": 0.6683077096939087,
      "rewards/confidence_one_or_zero": 0.09072265625,
      "rewards/format_reward": 0.98662109375,
      "rewards/mean_confidence_reward": 0.6470906853675842,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00224609375,
      "completions/max_length": 1306.2,
      "completions/max_terminated_length": 937.4,
      "completions/mean_length": 133.3306640625,
      "completions/mean_terminated_length": 130.1755615234375,
      "completions/min_length": 10.8,
      "completions/min_terminated_length": 10.8,
      "epoch": 0.096,
      "grad_norm": 0.11787062138319016,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 101648678.0,
      "reward": 1.0414881467819215,
      "reward_std": 0.18446856439113618,
      "rewards/accuracy_reward": 0.3625,
      "rewards/brier_reward": 0.7302408814430237,
      "rewards/confidence_one_or_zero": 0.04970703125,
      "rewards/format_reward": 0.990234375,
      "rewards/mean_confidence_reward": 0.5119157314300538,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00146484375,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 1072.2,
      "completions/mean_length": 143.08466796875,
      "completions/mean_terminated_length": 141.0427459716797,
      "completions/min_length": 2.6,
      "completions/min_terminated_length": 2.6,
      "epoch": 0.112,
      "grad_norm": 0.03638599067926407,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 118223337.0,
      "reward": 1.0774429082870483,
      "reward_std": 0.13774871528148652,
      "rewards/accuracy_reward": 0.39619140625,
      "rewards/brier_reward": 0.7643576622009277,
      "rewards/confidence_one_or_zero": 0.05927734375,
      "rewards/format_reward": 0.9943359375,
      "rewards/mean_confidence_reward": 0.37070313692092893,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00048828125,
      "completions/max_length": 913.2,
      "completions/max_terminated_length": 723.4,
      "completions/mean_length": 151.192578125,
      "completions/mean_terminated_length": 150.51485290527344,
      "completions/min_length": 43.2,
      "completions/min_terminated_length": 43.2,
      "epoch": 0.128,
      "grad_norm": 0.008543482981622219,
      "learning_rate": 1e-06,
      "loss": 0.0007,
      "num_tokens": 134688221.0,
      "reward": 1.0826910495758058,
      "reward_std": 0.10313712060451508,
      "rewards/accuracy_reward": 0.405078125,
      "rewards/brier_reward": 0.7624517321586609,
      "rewards/confidence_one_or_zero": 0.0619140625,
      "rewards/format_reward": 0.9978515625,
      "rewards/mean_confidence_reward": 0.31029855012893676,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 899.6,
      "completions/max_terminated_length": 555.0,
      "completions/mean_length": 155.73017578125,
      "completions/mean_terminated_length": 155.19070434570312,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.144,
      "grad_norm": 0.010719305835664272,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 151233330.0,
      "reward": 1.1194574117660523,
      "reward_std": 0.10182622820138931,
      "rewards/accuracy_reward": 0.4982421875,
      "rewards/brier_reward": 0.7418437123298645,
      "rewards/confidence_one_or_zero": 0.04111328125,
      "rewards/format_reward": 0.998828125,
      "rewards/mean_confidence_reward": 0.3395689487457275,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00078125,
      "completions/max_length": 635.4,
      "completions/max_terminated_length": 433.6,
      "completions/mean_length": 156.94189453125,
      "completions/mean_terminated_length": 155.86400146484374,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.16,
      "grad_norm": 0.02116214483976364,
      "learning_rate": 1e-06,
      "loss": 0.0021,
      "num_tokens": 167861343.0,
      "reward": 1.1079134941101074,
      "reward_std": 0.10681741833686828,
      "rewards/accuracy_reward": 0.45244140625,
      "rewards/brier_reward": 0.7644589900970459,
      "rewards/confidence_one_or_zero": 0.02294921875,
      "rewards/format_reward": 0.99892578125,
      "rewards/mean_confidence_reward": 0.3910071313381195,
      "step": 50
    },
    {
      "epoch": 0.16,
      "eval_completions/clipped_ratio": 0.0,
      "eval_completions/max_length": 351.75,
      "eval_completions/max_terminated_length": 351.75,
      "eval_completions/mean_length": 158.1922149658203,
      "eval_completions/mean_terminated_length": 158.1922149658203,
      "eval_completions/min_length": 87.0,
      "eval_completions/min_terminated_length": 87.0,
      "eval_loss": 0.0,
      "eval_num_tokens": 167861343.0,
      "eval_reward": 1.0662100315093994,
      "eval_reward_std": 0.21462075412273407,
      "eval_rewards/accuracy_reward": 0.357421875,
      "eval_rewards/brier_reward": 0.774997279047966,
      "eval_rewards/confidence_one_or_zero": 0.013671875,
      "eval_rewards/format_reward": 1.0,
      "eval_rewards/mean_confidence_reward": 0.4146093651652336,
      "eval_runtime": 21.8666,
      "eval_samples_per_second": 22.866,
      "eval_steps_per_second": 0.183,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 915.6,
      "completions/max_terminated_length": 474.8,
      "completions/mean_length": 159.457421875,
      "completions/mean_terminated_length": 158.91932067871093,
      "completions/min_length": 45.8,
      "completions/min_terminated_length": 45.8,
      "epoch": 0.176,
      "grad_norm": 0.009603990241885185,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 184731307.0,
      "reward": 1.1049310684204101,
      "reward_std": 0.11344930976629257,
      "rewards/accuracy_reward": 0.441796875,
      "rewards/brier_reward": 0.769236147403717,
      "rewards/confidence_one_or_zero": 0.01845703125,
      "rewards/format_reward": 0.998828125,
      "rewards/mean_confidence_reward": 0.4387405276298523,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0005859375,
      "completions/max_length": 995.2,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 163.6873046875,
      "completions/mean_terminated_length": 162.88194580078124,
      "completions/min_length": 77.4,
      "completions/min_terminated_length": 77.4,
      "epoch": 0.192,
      "grad_norm": 0.007208758499473333,
      "learning_rate": 1e-06,
      "loss": 0.0024,
      "num_tokens": 201222281.0,
      "reward": 1.117084002494812,
      "reward_std": 0.11234763264656067,
      "rewards/accuracy_reward": 0.4650390625,
      "rewards/brier_reward": 0.7699091911315918,
      "rewards/confidence_one_or_zero": 0.0126953125,
      "rewards/format_reward": 0.99921875,
      "rewards/mean_confidence_reward": 0.47076983451843263,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 695.4,
      "completions/max_terminated_length": 507.2,
      "completions/mean_length": 167.3060546875,
      "completions/mean_terminated_length": 166.7700225830078,
      "completions/min_length": 62.8,
      "completions/min_terminated_length": 62.8,
      "epoch": 0.208,
      "grad_norm": 0.010833042673766613,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 217967719.0,
      "reward": 1.1401428937911988,
      "reward_std": 0.11306387037038804,
      "rewards/accuracy_reward": 0.5169921875,
      "rewards/brier_reward": 0.7639761686325073,
      "rewards/confidence_one_or_zero": 0.00966796875,
      "rewards/format_reward": 0.99931640625,
      "rewards/mean_confidence_reward": 0.48724169135093687,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 730.4,
      "completions/max_terminated_length": 518.4,
      "completions/mean_length": 166.8869140625,
      "completions/mean_terminated_length": 166.7537078857422,
      "completions/min_length": 81.8,
      "completions/min_terminated_length": 81.8,
      "epoch": 0.224,
      "grad_norm": 0.00990777276456356,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 234829825.0,
      "reward": 1.117465901374817,
      "reward_std": 0.11148046851158142,
      "rewards/accuracy_reward": 0.4650390625,
      "rewards/brier_reward": 0.7703800439834595,
      "rewards/confidence_one_or_zero": 0.0109375,
      "rewards/format_reward": 0.99951171875,
      "rewards/mean_confidence_reward": 0.48857617378234863,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0005859375,
      "completions/max_length": 1110.6,
      "completions/max_terminated_length": 485.6,
      "completions/mean_length": 173.951171875,
      "completions/mean_terminated_length": 173.1517333984375,
      "completions/min_length": 81.6,
      "completions/min_terminated_length": 81.6,
      "epoch": 0.24,
      "grad_norm": 0.012188726104795933,
      "learning_rate": 1e-06,
      "loss": 0.0026,
      "num_tokens": 251862765.0,
      "reward": 1.1460721731185912,
      "reward_std": 0.1271799236536026,
      "rewards/accuracy_reward": 0.52431640625,
      "rewards/brier_reward": 0.7685105323791503,
      "rewards/confidence_one_or_zero": 0.00810546875,
      "rewards/format_reward": 0.99931640625,
      "rewards/mean_confidence_reward": 0.5015208303928376,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 779.0,
      "completions/max_terminated_length": 570.2,
      "completions/mean_length": 171.87490234375,
      "completions/mean_terminated_length": 171.741259765625,
      "completions/min_length": 83.8,
      "completions/min_terminated_length": 83.8,
      "epoch": 0.256,
      "grad_norm": 0.00730907404795289,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 268677580.0,
      "reward": 1.13645658493042,
      "reward_std": 0.11724818050861359,
      "rewards/accuracy_reward": 0.49716796875,
      "rewards/brier_reward": 0.7759395122528077,
      "rewards/confidence_one_or_zero": 0.0126953125,
      "rewards/format_reward": 0.9998046875,
      "rewards/mean_confidence_reward": 0.5036328196525574,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00048828125,
      "completions/max_length": 863.2,
      "completions/max_terminated_length": 460.8,
      "completions/mean_length": 177.0607421875,
      "completions/mean_terminated_length": 176.3964630126953,
      "completions/min_length": 57.6,
      "completions/min_terminated_length": 57.6,
      "epoch": 0.272,
      "grad_norm": 0.009313421323895454,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 285456378.0,
      "reward": 1.1312544345855713,
      "reward_std": 0.11844458729028702,
      "rewards/accuracy_reward": 0.49052734375,
      "rewards/brier_reward": 0.7730547547340393,
      "rewards/confidence_one_or_zero": 0.0130859375,
      "rewards/format_reward": 0.99892578125,
      "rewards/mean_confidence_reward": 0.4925888657569885,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0001953125,
      "completions/max_length": 890.4,
      "completions/max_terminated_length": 446.2,
      "completions/mean_length": 173.14130859375,
      "completions/mean_terminated_length": 172.87517395019532,
      "completions/min_length": 66.2,
      "completions/min_terminated_length": 66.2,
      "epoch": 0.288,
      "grad_norm": 0.021724838763475418,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 302187521.0,
      "reward": 1.1318594217300415,
      "reward_std": 0.11806258261203766,
      "rewards/accuracy_reward": 0.4912109375,
      "rewards/brier_reward": 0.7728975296020508,
      "rewards/confidence_one_or_zero": 0.01240234375,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.4906367301940918,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0005859375,
      "completions/max_length": 1014.6,
      "completions/max_terminated_length": 857.8,
      "completions/mean_length": 176.462890625,
      "completions/mean_terminated_length": 175.66776123046876,
      "completions/min_length": 62.4,
      "completions/min_terminated_length": 62.4,
      "epoch": 0.304,
      "grad_norm": 0.006397055462002754,
      "learning_rate": 1e-06,
      "loss": 0.0018,
      "num_tokens": 318924453.0,
      "reward": 1.1344104766845704,
      "reward_std": 0.11458559930324555,
      "rewards/accuracy_reward": 0.4953125,
      "rewards/brier_reward": 0.7745817184448243,
      "rewards/confidence_one_or_zero": 0.01767578125,
      "rewards/format_reward": 0.99892578125,
      "rewards/mean_confidence_reward": 0.47802149057388305,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0009765625,
      "completions/max_length": 1325.6,
      "completions/max_terminated_length": 647.8,
      "completions/mean_length": 176.624609375,
      "completions/mean_terminated_length": 175.29497375488282,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.32,
      "grad_norm": 0.13792400062084198,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 335821793.0,
      "reward": 1.1494127273559571,
      "reward_std": 0.10425310283899307,
      "rewards/accuracy_reward": 0.51708984375,
      "rewards/brier_reward": 0.7831019043922425,
      "rewards/confidence_one_or_zero": 0.01220703125,
      "rewards/format_reward": 0.9986328125,
      "rewards/mean_confidence_reward": 0.4873457133769989,
      "step": 100
    },
    {
      "epoch": 0.32,
      "eval_completions/clipped_ratio": 0.0,
      "eval_completions/max_length": 408.0,
      "eval_completions/max_terminated_length": 408.0,
      "eval_completions/mean_length": 175.2724952697754,
      "eval_completions/mean_terminated_length": 175.2724952697754,
      "eval_completions/min_length": 94.75,
      "eval_completions/min_terminated_length": 94.75,
      "eval_loss": 0.0,
      "eval_num_tokens": 335821793.0,
      "eval_reward": 1.0855459570884705,
      "eval_reward_std": 0.25649269297719,
      "eval_rewards/accuracy_reward": 0.392578125,
      "eval_rewards/brier_reward": 0.7785128951072693,
      "eval_rewards/confidence_one_or_zero": 0.021484375,
      "eval_rewards/format_reward": 1.0,
      "eval_rewards/mean_confidence_reward": 0.47496095299720764,
      "eval_runtime": 22.7529,
      "eval_samples_per_second": 21.975,
      "eval_steps_per_second": 0.176,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001171875,
      "completions/max_length": 1462.2,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 175.45908203125,
      "completions/mean_terminated_length": 173.86156005859374,
      "completions/min_length": 49.6,
      "completions/min_terminated_length": 49.6,
      "epoch": 0.336,
      "grad_norm": 0.017879005521535873,
      "learning_rate": 1e-06,
      "loss": 0.0028,
      "num_tokens": 352340926.0,
      "reward": 1.1430244445800781,
      "reward_std": 0.12031411826610565,
      "rewards/accuracy_reward": 0.51044921875,
      "rewards/brier_reward": 0.7780401229858398,
      "rewards/confidence_one_or_zero": 0.015625,
      "rewards/format_reward": 0.99755859375,
      "rewards/mean_confidence_reward": 0.4848623156547546,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0017578125,
      "completions/max_length": 1461.4,
      "completions/max_terminated_length": 807.2,
      "completions/mean_length": 180.69208984375,
      "completions/mean_terminated_length": 178.3111328125,
      "completions/min_length": 1.0,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.352,
      "grad_norm": 0.039632029831409454,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 369451629.0,
      "reward": 1.1116411447525025,
      "reward_std": 0.12863886207342148,
      "rewards/accuracy_reward": 0.4513671875,
      "rewards/brier_reward": 0.7790430426597595,
      "rewards/confidence_one_or_zero": 0.01787109375,
      "rewards/format_reward": 0.99287109375,
      "rewards/mean_confidence_reward": 0.45101758241653445,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00048828125,
      "completions/max_length": 1182.2,
      "completions/max_terminated_length": 745.6,
      "completions/mean_length": 183.9287109375,
      "completions/mean_terminated_length": 183.26663513183593,
      "completions/min_length": 7.2,
      "completions/min_terminated_length": 7.2,
      "epoch": 0.368,
      "grad_norm": 0.03524937480688095,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 386400531.0,
      "reward": 1.1271000146865844,
      "reward_std": 0.11355644613504409,
      "rewards/accuracy_reward": 0.475,
      "rewards/brier_reward": 0.7824217438697815,
      "rewards/confidence_one_or_zero": 0.01640625,
      "rewards/format_reward": 0.99677734375,
      "rewards/mean_confidence_reward": 0.465801864862442,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001171875,
      "completions/max_length": 1536.0,
      "completions/max_terminated_length": 745.8,
      "completions/mean_length": 184.33974609375,
      "completions/mean_terminated_length": 182.75419921875,
      "completions/min_length": 62.2,
      "completions/min_terminated_length": 62.2,
      "epoch": 0.384,
      "grad_norm": 0.017022427171468735,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 403144682.0,
      "reward": 1.1463651657104492,
      "reward_std": 0.12106073200702668,
      "rewards/accuracy_reward": 0.506640625,
      "rewards/brier_reward": 0.7902879238128662,
      "rewards/confidence_one_or_zero": 0.02177734375,
      "rewards/format_reward": 0.99580078125,
      "rewards/mean_confidence_reward": 0.46812206506729126,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00126953125,
      "completions/max_length": 1335.0,
      "completions/max_terminated_length": 753.8,
      "completions/mean_length": 184.96728515625,
      "completions/mean_terminated_length": 183.24875183105468,
      "completions/min_length": 60.2,
      "completions/min_terminated_length": 60.2,
      "epoch": 0.4,
      "grad_norm": 0.017970727756619453,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 420075195.0,
      "reward": 1.1312126159667968,
      "reward_std": 0.12662589848041533,
      "rewards/accuracy_reward": 0.48759765625,
      "rewards/brier_reward": 0.7792211294174194,
      "rewards/confidence_one_or_zero": 0.0115234375,
      "rewards/format_reward": 0.99560546875,
      "rewards/mean_confidence_reward": 0.5128134965896607,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.001171875,
      "completions/max_length": 1323.8,
      "completions/max_terminated_length": 799.8,
      "completions/mean_length": 190.72177734375,
      "completions/mean_terminated_length": 189.14300231933595,
      "completions/min_length": 83.2,
      "completions/min_terminated_length": 83.2,
      "epoch": 0.416,
      "grad_norm": 0.01795245334506035,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 436909370.0,
      "reward": 1.1423231840133667,
      "reward_std": 0.12082898765802383,
      "rewards/accuracy_reward": 0.49794921875,
      "rewards/brier_reward": 0.7881609320640564,
      "rewards/confidence_one_or_zero": 0.00966796875,
      "rewards/format_reward": 0.99853515625,
      "rewards/mean_confidence_reward": 0.5266152441501617,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0005859375,
      "completions/max_length": 1316.6,
      "completions/max_terminated_length": 564.4,
      "completions/mean_length": 189.984765625,
      "completions/mean_terminated_length": 189.19561157226562,
      "completions/min_length": 65.4,
      "completions/min_terminated_length": 65.4,
      "epoch": 0.432,
      "grad_norm": 0.12305645644664764,
      "learning_rate": 1e-06,
      "loss": 0.0022,
      "num_tokens": 453869150.0,
      "reward": 1.167378830909729,
      "reward_std": 0.11343645602464676,
      "rewards/accuracy_reward": 0.53447265625,
      "rewards/brier_reward": 0.8010651707649231,
      "rewards/confidence_one_or_zero": 0.00986328125,
      "rewards/format_reward": 0.99921875,
      "rewards/mean_confidence_reward": 0.5294726729393006,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 1255.2,
      "completions/max_terminated_length": 826.8,
      "completions/mean_length": 201.4958984375,
      "completions/mean_terminated_length": 200.97523803710936,
      "completions/min_length": 76.6,
      "completions/min_terminated_length": 76.6,
      "epoch": 0.448,
      "grad_norm": 0.01579131931066513,
      "learning_rate": 1e-06,
      "loss": 0.0017,
      "num_tokens": 470885268.0,
      "reward": 1.1469329595565796,
      "reward_std": 0.1118901401758194,
      "rewards/accuracy_reward": 0.49921875,
      "rewards/brier_reward": 0.7957204103469848,
      "rewards/confidence_one_or_zero": 0.0072265625,
      "rewards/format_reward": 0.99892578125,
      "rewards/mean_confidence_reward": 0.5328691601753235,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0001953125,
      "completions/max_length": 736.6,
      "completions/max_terminated_length": 513.0,
      "completions/mean_length": 211.05078125,
      "completions/mean_terminated_length": 210.79130859375,
      "completions/min_length": 81.8,
      "completions/min_terminated_length": 81.8,
      "epoch": 0.464,
      "grad_norm": 0.011122009716928005,
      "learning_rate": 1e-06,
      "loss": 0.0008,
      "num_tokens": 488217244.0,
      "reward": 1.1156266927719116,
      "reward_std": 0.10946736484766006,
      "rewards/accuracy_reward": 0.4498046875,
      "rewards/brier_reward": 0.781838345527649,
      "rewards/confidence_one_or_zero": 0.009765625,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5123164117336273,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0001953125,
      "completions/max_length": 999.8,
      "completions/max_terminated_length": 586.0,
      "completions/mean_length": 209.76787109375,
      "completions/mean_terminated_length": 209.50829467773437,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.48,
      "grad_norm": 0.011441366747021675,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 505413299.0,
      "reward": 1.1479370832443236,
      "reward_std": 0.10615950524806976,
      "rewards/accuracy_reward": 0.5056640625,
      "rewards/brier_reward": 0.7905020475387573,
      "rewards/confidence_one_or_zero": 0.01357421875,
      "rewards/format_reward": 0.99970703125,
      "rewards/mean_confidence_reward": 0.49698535799980165,
      "step": 150
    },
    {
      "epoch": 0.48,
      "eval_completions/clipped_ratio": 0.0,
      "eval_completions/max_length": 454.75,
      "eval_completions/max_terminated_length": 454.75,
      "eval_completions/mean_length": 213.07852935791016,
      "eval_completions/mean_terminated_length": 213.07852935791016,
      "eval_completions/min_length": 122.75,
      "eval_completions/min_terminated_length": 122.75,
      "eval_loss": 0.0,
      "eval_num_tokens": 505413299.0,
      "eval_reward": 1.1056718528270721,
      "eval_reward_std": 0.2651190534234047,
      "eval_rewards/accuracy_reward": 0.41015625,
      "eval_rewards/brier_reward": 0.8011865168809891,
      "eval_rewards/confidence_one_or_zero": 0.005859375,
      "eval_rewards/format_reward": 1.0,
      "eval_rewards/mean_confidence_reward": 0.4764648526906967,
      "eval_runtime": 25.1582,
      "eval_samples_per_second": 19.874,
      "eval_steps_per_second": 0.159,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 575.8,
      "completions/max_terminated_length": 575.8,
      "completions/mean_length": 212.034765625,
      "completions/mean_terminated_length": 212.034765625,
      "completions/min_length": 104.2,
      "completions/min_terminated_length": 104.2,
      "epoch": 0.496,
      "grad_norm": 0.0033662207424640656,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 522892375.0,
      "reward": 1.1667110204696656,
      "reward_std": 0.1078746810555458,
      "rewards/accuracy_reward": 0.5400390625,
      "rewards/brier_reward": 0.7934796333312988,
      "rewards/confidence_one_or_zero": 0.01025390625,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.5157265901565552,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0001953125,
      "completions/max_length": 924.6,
      "completions/max_terminated_length": 532.0,
      "completions/mean_length": 210.298828125,
      "completions/mean_terminated_length": 210.0409729003906,
      "completions/min_length": 97.4,
      "completions/min_terminated_length": 97.4,
      "epoch": 0.512,
      "grad_norm": 0.012034610845148563,
      "learning_rate": 1e-06,
      "loss": 0.0012,
      "num_tokens": 540191499.0,
      "reward": 1.1708500623703002,
      "reward_std": 0.11257555186748505,
      "rewards/accuracy_reward": 0.5369140625,
      "rewards/brier_reward": 0.8050778865814209,
      "rewards/confidence_one_or_zero": 0.00703125,
      "rewards/format_reward": 0.99970703125,
      "rewards/mean_confidence_reward": 0.5358349800109863,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00029296875,
      "completions/max_length": 1004.8,
      "completions/max_terminated_length": 661.4,
      "completions/mean_length": 207.5166015625,
      "completions/mean_terminated_length": 207.12822265625,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.528,
      "grad_norm": 0.015975475311279297,
      "learning_rate": 1e-06,
      "loss": 0.001,
      "num_tokens": 557346005.0,
      "reward": 1.1701508998870849,
      "reward_std": 0.10177138149738311,
      "rewards/accuracy_reward": 0.5318359375,
      "rewards/brier_reward": 0.8088555216789246,
      "rewards/confidence_one_or_zero": 0.00927734375,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5180878877639771,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 768.2,
      "completions/max_terminated_length": 551.8,
      "completions/mean_length": 209.73466796875,
      "completions/mean_terminated_length": 209.60549926757812,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.544,
      "grad_norm": 0.0062843854539096355,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 574657272.0,
      "reward": 1.1808686017990113,
      "reward_std": 0.10841628313064575,
      "rewards/accuracy_reward": 0.561328125,
      "rewards/brier_reward": 0.8007986664772033,
      "rewards/confidence_one_or_zero": 0.00556640625,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5436318397521973,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00029296875,
      "completions/max_length": 918.4,
      "completions/max_terminated_length": 479.4,
      "completions/mean_length": 208.92236328125,
      "completions/mean_terminated_length": 208.53166809082032,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.56,
      "grad_norm": 0.012134869582951069,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 591618045.0,
      "reward": 1.1645673036575317,
      "reward_std": 0.09899833053350449,
      "rewards/accuracy_reward": 0.51962890625,
      "rewards/brier_reward": 0.809797728061676,
      "rewards/confidence_one_or_zero": 0.0025390625,
      "rewards/format_reward": 0.99970703125,
      "rewards/mean_confidence_reward": 0.5343828201293945,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 1103.4,
      "completions/max_terminated_length": 482.4,
      "completions/mean_length": 208.23701171875,
      "completions/mean_terminated_length": 207.72026672363282,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.576,
      "grad_norm": 0.014361800625920296,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 608937016.0,
      "reward": 1.153262686729431,
      "reward_std": 0.09391747117042541,
      "rewards/accuracy_reward": 0.50830078125,
      "rewards/brier_reward": 0.7986142754554748,
      "rewards/confidence_one_or_zero": 0.0078125,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5241601765155792,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 1119.4,
      "completions/max_terminated_length": 530.6,
      "completions/mean_length": 206.19326171875,
      "completions/mean_terminated_length": 205.6750518798828,
      "completions/min_length": 102.8,
      "completions/min_terminated_length": 102.8,
      "epoch": 0.592,
      "grad_norm": 0.008724602870643139,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 626216147.0,
      "reward": 1.160974383354187,
      "reward_std": 0.0895046427845955,
      "rewards/accuracy_reward": 0.51416015625,
      "rewards/brier_reward": 0.8081783413887024,
      "rewards/confidence_one_or_zero": 0.0060546875,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5143349647521973,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 663.0,
      "completions/max_terminated_length": 556.8,
      "completions/mean_length": 205.84892578125,
      "completions/mean_terminated_length": 205.7189514160156,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.608,
      "grad_norm": 0.008571009151637554,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 643323528.0,
      "reward": 1.170970630645752,
      "reward_std": 0.0794813334941864,
      "rewards/accuracy_reward": 0.5240234375,
      "rewards/brier_reward": 0.8181121468544006,
      "rewards/confidence_one_or_zero": 0.00751953125,
      "rewards/format_reward": 0.9998046875,
      "rewards/mean_confidence_reward": 0.48931640982627866,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 573.0,
      "completions/max_terminated_length": 573.0,
      "completions/mean_length": 204.29423828125,
      "completions/mean_terminated_length": 204.29423828125,
      "completions/min_length": 99.4,
      "completions/min_terminated_length": 99.4,
      "epoch": 0.624,
      "grad_norm": 0.006929480005055666,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 660759405.0,
      "reward": 1.1707647800445558,
      "reward_std": 0.08714393228292465,
      "rewards/accuracy_reward": 0.5267578125,
      "rewards/brier_reward": 0.8147708535194397,
      "rewards/confidence_one_or_zero": 0.00556640625,
      "rewards/format_reward": 1.0,
      "rewards/mean_confidence_reward": 0.4799599587917328,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00048828125,
      "completions/max_length": 876.8,
      "completions/max_terminated_length": 465.2,
      "completions/mean_length": 204.7708984375,
      "completions/mean_terminated_length": 204.12095947265624,
      "completions/min_length": 101.4,
      "completions/min_terminated_length": 101.4,
      "epoch": 0.64,
      "grad_norm": 0.006568757817149162,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 678198947.0,
      "reward": 1.188051414489746,
      "reward_std": 0.08363442420959473,
      "rewards/accuracy_reward": 0.56201171875,
      "rewards/brier_reward": 0.8145784258842468,
      "rewards/confidence_one_or_zero": 0.0029296875,
      "rewards/format_reward": 0.99951171875,
      "rewards/mean_confidence_reward": 0.5187148451805115,
      "step": 200
    },
    {
      "epoch": 0.64,
      "eval_completions/clipped_ratio": 0.0,
      "eval_completions/max_length": 362.5,
      "eval_completions/max_terminated_length": 362.5,
      "eval_completions/mean_length": 202.62493133544922,
      "eval_completions/mean_terminated_length": 202.62493133544922,
      "eval_completions/min_length": 116.25,
      "eval_completions/min_terminated_length": 116.25,
      "eval_loss": 0.0,
      "eval_num_tokens": 678198947.0,
      "eval_reward": 1.1124199032783508,
      "eval_reward_std": 0.2762097716331482,
      "eval_rewards/accuracy_reward": 0.41796875,
      "eval_rewards/brier_reward": 0.8068701177835464,
      "eval_rewards/confidence_one_or_zero": 0.001953125,
      "eval_rewards/format_reward": 1.0,
      "eval_rewards/mean_confidence_reward": 0.5002929642796516,
      "eval_runtime": 20.9461,
      "eval_samples_per_second": 23.871,
      "eval_steps_per_second": 0.191,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 1170.6,
      "completions/max_terminated_length": 699.2,
      "completions/mean_length": 200.96318359375,
      "completions/mean_terminated_length": 200.4418518066406,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.656,
      "grad_norm": 0.0061631170101463795,
      "learning_rate": 1e-06,
      "loss": 0.0011,
      "num_tokens": 695113354.0,
      "reward": 1.1497117042541505,
      "reward_std": 0.09324304014444351,
      "rewards/accuracy_reward": 0.50517578125,
      "rewards/brier_reward": 0.7946373224258423,
      "rewards/confidence_one_or_zero": 0.00361328125,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5214970707893372,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 666.0,
      "completions/max_terminated_length": 434.4,
      "completions/mean_length": 199.86826171875,
      "completions/mean_terminated_length": 199.73834228515625,
      "completions/min_length": 101.8,
      "completions/min_terminated_length": 101.8,
      "epoch": 0.672,
      "grad_norm": 0.005808599293231964,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 712073445.0,
      "reward": 1.168108344078064,
      "reward_std": 0.08731473982334137,
      "rewards/accuracy_reward": 0.5232421875,
      "rewards/brier_reward": 0.8130711436271667,
      "rewards/confidence_one_or_zero": 0.003515625,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.5128886938095093,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 675.2,
      "completions/max_terminated_length": 629.4,
      "completions/mean_length": 205.4484375,
      "completions/mean_terminated_length": 205.31842651367188,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.688,
      "grad_norm": 0.005605604965239763,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 729131157.0,
      "reward": 1.1653831958770753,
      "reward_std": 0.09237445890903473,
      "rewards/accuracy_reward": 0.5251953125,
      "rewards/brier_reward": 0.8056677460670472,
      "rewards/confidence_one_or_zero": 0.0005859375,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.5064306616783142,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 411.0,
      "completions/max_terminated_length": 411.0,
      "completions/mean_length": 202.72197265625,
      "completions/mean_terminated_length": 202.72197265625,
      "completions/min_length": 104.2,
      "completions/min_terminated_length": 104.2,
      "epoch": 0.704,
      "grad_norm": 0.0074613383039832115,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 746073174.0,
      "reward": 1.176201581954956,
      "reward_std": 0.07743649333715438,
      "rewards/accuracy_reward": 0.53837890625,
      "rewards/brier_reward": 0.8140231966972351,
      "rewards/confidence_one_or_zero": 0.00517578125,
      "rewards/format_reward": 1.0,
      "rewards/mean_confidence_reward": 0.5200888633728027,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 656.2,
      "completions/max_terminated_length": 656.2,
      "completions/mean_length": 202.00751953125,
      "completions/mean_terminated_length": 202.00751953125,
      "completions/min_length": 99.4,
      "completions/min_terminated_length": 99.4,
      "epoch": 0.72,
      "grad_norm": 0.008289781399071217,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 763151587.0,
      "reward": 1.186832022666931,
      "reward_std": 0.08442019075155258,
      "rewards/accuracy_reward": 0.55576171875,
      "rewards/brier_reward": 0.817901360988617,
      "rewards/confidence_one_or_zero": 0.0025390625,
      "rewards/format_reward": 1.0,
      "rewards/mean_confidence_reward": 0.5230859518051147,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0005859375,
      "completions/max_length": 647.4,
      "completions/max_terminated_length": 516.6,
      "completions/mean_length": 205.50849609375,
      "completions/mean_terminated_length": 204.73146057128906,
      "completions/min_length": 97.6,
      "completions/min_terminated_length": 97.6,
      "epoch": 0.736,
      "grad_norm": 0.008065270259976387,
      "learning_rate": 1e-06,
      "loss": 0.0016,
      "num_tokens": 780195578.0,
      "reward": 1.1865583658218384,
      "reward_std": 0.08369777351617813,
      "rewards/accuracy_reward": 0.56474609375,
      "rewards/brier_reward": 0.8089555978775025,
      "rewards/confidence_one_or_zero": 0.003125,
      "rewards/format_reward": 0.9994140625,
      "rewards/mean_confidence_reward": 0.5270449399948121,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 606.4,
      "completions/max_terminated_length": 606.4,
      "completions/mean_length": 206.17548828125,
      "completions/mean_terminated_length": 206.17548828125,
      "completions/min_length": 102.2,
      "completions/min_terminated_length": 102.2,
      "epoch": 0.752,
      "grad_norm": 0.009517704136669636,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 797534015.0,
      "reward": 1.1857917547225951,
      "reward_std": 0.08198632448911666,
      "rewards/accuracy_reward": 0.5568359375,
      "rewards/brier_reward": 0.8151372194290161,
      "rewards/confidence_one_or_zero": 0.005859375,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5247080326080322,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.000390625,
      "completions/max_length": 1174.4,
      "completions/max_terminated_length": 875.4,
      "completions/mean_length": 215.8525390625,
      "completions/mean_terminated_length": 215.33623962402345,
      "completions/min_length": 103.4,
      "completions/min_terminated_length": 103.4,
      "epoch": 0.768,
      "grad_norm": 0.02318732999265194,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 814677049.0,
      "reward": 1.1634629249572754,
      "reward_std": 0.08838685750961303,
      "rewards/accuracy_reward": 0.50859375,
      "rewards/brier_reward": 0.8190146684646606,
      "rewards/confidence_one_or_zero": 0.00322265625,
      "rewards/format_reward": 0.99931640625,
      "rewards/mean_confidence_reward": 0.535248053073883,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 467.2,
      "completions/max_terminated_length": 467.2,
      "completions/mean_length": 216.6044921875,
      "completions/mean_terminated_length": 216.6044921875,
      "completions/min_length": 108.2,
      "completions/min_terminated_length": 108.2,
      "epoch": 0.784,
      "grad_norm": 0.02482554130256176,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 832069447.0,
      "reward": 1.1771643161773682,
      "reward_std": 0.09045170843601227,
      "rewards/accuracy_reward": 0.55361328125,
      "rewards/brier_reward": 0.8010073304176331,
      "rewards/confidence_one_or_zero": 0.00498046875,
      "rewards/format_reward": 0.99970703125,
      "rewards/mean_confidence_reward": 0.5427734732627869,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 497.2,
      "completions/max_terminated_length": 497.2,
      "completions/mean_length": 218.06123046875,
      "completions/mean_terminated_length": 218.06123046875,
      "completions/min_length": 113.6,
      "completions/min_terminated_length": 113.6,
      "epoch": 0.8,
      "grad_norm": 0.005663494113832712,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 849312954.0,
      "reward": 1.2057418823242188,
      "reward_std": 0.08701288551092148,
      "rewards/accuracy_reward": 0.587109375,
      "rewards/brier_reward": 0.8243734002113342,
      "rewards/confidence_one_or_zero": 0.0033203125,
      "rewards/format_reward": 1.0,
      "rewards/mean_confidence_reward": 0.5567070245742798,
      "step": 250
    },
    {
      "epoch": 0.8,
      "eval_completions/clipped_ratio": 0.0,
      "eval_completions/max_length": 408.0,
      "eval_completions/max_terminated_length": 408.0,
      "eval_completions/mean_length": 220.35998153686523,
      "eval_completions/mean_terminated_length": 220.35998153686523,
      "eval_completions/min_length": 134.5,
      "eval_completions/min_terminated_length": 134.5,
      "eval_loss": 0.0,
      "eval_num_tokens": 849312954.0,
      "eval_reward": 1.118152379989624,
      "eval_reward_std": 0.28543028980493546,
      "eval_rewards/accuracy_reward": 0.435546875,
      "eval_rewards/brier_reward": 0.8007568567991257,
      "eval_rewards/confidence_one_or_zero": 0.001953125,
      "eval_rewards/format_reward": 1.0,
      "eval_rewards/mean_confidence_reward": 0.5190429389476776,
      "eval_runtime": 22.0758,
      "eval_samples_per_second": 22.649,
      "eval_steps_per_second": 0.181,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 425.8,
      "completions/max_terminated_length": 425.8,
      "completions/mean_length": 218.22822265625,
      "completions/mean_terminated_length": 218.22822265625,
      "completions/min_length": 110.4,
      "completions/min_terminated_length": 110.4,
      "epoch": 0.816,
      "grad_norm": 0.021623145788908005,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 866646779.0,
      "reward": 1.1943198680877685,
      "reward_std": 0.08455176651477814,
      "rewards/accuracy_reward": 0.5806640625,
      "rewards/brier_reward": 0.808072280883789,
      "rewards/confidence_one_or_zero": 0.00185546875,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.5697656273841858,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 688.2,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 219.43544921875,
      "completions/mean_terminated_length": 219.30695190429688,
      "completions/min_length": 112.6,
      "completions/min_terminated_length": 112.6,
      "epoch": 0.832,
      "grad_norm": 0.010291030630469322,
      "learning_rate": 1e-06,
      "loss": 0.0003,
      "num_tokens": 883902150.0,
      "reward": 1.1886512756347656,
      "reward_std": 0.08867516815662384,
      "rewards/accuracy_reward": 0.552734375,
      "rewards/brier_reward": 0.8246647596359253,
      "rewards/confidence_one_or_zero": 0.001953125,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.5912207126617431,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 444.2,
      "completions/max_terminated_length": 444.2,
      "completions/mean_length": 219.04970703125,
      "completions/mean_terminated_length": 219.04970703125,
      "completions/min_length": 118.6,
      "completions/min_terminated_length": 118.6,
      "epoch": 0.848,
      "grad_norm": 0.007072034757584333,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 901159587.0,
      "reward": 1.1735345602035523,
      "reward_std": 0.08179984986782074,
      "rewards/accuracy_reward": 0.53017578125,
      "rewards/brier_reward": 0.8168921947479248,
      "rewards/confidence_one_or_zero": 0.001171875,
      "rewards/format_reward": 1.0,
      "rewards/mean_confidence_reward": 0.5902080178260803,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 655.6,
      "completions/max_terminated_length": 498.2,
      "completions/mean_length": 225.3056640625,
      "completions/mean_terminated_length": 225.17782287597657,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.864,
      "grad_norm": 0.009159094654023647,
      "learning_rate": 1e-06,
      "loss": 0.0004,
      "num_tokens": 918453533.0,
      "reward": 1.199086856842041,
      "reward_std": 0.08750579506158829,
      "rewards/accuracy_reward": 0.5861328125,
      "rewards/brier_reward": 0.8121374368667602,
      "rewards/confidence_one_or_zero": 0.00283203125,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.552389633655548,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 488.2,
      "completions/max_terminated_length": 488.2,
      "completions/mean_length": 225.61953125,
      "completions/mean_terminated_length": 225.61953125,
      "completions/min_length": 117.4,
      "completions/min_terminated_length": 117.4,
      "epoch": 0.88,
      "grad_norm": 0.013025188818573952,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 935910949.0,
      "reward": 1.1634913206100463,
      "reward_std": 0.08096154034137726,
      "rewards/accuracy_reward": 0.508984375,
      "rewards/brier_reward": 0.8179973006248474,
      "rewards/confidence_one_or_zero": 0.001953125,
      "rewards/format_reward": 1.0,
      "rewards/mean_confidence_reward": 0.5161884605884552,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00087890625,
      "completions/max_length": 1493.2,
      "completions/max_terminated_length": 1388.4,
      "completions/mean_length": 230.47041015625,
      "completions/mean_terminated_length": 229.32456970214844,
      "completions/min_length": 111.4,
      "completions/min_terminated_length": 111.4,
      "epoch": 0.896,
      "grad_norm": 0.013893580064177513,
      "learning_rate": 1e-06,
      "loss": 0.002,
      "num_tokens": 953381814.0,
      "reward": 1.171729063987732,
      "reward_std": 0.07993723750114441,
      "rewards/accuracy_reward": 0.53193359375,
      "rewards/brier_reward": 0.8125000953674316,
      "rewards/confidence_one_or_zero": 0.00244140625,
      "rewards/format_reward": 0.9990234375,
      "rewards/mean_confidence_reward": 0.5306591987609863,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00185546875,
      "completions/max_length": 1531.0,
      "completions/max_terminated_length": 1445.0,
      "completions/mean_length": 230.8126953125,
      "completions/mean_terminated_length": 228.377685546875,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.912,
      "grad_norm": 0.036593444645404816,
      "learning_rate": 1e-06,
      "loss": 0.0051,
      "num_tokens": 970796632.0,
      "reward": 1.1708195447921752,
      "reward_std": 0.09443000853061675,
      "rewards/accuracy_reward": 0.529296875,
      "rewards/brier_reward": 0.8148803591728211,
      "rewards/confidence_one_or_zero": 0.00146484375,
      "rewards/format_reward": 0.9974609375,
      "rewards/mean_confidence_reward": 0.5297421932220459,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00048828125,
      "completions/max_length": 1076.0,
      "completions/max_terminated_length": 947.4,
      "completions/mean_length": 224.44541015625,
      "completions/mean_terminated_length": 223.80473937988282,
      "completions/min_length": 112.2,
      "completions/min_terminated_length": 112.2,
      "epoch": 0.928,
      "grad_norm": 0.032313406467437744,
      "learning_rate": 1e-06,
      "loss": 0.0014,
      "num_tokens": 988121769.0,
      "reward": 1.1626015663146974,
      "reward_std": 0.08359554558992385,
      "rewards/accuracy_reward": 0.52421875,
      "rewards/brier_reward": 0.8017645239830017,
      "rewards/confidence_one_or_zero": 9.765625e-05,
      "rewards/format_reward": 0.99921875,
      "rewards/mean_confidence_reward": 0.5579541087150574,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 9.765625e-05,
      "completions/max_length": 1049.2,
      "completions/max_terminated_length": 1025.4,
      "completions/mean_length": 229.90673828125,
      "completions/mean_terminated_length": 229.7791961669922,
      "completions/min_length": 100.6,
      "completions/min_terminated_length": 100.6,
      "epoch": 0.944,
      "grad_norm": 0.03256411850452423,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 1005451438.0,
      "reward": 1.1718948125839233,
      "reward_std": 0.09756192564964294,
      "rewards/accuracy_reward": 0.53125,
      "rewards/brier_reward": 0.8127339124679566,
      "rewards/confidence_one_or_zero": 0.00029296875,
      "rewards/format_reward": 0.9998046875,
      "rewards/mean_confidence_reward": 0.5793359398841857,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 664.8,
      "completions/max_terminated_length": 664.8,
      "completions/mean_length": 237.31884765625,
      "completions/mean_terminated_length": 237.31884765625,
      "completions/min_length": 110.4,
      "completions/min_terminated_length": 110.4,
      "epoch": 0.96,
      "grad_norm": 0.005703456234186888,
      "learning_rate": 1e-06,
      "loss": -0.0003,
      "num_tokens": 1022821903.0,
      "reward": 1.1647626161575317,
      "reward_std": 0.07187836617231369,
      "rewards/accuracy_reward": 0.51796875,
      "rewards/brier_reward": 0.8118483901023865,
      "rewards/confidence_one_or_zero": 0.00029296875,
      "rewards/format_reward": 0.99970703125,
      "rewards/mean_confidence_reward": 0.5718408465385437,
      "step": 300
    },
    {
      "epoch": 0.96,
      "eval_completions/clipped_ratio": 0.0,
      "eval_completions/max_length": 407.0,
      "eval_completions/max_terminated_length": 407.0,
      "eval_completions/mean_length": 245.04660415649414,
      "eval_completions/mean_terminated_length": 245.04660415649414,
      "eval_completions/min_length": 155.25,
      "eval_completions/min_terminated_length": 155.25,
      "eval_loss": 0.0,
      "eval_num_tokens": 1022821903.0,
      "eval_reward": 1.12132129073143,
      "eval_reward_std": 0.27718404680490494,
      "eval_rewards/accuracy_reward": 0.43359375,
      "eval_rewards/brier_reward": 0.8090478628873825,
      "eval_rewards/confidence_one_or_zero": 0.001953125,
      "eval_rewards/format_reward": 1.0,
      "eval_rewards/mean_confidence_reward": 0.508496105670929,
      "eval_runtime": 22.3953,
      "eval_samples_per_second": 22.326,
      "eval_steps_per_second": 0.179,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0001953125,
      "completions/max_length": 980.4,
      "completions/max_terminated_length": 564.8,
      "completions/mean_length": 244.62080078125,
      "completions/mean_terminated_length": 244.36865234375,
      "completions/min_length": 118.8,
      "completions/min_terminated_length": 118.8,
      "epoch": 0.976,
      "grad_norm": 0.01138161402195692,
      "learning_rate": 1e-06,
      "loss": 0.0007,
      "num_tokens": 1040187940.0,
      "reward": 1.1814841508865357,
      "reward_std": 0.07296017110347748,
      "rewards/accuracy_reward": 0.5513671875,
      "rewards/brier_reward": 0.8119907259941102,
      "rewards/confidence_one_or_zero": 0.0001953125,
      "rewards/format_reward": 0.999609375,
      "rewards/mean_confidence_reward": 0.5193847775459289,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 682.0,
      "completions/max_terminated_length": 682.0,
      "completions/mean_length": 238.13291015625,
      "completions/mean_terminated_length": 238.13291015625,
      "completions/min_length": 127.2,
      "completions/min_terminated_length": 127.2,
      "epoch": 0.992,
      "grad_norm": 0.006512052845209837,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 1057754901.0,
      "reward": 1.1691525459289551,
      "reward_std": 0.06929974779486656,
      "rewards/accuracy_reward": 0.5232421875,
      "rewards/brier_reward": 0.8151596069335938,
      "rewards/confidence_one_or_zero": 0.00087890625,
      "rewards/format_reward": 0.99990234375,
      "rewards/mean_confidence_reward": 0.4931289255619049,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 559.5,
      "completions/max_terminated_length": 559.5,
      "completions/mean_length": 235.83663940429688,
      "completions/mean_terminated_length": 235.83663940429688,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.9984,
      "num_tokens": 1064720670.0,
      "reward": 1.1648695468902588,
      "reward_std": 0.073847196996212,
      "rewards/accuracy_reward": 0.54248046875,
      "rewards/brier_reward": 0.787746012210846,
      "rewards/confidence_one_or_zero": 0.000244140625,
      "rewards/format_reward": 0.99951171875,
      "rewards/mean_confidence_reward": 0.5165649354457855,
      "step": 312,
      "total_flos": 0.0,
      "train_loss": 0.005198169167087121,
      "train_runtime": 92786.4477,
      "train_samples_per_second": 0.216,
      "train_steps_per_second": 0.003
    }
  ],
  "logging_steps": 5,
  "max_steps": 312,
  "num_input_tokens_seen": 1064720670,
  "num_train_epochs": 1,
  "save_steps": 60,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}