File size: 50,147 Bytes
8f2a06c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.8028904054596547,
  "eval_steps": 500,
  "global_step": 100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "completion_length": 155.3333339691162,
      "epoch": 0.008028904054596548,
      "grad_norm": 0.8266991671440796,
      "kl": 0.0,
      "learning_rate": 3.0000000000000004e-08,
      "loss": 0.0,
      "reward": 2.3694444835186004,
      "reward_std": 0.706648226082325,
      "rewards/accuracy_reward_log": 1.4888889163732528,
      "rewards/format_number_reward": 0.4138888940215111,
      "rewards/format_reasoning_reward": 0.46666666865348816,
      "step": 1
    },
    {
      "completion_length": 177.53333435058593,
      "epoch": 0.016057808109193095,
      "grad_norm": 0.853670554176349,
      "kl": 0.0,
      "learning_rate": 6.000000000000001e-08,
      "loss": 0.0,
      "reward": 2.3472222685813904,
      "reward_std": 0.6901098385453224,
      "rewards/accuracy_reward_log": 1.500000025331974,
      "rewards/format_number_reward": 0.4000000059604645,
      "rewards/format_reasoning_reward": 0.4472222253680229,
      "step": 2
    },
    {
      "completion_length": 160.18333435058594,
      "epoch": 0.02408671216378964,
      "grad_norm": 0.6327729129725634,
      "kl": 0.00010757744312286376,
      "learning_rate": 9e-08,
      "loss": 0.0,
      "reward": 2.4138889491558073,
      "reward_std": 0.5638244189321995,
      "rewards/accuracy_reward_log": 1.5222222447395324,
      "rewards/format_number_reward": 0.4194444492459297,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 3
    },
    {
      "completion_length": 167.59444580078124,
      "epoch": 0.03211561621838619,
      "grad_norm": 0.6798573023319466,
      "kl": 0.00010120868682861328,
      "learning_rate": 1.2000000000000002e-07,
      "loss": 0.0,
      "reward": 2.20833335518837,
      "reward_std": 0.5129089742898941,
      "rewards/accuracy_reward_log": 1.333333359658718,
      "rewards/format_number_reward": 0.4027777835726738,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 4
    },
    {
      "completion_length": 147.56666870117186,
      "epoch": 0.04014452027298274,
      "grad_norm": 0.5872669908882053,
      "kl": 9.968876838684082e-05,
      "learning_rate": 1.5000000000000002e-07,
      "loss": 0.0,
      "reward": 2.4361111402511595,
      "reward_std": 0.48325310088694096,
      "rewards/accuracy_reward_log": 1.5333333551883697,
      "rewards/format_number_reward": 0.42222222685813904,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 5
    },
    {
      "completion_length": 168.4333324432373,
      "epoch": 0.04817342432757928,
      "grad_norm": 0.842231210713151,
      "kl": 0.000108298659324646,
      "learning_rate": 1.8e-07,
      "loss": 0.0,
      "reward": 1.9972222298383713,
      "reward_std": 0.6861602704972029,
      "rewards/accuracy_reward_log": 1.188888917118311,
      "rewards/format_number_reward": 0.35555556118488313,
      "rewards/format_reasoning_reward": 0.45277778059244156,
      "step": 6
    },
    {
      "completion_length": 161.47222175598145,
      "epoch": 0.056202328382175835,
      "grad_norm": 0.6301911132102126,
      "kl": 0.00010362863540649414,
      "learning_rate": 2.1000000000000003e-07,
      "loss": 0.0,
      "reward": 2.344444477558136,
      "reward_std": 0.5787256445735693,
      "rewards/accuracy_reward_log": 1.455555585026741,
      "rewards/format_number_reward": 0.41111111640930176,
      "rewards/format_reasoning_reward": 0.47777777910232544,
      "step": 7
    },
    {
      "completion_length": 178.705558013916,
      "epoch": 0.06423123243677238,
      "grad_norm": 0.7904000046666478,
      "kl": 0.00015439987182617188,
      "learning_rate": 2.4000000000000003e-07,
      "loss": 0.0,
      "reward": 2.033333358168602,
      "reward_std": 0.6374398373067379,
      "rewards/accuracy_reward_log": 1.2000000223517417,
      "rewards/format_number_reward": 0.37777778320014477,
      "rewards/format_reasoning_reward": 0.4555555582046509,
      "step": 8
    },
    {
      "completion_length": 154.6055564880371,
      "epoch": 0.07226013649136893,
      "grad_norm": 0.8113384704385274,
      "kl": 0.00010915249586105347,
      "learning_rate": 2.7e-07,
      "loss": 0.0,
      "reward": 2.4722222566604612,
      "reward_std": 0.38092764765024184,
      "rewards/accuracy_reward_log": 1.5777777969837188,
      "rewards/format_number_reward": 0.4194444492459297,
      "rewards/format_reasoning_reward": 0.4750000014901161,
      "step": 9
    },
    {
      "completion_length": 155.86111221313476,
      "epoch": 0.08028904054596547,
      "grad_norm": 0.6734680070959974,
      "kl": 0.00010448098182678223,
      "learning_rate": 3.0000000000000004e-07,
      "loss": 0.0,
      "reward": 2.3611111462116243,
      "reward_std": 0.5814844127744436,
      "rewards/accuracy_reward_log": 1.4777778029441833,
      "rewards/format_number_reward": 0.413888893276453,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 10
    },
    {
      "completion_length": 167.8611110687256,
      "epoch": 0.08831794460056203,
      "grad_norm": 0.6192549717392329,
      "kl": 0.0002144932746887207,
      "learning_rate": 3.3e-07,
      "loss": 0.0,
      "reward": 2.3972222745418548,
      "reward_std": 0.5403781462460756,
      "rewards/accuracy_reward_log": 1.511111134290695,
      "rewards/format_number_reward": 0.4194444492459297,
      "rewards/format_reasoning_reward": 0.46666666865348816,
      "step": 11
    },
    {
      "completion_length": 159.29444694519043,
      "epoch": 0.09634684865515857,
      "grad_norm": 0.6219588398194654,
      "kl": 0.00014747381210327147,
      "learning_rate": 3.6e-07,
      "loss": 0.0,
      "reward": 2.22777781188488,
      "reward_std": 0.6599382009357214,
      "rewards/accuracy_reward_log": 1.3777778007090091,
      "rewards/format_number_reward": 0.3805555604398251,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 12
    },
    {
      "completion_length": 163.36666717529297,
      "epoch": 0.10437575270975512,
      "grad_norm": 0.5303599869712791,
      "kl": 0.0001802980899810791,
      "learning_rate": 3.9e-07,
      "loss": 0.0,
      "reward": 2.388888931274414,
      "reward_std": 0.38289184793829917,
      "rewards/accuracy_reward_log": 1.4888889119029045,
      "rewards/format_number_reward": 0.4111111145466566,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 13
    },
    {
      "completion_length": 153.45000038146972,
      "epoch": 0.11240465676435167,
      "grad_norm": 0.5530377333718409,
      "kl": 0.0004338264465332031,
      "learning_rate": 4.2000000000000006e-07,
      "loss": 0.0,
      "reward": 2.569444465637207,
      "reward_std": 0.42339018881320956,
      "rewards/accuracy_reward_log": 1.6333333551883698,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.4833333343267441,
      "step": 14
    },
    {
      "completion_length": 165.07777938842773,
      "epoch": 0.12043356081894821,
      "grad_norm": 0.49102029519105816,
      "kl": 0.0006180524826049805,
      "learning_rate": 4.5e-07,
      "loss": 0.0,
      "reward": 2.597222250699997,
      "reward_std": 0.3478126596659422,
      "rewards/accuracy_reward_log": 1.6444444566965104,
      "rewards/format_number_reward": 0.46388889104127884,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 15
    },
    {
      "completion_length": 164.92778129577636,
      "epoch": 0.12846246487354476,
      "grad_norm": 0.6338598904220616,
      "kl": 0.0007488012313842773,
      "learning_rate": 4.800000000000001e-07,
      "loss": 0.0,
      "reward": 2.319444465637207,
      "reward_std": 0.5170031324028969,
      "rewards/accuracy_reward_log": 1.4222222447395325,
      "rewards/format_number_reward": 0.4277777820825577,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 16
    },
    {
      "completion_length": 156.144446182251,
      "epoch": 0.13649136892814132,
      "grad_norm": 0.5574851133438189,
      "kl": 0.0012537002563476562,
      "learning_rate": 5.100000000000001e-07,
      "loss": 0.0001,
      "reward": 2.5555556058883666,
      "reward_std": 0.44343185126781465,
      "rewards/accuracy_reward_log": 1.6222222417593002,
      "rewards/format_number_reward": 0.45000000223517417,
      "rewards/format_reasoning_reward": 0.4833333343267441,
      "step": 17
    },
    {
      "completion_length": 153.38888931274414,
      "epoch": 0.14452027298273787,
      "grad_norm": 0.6029065735373917,
      "kl": 0.001799297332763672,
      "learning_rate": 5.4e-07,
      "loss": 0.0001,
      "reward": 2.4750000178813933,
      "reward_std": 0.5368539825081825,
      "rewards/accuracy_reward_log": 1.5444444686174392,
      "rewards/format_number_reward": 0.45000000298023224,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 18
    },
    {
      "completion_length": 170.45000228881835,
      "epoch": 0.1525491770373344,
      "grad_norm": 0.6238637844164427,
      "kl": 0.0022980213165283204,
      "learning_rate": 5.7e-07,
      "loss": 0.0001,
      "reward": 2.3472222328186034,
      "reward_std": 0.5638264730572701,
      "rewards/accuracy_reward_log": 1.4444444686174394,
      "rewards/format_number_reward": 0.4333333373069763,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 19
    },
    {
      "completion_length": 146.12777786254884,
      "epoch": 0.16057808109193095,
      "grad_norm": 0.4196477311949672,
      "kl": 0.00455629825592041,
      "learning_rate": 6.000000000000001e-07,
      "loss": 0.0002,
      "reward": 2.5250000238418577,
      "reward_std": 0.27503596656024454,
      "rewards/accuracy_reward_log": 1.5888888999819755,
      "rewards/format_number_reward": 0.44722222462296485,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 20
    },
    {
      "completion_length": 179.50000228881837,
      "epoch": 0.1686069851465275,
      "grad_norm": 0.5891348987729047,
      "kl": 0.0013088226318359376,
      "learning_rate": 6.3e-07,
      "loss": 0.0001,
      "reward": 2.4500000417232513,
      "reward_std": 0.5685264855623245,
      "rewards/accuracy_reward_log": 1.5666666895151138,
      "rewards/format_number_reward": 0.43611111491918564,
      "rewards/format_reasoning_reward": 0.4472222253680229,
      "step": 21
    },
    {
      "completion_length": 167.82222518920898,
      "epoch": 0.17663588920112405,
      "grad_norm": 0.5080236888533528,
      "kl": 0.0015691757202148438,
      "learning_rate": 6.6e-07,
      "loss": 0.0001,
      "reward": 2.4805555999279023,
      "reward_std": 0.38282057382166385,
      "rewards/accuracy_reward_log": 1.5333333551883697,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.47777777910232544,
      "step": 22
    },
    {
      "completion_length": 159.62222480773926,
      "epoch": 0.1846647932557206,
      "grad_norm": 0.5410426478942184,
      "kl": 0.0024078369140625,
      "learning_rate": 6.900000000000001e-07,
      "loss": 0.0001,
      "reward": 2.6277777940034865,
      "reward_std": 0.332701800763607,
      "rewards/accuracy_reward_log": 1.6777777917683125,
      "rewards/format_number_reward": 0.46388889029622077,
      "rewards/format_reasoning_reward": 0.4861111119389534,
      "step": 23
    },
    {
      "completion_length": 157.90000114440917,
      "epoch": 0.19269369731031713,
      "grad_norm": 0.5882381452711394,
      "kl": 0.002088165283203125,
      "learning_rate": 7.2e-07,
      "loss": 0.0001,
      "reward": 2.6722222566604614,
      "reward_std": 0.38149141892790794,
      "rewards/accuracy_reward_log": 1.7111111283302307,
      "rewards/format_number_reward": 0.48055555671453476,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 24
    },
    {
      "completion_length": 148.97222328186035,
      "epoch": 0.20072260136491368,
      "grad_norm": 0.46880008245690374,
      "kl": 0.002790069580078125,
      "learning_rate": 7.5e-07,
      "loss": 0.0001,
      "reward": 2.6666667222976685,
      "reward_std": 0.42418686002492906,
      "rewards/accuracy_reward_log": 1.7111111283302307,
      "rewards/format_number_reward": 0.46388889104127884,
      "rewards/format_reasoning_reward": 0.49166666716337204,
      "step": 25
    },
    {
      "completion_length": 166.62222595214843,
      "epoch": 0.20875150541951024,
      "grad_norm": 0.6233073148075161,
      "kl": 0.003025054931640625,
      "learning_rate": 7.8e-07,
      "loss": 0.0001,
      "reward": 2.544444477558136,
      "reward_std": 0.5540193915367126,
      "rewards/accuracy_reward_log": 1.6111111342906952,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 26
    },
    {
      "completion_length": 153.51666755676268,
      "epoch": 0.2167804094741068,
      "grad_norm": 0.4559712721597118,
      "kl": 0.003208160400390625,
      "learning_rate": 8.100000000000001e-07,
      "loss": 0.0001,
      "reward": 2.5555555880069734,
      "reward_std": 0.34781265556812285,
      "rewards/accuracy_reward_log": 1.5888889104127883,
      "rewards/format_number_reward": 0.4750000014901161,
      "rewards/format_reasoning_reward": 0.49166666716337204,
      "step": 27
    },
    {
      "completion_length": 172.8500011444092,
      "epoch": 0.22480931352870334,
      "grad_norm": 0.5449553128247491,
      "kl": 0.0030437469482421874,
      "learning_rate": 8.400000000000001e-07,
      "loss": 0.0001,
      "reward": 2.4750000476837157,
      "reward_std": 0.3911139152944088,
      "rewards/accuracy_reward_log": 1.544444465637207,
      "rewards/format_number_reward": 0.45000000223517417,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 28
    },
    {
      "completion_length": 164.09999961853026,
      "epoch": 0.23283821758329987,
      "grad_norm": 0.5619230788736052,
      "kl": 0.00468292236328125,
      "learning_rate": 8.699999999999999e-07,
      "loss": 0.0002,
      "reward": 2.502777820825577,
      "reward_std": 0.46127643398940565,
      "rewards/accuracy_reward_log": 1.5666666895151138,
      "rewards/format_number_reward": 0.46388889104127884,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 29
    },
    {
      "completion_length": 151.90000228881837,
      "epoch": 0.24086712163789642,
      "grad_norm": 0.7524148953577687,
      "kl": 0.0029834747314453126,
      "learning_rate": 9e-07,
      "loss": 0.0001,
      "reward": 2.7583333969116213,
      "reward_std": 0.31546305269002917,
      "rewards/accuracy_reward_log": 1.7777777910232544,
      "rewards/format_number_reward": 0.4861111119389534,
      "rewards/format_reasoning_reward": 0.49444444477558136,
      "step": 30
    },
    {
      "completion_length": 150.7055564880371,
      "epoch": 0.24889602569249297,
      "grad_norm": 1.70651807869166,
      "kl": 0.013134002685546875,
      "learning_rate": 9.3e-07,
      "loss": 0.0005,
      "reward": 2.8583333492279053,
      "reward_std": 0.1264950528740883,
      "rewards/accuracy_reward_log": 1.8666666746139526,
      "rewards/format_number_reward": 0.49444444477558136,
      "rewards/format_reasoning_reward": 0.4972222223877907,
      "step": 31
    },
    {
      "completion_length": 169.57222366333008,
      "epoch": 0.2569249297470895,
      "grad_norm": 0.41158376714433204,
      "kl": 0.004754638671875,
      "learning_rate": 9.600000000000001e-07,
      "loss": 0.0002,
      "reward": 2.6888889491558077,
      "reward_std": 0.29488888159394266,
      "rewards/accuracy_reward_log": 1.722222238779068,
      "rewards/format_number_reward": 0.4833333343267441,
      "rewards/format_reasoning_reward": 0.4833333343267441,
      "step": 32
    },
    {
      "completion_length": 160.33333473205568,
      "epoch": 0.2649538338016861,
      "grad_norm": 0.5042425185498995,
      "kl": 0.005771636962890625,
      "learning_rate": 9.9e-07,
      "loss": 0.0002,
      "reward": 2.6055556178092956,
      "reward_std": 0.33884221240878104,
      "rewards/accuracy_reward_log": 1.644444465637207,
      "rewards/format_number_reward": 0.4750000014901161,
      "rewards/format_reasoning_reward": 0.4861111119389534,
      "step": 33
    },
    {
      "completion_length": 165.0555561065674,
      "epoch": 0.27298273785628263,
      "grad_norm": 0.5229992920065437,
      "kl": 0.006603240966796875,
      "learning_rate": 1.0200000000000002e-06,
      "loss": 0.0003,
      "reward": 2.663888943195343,
      "reward_std": 0.4261931136250496,
      "rewards/accuracy_reward_log": 1.6888889074325562,
      "rewards/format_number_reward": 0.4833333343267441,
      "rewards/format_reasoning_reward": 0.49166666716337204,
      "step": 34
    },
    {
      "completion_length": 164.63888969421387,
      "epoch": 0.2810116419108792,
      "grad_norm": 0.4700519781333619,
      "kl": 0.006055450439453125,
      "learning_rate": 1.05e-06,
      "loss": 0.0002,
      "reward": 2.6194444715976717,
      "reward_std": 0.34433056265115736,
      "rewards/accuracy_reward_log": 1.6444444596767425,
      "rewards/format_number_reward": 0.4861111119389534,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 35
    },
    {
      "completion_length": 195.3277786254883,
      "epoch": 0.28904054596547574,
      "grad_norm": 0.5312518629436677,
      "kl": 0.005257415771484375,
      "learning_rate": 1.08e-06,
      "loss": 0.0002,
      "reward": 2.4138889372348786,
      "reward_std": 0.4737018562853336,
      "rewards/accuracy_reward_log": 1.5000000193715095,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.4611111134290695,
      "step": 36
    },
    {
      "completion_length": 162.9777805328369,
      "epoch": 0.29706945002007223,
      "grad_norm": 0.3947867705421231,
      "kl": 0.00660400390625,
      "learning_rate": 1.11e-06,
      "loss": 0.0003,
      "reward": 2.8111111521720886,
      "reward_std": 0.21169509664177893,
      "rewards/accuracy_reward_log": 1.8222222328186035,
      "rewards/format_number_reward": 0.49166666716337204,
      "rewards/format_reasoning_reward": 0.4972222223877907,
      "step": 37
    },
    {
      "completion_length": 171.81111373901368,
      "epoch": 0.3050983540746688,
      "grad_norm": 0.49680622935913366,
      "kl": 0.0083709716796875,
      "learning_rate": 1.14e-06,
      "loss": 0.0003,
      "reward": 2.600000059604645,
      "reward_std": 0.3381901502609253,
      "rewards/accuracy_reward_log": 1.6333333551883698,
      "rewards/format_number_reward": 0.47777777910232544,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 38
    },
    {
      "completion_length": 159.78333473205566,
      "epoch": 0.31312725812926534,
      "grad_norm": 0.4841125000891351,
      "kl": 0.01015167236328125,
      "learning_rate": 1.17e-06,
      "loss": 0.0004,
      "reward": 2.7166666865348814,
      "reward_std": 0.25784134939312936,
      "rewards/accuracy_reward_log": 1.7555555701255798,
      "rewards/format_number_reward": 0.4750000014901161,
      "rewards/format_reasoning_reward": 0.4861111119389534,
      "step": 39
    },
    {
      "completion_length": 160.47777938842773,
      "epoch": 0.3211561621838619,
      "grad_norm": 0.4051878172293322,
      "kl": 0.00718231201171875,
      "learning_rate": 1.2000000000000002e-06,
      "loss": 0.0003,
      "reward": 2.6527778089046476,
      "reward_std": 0.27083261907100675,
      "rewards/accuracy_reward_log": 1.6777777969837189,
      "rewards/format_number_reward": 0.4861111119389534,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 40
    },
    {
      "completion_length": 164.8000015258789,
      "epoch": 0.32918506623845845,
      "grad_norm": 0.474618622817464,
      "kl": 0.006967926025390625,
      "learning_rate": 1.2299999999999999e-06,
      "loss": 0.0003,
      "reward": 2.7750000238418577,
      "reward_std": 0.24537386298179625,
      "rewards/accuracy_reward_log": 1.800000011920929,
      "rewards/format_number_reward": 0.4861111119389534,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 41
    },
    {
      "completion_length": 166.5111141204834,
      "epoch": 0.337213970293055,
      "grad_norm": 0.3629585253598538,
      "kl": 0.007363128662109375,
      "learning_rate": 1.26e-06,
      "loss": 0.0003,
      "reward": 2.63055557012558,
      "reward_std": 0.22272009253501893,
      "rewards/accuracy_reward_log": 1.6555555701255797,
      "rewards/format_number_reward": 0.4861111119389534,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 42
    },
    {
      "completion_length": 166.7388900756836,
      "epoch": 0.34524287434765155,
      "grad_norm": 0.4008326525358567,
      "kl": 0.009012603759765625,
      "learning_rate": 1.29e-06,
      "loss": 0.0004,
      "reward": 2.708333361148834,
      "reward_std": 0.29148011431097987,
      "rewards/accuracy_reward_log": 1.7444444596767426,
      "rewards/format_number_reward": 0.47777777910232544,
      "rewards/format_reasoning_reward": 0.4861111119389534,
      "step": 43
    },
    {
      "completion_length": 185.93888893127442,
      "epoch": 0.3532717784022481,
      "grad_norm": 0.5017342406963871,
      "kl": 0.00979156494140625,
      "learning_rate": 1.32e-06,
      "loss": 0.0004,
      "reward": 2.5722222566604613,
      "reward_std": 0.3904368232935667,
      "rewards/accuracy_reward_log": 1.6555555760860443,
      "rewards/format_number_reward": 0.4472222253680229,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 44
    },
    {
      "completion_length": 170.97777862548827,
      "epoch": 0.36130068245684466,
      "grad_norm": 0.36508724491351696,
      "kl": 0.00971527099609375,
      "learning_rate": 1.35e-06,
      "loss": 0.0004,
      "reward": 2.680555593967438,
      "reward_std": 0.1924500897526741,
      "rewards/accuracy_reward_log": 1.7111111253499984,
      "rewards/format_number_reward": 0.48055555671453476,
      "rewards/format_reasoning_reward": 0.4888888895511627,
      "step": 45
    },
    {
      "completion_length": 200.09444770812988,
      "epoch": 0.3693295865114412,
      "grad_norm": 0.5859797241606328,
      "kl": 0.01236724853515625,
      "learning_rate": 1.3800000000000001e-06,
      "loss": 0.0005,
      "reward": 2.4972222447395325,
      "reward_std": 0.4179751716554165,
      "rewards/accuracy_reward_log": 1.588888904452324,
      "rewards/format_number_reward": 0.4527777798473835,
      "rewards/format_reasoning_reward": 0.4555555582046509,
      "step": 46
    },
    {
      "completion_length": 190.55000267028808,
      "epoch": 0.37735849056603776,
      "grad_norm": 0.6735251651031329,
      "kl": 0.013592529296875,
      "learning_rate": 1.41e-06,
      "loss": 0.0005,
      "reward": 2.525000035762787,
      "reward_std": 0.39592516496777536,
      "rewards/accuracy_reward_log": 1.6000000178813933,
      "rewards/format_number_reward": 0.4611111134290695,
      "rewards/format_reasoning_reward": 0.46388889104127884,
      "step": 47
    },
    {
      "completion_length": 177.14444732666016,
      "epoch": 0.38538739462063426,
      "grad_norm": 0.4013696331659362,
      "kl": 0.01161346435546875,
      "learning_rate": 1.44e-06,
      "loss": 0.0005,
      "reward": 2.725000035762787,
      "reward_std": 0.30932264029979706,
      "rewards/accuracy_reward_log": 1.7666666805744171,
      "rewards/format_number_reward": 0.4750000014901161,
      "rewards/format_reasoning_reward": 0.4833333343267441,
      "step": 48
    },
    {
      "completion_length": 183.58333511352538,
      "epoch": 0.3934162986752308,
      "grad_norm": 0.413245950296645,
      "kl": 0.0140655517578125,
      "learning_rate": 1.4700000000000001e-06,
      "loss": 0.0006,
      "reward": 2.547222238779068,
      "reward_std": 0.28799803368747234,
      "rewards/accuracy_reward_log": 1.6000000208616256,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.47777777910232544,
      "step": 49
    },
    {
      "completion_length": 179.96666717529297,
      "epoch": 0.40144520272982737,
      "grad_norm": 0.41408601639844395,
      "kl": 0.01335296630859375,
      "learning_rate": 1.5e-06,
      "loss": 0.0005,
      "reward": 2.658333349227905,
      "reward_std": 0.3410413548350334,
      "rewards/accuracy_reward_log": 1.7222222357988357,
      "rewards/format_number_reward": 0.46388889104127884,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 50
    },
    {
      "completion_length": 185.75556030273438,
      "epoch": 0.4094741067844239,
      "grad_norm": 0.4053783571755214,
      "kl": 0.0148193359375,
      "learning_rate": 1.53e-06,
      "loss": 0.0006,
      "reward": 2.6555556058883667,
      "reward_std": 0.35194680131971834,
      "rewards/accuracy_reward_log": 1.722222238779068,
      "rewards/format_number_reward": 0.46388889104127884,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 51
    },
    {
      "completion_length": 209.23333358764648,
      "epoch": 0.41750301083902047,
      "grad_norm": 0.55024118806201,
      "kl": 0.018719482421875,
      "learning_rate": 1.56e-06,
      "loss": 0.0007,
      "reward": 2.4083333551883697,
      "reward_std": 0.5498852420598268,
      "rewards/accuracy_reward_log": 1.5333333522081376,
      "rewards/format_number_reward": 0.430555559694767,
      "rewards/format_reasoning_reward": 0.4444444477558136,
      "step": 52
    },
    {
      "completion_length": 201.15000343322754,
      "epoch": 0.425531914893617,
      "grad_norm": 0.4665326355575322,
      "kl": 0.01983795166015625,
      "learning_rate": 1.59e-06,
      "loss": 0.0008,
      "reward": 2.5222222208976746,
      "reward_std": 0.5347743809223175,
      "rewards/accuracy_reward_log": 1.6333333492279052,
      "rewards/format_number_reward": 0.4416666693985462,
      "rewards/format_reasoning_reward": 0.44722222462296485,
      "step": 53
    },
    {
      "completion_length": 192.3111145019531,
      "epoch": 0.4335608189482136,
      "grad_norm": 0.4264344321145415,
      "kl": 0.02133026123046875,
      "learning_rate": 1.6200000000000002e-06,
      "loss": 0.0009,
      "reward": 2.566666692495346,
      "reward_std": 0.47150271385908127,
      "rewards/accuracy_reward_log": 1.6555555760860443,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.4583333358168602,
      "step": 54
    },
    {
      "completion_length": 207.86111373901366,
      "epoch": 0.44158972300281013,
      "grad_norm": 0.41848310682566847,
      "kl": 0.0194488525390625,
      "learning_rate": 1.65e-06,
      "loss": 0.0008,
      "reward": 2.525000047683716,
      "reward_std": 0.5073827020823956,
      "rewards/accuracy_reward_log": 1.6333333492279052,
      "rewards/format_number_reward": 0.4444444477558136,
      "rewards/format_reasoning_reward": 0.4472222253680229,
      "step": 55
    },
    {
      "completion_length": 218.4888916015625,
      "epoch": 0.4496186270574067,
      "grad_norm": 0.493209444659726,
      "kl": 0.0301055908203125,
      "learning_rate": 1.6800000000000002e-06,
      "loss": 0.0012,
      "reward": 2.4250000417232513,
      "reward_std": 0.6275906786322594,
      "rewards/accuracy_reward_log": 1.5777777969837188,
      "rewards/format_number_reward": 0.41388889253139494,
      "rewards/format_reasoning_reward": 0.4333333373069763,
      "step": 56
    },
    {
      "completion_length": 207.85555877685547,
      "epoch": 0.45764753111200324,
      "grad_norm": 0.4781117283691274,
      "kl": 0.028985595703125,
      "learning_rate": 1.71e-06,
      "loss": 0.0012,
      "reward": 2.4833333492279053,
      "reward_std": 0.4261951830238104,
      "rewards/accuracy_reward_log": 1.588888907432556,
      "rewards/format_number_reward": 0.4444444477558136,
      "rewards/format_reasoning_reward": 0.45000000298023224,
      "step": 57
    },
    {
      "completion_length": 207.41666984558105,
      "epoch": 0.46567643516659973,
      "grad_norm": 9.7179275371274,
      "kl": 0.066973876953125,
      "learning_rate": 1.7399999999999999e-06,
      "loss": 0.0027,
      "reward": 2.4972222566604616,
      "reward_std": 0.6165656700730324,
      "rewards/accuracy_reward_log": 1.6222222447395325,
      "rewards/format_number_reward": 0.4333333373069763,
      "rewards/format_reasoning_reward": 0.4416666701436043,
      "step": 58
    },
    {
      "completion_length": 199.16111221313477,
      "epoch": 0.4737053392211963,
      "grad_norm": 0.4594537727679852,
      "kl": 0.03179931640625,
      "learning_rate": 1.77e-06,
      "loss": 0.0013,
      "reward": 2.5444444954395293,
      "reward_std": 0.4083905890583992,
      "rewards/accuracy_reward_log": 1.6333333522081375,
      "rewards/format_number_reward": 0.45000000298023224,
      "rewards/format_reasoning_reward": 0.4611111134290695,
      "step": 59
    },
    {
      "completion_length": 185.04444580078126,
      "epoch": 0.48173424327579284,
      "grad_norm": 0.35591417052929925,
      "kl": 0.02755126953125,
      "learning_rate": 1.8e-06,
      "loss": 0.0011,
      "reward": 2.708333361148834,
      "reward_std": 0.30322221145033834,
      "rewards/accuracy_reward_log": 1.7555555701255798,
      "rewards/format_number_reward": 0.4750000014901161,
      "rewards/format_reasoning_reward": 0.47777777910232544,
      "step": 60
    },
    {
      "completion_length": 217.3444480895996,
      "epoch": 0.4897631473303894,
      "grad_norm": 0.47620034091897007,
      "kl": 0.043768310546875,
      "learning_rate": 1.83e-06,
      "loss": 0.0017,
      "reward": 2.45833335518837,
      "reward_std": 0.48677313327789307,
      "rewards/accuracy_reward_log": 1.5777777925133705,
      "rewards/format_number_reward": 0.4305555589497089,
      "rewards/format_reasoning_reward": 0.45000000298023224,
      "step": 61
    },
    {
      "completion_length": 190.76111221313477,
      "epoch": 0.49779205138498595,
      "grad_norm": 0.381306777730851,
      "kl": 0.03514404296875,
      "learning_rate": 1.86e-06,
      "loss": 0.0014,
      "reward": 2.694444453716278,
      "reward_std": 0.2467763565480709,
      "rewards/accuracy_reward_log": 1.7555555671453476,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 62
    },
    {
      "completion_length": 187.97777671813964,
      "epoch": 0.5058209554395825,
      "grad_norm": 0.594069782615144,
      "kl": 0.039031982421875,
      "learning_rate": 1.8900000000000001e-06,
      "loss": 0.0016,
      "reward": 2.6111111283302306,
      "reward_std": 0.39384557902812956,
      "rewards/accuracy_reward_log": 1.6888889014720916,
      "rewards/format_number_reward": 0.4555555582046509,
      "rewards/format_reasoning_reward": 0.46666666865348816,
      "step": 63
    },
    {
      "completion_length": 193.00555725097655,
      "epoch": 0.513849859494179,
      "grad_norm": 0.5261946833334382,
      "kl": 0.04935302734375,
      "learning_rate": 1.9200000000000003e-06,
      "loss": 0.002,
      "reward": 2.586111146211624,
      "reward_std": 0.4021389245986938,
      "rewards/accuracy_reward_log": 1.6777777940034866,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.4555555582046509,
      "step": 64
    },
    {
      "completion_length": 193.02222366333007,
      "epoch": 0.5218787635487756,
      "grad_norm": 0.4720147986556917,
      "kl": 0.0471923828125,
      "learning_rate": 1.95e-06,
      "loss": 0.0019,
      "reward": 2.6138889133930205,
      "reward_std": 0.444037689268589,
      "rewards/accuracy_reward_log": 1.7000000149011611,
      "rewards/format_number_reward": 0.4555555582046509,
      "rewards/format_reasoning_reward": 0.4583333358168602,
      "step": 65
    },
    {
      "completion_length": 210.9277801513672,
      "epoch": 0.5299076676033722,
      "grad_norm": 0.5124703016443702,
      "kl": 0.06097412109375,
      "learning_rate": 1.98e-06,
      "loss": 0.0024,
      "reward": 2.4833333492279053,
      "reward_std": 0.5216697975993156,
      "rewards/accuracy_reward_log": 1.6111111342906952,
      "rewards/format_number_reward": 0.4277777820825577,
      "rewards/format_reasoning_reward": 0.4444444477558136,
      "step": 66
    },
    {
      "completion_length": 195.95000152587892,
      "epoch": 0.5379365716579687,
      "grad_norm": 0.3914341348179692,
      "kl": 0.05845947265625,
      "learning_rate": 2.0100000000000002e-06,
      "loss": 0.0023,
      "reward": 2.5666666865348815,
      "reward_std": 0.4269164353609085,
      "rewards/accuracy_reward_log": 1.666666680574417,
      "rewards/format_number_reward": 0.4416666701436043,
      "rewards/format_reasoning_reward": 0.4583333358168602,
      "step": 67
    },
    {
      "completion_length": 189.8333354949951,
      "epoch": 0.5459654757125653,
      "grad_norm": 1.8409712600042354,
      "kl": 0.104864501953125,
      "learning_rate": 2.0400000000000004e-06,
      "loss": 0.0042,
      "reward": 2.6055555939674377,
      "reward_std": 0.3877051591873169,
      "rewards/accuracy_reward_log": 1.7000000119209289,
      "rewards/format_number_reward": 0.45000000298023224,
      "rewards/format_reasoning_reward": 0.4555555582046509,
      "step": 68
    },
    {
      "completion_length": 216.18333587646484,
      "epoch": 0.5539943797671618,
      "grad_norm": 1.234592815069376,
      "kl": 0.11241455078125,
      "learning_rate": 2.07e-06,
      "loss": 0.0045,
      "reward": 2.3138889193534853,
      "reward_std": 0.5511893726885319,
      "rewards/accuracy_reward_log": 1.46666669100523,
      "rewards/format_number_reward": 0.4138888940215111,
      "rewards/format_reasoning_reward": 0.4333333373069763,
      "step": 69
    },
    {
      "completion_length": 216.01666946411132,
      "epoch": 0.5620232838217584,
      "grad_norm": 0.47189027920553445,
      "kl": 0.086163330078125,
      "learning_rate": 2.1e-06,
      "loss": 0.0034,
      "reward": 2.4277777910232543,
      "reward_std": 0.5567510481923819,
      "rewards/accuracy_reward_log": 1.588888907432556,
      "rewards/format_number_reward": 0.413888893276453,
      "rewards/format_reasoning_reward": 0.4250000037252903,
      "step": 70
    },
    {
      "completion_length": 180.42222366333007,
      "epoch": 0.5700521878763549,
      "grad_norm": 0.5661866241275311,
      "kl": 0.055902099609375,
      "learning_rate": 2.13e-06,
      "loss": 0.0022,
      "reward": 2.508333349227905,
      "reward_std": 0.416572679579258,
      "rewards/accuracy_reward_log": 1.6000000208616256,
      "rewards/format_number_reward": 0.4472222253680229,
      "rewards/format_reasoning_reward": 0.4611111134290695,
      "step": 71
    },
    {
      "completion_length": 212.58333511352538,
      "epoch": 0.5780810919309515,
      "grad_norm": 0.5852592346007427,
      "kl": 0.108624267578125,
      "learning_rate": 2.16e-06,
      "loss": 0.0043,
      "reward": 2.2777777791023253,
      "reward_std": 0.53003438860178,
      "rewards/accuracy_reward_log": 1.4888889133930205,
      "rewards/format_number_reward": 0.38611111640930174,
      "rewards/format_reasoning_reward": 0.40277778208255766,
      "step": 72
    },
    {
      "completion_length": 203.32777824401856,
      "epoch": 0.586109995985548,
      "grad_norm": 1.0262474142127427,
      "kl": 0.104498291015625,
      "learning_rate": 2.19e-06,
      "loss": 0.0042,
      "reward": 2.4166666686534883,
      "reward_std": 0.40687683820724485,
      "rewards/accuracy_reward_log": 1.5777777969837188,
      "rewards/format_number_reward": 0.4166666708886623,
      "rewards/format_reasoning_reward": 0.42222222611308097,
      "step": 73
    },
    {
      "completion_length": 205.4111114501953,
      "epoch": 0.5941389000401445,
      "grad_norm": 0.6749287604370076,
      "kl": 0.106732177734375,
      "learning_rate": 2.22e-06,
      "loss": 0.0043,
      "reward": 2.2611111223697664,
      "reward_std": 0.5402376987040043,
      "rewards/accuracy_reward_log": 1.4333333566784858,
      "rewards/format_number_reward": 0.4027777828276157,
      "rewards/format_reasoning_reward": 0.42500000447034836,
      "step": 74
    },
    {
      "completion_length": 204.6277805328369,
      "epoch": 0.602167804094741,
      "grad_norm": 1.3836757790747922,
      "kl": 0.118878173828125,
      "learning_rate": 2.25e-06,
      "loss": 0.0048,
      "reward": 2.211111146211624,
      "reward_std": 0.7402307014912367,
      "rewards/accuracy_reward_log": 1.4000000268220902,
      "rewards/format_number_reward": 0.38055556192994117,
      "rewards/format_reasoning_reward": 0.430555559694767,
      "step": 75
    },
    {
      "completion_length": 192.80555725097656,
      "epoch": 0.6101967081493376,
      "grad_norm": 0.5732652733178961,
      "kl": 0.081439208984375,
      "learning_rate": 2.28e-06,
      "loss": 0.0033,
      "reward": 2.444444465637207,
      "reward_std": 0.5093889623880387,
      "rewards/accuracy_reward_log": 1.533333358168602,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.4583333358168602,
      "step": 76
    },
    {
      "completion_length": 159.32222518920898,
      "epoch": 0.6182256122039341,
      "grad_norm": 0.4884865339246385,
      "kl": 0.057391357421875,
      "learning_rate": 2.31e-06,
      "loss": 0.0023,
      "reward": 2.5805555820465087,
      "reward_std": 0.16979632191359997,
      "rewards/accuracy_reward_log": 1.611111131310463,
      "rewards/format_number_reward": 0.47777777910232544,
      "rewards/format_reasoning_reward": 0.49166666716337204,
      "step": 77
    },
    {
      "completion_length": 190.20000076293945,
      "epoch": 0.6262545162585307,
      "grad_norm": 0.5209974936705293,
      "kl": 0.067266845703125,
      "learning_rate": 2.34e-06,
      "loss": 0.0027,
      "reward": 2.4777778029441833,
      "reward_std": 0.4935527116060257,
      "rewards/accuracy_reward_log": 1.5555555790662765,
      "rewards/format_number_reward": 0.4583333358168602,
      "rewards/format_reasoning_reward": 0.46388889104127884,
      "step": 78
    },
    {
      "completion_length": 169.23889122009277,
      "epoch": 0.6342834203131272,
      "grad_norm": 0.4459209602371647,
      "kl": 0.05482177734375,
      "learning_rate": 2.37e-06,
      "loss": 0.0022,
      "reward": 2.6750000178813935,
      "reward_std": 0.23374302312731743,
      "rewards/accuracy_reward_log": 1.7000000149011611,
      "rewards/format_number_reward": 0.4833333343267441,
      "rewards/format_reasoning_reward": 0.49166666716337204,
      "step": 79
    },
    {
      "completion_length": 182.30000267028808,
      "epoch": 0.6423123243677238,
      "grad_norm": 0.45736704423541613,
      "kl": 0.07120361328125,
      "learning_rate": 2.4000000000000003e-06,
      "loss": 0.0028,
      "reward": 2.5750000059604643,
      "reward_std": 0.31974178850650786,
      "rewards/accuracy_reward_log": 1.6444444626569747,
      "rewards/format_number_reward": 0.46388889104127884,
      "rewards/format_reasoning_reward": 0.46666666865348816,
      "step": 80
    },
    {
      "completion_length": 163.07778053283693,
      "epoch": 0.6503412284223203,
      "grad_norm": 0.4682785728868757,
      "kl": 0.06102294921875,
      "learning_rate": 2.43e-06,
      "loss": 0.0024,
      "reward": 2.686111146211624,
      "reward_std": 0.3670576632022858,
      "rewards/accuracy_reward_log": 1.7444444596767426,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 81
    },
    {
      "completion_length": 165.8722225189209,
      "epoch": 0.6583701324769169,
      "grad_norm": 0.509179920003658,
      "kl": 0.061614990234375,
      "learning_rate": 2.4599999999999997e-06,
      "loss": 0.0025,
      "reward": 2.6250000596046448,
      "reward_std": 0.402138914167881,
      "rewards/accuracy_reward_log": 1.6777777969837189,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.47777777910232544,
      "step": 82
    },
    {
      "completion_length": 159.80555725097656,
      "epoch": 0.6663990365315134,
      "grad_norm": 0.5994985713489502,
      "kl": 0.054705810546875,
      "learning_rate": 2.49e-06,
      "loss": 0.0022,
      "reward": 2.6194444835186004,
      "reward_std": 0.4117614269256592,
      "rewards/accuracy_reward_log": 1.6555555760860443,
      "rewards/format_number_reward": 0.48055555671453476,
      "rewards/format_reasoning_reward": 0.4833333343267441,
      "step": 83
    },
    {
      "completion_length": 175.34444770812988,
      "epoch": 0.67442794058611,
      "grad_norm": 0.518979425053283,
      "kl": 0.08248291015625,
      "learning_rate": 2.52e-06,
      "loss": 0.0033,
      "reward": 2.5138889074325563,
      "reward_std": 0.4138893112540245,
      "rewards/accuracy_reward_log": 1.6000000208616256,
      "rewards/format_number_reward": 0.45277778059244156,
      "rewards/format_reasoning_reward": 0.4611111134290695,
      "step": 84
    },
    {
      "completion_length": 165.01666755676268,
      "epoch": 0.6824568446407066,
      "grad_norm": 0.49612998471580416,
      "kl": 0.065625,
      "learning_rate": 2.55e-06,
      "loss": 0.0026,
      "reward": 2.469444477558136,
      "reward_std": 0.5141268767416477,
      "rewards/accuracy_reward_log": 1.588888907432556,
      "rewards/format_number_reward": 0.43333333656191825,
      "rewards/format_reasoning_reward": 0.44722222462296485,
      "step": 85
    },
    {
      "completion_length": 165.6388900756836,
      "epoch": 0.6904857486953031,
      "grad_norm": 0.552553452651074,
      "kl": 0.06807861328125,
      "learning_rate": 2.58e-06,
      "loss": 0.0027,
      "reward": 2.4944444805383683,
      "reward_std": 0.37747681848704817,
      "rewards/accuracy_reward_log": 1.577777798473835,
      "rewards/format_number_reward": 0.4527777798473835,
      "rewards/format_reasoning_reward": 0.46388889104127884,
      "step": 86
    },
    {
      "completion_length": 155.51666641235352,
      "epoch": 0.6985146527498997,
      "grad_norm": 0.5440640607374735,
      "kl": 0.060748291015625,
      "learning_rate": 2.61e-06,
      "loss": 0.0024,
      "reward": 2.4611111283302307,
      "reward_std": 0.5409881249070168,
      "rewards/accuracy_reward_log": 1.577777799963951,
      "rewards/format_number_reward": 0.43888889253139496,
      "rewards/format_reasoning_reward": 0.4444444477558136,
      "step": 87
    },
    {
      "completion_length": 150.6777774810791,
      "epoch": 0.7065435568044962,
      "grad_norm": 0.4538101109952547,
      "kl": 0.05191650390625,
      "learning_rate": 2.64e-06,
      "loss": 0.0021,
      "reward": 2.6416666924953462,
      "reward_std": 0.2997001264244318,
      "rewards/accuracy_reward_log": 1.7000000149011611,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 88
    },
    {
      "completion_length": 165.80000228881835,
      "epoch": 0.7145724608590928,
      "grad_norm": 0.5131296250407812,
      "kl": 0.06552734375,
      "learning_rate": 2.6700000000000003e-06,
      "loss": 0.0026,
      "reward": 2.600000035762787,
      "reward_std": 0.48052144795656204,
      "rewards/accuracy_reward_log": 1.6777777969837189,
      "rewards/format_number_reward": 0.4583333358168602,
      "rewards/format_reasoning_reward": 0.46388889104127884,
      "step": 89
    },
    {
      "completion_length": 151.10555686950684,
      "epoch": 0.7226013649136893,
      "grad_norm": 0.5165638439339917,
      "kl": 0.0578857421875,
      "learning_rate": 2.7e-06,
      "loss": 0.0023,
      "reward": 2.6555556058883667,
      "reward_std": 0.362246410548687,
      "rewards/accuracy_reward_log": 1.7111111283302307,
      "rewards/format_number_reward": 0.4722222238779068,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 90
    },
    {
      "completion_length": 159.4166687011719,
      "epoch": 0.7306302689682859,
      "grad_norm": 0.49295723950300413,
      "kl": 0.058404541015625,
      "learning_rate": 2.73e-06,
      "loss": 0.0023,
      "reward": 2.5833333671092986,
      "reward_std": 0.44544019252061845,
      "rewards/accuracy_reward_log": 1.6444444626569747,
      "rewards/format_number_reward": 0.4694444462656975,
      "rewards/format_reasoning_reward": 0.4694444462656975,
      "step": 91
    },
    {
      "completion_length": 159.53333625793456,
      "epoch": 0.7386591730228824,
      "grad_norm": 1.0273725644592662,
      "kl": 0.06968994140625,
      "learning_rate": 2.7600000000000003e-06,
      "loss": 0.0028,
      "reward": 2.5111111402511597,
      "reward_std": 0.44476102106273174,
      "rewards/accuracy_reward_log": 1.577777799963951,
      "rewards/format_number_reward": 0.4611111134290695,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 92
    },
    {
      "completion_length": 154.76666793823242,
      "epoch": 0.746688077077479,
      "grad_norm": 0.3153647070205455,
      "kl": 0.053741455078125,
      "learning_rate": 2.7900000000000004e-06,
      "loss": 0.0021,
      "reward": 2.688888907432556,
      "reward_std": 0.2062800731509924,
      "rewards/accuracy_reward_log": 1.733333346247673,
      "rewards/format_number_reward": 0.4750000014901161,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 93
    },
    {
      "completion_length": 166.42777900695802,
      "epoch": 0.7547169811320755,
      "grad_norm": 0.5748531305032764,
      "kl": 0.066802978515625,
      "learning_rate": 2.82e-06,
      "loss": 0.0027,
      "reward": 2.600000041723251,
      "reward_std": 0.3458063915371895,
      "rewards/accuracy_reward_log": 1.655555573105812,
      "rewards/format_number_reward": 0.4722222238779068,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 94
    },
    {
      "completion_length": 158.03888969421388,
      "epoch": 0.762745885186672,
      "grad_norm": 0.5545909018402391,
      "kl": 0.066021728515625,
      "learning_rate": 2.85e-06,
      "loss": 0.0026,
      "reward": 2.7111111342906953,
      "reward_std": 0.3430013954639435,
      "rewards/accuracy_reward_log": 1.7666666775941848,
      "rewards/format_number_reward": 0.4722222238779068,
      "rewards/format_reasoning_reward": 0.4722222238779068,
      "step": 95
    },
    {
      "completion_length": 165.80000228881835,
      "epoch": 0.7707747892412685,
      "grad_norm": 0.42499585100246984,
      "kl": 0.05565185546875,
      "learning_rate": 2.88e-06,
      "loss": 0.0022,
      "reward": 2.602777808904648,
      "reward_std": 0.3154630549252033,
      "rewards/accuracy_reward_log": 1.6444444626569747,
      "rewards/format_number_reward": 0.47777777910232544,
      "rewards/format_reasoning_reward": 0.48055555671453476,
      "step": 96
    },
    {
      "completion_length": 174.88333435058593,
      "epoch": 0.7788036932958651,
      "grad_norm": 0.5630736371263277,
      "kl": 0.07933349609375,
      "learning_rate": 2.91e-06,
      "loss": 0.0032,
      "reward": 2.394444453716278,
      "reward_std": 0.4866618663072586,
      "rewards/accuracy_reward_log": 1.4777778044342995,
      "rewards/format_number_reward": 0.4555555582046509,
      "rewards/format_reasoning_reward": 0.4611111134290695,
      "step": 97
    },
    {
      "completion_length": 194.8888916015625,
      "epoch": 0.7868325973504616,
      "grad_norm": 0.5354591296941339,
      "kl": 0.08121337890625,
      "learning_rate": 2.9400000000000002e-06,
      "loss": 0.0032,
      "reward": 2.4166667103767394,
      "reward_std": 0.467490179464221,
      "rewards/accuracy_reward_log": 1.5444444686174392,
      "rewards/format_number_reward": 0.4333333373069763,
      "rewards/format_reasoning_reward": 0.43888889253139496,
      "step": 98
    },
    {
      "completion_length": 198.76111335754393,
      "epoch": 0.7948615014050582,
      "grad_norm": 0.7579425034647131,
      "kl": 0.092218017578125,
      "learning_rate": 2.97e-06,
      "loss": 0.0037,
      "reward": 2.472222238779068,
      "reward_std": 0.44544018656015394,
      "rewards/accuracy_reward_log": 1.577777799963951,
      "rewards/format_number_reward": 0.4472222253680229,
      "rewards/format_reasoning_reward": 0.4472222253680229,
      "step": 99
    },
    {
      "completion_length": 181.00000228881837,
      "epoch": 0.8028904054596547,
      "grad_norm": 3.752908048104343,
      "kl": 0.29696044921875,
      "learning_rate": 3e-06,
      "loss": 0.0119,
      "reward": 2.644444489479065,
      "reward_std": 0.3877051673829556,
      "rewards/accuracy_reward_log": 1.722222238779068,
      "rewards/format_number_reward": 0.4611111134290695,
      "rewards/format_reasoning_reward": 0.4611111134290695,
      "step": 100
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 1000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 9,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}