diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,28034 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.291005291005291,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 676.0,
+      "completions/mean_length": 723.75,
+      "completions/mean_terminated_length": 490.22222900390625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5195164084434509,
+      "epoch": 0.005291005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.453125,
+      "kl": 0.0010675216326490045,
+      "learning_rate": 0.0,
+      "loss": -0.1441,
+      "num_tokens": 18452.0,
+      "reward": 0.828125,
+      "reward_std": 0.3463020324707031,
+      "rewards/itbench_correctness/mean": 0.828125,
+      "rewards/itbench_correctness/std": 0.33811673521995544,
+      "step": 1,
+      "step_time": 91.14044637419283
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 604.75,
+      "completions/mean_terminated_length": 544.857177734375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3654402792453766,
+      "epoch": 0.010582010582010581,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.8046875,
+      "kl": 0.0009289107983931899,
+      "learning_rate": 2e-08,
+      "loss": -0.0658,
+      "num_tokens": 33008.0,
+      "reward": 0.3645833432674408,
+      "reward_std": 0.1873345822095871,
+      "rewards/itbench_correctness/mean": 0.3645833432674408,
+      "rewards/itbench_correctness/std": 0.4552929401397705,
+      "step": 2,
+      "step_time": 828.1970858396962
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 957.5625,
+      "completions/mean_terminated_length": 905.888916015625,
+      "completions/min_length": 763.0,
+      "completions/min_terminated_length": 763.0,
+      "entropy": 0.5472227931022644,
+      "epoch": 0.015873015873015872,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.25,
+      "kl": 0.001058843918144703,
+      "learning_rate": 4e-08,
+      "loss": 0.0343,
+      "num_tokens": 55673.0,
+      "reward": 0.34687501192092896,
+      "reward_std": 0.3456803262233734,
+      "rewards/itbench_correctness/mean": 0.34687501192092896,
+      "rewards/itbench_correctness/std": 0.4120957851409912,
+      "step": 3,
+      "step_time": 151.529059112072
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 532.5625,
+      "completions/mean_terminated_length": 532.5625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5482924580574036,
+      "epoch": 0.021164021164021163,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5078125,
+      "kl": 0.0013292429503053427,
+      "learning_rate": 6e-08,
+      "loss": -0.1305,
+      "num_tokens": 68794.0,
+      "reward": 0.7916666865348816,
+      "reward_std": 0.32439103722572327,
+      "rewards/itbench_correctness/mean": 0.7916666865348816,
+      "rewards/itbench_correctness/std": 0.34960296750068665,
+      "step": 4,
+      "step_time": 417.0535086672753
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 711.4375,
+      "completions/mean_terminated_length": 468.3333435058594,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.47509443759918213,
+      "epoch": 0.026455026455026454,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.0010377272265031934,
+      "learning_rate": 8e-08,
+      "loss": -0.0456,
+      "num_tokens": 83449.0,
+      "reward": 0.3515625,
+      "reward_std": 0.2974616289138794,
+      "rewards/itbench_correctness/mean": 0.3515625,
+      "rewards/itbench_correctness/std": 0.32021722197532654,
+      "step": 5,
+      "step_time": 128.02622807957232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 951.0,
+      "completions/mean_terminated_length": 440.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5846477150917053,
+      "epoch": 0.031746031746031744,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5625,
+      "kl": 0.000945708598010242,
+      "learning_rate": 1e-07,
+      "loss": 0.0034,
+      "num_tokens": 122025.0,
+      "reward": 0.25,
+      "reward_std": 0.2182178944349289,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.394405335187912,
+      "step": 6,
+      "step_time": 145.1888073068112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 544.4375,
+      "completions/mean_terminated_length": 544.4375,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.27000343799591064,
+      "epoch": 0.037037037037037035,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5234375,
+      "kl": 0.001091663958504796,
+      "learning_rate": 1.2e-07,
+      "loss": -0.0527,
+      "num_tokens": 135296.0,
+      "reward": 0.3736979365348816,
+      "reward_std": 0.31324487924575806,
+      "rewards/itbench_correctness/mean": 0.3736979365348816,
+      "rewards/itbench_correctness/std": 0.3162706792354584,
+      "step": 7,
+      "step_time": 83.35824911855161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 683.25,
+      "completions/mean_terminated_length": 660.5333862304688,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.47713136672973633,
+      "epoch": 0.042328042328042326,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0009320093668065965,
+      "learning_rate": 1.4e-07,
+      "loss": 0.0447,
+      "num_tokens": 150748.0,
+      "reward": 0.9322916269302368,
+      "reward_std": 0.062747523188591,
+      "rewards/itbench_correctness/mean": 0.9322916269302368,
+      "rewards/itbench_correctness/std": 0.11063265055418015,
+      "step": 8,
+      "step_time": 179.78012859076262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 892.0,
+      "completions/mean_length": 518.6875,
+      "completions/mean_terminated_length": 446.5000305175781,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.42993131279945374,
+      "epoch": 0.047619047619047616,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4765625,
+      "kl": 0.001125591923482716,
+      "learning_rate": 1.6e-07,
+      "loss": -0.0315,
+      "num_tokens": 165327.0,
+      "reward": 0.578125,
+      "reward_std": 0.24882009625434875,
+      "rewards/itbench_correctness/mean": 0.578125,
+      "rewards/itbench_correctness/std": 0.2660909593105316,
+      "step": 9,
+      "step_time": 145.98578487429768
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 700.0625,
+      "completions/mean_terminated_length": 552.8181762695312,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 0.4028211832046509,
+      "epoch": 0.05291005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.0010170488385483623,
+      "learning_rate": 1.8e-07,
+      "loss": -0.0257,
+      "num_tokens": 181176.0,
+      "reward": 0.5083333253860474,
+      "reward_std": 0.3309464454650879,
+      "rewards/itbench_correctness/mean": 0.5083333253860474,
+      "rewards/itbench_correctness/std": 0.3380225598812103,
+      "step": 10,
+      "step_time": 135.02681362256408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 694.0,
+      "completions/max_terminated_length": 694.0,
+      "completions/mean_length": 466.4375,
+      "completions/mean_terminated_length": 466.4375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.37089642882347107,
+      "epoch": 0.0582010582010582,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.15625,
+      "kl": 0.0011742996284738183,
+      "learning_rate": 2e-07,
+      "loss": -0.0049,
+      "num_tokens": 192455.0,
+      "reward": 0.46875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4989572763442993,
+      "step": 11,
+      "step_time": 994.5879717040807
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 772.0,
+      "completions/mean_length": 501.25,
+      "completions/mean_terminated_length": 327.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.49077308177948,
+      "epoch": 0.06349206349206349,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.890625,
+      "kl": 0.0011536800302565098,
+      "learning_rate": 2.1999999999999998e-07,
+      "loss": -0.0581,
+      "num_tokens": 210611.0,
+      "reward": 0.09375,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.09375,
+      "rewards/itbench_correctness/std": 0.20155644416809082,
+      "step": 12,
+      "step_time": 106.51383402384818
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 627.0,
+      "completions/max_terminated_length": 627.0,
+      "completions/mean_length": 420.9375,
+      "completions/mean_terminated_length": 420.9375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5107646584510803,
+      "epoch": 0.06878306878306878,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.98046875,
+      "kl": 0.0012960818130522966,
+      "learning_rate": 2.4e-07,
+      "loss": -0.1036,
+      "num_tokens": 220666.0,
+      "reward": 0.5572916865348816,
+      "reward_std": 0.2719196677207947,
+      "rewards/itbench_correctness/mean": 0.5572916865348816,
+      "rewards/itbench_correctness/std": 0.2750736474990845,
+      "step": 13,
+      "step_time": 78.42556338571012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 553.4375,
+      "completions/mean_terminated_length": 486.21429443359375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6179559826850891,
+      "epoch": 0.07407407407407407,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6484375,
+      "kl": 0.0013870123075321317,
+      "learning_rate": 2.6e-07,
+      "loss": -0.1253,
+      "num_tokens": 237537.0,
+      "reward": 0.5,
+      "reward_std": 0.3650856614112854,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 14,
+      "step_time": 266.15765621792525
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 659.0,
+      "completions/max_terminated_length": 659.0,
+      "completions/mean_length": 504.9375,
+      "completions/mean_terminated_length": 504.9375,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "entropy": 0.5228369832038879,
+      "epoch": 0.07936507936507936,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.53125,
+      "kl": 0.0013084918027743697,
+      "learning_rate": 2.8e-07,
+      "loss": 0.0175,
+      "num_tokens": 253968.0,
+      "reward": 0.9035714864730835,
+      "reward_std": 0.06060914695262909,
+      "rewards/itbench_correctness/mean": 0.9035714864730835,
+      "rewards/itbench_correctness/std": 0.10054273903369904,
+      "step": 15,
+      "step_time": 132.4059884781018
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 420.5,
+      "completions/mean_terminated_length": 420.5,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 0.4851367473602295,
+      "epoch": 0.08465608465608465,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6484375,
+      "kl": 0.001197831821627915,
+      "learning_rate": 3e-07,
+      "loss": 0.0638,
+      "num_tokens": 263192.0,
+      "reward": 0.4375,
+      "reward_std": 0.38298875093460083,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 16,
+      "step_time": 94.08578859362751
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 573.8125,
+      "completions/mean_terminated_length": 573.8125,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 0.3694586753845215,
+      "epoch": 0.08994708994708994,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.0009872535010799766,
+      "learning_rate": 3.2e-07,
+      "loss": -0.0021,
+      "num_tokens": 276349.0,
+      "reward": 0.7132352590560913,
+      "reward_std": 0.24745365977287292,
+      "rewards/itbench_correctness/mean": 0.7132352590560913,
+      "rewards/itbench_correctness/std": 0.44946467876434326,
+      "step": 17,
+      "step_time": 803.3225803021342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 918.625,
+      "completions/mean_terminated_length": 783.1428833007812,
+      "completions/min_length": 574.0,
+      "completions/min_terminated_length": 574.0,
+      "entropy": 0.3722955584526062,
+      "epoch": 0.09523809523809523,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.7421875,
+      "kl": 0.0011494335485622287,
+      "learning_rate": 3.4000000000000003e-07,
+      "loss": 0.0019,
+      "num_tokens": 305519.0,
+      "reward": 0.7291666865348816,
+      "reward_std": 0.23464766144752502,
+      "rewards/itbench_correctness/mean": 0.7291666865348816,
+      "rewards/itbench_correctness/std": 0.4254627227783203,
+      "step": 18,
+      "step_time": 293.30187319312245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 973.0,
+      "completions/max_terminated_length": 973.0,
+      "completions/mean_length": 638.5,
+      "completions/mean_terminated_length": 638.5,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "entropy": 0.4792482256889343,
+      "epoch": 0.10052910052910052,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.390625,
+      "kl": 0.001083478331565857,
+      "learning_rate": 3.6e-07,
+      "loss": -0.0201,
+      "num_tokens": 319703.0,
+      "reward": 0.71875,
+      "reward_std": 0.09797047078609467,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.16520188748836517,
+      "step": 19,
+      "step_time": 138.92694834899157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 680.625,
+      "completions/mean_terminated_length": 524.5454711914062,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.39669421315193176,
+      "epoch": 0.10582010582010581,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.96875,
+      "kl": 0.0011998838745057583,
+      "learning_rate": 3.7999999999999996e-07,
+      "loss": -0.1584,
+      "num_tokens": 342409.0,
+      "reward": 0.1770833432674408,
+      "reward_std": 0.3077988028526306,
+      "rewards/itbench_correctness/mean": 0.1770833432674408,
+      "rewards/itbench_correctness/std": 0.3413955569267273,
+      "step": 20,
+      "step_time": 374.13402384892106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 655.375,
+      "completions/mean_terminated_length": 655.375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4669082462787628,
+      "epoch": 0.1111111111111111,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6953125,
+      "kl": 0.0013768852222710848,
+      "learning_rate": 4e-07,
+      "loss": -0.115,
+      "num_tokens": 365695.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 21,
+      "step_time": 114.49692635703832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 969.0,
+      "completions/mean_length": 880.375,
+      "completions/mean_terminated_length": 847.2307739257812,
+      "completions/min_length": 599.0,
+      "completions/min_terminated_length": 599.0,
+      "entropy": 0.511145830154419,
+      "epoch": 0.1164021164021164,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4609375,
+      "kl": 0.0011111602652817965,
+      "learning_rate": 4.1999999999999995e-07,
+      "loss": 0.0192,
+      "num_tokens": 389429.0,
+      "reward": 0.59375,
+      "reward_std": 0.03788072243332863,
+      "rewards/itbench_correctness/mean": 0.59375,
+      "rewards/itbench_correctness/std": 0.4227531850337982,
+      "step": 22,
+      "step_time": 103.79572249855846
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1008.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 653.375,
+      "completions/mean_terminated_length": 653.375,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.5203749537467957,
+      "epoch": 0.12169312169312169,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.021240234375,
+      "kl": 0.0011424734257161617,
+      "learning_rate": 4.3999999999999997e-07,
+      "loss": 0.0,
+      "num_tokens": 405051.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 23,
+      "step_time": 158.34662247169763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 675.0,
+      "completions/max_terminated_length": 675.0,
+      "completions/mean_length": 565.0,
+      "completions/mean_terminated_length": 565.0,
+      "completions/min_length": 492.0,
+      "completions/min_terminated_length": 492.0,
+      "entropy": 0.5274336338043213,
+      "epoch": 0.12698412698412698,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0010382338659837842,
+      "learning_rate": 4.6e-07,
+      "loss": -0.0043,
+      "num_tokens": 417107.0,
+      "reward": 0.578125,
+      "reward_std": 0.1099528968334198,
+      "rewards/itbench_correctness/mean": 0.578125,
+      "rewards/itbench_correctness/std": 0.19116783142089844,
+      "step": 24,
+      "step_time": 93.74338541273028
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 806.9375,
+      "completions/mean_terminated_length": 734.5833740234375,
+      "completions/min_length": 487.0,
+      "completions/min_terminated_length": 487.0,
+      "entropy": 0.4733947813510895,
+      "epoch": 0.13227513227513227,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.90625,
+      "kl": 0.0012686237460002303,
+      "learning_rate": 4.8e-07,
+      "loss": -0.027,
+      "num_tokens": 443242.0,
+      "reward": 0.5,
+      "reward_std": 0.39511844515800476,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.4654746949672699,
+      "step": 25,
+      "step_time": 117.60556835308671
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 717.125,
+      "completions/mean_terminated_length": 673.2857666015625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.46017083525657654,
+      "epoch": 0.13756613756613756,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.234375,
+      "kl": 0.0011198390275239944,
+      "learning_rate": 5e-07,
+      "loss": -0.0578,
+      "num_tokens": 460940.0,
+      "reward": 0.15625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.15625,
+      "rewards/itbench_correctness/std": 0.23935678601264954,
+      "step": 26,
+      "step_time": 435.78113711997867
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 709.125,
+      "completions/mean_terminated_length": 664.1428833007812,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.38639166951179504,
+      "epoch": 0.14285714285714285,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.001125096925534308,
+      "learning_rate": 5.2e-07,
+      "loss": 0.0106,
+      "num_tokens": 477014.0,
+      "reward": 0.3166666626930237,
+      "reward_std": 0.175833061337471,
+      "rewards/itbench_correctness/mean": 0.3166666626930237,
+      "rewards/itbench_correctness/std": 0.2388242930173874,
+      "step": 27,
+      "step_time": 136.88797108456492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 603.9375,
+      "completions/mean_terminated_length": 543.9285888671875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.42057332396507263,
+      "epoch": 0.14814814814814814,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0390625,
+      "kl": 0.0010505876271054149,
+      "learning_rate": 5.4e-07,
+      "loss": -0.012,
+      "num_tokens": 491261.0,
+      "reward": 0.46875,
+      "reward_std": 0.04312910512089729,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4876958429813385,
+      "step": 28,
+      "step_time": 450.98891491629183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 822.4375,
+      "completions/mean_terminated_length": 755.25,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.4498822093009949,
+      "epoch": 0.15343915343915343,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0008466003928333521,
+      "learning_rate": 5.6e-07,
+      "loss": -0.0265,
+      "num_tokens": 510580.0,
+      "reward": 0.831250011920929,
+      "reward_std": 0.2088201940059662,
+      "rewards/itbench_correctness/mean": 0.831250011920929,
+      "rewards/itbench_correctness/std": 0.24958299100399017,
+      "step": 29,
+      "step_time": 82.54267377220094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 790.4375,
+      "completions/mean_terminated_length": 650.2999877929688,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.33146199584007263,
+      "epoch": 0.15873015873015872,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3125,
+      "kl": 0.000913652591407299,
+      "learning_rate": 5.8e-07,
+      "loss": -0.0192,
+      "num_tokens": 529675.0,
+      "reward": 0.46875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4989572763442993,
+      "step": 30,
+      "step_time": 153.0279028210789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 852.0625,
+      "completions/mean_terminated_length": 773.9091186523438,
+      "completions/min_length": 553.0,
+      "completions/min_terminated_length": 553.0,
+      "entropy": 0.624367356300354,
+      "epoch": 0.164021164021164,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.125,
+      "kl": 0.0011790527496486902,
+      "learning_rate": 6e-07,
+      "loss": 0.0451,
+      "num_tokens": 554588.0,
+      "reward": 0.17812499403953552,
+      "reward_std": 0.21488739550113678,
+      "rewards/itbench_correctness/mean": 0.17812499403953552,
+      "rewards/itbench_correctness/std": 0.21210749447345734,
+      "step": 31,
+      "step_time": 496.71210376080126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 639.0,
+      "completions/max_terminated_length": 639.0,
+      "completions/mean_length": 464.375,
+      "completions/mean_terminated_length": 464.375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4242261052131653,
+      "epoch": 0.1693121693121693,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3828125,
+      "kl": 0.001077913912013173,
+      "learning_rate": 6.2e-07,
+      "loss": -0.0869,
+      "num_tokens": 565106.0,
+      "reward": 0.53125,
+      "reward_std": 0.12696418166160583,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.43977582454681396,
+      "step": 32,
+      "step_time": 62.5571150816977
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 699.0,
+      "completions/mean_length": 790.375,
+      "completions/mean_terminated_length": 490.0000305175781,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5136802196502686,
+      "epoch": 0.1746031746031746,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.34375,
+      "kl": 0.0010994027834385633,
+      "learning_rate": 6.4e-07,
+      "loss": -0.0752,
+      "num_tokens": 590992.0,
+      "reward": 0.2723214328289032,
+      "reward_std": 0.22582654654979706,
+      "rewards/itbench_correctness/mean": 0.2723214328289032,
+      "rewards/itbench_correctness/std": 0.417490690946579,
+      "step": 33,
+      "step_time": 873.944114420563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 580.0,
+      "completions/max_terminated_length": 580.0,
+      "completions/mean_length": 382.3125,
+      "completions/mean_terminated_length": 382.3125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.41327446699142456,
+      "epoch": 0.17989417989417988,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.38671875,
+      "kl": 0.0016043909126892686,
+      "learning_rate": 6.6e-07,
+      "loss": -0.05,
+      "num_tokens": 599933.0,
+      "reward": 0.71875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 34,
+      "step_time": 811.917650568299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 961.25,
+      "completions/mean_terminated_length": 856.6666870117188,
+      "completions/min_length": 705.0,
+      "completions/min_terminated_length": 705.0,
+      "entropy": 0.6283485293388367,
+      "epoch": 0.18518518518518517,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.515625,
+      "kl": 0.0012013108935207129,
+      "learning_rate": 6.800000000000001e-07,
+      "loss": 0.0029,
+      "num_tokens": 628801.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 35,
+      "step_time": 104.12166160158813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 571.0,
+      "completions/mean_length": 765.4375,
+      "completions/mean_terminated_length": 506.875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.29917532205581665,
+      "epoch": 0.19047619047619047,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.390625,
+      "kl": 0.0010058790212497115,
+      "learning_rate": 7e-07,
+      "loss": 0.0029,
+      "num_tokens": 648056.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 36,
+      "step_time": 884.992473276332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 713.0,
+      "completions/max_terminated_length": 713.0,
+      "completions/mean_length": 540.0625,
+      "completions/mean_terminated_length": 540.0625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.4203217327594757,
+      "epoch": 0.19576719576719576,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0106201171875,
+      "kl": 0.001015704357996583,
+      "learning_rate": 7.2e-07,
+      "loss": 0.0,
+      "num_tokens": 660377.0,
+      "reward": 0.5833333134651184,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5833333134651184,
+      "rewards/itbench_correctness/std": 0.4303314983844757,
+      "step": 37,
+      "step_time": 85.28049738146365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 456.1875,
+      "completions/mean_terminated_length": 456.1875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5830935835838318,
+      "epoch": 0.20105820105820105,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.017822265625,
+      "kl": 0.0013373795663937926,
+      "learning_rate": 7.4e-07,
+      "loss": 0.0,
+      "num_tokens": 688692.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 38,
+      "step_time": 211.86649047024548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 711.375,
+      "completions/mean_terminated_length": 639.2307739257812,
+      "completions/min_length": 377.0,
+      "completions/min_terminated_length": 377.0,
+      "entropy": 0.2656826674938202,
+      "epoch": 0.20634920634920634,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.265625,
+      "kl": 0.0009032340021803975,
+      "learning_rate": 7.599999999999999e-07,
+      "loss": 0.0308,
+      "num_tokens": 707554.0,
+      "reward": 0.484375,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.484375,
+      "rewards/itbench_correctness/std": 0.503891110420227,
+      "step": 39,
+      "step_time": 823.7539153788239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 798.0,
+      "completions/max_terminated_length": 798.0,
+      "completions/mean_length": 455.6875,
+      "completions/mean_terminated_length": 455.6875,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "entropy": 0.4432862401008606,
+      "epoch": 0.21164021164021163,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9609375,
+      "kl": 0.0011500748805701733,
+      "learning_rate": 7.799999999999999e-07,
+      "loss": -0.0166,
+      "num_tokens": 717741.0,
+      "reward": 0.2395833432674408,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.2395833432674408,
+      "rewards/itbench_correctness/std": 0.19214914739131927,
+      "step": 40,
+      "step_time": 798.0437586428598
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 644.25,
+      "completions/mean_terminated_length": 416.3999938964844,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6643383502960205,
+      "epoch": 0.21693121693121692,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3984375,
+      "kl": 0.0012484320905059576,
+      "learning_rate": 8e-07,
+      "loss": 0.0101,
+      "num_tokens": 743129.0,
+      "reward": 0.015625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.015625,
+      "rewards/itbench_correctness/std": 0.0625,
+      "step": 41,
+      "step_time": 98.21953046228737
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 725.875,
+      "completions/mean_terminated_length": 547.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5951437950134277,
+      "epoch": 0.2222222222222222,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7265625,
+      "kl": 0.0012599489418789744,
+      "learning_rate": 8.199999999999999e-07,
+      "loss": -0.1275,
+      "num_tokens": 779247.0,
+      "reward": 0.4375,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 42,
+      "step_time": 374.9474004274234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 585.0,
+      "completions/max_terminated_length": 585.0,
+      "completions/mean_length": 411.625,
+      "completions/mean_terminated_length": 411.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.449438214302063,
+      "epoch": 0.2275132275132275,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2109375,
+      "kl": 0.0011913544731214643,
+      "learning_rate": 8.399999999999999e-07,
+      "loss": -0.063,
+      "num_tokens": 794473.0,
+      "reward": 0.2911931872367859,
+      "reward_std": 0.16020165383815765,
+      "rewards/itbench_correctness/mean": 0.2911931872367859,
+      "rewards/itbench_correctness/std": 0.1646159142255783,
+      "step": 43,
+      "step_time": 82.59138822741807
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 823.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 500.0625,
+      "completions/mean_terminated_length": 500.0625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5279340147972107,
+      "epoch": 0.2328042328042328,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.91796875,
+      "kl": 0.0011128420010209084,
+      "learning_rate": 8.599999999999999e-07,
+      "loss": -0.0494,
+      "num_tokens": 805986.0,
+      "reward": 0.65625,
+      "reward_std": 0.09643959254026413,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.3786855936050415,
+      "step": 44,
+      "step_time": 486.3739328915253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 614.0,
+      "completions/mean_length": 970.5625,
+      "completions/mean_terminated_length": 596.5,
+      "completions/min_length": 579.0,
+      "completions/min_terminated_length": 579.0,
+      "entropy": 0.5398930907249451,
+      "epoch": 0.23809523809523808,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5625,
+      "kl": 0.0010154710616916418,
+      "learning_rate": 8.799999999999999e-07,
+      "loss": 0.0,
+      "num_tokens": 836355.0,
+      "reward": 0.109375,
+      "reward_std": 0.14074896275997162,
+      "rewards/itbench_correctness/mean": 0.109375,
+      "rewards/itbench_correctness/std": 0.22302372753620148,
+      "step": 45,
+      "step_time": 134.77385379187763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 686.0,
+      "completions/max_terminated_length": 686.0,
+      "completions/mean_length": 522.5,
+      "completions/mean_terminated_length": 522.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.45741626620292664,
+      "epoch": 0.24338624338624337,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.02490234375,
+      "kl": 0.0011397113557904959,
+      "learning_rate": 9e-07,
+      "loss": 0.0,
+      "num_tokens": 849099.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 46,
+      "step_time": 88.64014313649386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 596.6875,
+      "completions/mean_terminated_length": 596.6875,
+      "completions/min_length": 448.0,
+      "completions/min_terminated_length": 448.0,
+      "entropy": 0.3787577152252197,
+      "epoch": 0.24867724867724866,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0011281240731477737,
+      "learning_rate": 9.2e-07,
+      "loss": -0.0062,
+      "num_tokens": 862838.0,
+      "reward": 0.7421875,
+      "reward_std": 0.3093565106391907,
+      "rewards/itbench_correctness/mean": 0.7421875,
+      "rewards/itbench_correctness/std": 0.3337562382221222,
+      "step": 47,
+      "step_time": 70.17125954851508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 728.0,
+      "completions/max_terminated_length": 728.0,
+      "completions/mean_length": 423.0625,
+      "completions/mean_terminated_length": 423.0625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.45146992802619934,
+      "epoch": 0.25396825396825395,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.0013216013321653008,
+      "learning_rate": 9.399999999999999e-07,
+      "loss": 0.0073,
+      "num_tokens": 872559.0,
+      "reward": 0.11328125,
+      "reward_std": 0.08985587954521179,
+      "rewards/itbench_correctness/mean": 0.11328125,
+      "rewards/itbench_correctness/std": 0.16958704590797424,
+      "step": 48,
+      "step_time": 87.19072807300836
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 815.375,
+      "completions/mean_terminated_length": 653.1111450195312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.39981603622436523,
+      "epoch": 0.25925925925925924,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7265625,
+      "kl": 0.0008777660550549626,
+      "learning_rate": 9.6e-07,
+      "loss": 0.0344,
+      "num_tokens": 891605.0,
+      "reward": 0.32207342982292175,
+      "reward_std": 0.2425267994403839,
+      "rewards/itbench_correctness/mean": 0.32207342982292175,
+      "rewards/itbench_correctness/std": 0.32837510108947754,
+      "step": 49,
+      "step_time": 145.8219982078299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 709.0,
+      "completions/mean_length": 576.25,
+      "completions/mean_terminated_length": 546.4000244140625,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.5032538175582886,
+      "epoch": 0.26455026455026454,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6796875,
+      "kl": 0.0010894184233620763,
+      "learning_rate": 9.8e-07,
+      "loss": 0.002,
+      "num_tokens": 903569.0,
+      "reward": 0.3849431872367859,
+      "reward_std": 0.13158553838729858,
+      "rewards/itbench_correctness/mean": 0.3849431872367859,
+      "rewards/itbench_correctness/std": 0.20182853937149048,
+      "step": 50,
+      "step_time": 374.8412516852841
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 721.0625,
+      "completions/mean_terminated_length": 539.2999877929688,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.3356158435344696,
+      "epoch": 0.2698412698412698,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0703125,
+      "kl": 0.0009243504609912634,
+      "learning_rate": 1e-06,
+      "loss": 0.0018,
+      "num_tokens": 926762.0,
+      "reward": 0.4732142686843872,
+      "reward_std": 0.07576144486665726,
+      "rewards/itbench_correctness/mean": 0.4732142686843872,
+      "rewards/itbench_correctness/std": 0.4995746612548828,
+      "step": 51,
+      "step_time": 99.96388372033834
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 644.8125,
+      "completions/mean_terminated_length": 590.6428833007812,
+      "completions/min_length": 353.0,
+      "completions/min_terminated_length": 353.0,
+      "entropy": 0.43733644485473633,
+      "epoch": 0.2751322751322751,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.0011704964563250542,
+      "learning_rate": 9.999972660400534e-07,
+      "loss": -0.0123,
+      "num_tokens": 941111.0,
+      "reward": 0.4375,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 52,
+      "step_time": 114.90833497233689
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 913.0,
+      "completions/max_terminated_length": 913.0,
+      "completions/mean_length": 588.4375,
+      "completions/mean_terminated_length": 588.4375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.42825278639793396,
+      "epoch": 0.2804232804232804,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.140625,
+      "kl": 0.0011681180913001299,
+      "learning_rate": 9.999890641901124e-07,
+      "loss": -0.0885,
+      "num_tokens": 953494.0,
+      "reward": 0.8244047164916992,
+      "reward_std": 0.27204394340515137,
+      "rewards/itbench_correctness/mean": 0.8244047164916992,
+      "rewards/itbench_correctness/std": 0.2754608690738678,
+      "step": 53,
+      "step_time": 127.88445741310716
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 788.75,
+      "completions/mean_terminated_length": 647.6000366210938,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3549920618534088,
+      "epoch": 0.2857142857142857,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3359375,
+      "kl": 0.0009949615923687816,
+      "learning_rate": 9.999753945398703e-07,
+      "loss": -0.1075,
+      "num_tokens": 981738.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.044543541967868805,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.4772607088088989,
+      "step": 54,
+      "step_time": 270.70763381849974
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 641.1875,
+      "completions/mean_terminated_length": 615.6666870117188,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4897163510322571,
+      "epoch": 0.291005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.0010646218433976173,
+      "learning_rate": 9.99956257238817e-07,
+      "loss": -0.0731,
+      "num_tokens": 996773.0,
+      "reward": 0.6666666865348816,
+      "reward_std": 0.35634830594062805,
+      "rewards/itbench_correctness/mean": 0.6666666865348816,
+      "rewards/itbench_correctness/std": 0.42163705825805664,
+      "step": 55,
+      "step_time": 172.69540655519813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 557.0,
+      "completions/max_terminated_length": 557.0,
+      "completions/mean_length": 365.875,
+      "completions/mean_terminated_length": 365.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3881106972694397,
+      "epoch": 0.2962962962962963,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.25,
+      "kl": 0.001151208532974124,
+      "learning_rate": 9.999316524962345e-07,
+      "loss": -0.0416,
+      "num_tokens": 1010651.0,
+      "reward": 0.359375,
+      "reward_std": 0.1751839816570282,
+      "rewards/itbench_correctness/mean": 0.359375,
+      "rewards/itbench_correctness/std": 0.1875,
+      "step": 56,
+      "step_time": 88.2503134328872
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 928.0,
+      "completions/mean_length": 699.6875,
+      "completions/mean_terminated_length": 552.2727661132812,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.523090660572052,
+      "epoch": 0.30158730158730157,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.001052780426107347,
+      "learning_rate": 9.999015805811963e-07,
+      "loss": -0.1995,
+      "num_tokens": 1031726.0,
+      "reward": 0.2864583432674408,
+      "reward_std": 0.1927037239074707,
+      "rewards/itbench_correctness/mean": 0.2864583432674408,
+      "rewards/itbench_correctness/std": 0.2652195990085602,
+      "step": 57,
+      "step_time": 354.65077784564346
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 657.0625,
+      "completions/mean_terminated_length": 490.2727355957031,
+      "completions/min_length": 336.0,
+      "completions/min_terminated_length": 336.0,
+      "entropy": 0.47179681062698364,
+      "epoch": 0.30687830687830686,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.03125,
+      "kl": 0.0011720317415893078,
+      "learning_rate": 9.998660418225644e-07,
+      "loss": 0.0026,
+      "num_tokens": 1048359.0,
+      "reward": 0.6420454978942871,
+      "reward_std": 0.07464002817869186,
+      "rewards/itbench_correctness/mean": 0.6420454978942871,
+      "rewards/itbench_correctness/std": 0.38350099325180054,
+      "step": 58,
+      "step_time": 612.9088207762688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 757.1875,
+      "completions/mean_terminated_length": 597.1000366210938,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5388361811637878,
+      "epoch": 0.31216931216931215,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.001013660104945302,
+      "learning_rate": 9.998250366089846e-07,
+      "loss": -0.0625,
+      "num_tokens": 1067810.0,
+      "reward": 0.375,
+      "reward_std": 0.33407655358314514,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.3979112207889557,
+      "step": 59,
+      "step_time": 368.7298939973116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 921.625,
+      "completions/mean_terminated_length": 696.4000244140625,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 0.49694833159446716,
+      "epoch": 0.31746031746031744,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.15625,
+      "kl": 0.0011713014682754874,
+      "learning_rate": 9.997785653888834e-07,
+      "loss": 0.0514,
+      "num_tokens": 1093876.0,
+      "reward": 0.140625,
+      "reward_std": 0.26977968215942383,
+      "rewards/itbench_correctness/mean": 0.140625,
+      "rewards/itbench_correctness/std": 0.2733854353427887,
+      "step": 60,
+      "step_time": 751.7327463729307
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 780.5625,
+      "completions/mean_terminated_length": 634.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4586436152458191,
+      "epoch": 0.32275132275132273,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9296875,
+      "kl": 0.0011096000671386719,
+      "learning_rate": 9.99726628670463e-07,
+      "loss": -0.0469,
+      "num_tokens": 1113397.0,
+      "reward": 0.5833333134651184,
+      "reward_std": 0.3903999924659729,
+      "rewards/itbench_correctness/mean": 0.5833333134651184,
+      "rewards/itbench_correctness/std": 0.3884918689727783,
+      "step": 61,
+      "step_time": 380.8789173979312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 782.6875,
+      "completions/mean_terminated_length": 702.25,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5136149525642395,
+      "epoch": 0.328042328042328,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.03125,
+      "kl": 0.0011364103993400931,
+      "learning_rate": 9.996692270216946e-07,
+      "loss": -0.0176,
+      "num_tokens": 1133432.0,
+      "reward": 0.581250011920929,
+      "reward_std": 0.37959763407707214,
+      "rewards/itbench_correctness/mean": 0.581250011920929,
+      "rewards/itbench_correctness/std": 0.4445503354072571,
+      "step": 62,
+      "step_time": 86.81900852825493
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 585.0,
+      "completions/max_terminated_length": 585.0,
+      "completions/mean_length": 403.1875,
+      "completions/mean_terminated_length": 403.1875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3745155930519104,
+      "epoch": 0.3333333333333333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.0013223676942288876,
+      "learning_rate": 9.996063610703135e-07,
+      "loss": 0.0032,
+      "num_tokens": 1148299.0,
+      "reward": 0.4635416865348816,
+      "reward_std": 0.06842001527547836,
+      "rewards/itbench_correctness/mean": 0.4635416865348816,
+      "rewards/itbench_correctness/std": 0.48778483271598816,
+      "step": 63,
+      "step_time": 649.0491365483031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 997.625,
+      "completions/mean_terminated_length": 883.3333740234375,
+      "completions/min_length": 725.0,
+      "completions/min_terminated_length": 725.0,
+      "entropy": 0.3588522672653198,
+      "epoch": 0.3386243386243386,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.0009991804836317897,
+      "learning_rate": 9.995380315038117e-07,
+      "loss": 0.0,
+      "num_tokens": 1174997.0,
+      "reward": 0.2083333432674408,
+      "reward_std": 0.11785111576318741,
+      "rewards/itbench_correctness/mean": 0.2083333432674408,
+      "rewards/itbench_correctness/std": 0.2687419056892395,
+      "step": 64,
+      "step_time": 110.76476481370628
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 632.0,
+      "completions/max_terminated_length": 632.0,
+      "completions/mean_length": 454.0625,
+      "completions/mean_terminated_length": 454.0625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5461803078651428,
+      "epoch": 0.3439153439153439,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.0015210240380838513,
+      "learning_rate": 9.994642390694308e-07,
+      "loss": -0.0216,
+      "num_tokens": 1187070.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 65,
+      "step_time": 79.26396809145808
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 719.8125,
+      "completions/mean_terminated_length": 581.5454711914062,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "entropy": 0.5056872367858887,
+      "epoch": 0.3492063492063492,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0013450310798361897,
+      "learning_rate": 9.993849845741523e-07,
+      "loss": -0.0131,
+      "num_tokens": 1207347.0,
+      "reward": 0.375,
+      "reward_std": 0.5175491571426392,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 66,
+      "step_time": 337.5469845244661
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 438.0,
+      "completions/max_terminated_length": 438.0,
+      "completions/mean_length": 396.625,
+      "completions/mean_terminated_length": 396.625,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "entropy": 0.35297825932502747,
+      "epoch": 0.3544973544973545,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5546875,
+      "kl": 0.0014365667011588812,
+      "learning_rate": 9.993002688846912e-07,
+      "loss": 0.0055,
+      "num_tokens": 1216421.0,
+      "reward": 0.28125,
+      "reward_std": 0.3061639666557312,
+      "rewards/itbench_correctness/mean": 0.28125,
+      "rewards/itbench_correctness/std": 0.3400367796421051,
+      "step": 67,
+      "step_time": 1142.9677757564932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 682.5,
+      "completions/mean_terminated_length": 603.6923217773438,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4981684982776642,
+      "epoch": 0.35978835978835977,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0014837104827165604,
+      "learning_rate": 9.992100929274846e-07,
+      "loss": -0.0849,
+      "num_tokens": 1231053.0,
+      "reward": 0.5208333134651184,
+      "reward_std": 0.4382143020629883,
+      "rewards/itbench_correctness/mean": 0.5208333134651184,
+      "rewards/itbench_correctness/std": 0.4326561689376831,
+      "step": 68,
+      "step_time": 479.8328125309199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 817.875,
+      "completions/mean_terminated_length": 694.2000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5012990832328796,
+      "epoch": 0.36507936507936506,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.25,
+      "kl": 0.001043464639224112,
+      "learning_rate": 9.991144576886822e-07,
+      "loss": 0.0323,
+      "num_tokens": 1249699.0,
+      "reward": 0.4642857015132904,
+      "reward_std": 0.22637419402599335,
+      "rewards/itbench_correctness/mean": 0.4642857015132904,
+      "rewards/itbench_correctness/std": 0.4928053915500641,
+      "step": 69,
+      "step_time": 82.86210318095982
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1020.0,
+      "completions/mean_length": 693.5625,
+      "completions/mean_terminated_length": 646.357177734375,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.2220419943332672,
+      "epoch": 0.37037037037037035,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.388671875,
+      "kl": 0.0010405785869807005,
+      "learning_rate": 9.990133642141357e-07,
+      "loss": -0.0727,
+      "num_tokens": 1271964.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 70,
+      "step_time": 202.81221913732588
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 769.125,
+      "completions/mean_terminated_length": 710.3077392578125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4264586269855499,
+      "epoch": 0.37566137566137564,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.890625,
+      "kl": 0.0012975148856639862,
+      "learning_rate": 9.989068136093872e-07,
+      "loss": -0.0651,
+      "num_tokens": 1293950.0,
+      "reward": 0.6597222089767456,
+      "reward_std": 0.4048736095428467,
+      "rewards/itbench_correctness/mean": 0.6597222089767456,
+      "rewards/itbench_correctness/std": 0.4526442587375641,
+      "step": 71,
+      "step_time": 454.2652143603191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 711.125,
+      "completions/mean_terminated_length": 690.2667236328125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.28124451637268066,
+      "epoch": 0.38095238095238093,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.51953125,
+      "kl": 0.0010576838394626975,
+      "learning_rate": 9.98794807039657e-07,
+      "loss": -0.085,
+      "num_tokens": 1311536.0,
+      "reward": 0.8125,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 72,
+      "step_time": 104.54512037336826
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 850.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 516.25,
+      "completions/mean_terminated_length": 516.25,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.46295398473739624,
+      "epoch": 0.3862433862433862,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1640625,
+      "kl": 0.0009957690490409732,
+      "learning_rate": 9.98677345729831e-07,
+      "loss": -0.0093,
+      "num_tokens": 1322844.0,
+      "reward": 0.6812499761581421,
+      "reward_std": 0.062321171164512634,
+      "rewards/itbench_correctness/mean": 0.6812499761581421,
+      "rewards/itbench_correctness/std": 0.3400367796421051,
+      "step": 73,
+      "step_time": 635.4008999932557
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 674.0,
+      "completions/mean_terminated_length": 674.0,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.48961424827575684,
+      "epoch": 0.3915343915343915,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0283203125,
+      "kl": 0.0012715591583400965,
+      "learning_rate": 9.985544309644473e-07,
+      "loss": 0.0,
+      "num_tokens": 1342212.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 74,
+      "step_time": 107.24268661439419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 889.125,
+      "completions/mean_terminated_length": 844.1666870117188,
+      "completions/min_length": 570.0,
+      "completions/min_terminated_length": 570.0,
+      "entropy": 0.5128637552261353,
+      "epoch": 0.3968253968253968,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.640625,
+      "kl": 0.0014187946217134595,
+      "learning_rate": 9.98426064087682e-07,
+      "loss": 0.0213,
+      "num_tokens": 1371870.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 75,
+      "step_time": 283.4586225701496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 880.9375,
+      "completions/mean_terminated_length": 566.2000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6129833459854126,
+      "epoch": 0.4021164021164021,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.0010527537669986486,
+      "learning_rate": 9.982922465033348e-07,
+      "loss": 0.0007,
+      "num_tokens": 1399837.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 76,
+      "step_time": 86.58735218271613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 715.6875,
+      "completions/mean_terminated_length": 715.6875,
+      "completions/min_length": 398.0,
+      "completions/min_terminated_length": 398.0,
+      "entropy": 0.3325473666191101,
+      "epoch": 0.4074074074074074,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2265625,
+      "kl": 0.0010047402465716004,
+      "learning_rate": 9.981529796748134e-07,
+      "loss": 0.0174,
+      "num_tokens": 1423416.0,
+      "reward": 0.25833335518836975,
+      "reward_std": 0.07715167105197906,
+      "rewards/itbench_correctness/mean": 0.25833335518836975,
+      "rewards/itbench_correctness/std": 0.19455552101135254,
+      "step": 77,
+      "step_time": 97.38318173773587
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 476.0,
+      "completions/max_terminated_length": 476.0,
+      "completions/mean_length": 376.1875,
+      "completions/mean_terminated_length": 376.1875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4253198206424713,
+      "epoch": 0.4126984126984127,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2109375,
+      "kl": 0.0014126732712611556,
+      "learning_rate": 9.980082651251174e-07,
+      "loss": -0.0444,
+      "num_tokens": 1432859.0,
+      "reward": 0.5520833730697632,
+      "reward_std": 0.29967689514160156,
+      "rewards/itbench_correctness/mean": 0.5520833730697632,
+      "rewards/itbench_correctness/std": 0.32185083627700806,
+      "step": 78,
+      "step_time": 62.12770148552954
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 779.1875,
+      "completions/mean_terminated_length": 744.2142944335938,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4029838740825653,
+      "epoch": 0.41798941798941797,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0012558053713291883,
+      "learning_rate": 9.978581044368217e-07,
+      "loss": -0.0309,
+      "num_tokens": 1461190.0,
+      "reward": 0.1354166716337204,
+      "reward_std": 0.07634378224611282,
+      "rewards/itbench_correctness/mean": 0.1354166716337204,
+      "rewards/itbench_correctness/std": 0.17447009682655334,
+      "step": 79,
+      "step_time": 79.0433895830065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 858.5625,
+      "completions/mean_terminated_length": 729.888916015625,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.36106863617897034,
+      "epoch": 0.42328042328042326,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6171875,
+      "kl": 0.0013509814161807299,
+      "learning_rate": 9.977024992520601e-07,
+      "loss": 0.0063,
+      "num_tokens": 1483655.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 80,
+      "step_time": 7292.784606534056
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 627.6875,
+      "completions/mean_terminated_length": 627.6875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5671612024307251,
+      "epoch": 0.42857142857142855,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.515625,
+      "kl": 0.0016267532482743263,
+      "learning_rate": 9.975414512725056e-07,
+      "loss": -0.0913,
+      "num_tokens": 1504522.0,
+      "reward": 0.5,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 81,
+      "step_time": 89.92690824903548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 961.0,
+      "completions/max_terminated_length": 961.0,
+      "completions/mean_length": 600.375,
+      "completions/mean_terminated_length": 600.375,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "entropy": 0.264834463596344,
+      "epoch": 0.43386243386243384,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.515625,
+      "kl": 0.0011798108462244272,
+      "learning_rate": 9.973749622593532e-07,
+      "loss": -0.0018,
+      "num_tokens": 1519384.0,
+      "reward": 0.5625,
+      "reward_std": 0.1462520956993103,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.19364917278289795,
+      "step": 82,
+      "step_time": 92.88886137399822
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 584.0,
+      "completions/max_terminated_length": 584.0,
+      "completions/mean_length": 491.9375,
+      "completions/mean_terminated_length": 491.9375,
+      "completions/min_length": 362.0,
+      "completions/min_terminated_length": 362.0,
+      "entropy": 0.35370346903800964,
+      "epoch": 0.43915343915343913,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1484375,
+      "kl": 0.001843614736571908,
+      "learning_rate": 9.972030340333e-07,
+      "loss": 0.0148,
+      "num_tokens": 1531063.0,
+      "reward": 0.3020833134651184,
+      "reward_std": 0.1386406421661377,
+      "rewards/itbench_correctness/mean": 0.3020833134651184,
+      "rewards/itbench_correctness/std": 0.36498987674713135,
+      "step": 83,
+      "step_time": 1134.5993446996436
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 674.0,
+      "completions/max_terminated_length": 674.0,
+      "completions/mean_length": 475.75,
+      "completions/mean_terminated_length": 475.75,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.41828691959381104,
+      "epoch": 0.4444444444444444,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.046875,
+      "kl": 0.001323950826190412,
+      "learning_rate": 9.970256684745255e-07,
+      "loss": -0.0128,
+      "num_tokens": 1542371.0,
+      "reward": 0.75,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 84,
+      "step_time": 89.19195851124823
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 975.0,
+      "completions/mean_length": 987.0,
+      "completions/mean_terminated_length": 876.0,
+      "completions/min_length": 824.0,
+      "completions/min_terminated_length": 824.0,
+      "entropy": 0.3343465030193329,
+      "epoch": 0.4497354497354497,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.140625,
+      "kl": 0.0010419428581371903,
+      "learning_rate": 9.968428675226713e-07,
+      "loss": 0.0338,
+      "num_tokens": 1576531.0,
+      "reward": 0.6875,
+      "reward_std": 0.32618680596351624,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.42108768224716187,
+      "step": 85,
+      "step_time": 85.11601546406746
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 729.0,
+      "completions/mean_length": 709.0625,
+      "completions/mean_terminated_length": 520.1000366210938,
+      "completions/min_length": 420.0,
+      "completions/min_terminated_length": 420.0,
+      "entropy": 0.33565446734428406,
+      "epoch": 0.455026455026455,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.98828125,
+      "kl": 0.0012508188374340534,
+      "learning_rate": 9.966546331768192e-07,
+      "loss": -0.0029,
+      "num_tokens": 1595508.0,
+      "reward": 0.5104166865348816,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.5104166865348816,
+      "rewards/itbench_correctness/std": 0.2543601393699646,
+      "step": 86,
+      "step_time": 110.2943638684228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 754.0,
+      "completions/max_terminated_length": 754.0,
+      "completions/mean_length": 485.75,
+      "completions/mean_terminated_length": 485.75,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "entropy": 0.5352547764778137,
+      "epoch": 0.4603174603174603,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4296875,
+      "kl": 0.0011995767708867788,
+      "learning_rate": 9.964609674954695e-07,
+      "loss": 0.0036,
+      "num_tokens": 1608696.0,
+      "reward": 0.3125,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 87,
+      "step_time": 85.32795084360987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 900.0,
+      "completions/mean_length": 802.3125,
+      "completions/mean_terminated_length": 751.1538696289062,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.40632545948028564,
+      "epoch": 0.4656084656084656,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.0009902386227622628,
+      "learning_rate": 9.962618725965194e-07,
+      "loss": -0.0316,
+      "num_tokens": 1627885.0,
+      "reward": 0.4479166865348816,
+      "reward_std": 0.3577525019645691,
+      "rewards/itbench_correctness/mean": 0.4479166865348816,
+      "rewards/itbench_correctness/std": 0.420399934053421,
+      "step": 88,
+      "step_time": 81.01259941980243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 698.0,
+      "completions/max_terminated_length": 698.0,
+      "completions/mean_length": 542.0625,
+      "completions/mean_terminated_length": 542.0625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4243053197860718,
+      "epoch": 0.4708994708994709,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2421875,
+      "kl": 0.0011555891251191497,
+      "learning_rate": 9.960573506572389e-07,
+      "loss": -0.0988,
+      "num_tokens": 1640238.0,
+      "reward": 0.53515625,
+      "reward_std": 0.2504205107688904,
+      "rewards/itbench_correctness/mean": 0.53515625,
+      "rewards/itbench_correctness/std": 0.43777894973754883,
+      "step": 89,
+      "step_time": 97.55466525349766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 963.0,
+      "completions/mean_length": 807.875,
+      "completions/mean_terminated_length": 709.6364135742188,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.48522359132766724,
+      "epoch": 0.47619047619047616,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5,
+      "kl": 0.001208159956149757,
+      "learning_rate": 9.958474039142469e-07,
+      "loss": -0.1015,
+      "num_tokens": 1668412.0,
+      "reward": 0.10625000298023224,
+      "reward_std": 0.1334051787853241,
+      "rewards/itbench_correctness/mean": 0.10625000298023224,
+      "rewards/itbench_correctness/std": 0.16111589968204498,
+      "step": 90,
+      "step_time": 459.5639867214486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 543.0625,
+      "completions/mean_terminated_length": 543.0625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5929335951805115,
+      "epoch": 0.48148148148148145,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.0014933362836018205,
+      "learning_rate": 9.956320346634875e-07,
+      "loss": -0.0536,
+      "num_tokens": 1681853.0,
+      "reward": 0.8125,
+      "reward_std": 0.32946425676345825,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.3256048858165741,
+      "step": 91,
+      "step_time": 78.2018728973344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 705.0,
+      "completions/mean_length": 682.625,
+      "completions/mean_terminated_length": 527.45458984375,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.38381248712539673,
+      "epoch": 0.48677248677248675,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06689453125,
+      "kl": 0.001028747414238751,
+      "learning_rate": 9.954112452602043e-07,
+      "loss": 0.0,
+      "num_tokens": 1707895.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 92,
+      "step_time": 160.40463780704886
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 984.0625,
+      "completions/mean_terminated_length": 704.5,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.30079391598701477,
+      "epoch": 0.49206349206349204,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.890625,
+      "kl": 0.0009068697690963745,
+      "learning_rate": 9.95185038118915e-07,
+      "loss": -0.0136,
+      "num_tokens": 1733104.0,
+      "reward": 0.53125,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.4989572763442993,
+      "step": 93,
+      "step_time": 135.90597889758646
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 833.0,
+      "completions/mean_length": 646.875,
+      "completions/mean_terminated_length": 475.4545593261719,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5750724673271179,
+      "epoch": 0.4973544973544973,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5625,
+      "kl": 0.001511996379122138,
+      "learning_rate": 9.949534157133844e-07,
+      "loss": -0.1351,
+      "num_tokens": 1762622.0,
+      "reward": 0.4765625,
+      "reward_std": 0.32506585121154785,
+      "rewards/itbench_correctness/mean": 0.4765625,
+      "rewards/itbench_correctness/std": 0.3958607614040375,
+      "step": 94,
+      "step_time": 178.96230245847255
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 802.5625,
+      "completions/mean_terminated_length": 751.4615478515625,
+      "completions/min_length": 542.0,
+      "completions/min_terminated_length": 542.0,
+      "entropy": 0.5806401371955872,
+      "epoch": 0.5026455026455027,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.125,
+      "kl": 0.0013606835855171084,
+      "learning_rate": 9.947163805765979e-07,
+      "loss": 0.0764,
+      "num_tokens": 1795879.0,
+      "reward": 0.48124998807907104,
+      "reward_std": 0.1944543570280075,
+      "rewards/itbench_correctness/mean": 0.48124998807907104,
+      "rewards/itbench_correctness/std": 0.47359442710876465,
+      "step": 95,
+      "step_time": 182.67914429306984
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 956.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 667.5,
+      "completions/mean_terminated_length": 667.5,
+      "completions/min_length": 441.0,
+      "completions/min_terminated_length": 441.0,
+      "entropy": 0.32958802580833435,
+      "epoch": 0.5079365079365079,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6015625,
+      "kl": 0.0008195637492462993,
+      "learning_rate": 9.944739353007341e-07,
+      "loss": 0.0178,
+      "num_tokens": 1811303.0,
+      "reward": 0.8374999761581421,
+      "reward_std": 0.09672200679779053,
+      "rewards/itbench_correctness/mean": 0.8374999761581421,
+      "rewards/itbench_correctness/std": 0.1031898632645607,
+      "step": 96,
+      "step_time": 74.22002993617207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 519.4375,
+      "completions/mean_terminated_length": 519.4375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3407532274723053,
+      "epoch": 0.5132275132275133,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1171875,
+      "kl": 0.0011502447305247188,
+      "learning_rate": 9.942260825371357e-07,
+      "loss": -0.1158,
+      "num_tokens": 1824454.0,
+      "reward": 0.5687500238418579,
+      "reward_std": 0.23231291770935059,
+      "rewards/itbench_correctness/mean": 0.5687500238418579,
+      "rewards/itbench_correctness/std": 0.2676284909248352,
+      "step": 97,
+      "step_time": 72.25101596303284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 685.0,
+      "completions/max_terminated_length": 685.0,
+      "completions/mean_length": 519.75,
+      "completions/mean_terminated_length": 519.75,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.49639248847961426,
+      "epoch": 0.5185185185185185,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.0014472255716100335,
+      "learning_rate": 9.939728249962806e-07,
+      "loss": -0.0098,
+      "num_tokens": 1844642.0,
+      "reward": 0.8500000238418579,
+      "reward_std": 0.2121320366859436,
+      "rewards/itbench_correctness/mean": 0.8500000238418579,
+      "rewards/itbench_correctness/std": 0.24765567481517792,
+      "step": 98,
+      "step_time": 68.29791031684726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 692.9375,
+      "completions/mean_terminated_length": 542.45458984375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5685938596725464,
+      "epoch": 0.5238095238095238,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.6171875,
+      "kl": 0.0014234319096431136,
+      "learning_rate": 9.937141654477528e-07,
+      "loss": -0.1176,
+      "num_tokens": 1866377.0,
+      "reward": 0.375,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 99,
+      "step_time": 99.10520203411579
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 755.0,
+      "completions/max_terminated_length": 755.0,
+      "completions/mean_length": 337.6875,
+      "completions/mean_terminated_length": 337.6875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3746066987514496,
+      "epoch": 0.5291005291005291,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.97265625,
+      "kl": 0.0013704805169254541,
+      "learning_rate": 9.934501067202117e-07,
+      "loss": -0.0118,
+      "num_tokens": 1874500.0,
+      "reward": 0.3125,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.3095695972442627,
+      "step": 100,
+      "step_time": 831.8933219816536
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 823.0,
+      "completions/mean_length": 611.1875,
+      "completions/mean_terminated_length": 473.5833435058594,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "entropy": 0.5955619215965271,
+      "epoch": 0.5343915343915344,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0012612693244591355,
+      "learning_rate": 9.931806517013612e-07,
+      "loss": 0.0328,
+      "num_tokens": 1899799.0,
+      "reward": 0.125,
+      "reward_std": 0.2925041913986206,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.28867512941360474,
+      "step": 101,
+      "step_time": 185.49466035328805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 743.0625,
+      "completions/mean_terminated_length": 724.3333740234375,
+      "completions/min_length": 528.0,
+      "completions/min_terminated_length": 528.0,
+      "entropy": 0.4737151861190796,
+      "epoch": 0.5396825396825397,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.609375,
+      "kl": 0.0012106020003557205,
+      "learning_rate": 9.929058033379181e-07,
+      "loss": 0.0185,
+      "num_tokens": 1915728.0,
+      "reward": 0.8194444179534912,
+      "reward_std": 0.20520132780075073,
+      "rewards/itbench_correctness/mean": 0.8194444179534912,
+      "rewards/itbench_correctness/std": 0.3367112874984741,
+      "step": 102,
+      "step_time": 418.81876328215003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 761.3125,
+      "completions/mean_terminated_length": 557.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5017650723457336,
+      "epoch": 0.544973544973545,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.0011779210763052106,
+      "learning_rate": 9.926255646355803e-07,
+      "loss": -0.1277,
+      "num_tokens": 1953421.0,
+      "reward": 0.2708333432674408,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.2708333432674408,
+      "rewards/itbench_correctness/std": 0.4254627227783203,
+      "step": 103,
+      "step_time": 131.8819383457303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 631.1875,
+      "completions/mean_terminated_length": 605.0000610351562,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.41825923323631287,
+      "epoch": 0.5502645502645502,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3046875,
+      "kl": 0.001125396229326725,
+      "learning_rate": 9.923399386589932e-07,
+      "loss": 0.0027,
+      "num_tokens": 1967568.0,
+      "reward": 0.967524528503418,
+      "reward_std": 0.0356326624751091,
+      "rewards/itbench_correctness/mean": 0.967524528503418,
+      "rewards/itbench_correctness/std": 0.059118952602148056,
+      "step": 104,
+      "step_time": 237.89590667374432
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 975.0,
+      "completions/mean_length": 437.25,
+      "completions/mean_terminated_length": 398.13336181640625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.41623786091804504,
+      "epoch": 0.5555555555555556,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3671875,
+      "kl": 0.001291301567107439,
+      "learning_rate": 9.92048928531717e-07,
+      "loss": -0.0479,
+      "num_tokens": 1981084.0,
+      "reward": 0.46875,
+      "reward_std": 0.1883128434419632,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.2525334656238556,
+      "step": 105,
+      "step_time": 178.8811132274568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 492.0,
+      "completions/mean_length": 725.9375,
+      "completions/mean_terminated_length": 427.875,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "entropy": 0.5179509520530701,
+      "epoch": 0.5608465608465608,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6796875,
+      "kl": 0.0009696544148027897,
+      "learning_rate": 9.917525374361911e-07,
+      "loss": 0.0018,
+      "num_tokens": 1999387.0,
+      "reward": 0.546875,
+      "reward_std": 0.22097086906433105,
+      "rewards/itbench_correctness/mean": 0.546875,
+      "rewards/itbench_correctness/std": 0.5018196105957031,
+      "step": 106,
+      "step_time": 493.8660353682935
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 911.375,
+      "completions/mean_terminated_length": 843.7999877929688,
+      "completions/min_length": 538.0,
+      "completions/min_terminated_length": 538.0,
+      "entropy": 0.34014537930488586,
+      "epoch": 0.5661375661375662,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4453125,
+      "kl": 0.001088326214812696,
+      "learning_rate": 9.914507686137017e-07,
+      "loss": 0.0167,
+      "num_tokens": 2022945.0,
+      "reward": 0.35624998807907104,
+      "reward_std": 0.11475905776023865,
+      "rewards/itbench_correctness/mean": 0.35624998807907104,
+      "rewards/itbench_correctness/std": 0.3999479115009308,
+      "step": 107,
+      "step_time": 235.87840359471738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 700.0,
+      "completions/max_terminated_length": 700.0,
+      "completions/mean_length": 542.4375,
+      "completions/mean_terminated_length": 542.4375,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.4701002538204193,
+      "epoch": 0.5714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.0014570436906069517,
+      "learning_rate": 9.911436253643443e-07,
+      "loss": 0.0162,
+      "num_tokens": 2036592.0,
+      "reward": 0.8567708134651184,
+      "reward_std": 0.19427995383739471,
+      "rewards/itbench_correctness/mean": 0.8567708134651184,
+      "rewards/itbench_correctness/std": 0.24054758250713348,
+      "step": 108,
+      "step_time": 129.46329625695944
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 1023.4375,
+      "completions/mean_terminated_length": 1015.0,
+      "completions/min_length": 1015.0,
+      "completions/min_terminated_length": 1015.0,
+      "entropy": 0.5901679396629333,
+      "epoch": 0.5767195767195767,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0010806693462654948,
+      "learning_rate": 9.90831111046988e-07,
+      "loss": 0.0009,
+      "num_tokens": 2060871.0,
+      "reward": 0.15625,
+      "reward_std": 0.3198433816432953,
+      "rewards/itbench_correctness/mean": 0.15625,
+      "rewards/itbench_correctness/std": 0.3520771861076355,
+      "step": 109,
+      "step_time": 73.70483169332147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 885.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 689.5,
+      "completions/mean_terminated_length": 689.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.39158812165260315,
+      "epoch": 0.582010582010582,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.001037560636177659,
+      "learning_rate": 9.905132290792392e-07,
+      "loss": -0.0033,
+      "num_tokens": 2076943.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 110,
+      "step_time": 73.8764311010018
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 369.0,
+      "completions/mean_length": 513.0,
+      "completions/mean_terminated_length": 206.40000915527344,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5146198868751526,
+      "epoch": 0.5873015873015873,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5,
+      "kl": 0.0013582897372543812,
+      "learning_rate": 9.901899829374047e-07,
+      "loss": -0.1464,
+      "num_tokens": 2089871.0,
+      "reward": 0.3740079402923584,
+      "reward_std": 0.34763163328170776,
+      "rewards/itbench_correctness/mean": 0.3740079402923584,
+      "rewards/itbench_correctness/std": 0.3568885028362274,
+      "step": 111,
+      "step_time": 695.7899582823738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 603.0,
+      "completions/max_terminated_length": 603.0,
+      "completions/mean_length": 466.9375,
+      "completions/mean_terminated_length": 466.9375,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "entropy": 0.3662160336971283,
+      "epoch": 0.5925925925925926,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01226806640625,
+      "kl": 0.0011427823919802904,
+      "learning_rate": 9.89861376156452e-07,
+      "loss": 0.0,
+      "num_tokens": 2100646.0,
+      "reward": 0.4166666865348816,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.4166666865348816,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 112,
+      "step_time": 65.8763862894848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 998.875,
+      "completions/mean_terminated_length": 823.0,
+      "completions/min_length": 642.0,
+      "completions/min_terminated_length": 642.0,
+      "entropy": 0.3023401200771332,
+      "epoch": 0.5978835978835979,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3046875,
+      "kl": 0.0008460183744318783,
+      "learning_rate": 9.895274123299722e-07,
+      "loss": 0.0013,
+      "num_tokens": 2126916.0,
+      "reward": 0.28125,
+      "reward_std": 0.2086307406425476,
+      "rewards/itbench_correctness/mean": 0.28125,
+      "rewards/itbench_correctness/std": 0.4069705307483673,
+      "step": 113,
+      "step_time": 870.3144110767171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 986.0,
+      "completions/mean_length": 618.875,
+      "completions/mean_terminated_length": 525.3846435546875,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "entropy": 0.47182387113571167,
+      "epoch": 0.6031746031746031,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4609375,
+      "kl": 0.0011947272578254342,
+      "learning_rate": 9.891880951101407e-07,
+      "loss": -0.0027,
+      "num_tokens": 2140634.0,
+      "reward": 0.15416666865348816,
+      "reward_std": 0.21283237636089325,
+      "rewards/itbench_correctness/mean": 0.15416666865348816,
+      "rewards/itbench_correctness/std": 0.3315228819847107,
+      "step": 114,
+      "step_time": 111.45921329036355
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 748.0,
+      "completions/mean_length": 831.6875,
+      "completions/mean_terminated_length": 639.375,
+      "completions/min_length": 546.0,
+      "completions/min_terminated_length": 546.0,
+      "entropy": 0.3174269199371338,
+      "epoch": 0.6084656084656085,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0009422831353731453,
+      "learning_rate": 9.888434282076757e-07,
+      "loss": 0.0093,
+      "num_tokens": 2159877.0,
+      "reward": 0.10546875,
+      "reward_std": 0.07999982684850693,
+      "rewards/itbench_correctness/mean": 0.10546875,
+      "rewards/itbench_correctness/std": 0.1543108969926834,
+      "step": 115,
+      "step_time": 162.2415656549856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 849.0,
+      "completions/mean_length": 833.0,
+      "completions/mean_terminated_length": 718.4000244140625,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.5162065029144287,
+      "epoch": 0.6137566137566137,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.0014166326727718115,
+      "learning_rate": 9.884934153917996e-07,
+      "loss": 0.0456,
+      "num_tokens": 2190885.0,
+      "reward": 0.21875,
+      "reward_std": 0.1735912710428238,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.3275540769100189,
+      "step": 116,
+      "step_time": 763.6827120250091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 753.6875,
+      "completions/mean_terminated_length": 591.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.41927191615104675,
+      "epoch": 0.6190476190476191,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.0011654142290353775,
+      "learning_rate": 9.881380604901963e-07,
+      "loss": -0.1407,
+      "num_tokens": 2212584.0,
+      "reward": 0.2708333134651184,
+      "reward_std": 0.3443610668182373,
+      "rewards/itbench_correctness/mean": 0.2708333134651184,
+      "rewards/itbench_correctness/std": 0.33471935987472534,
+      "step": 117,
+      "step_time": 234.95893322955817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 748.8125,
+      "completions/mean_terminated_length": 623.727294921875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4727485179901123,
+      "epoch": 0.6243386243386243,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.09375,
+      "kl": 0.0017693624831736088,
+      "learning_rate": 9.8777736738897e-07,
+      "loss": -0.09,
+      "num_tokens": 2236157.0,
+      "reward": 0.2291666716337204,
+      "reward_std": 0.3471825420856476,
+      "rewards/itbench_correctness/mean": 0.2291666716337204,
+      "rewards/itbench_correctness/std": 0.35420751571655273,
+      "step": 118,
+      "step_time": 141.18642224557698
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 832.25,
+      "completions/mean_terminated_length": 683.1111450195312,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "entropy": 0.4133373498916626,
+      "epoch": 0.6296296296296297,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5,
+      "kl": 0.0010757006239145994,
+      "learning_rate": 9.87411340032603e-07,
+      "loss": 0.0049,
+      "num_tokens": 2259913.0,
+      "reward": 0.46875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4989572763442993,
+      "step": 119,
+      "step_time": 577.6952238306403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 707.6875,
+      "completions/mean_terminated_length": 461.6666564941406,
+      "completions/min_length": 336.0,
+      "completions/min_terminated_length": 336.0,
+      "entropy": 0.5736995339393616,
+      "epoch": 0.6349206349206349,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.0011166096664965153,
+      "learning_rate": 9.870399824239114e-07,
+      "loss": -0.0078,
+      "num_tokens": 2278228.0,
+      "reward": 0.3671875,
+      "reward_std": 0.2785572409629822,
+      "rewards/itbench_correctness/mean": 0.3671875,
+      "rewards/itbench_correctness/std": 0.2793920040130615,
+      "step": 120,
+      "step_time": 203.33785133063793
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 780.75,
+      "completions/mean_terminated_length": 670.1818237304688,
+      "completions/min_length": 452.0,
+      "completions/min_terminated_length": 452.0,
+      "entropy": 0.40473902225494385,
+      "epoch": 0.6402116402116402,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0012949309311807156,
+      "learning_rate": 9.866632986240029e-07,
+      "loss": 0.0027,
+      "num_tokens": 2296336.0,
+      "reward": 0.4776785671710968,
+      "reward_std": 0.2322283834218979,
+      "rewards/itbench_correctness/mean": 0.4776785671710968,
+      "rewards/itbench_correctness/std": 0.4821428656578064,
+      "step": 121,
+      "step_time": 101.13796862587333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 618.75,
+      "completions/mean_terminated_length": 618.75,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 0.5333333611488342,
+      "epoch": 0.6455026455026455,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1328125,
+      "kl": 0.0013620926765725017,
+      "learning_rate": 9.862812927522308e-07,
+      "loss": 0.0167,
+      "num_tokens": 2314388.0,
+      "reward": 0.6145833134651184,
+      "reward_std": 0.043129097670316696,
+      "rewards/itbench_correctness/mean": 0.6145833134651184,
+      "rewards/itbench_correctness/std": 0.40239447355270386,
+      "step": 122,
+      "step_time": 715.118090393953
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 574.0,
+      "completions/mean_length": 589.375,
+      "completions/mean_terminated_length": 444.5,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 0.38006362318992615,
+      "epoch": 0.6507936507936508,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0010308035416528583,
+      "learning_rate": 9.858939689861506e-07,
+      "loss": 0.0628,
+      "num_tokens": 2330282.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.17097428441047668,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.197202667593956,
+      "step": 123,
+      "step_time": 104.44047453720123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 429.0,
+      "completions/mean_length": 700.125,
+      "completions/mean_terminated_length": 376.25,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.43992143869400024,
+      "epoch": 0.656084656084656,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0233154296875,
+      "kl": 0.0014201418962329626,
+      "learning_rate": 9.855013315614725e-07,
+      "loss": 0.0,
+      "num_tokens": 2353412.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 124,
+      "step_time": 91.80700621567667
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 996.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 658.0,
+      "completions/mean_terminated_length": 658.0,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.3844984769821167,
+      "epoch": 0.6613756613756614,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0859375,
+      "kl": 0.0017445363337174058,
+      "learning_rate": 9.851033847720164e-07,
+      "loss": 0.0,
+      "num_tokens": 2368164.0,
+      "reward": 0.25,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 125,
+      "step_time": 84.32240361534059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 704.0,
+      "completions/mean_length": 700.6875,
+      "completions/mean_terminated_length": 449.22222900390625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.6279546618461609,
+      "epoch": 0.6666666666666666,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.001191388233564794,
+      "learning_rate": 9.847001329696652e-07,
+      "loss": -0.0546,
+      "num_tokens": 2386335.0,
+      "reward": 0.41874998807907104,
+      "reward_std": 0.2509503960609436,
+      "rewards/itbench_correctness/mean": 0.41874998807907104,
+      "rewards/itbench_correctness/std": 0.3046172559261322,
+      "step": 126,
+      "step_time": 192.25429659802467
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 589.0,
+      "completions/mean_length": 673.125,
+      "completions/mean_terminated_length": 400.22222900390625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.6596100330352783,
+      "epoch": 0.671957671957672,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0013828243827447295,
+      "learning_rate": 9.842915805643156e-07,
+      "loss": -0.0019,
+      "num_tokens": 2410073.0,
+      "reward": 0.453125,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.453125,
+      "rewards/itbench_correctness/std": 0.5018196105957031,
+      "step": 127,
+      "step_time": 370.60414741840214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 608.0,
+      "completions/mean_length": 675.125,
+      "completions/mean_terminated_length": 465.8000183105469,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6161822080612183,
+      "epoch": 0.6772486772486772,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0390625,
+      "kl": 0.0011551063507795334,
+      "learning_rate": 9.838777320238312e-07,
+      "loss": -0.0151,
+      "num_tokens": 2430699.0,
+      "reward": 0.34375,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.3966001570224762,
+      "step": 128,
+      "step_time": 101.63996140938252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 655.9375,
+      "completions/mean_terminated_length": 655.9375,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "entropy": 0.27136731147766113,
+      "epoch": 0.6825396825396826,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.0009132423438131809,
+      "learning_rate": 9.834585918739934e-07,
+      "loss": 0.0035,
+      "num_tokens": 2448146.0,
+      "reward": 0.34375,
+      "reward_std": 0.0578637570142746,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.36371922492980957,
+      "step": 129,
+      "step_time": 926.4854553686455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 755.0,
+      "completions/max_terminated_length": 755.0,
+      "completions/mean_length": 507.75,
+      "completions/mean_terminated_length": 507.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.486459881067276,
+      "epoch": 0.6878306878306878,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2421875,
+      "kl": 0.0013310650829225779,
+      "learning_rate": 9.83034164698452e-07,
+      "loss": -0.08,
+      "num_tokens": 2459726.0,
+      "reward": 0.8690476417541504,
+      "reward_std": 0.28752756118774414,
+      "rewards/itbench_correctness/mean": 0.8690476417541504,
+      "rewards/itbench_correctness/std": 0.2865068316459656,
+      "step": 130,
+      "step_time": 497.3244105326012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 601.0,
+      "completions/max_terminated_length": 601.0,
+      "completions/mean_length": 458.625,
+      "completions/mean_terminated_length": 458.625,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 0.40119925141334534,
+      "epoch": 0.6931216931216931,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.328125,
+      "kl": 0.0013875841395929456,
+      "learning_rate": 9.826044551386742e-07,
+      "loss": 0.0024,
+      "num_tokens": 2469992.0,
+      "reward": 0.4791666865348816,
+      "reward_std": 0.19795583188533783,
+      "rewards/itbench_correctness/mean": 0.4791666865348816,
+      "rewards/itbench_correctness/std": 0.27131369709968567,
+      "step": 131,
+      "step_time": 64.11436599586159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 632.5625,
+      "completions/mean_terminated_length": 606.4666748046875,
+      "completions/min_length": 457.0,
+      "completions/min_terminated_length": 457.0,
+      "entropy": 0.3730856776237488,
+      "epoch": 0.6984126984126984,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0234375,
+      "kl": 0.0012992192059755325,
+      "learning_rate": 9.821694678938952e-07,
+      "loss": -0.0026,
+      "num_tokens": 2484161.0,
+      "reward": 0.9255682229995728,
+      "reward_std": 0.17330622673034668,
+      "rewards/itbench_correctness/mean": 0.9255682229995728,
+      "rewards/itbench_correctness/std": 0.24894750118255615,
+      "step": 132,
+      "step_time": 782.2131289467216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 779.25,
+      "completions/mean_terminated_length": 762.933349609375,
+      "completions/min_length": 639.0,
+      "completions/min_terminated_length": 639.0,
+      "entropy": 0.6185434460639954,
+      "epoch": 0.7037037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.25,
+      "kl": 0.0011128331534564495,
+      "learning_rate": 9.817292077210656e-07,
+      "loss": 0.0277,
+      "num_tokens": 2503445.0,
+      "reward": 0.59375,
+      "reward_std": 0.3061639666557312,
+      "rewards/itbench_correctness/mean": 0.59375,
+      "rewards/itbench_correctness/std": 0.41708314418792725,
+      "step": 133,
+      "step_time": 234.19261386059225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 657.4375,
+      "completions/mean_terminated_length": 535.25,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 0.31942200660705566,
+      "epoch": 0.708994708994709,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.109375,
+      "kl": 0.0009678892092779279,
+      "learning_rate": 9.812836794348002e-07,
+      "loss": 0.0316,
+      "num_tokens": 2520980.0,
+      "reward": 0.78125,
+      "reward_std": 0.1085391715168953,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.27024510502815247,
+      "step": 134,
+      "step_time": 130.00603658426553
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 942.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 696.0,
+      "completions/mean_terminated_length": 696.0,
+      "completions/min_length": 458.0,
+      "completions/min_terminated_length": 458.0,
+      "entropy": 0.36063218116760254,
+      "epoch": 0.7142857142857143,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1015625,
+      "kl": 0.0012922929599881172,
+      "learning_rate": 9.808328879073251e-07,
+      "loss": -0.024,
+      "num_tokens": 2537100.0,
+      "reward": 0.6875,
+      "reward_std": 0.0862581878900528,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.3435921370983124,
+      "step": 135,
+      "step_time": 191.46370885893703
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 370.0,
+      "completions/max_terminated_length": 370.0,
+      "completions/mean_length": 310.375,
+      "completions/mean_terminated_length": 310.375,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "entropy": 0.3898509740829468,
+      "epoch": 0.7195767195767195,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1953125,
+      "kl": 0.0024722313974052668,
+      "learning_rate": 9.803768380684242e-07,
+      "loss": -0.0114,
+      "num_tokens": 2544442.0,
+      "reward": 0.21875,
+      "reward_std": 0.3061639666557312,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 136,
+      "step_time": 65.17159292474389
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 9.0,
+      "completions/mean_length": 960.5625,
+      "completions/mean_terminated_length": 9.0,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.4580649435520172,
+      "epoch": 0.7248677248677249,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.078125,
+      "kl": 0.0012847312027588487,
+      "learning_rate": 9.79915534905385e-07,
+      "loss": -0.0218,
+      "num_tokens": 2571915.0,
+      "reward": 0.3541666865348816,
+      "reward_std": 0.349293053150177,
+      "rewards/itbench_correctness/mean": 0.3541666865348816,
+      "rewards/itbench_correctness/std": 0.4121982753276825,
+      "step": 137,
+      "step_time": 95.23527884297073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 655.5,
+      "completions/mean_terminated_length": 630.933349609375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5095347166061401,
+      "epoch": 0.7301587301587301,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.0019625271670520306,
+      "learning_rate": 9.794489834629454e-07,
+      "loss": 0.0004,
+      "num_tokens": 2596083.0,
+      "reward": 0.296875,
+      "reward_std": 0.24944134056568146,
+      "rewards/itbench_correctness/mean": 0.296875,
+      "rewards/itbench_correctness/std": 0.4584280252456665,
+      "step": 138,
+      "step_time": 73.28423386160284
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 568.3125,
+      "completions/mean_terminated_length": 568.3125,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "entropy": 0.47509074211120605,
+      "epoch": 0.7354497354497355,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.001536766067147255,
+      "learning_rate": 9.789771888432373e-07,
+      "loss": 0.0225,
+      "num_tokens": 2617728.0,
+      "reward": 0.5104166865348816,
+      "reward_std": 0.43504026532173157,
+      "rewards/itbench_correctness/mean": 0.5104166865348816,
+      "rewards/itbench_correctness/std": 0.43127182126045227,
+      "step": 139,
+      "step_time": 116.2228917106986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 554.75,
+      "completions/mean_terminated_length": 487.71429443359375,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "entropy": 0.3929698169231415,
+      "epoch": 0.7407407407407407,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.001249143504537642,
+      "learning_rate": 9.78500156205731e-07,
+      "loss": -0.0021,
+      "num_tokens": 2630956.0,
+      "reward": 0.19062501192092896,
+      "reward_std": 0.0265165064483881,
+      "rewards/itbench_correctness/mean": 0.19062501192092896,
+      "rewards/itbench_correctness/std": 0.2001822143793106,
+      "step": 140,
+      "step_time": 416.3123774584383
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 697.0,
+      "completions/max_terminated_length": 697.0,
+      "completions/mean_length": 462.125,
+      "completions/mean_terminated_length": 462.125,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "entropy": 0.36353799700737,
+      "epoch": 0.746031746031746,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0016349649522453547,
+      "learning_rate": 9.780178907671788e-07,
+      "loss": 0.0084,
+      "num_tokens": 2641358.0,
+      "reward": 0.375,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 141,
+      "step_time": 87.9296273579821
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 708.0,
+      "completions/max_terminated_length": 708.0,
+      "completions/mean_length": 552.0,
+      "completions/mean_terminated_length": 552.0,
+      "completions/min_length": 405.0,
+      "completions/min_terminated_length": 405.0,
+      "entropy": 0.5036231875419617,
+      "epoch": 0.7513227513227513,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.0012071933597326279,
+      "learning_rate": 9.775303978015585e-07,
+      "loss": -0.0368,
+      "num_tokens": 2652918.0,
+      "reward": 0.65625,
+      "reward_std": 0.4532671868801117,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.4732423722743988,
+      "step": 142,
+      "step_time": 125.74961478449404
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 690.5,
+      "completions/mean_terminated_length": 490.3999938964844,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.483707457780838,
+      "epoch": 0.7566137566137566,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4609375,
+      "kl": 0.0013088560663163662,
+      "learning_rate": 9.77037682640015e-07,
+      "loss": -0.1116,
+      "num_tokens": 2668606.0,
+      "reward": 0.5104166269302368,
+      "reward_std": 0.39774924516677856,
+      "rewards/itbench_correctness/mean": 0.5104166269302368,
+      "rewards/itbench_correctness/std": 0.4732423722743988,
+      "step": 143,
+      "step_time": 81.490906807594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 869.1875,
+      "completions/mean_terminated_length": 817.5833740234375,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.36585891246795654,
+      "epoch": 0.7619047619047619,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.375,
+      "kl": 0.001102715963497758,
+      "learning_rate": 9.76539750670802e-07,
+      "loss": 0.0248,
+      "num_tokens": 2688489.0,
+      "reward": 0.29411765933036804,
+      "reward_std": 0.1618601679801941,
+      "rewards/itbench_correctness/mean": 0.29411765933036804,
+      "rewards/itbench_correctness/std": 0.3757345974445343,
+      "step": 144,
+      "step_time": 625.0967052578926
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 753.125,
+      "completions/mean_terminated_length": 714.4285888671875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6479668021202087,
+      "epoch": 0.7671957671957672,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.59375,
+      "kl": 0.0020443897228688,
+      "learning_rate": 9.760366073392244e-07,
+      "loss": -0.1502,
+      "num_tokens": 2719323.0,
+      "reward": 0.6875,
+      "reward_std": 0.44403791427612305,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 145,
+      "step_time": 126.37558931391686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1020.0,
+      "completions/max_terminated_length": 1020.0,
+      "completions/mean_length": 751.5,
+      "completions/mean_terminated_length": 751.5,
+      "completions/min_length": 443.0,
+      "completions/min_terminated_length": 443.0,
+      "entropy": 0.39387890696525574,
+      "epoch": 0.7724867724867724,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9375,
+      "kl": 0.0012558232992887497,
+      "learning_rate": 9.755282581475767e-07,
+      "loss": -0.0169,
+      "num_tokens": 2736931.0,
+      "reward": 0.84375,
+      "reward_std": 0.32239729166030884,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 146,
+      "step_time": 86.20990402065217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 764.1875,
+      "completions/mean_terminated_length": 677.5833740234375,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.5051116347312927,
+      "epoch": 0.7777777777777778,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1015625,
+      "kl": 0.0011690112296491861,
+      "learning_rate": 9.750147086550842e-07,
+      "loss": 0.0162,
+      "num_tokens": 2773926.0,
+      "reward": 0.4734848737716675,
+      "reward_std": 0.05882110819220543,
+      "rewards/itbench_correctness/mean": 0.4734848737716675,
+      "rewards/itbench_correctness/std": 0.4955727159976959,
+      "step": 147,
+      "step_time": 137.13953017815948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 628.0,
+      "completions/max_terminated_length": 628.0,
+      "completions/mean_length": 518.0,
+      "completions/mean_terminated_length": 518.0,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.3146718144416809,
+      "epoch": 0.783068783068783,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.028564453125,
+      "kl": 0.001314603490754962,
+      "learning_rate": 9.744959644778421e-07,
+      "loss": 0.0,
+      "num_tokens": 2787054.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 148,
+      "step_time": 1022.448972039856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 749.0,
+      "completions/mean_length": 704.8125,
+      "completions/mean_terminated_length": 513.2999877929688,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4767225384712219,
+      "epoch": 0.7883597883597884,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.3984375,
+      "kl": 0.0012462595477700233,
+      "learning_rate": 9.739720312887533e-07,
+      "loss": -0.0812,
+      "num_tokens": 2813323.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 149,
+      "step_time": 102.6073711141944
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 666.75,
+      "completions/mean_terminated_length": 584.3077392578125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4049493670463562,
+      "epoch": 0.7936507936507936,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.0011266146320849657,
+      "learning_rate": 9.734429148174674e-07,
+      "loss": -0.0562,
+      "num_tokens": 2830087.0,
+      "reward": 0.453125,
+      "reward_std": 0.15026018023490906,
+      "rewards/itbench_correctness/mean": 0.453125,
+      "rewards/itbench_correctness/std": 0.413710355758667,
+      "step": 150,
+      "step_time": 72.91534078493714
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 764.9375,
+      "completions/mean_terminated_length": 609.5,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.3294386863708496,
+      "epoch": 0.798941798941799,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2734375,
+      "kl": 0.0012626381358131766,
+      "learning_rate": 9.729086208503173e-07,
+      "loss": -0.0019,
+      "num_tokens": 2847998.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 151,
+      "step_time": 135.860564914532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 789.375,
+      "completions/mean_terminated_length": 755.857177734375,
+      "completions/min_length": 504.0,
+      "completions/min_terminated_length": 504.0,
+      "entropy": 0.6638163328170776,
+      "epoch": 0.8042328042328042,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4140625,
+      "kl": 0.0012647550320252776,
+      "learning_rate": 9.723691552302562e-07,
+      "loss": 0.006,
+      "num_tokens": 2892140.0,
+      "reward": 0.375,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 152,
+      "step_time": 128.31029498856515
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 568.0,
+      "completions/max_terminated_length": 568.0,
+      "completions/mean_length": 438.8125,
+      "completions/mean_terminated_length": 438.8125,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "entropy": 0.35778379440307617,
+      "epoch": 0.8095238095238095,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4140625,
+      "kl": 0.0011610172223299742,
+      "learning_rate": 9.718245238567938e-07,
+      "loss": -0.0117,
+      "num_tokens": 2901465.0,
+      "reward": 0.5062500238418579,
+      "reward_std": 0.1627907156944275,
+      "rewards/itbench_correctness/mean": 0.5062500238418579,
+      "rewards/itbench_correctness/std": 0.17308476567268372,
+      "step": 153,
+      "step_time": 53.513846694491804
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 529.3125,
+      "completions/mean_terminated_length": 529.3125,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 0.44775062799453735,
+      "epoch": 0.8148148148148148,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0016965508693829179,
+      "learning_rate": 9.712747326859315e-07,
+      "loss": 0.0038,
+      "num_tokens": 2931910.0,
+      "reward": 0.40625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.4552929699420929,
+      "step": 154,
+      "step_time": 79.1174840349704
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 713.0,
+      "completions/mean_terminated_length": 609.3333740234375,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.41234222054481506,
+      "epoch": 0.8201058201058201,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.59375,
+      "kl": 0.001036427216604352,
+      "learning_rate": 9.707197877300973e-07,
+      "loss": -0.0351,
+      "num_tokens": 2949046.0,
+      "reward": 0.47181373834609985,
+      "reward_std": 0.2768261134624481,
+      "rewards/itbench_correctness/mean": 0.47181373834609985,
+      "rewards/itbench_correctness/std": 0.45311903953552246,
+      "step": 155,
+      "step_time": 1143.2126589166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 868.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 572.9375,
+      "completions/mean_terminated_length": 572.9375,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.4380931556224823,
+      "epoch": 0.8253968253968254,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.0014040054520592093,
+      "learning_rate": 9.701596950580807e-07,
+      "loss": 0.0116,
+      "num_tokens": 2961597.0,
+      "reward": 0.953125,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.953125,
+      "rewards/itbench_correctness/std": 0.1875,
+      "step": 156,
+      "step_time": 101.3859726889059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 950.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 650.625,
+      "completions/mean_terminated_length": 650.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.40268972516059875,
+      "epoch": 0.8306878306878307,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.375,
+      "kl": 0.0015219611814245582,
+      "learning_rate": 9.695944607949648e-07,
+      "loss": -0.0258,
+      "num_tokens": 2981207.0,
+      "reward": 0.78125,
+      "reward_std": 0.2086307406425476,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.36371922492980957,
+      "step": 157,
+      "step_time": 316.9267311077565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 864.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 514.25,
+      "completions/mean_terminated_length": 514.25,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5833738446235657,
+      "epoch": 0.8359788359788359,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.80078125,
+      "kl": 0.0016437954036518931,
+      "learning_rate": 9.690240911220617e-07,
+      "loss": -0.0919,
+      "num_tokens": 2994235.0,
+      "reward": 0.84375,
+      "reward_std": 0.15866193175315857,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.27024510502815247,
+      "step": 158,
+      "step_time": 80.80979425925761
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 818.0,
+      "completions/max_terminated_length": 818.0,
+      "completions/mean_length": 591.0,
+      "completions/mean_terminated_length": 591.0,
+      "completions/min_length": 452.0,
+      "completions/min_terminated_length": 452.0,
+      "entropy": 0.46023687720298767,
+      "epoch": 0.8412698412698413,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.34375,
+      "kl": 0.001493943389505148,
+      "learning_rate": 9.684485922768421e-07,
+      "loss": -0.0018,
+      "num_tokens": 3009803.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 159,
+      "step_time": 92.85486916080117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 839.375,
+      "completions/mean_terminated_length": 654.75,
+      "completions/min_length": 547.0,
+      "completions/min_terminated_length": 547.0,
+      "entropy": 0.3979151248931885,
+      "epoch": 0.8465608465608465,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.140625,
+      "kl": 0.0009861888829618692,
+      "learning_rate": 9.678679705528698e-07,
+      "loss": 0.0391,
+      "num_tokens": 3033361.0,
+      "reward": 0.4520833492279053,
+      "reward_std": 0.2401251643896103,
+      "rewards/itbench_correctness/mean": 0.4520833492279053,
+      "rewards/itbench_correctness/std": 0.3798574209213257,
+      "step": 160,
+      "step_time": 113.24223164469004
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 866.9375,
+      "completions/mean_terminated_length": 772.7000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4936918616294861,
+      "epoch": 0.8518518518518519,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0016467805253341794,
+      "learning_rate": 9.672822322997304e-07,
+      "loss": -0.0031,
+      "num_tokens": 3052032.0,
+      "reward": 0.535937488079071,
+      "reward_std": 0.39822056889533997,
+      "rewards/itbench_correctness/mean": 0.535937488079071,
+      "rewards/itbench_correctness/std": 0.4591630697250366,
+      "step": 161,
+      "step_time": 73.70399552583694
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 524.125,
+      "completions/mean_terminated_length": 524.125,
+      "completions/min_length": 347.0,
+      "completions/min_terminated_length": 347.0,
+      "entropy": 0.503696620464325,
+      "epoch": 0.8571428571428571,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1015625,
+      "kl": 0.0013778236461803317,
+      "learning_rate": 9.666913839229637e-07,
+      "loss": -0.0048,
+      "num_tokens": 3063106.0,
+      "reward": 0.5,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.3651483952999115,
+      "step": 162,
+      "step_time": 143.48580626491457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 587.75,
+      "completions/mean_terminated_length": 587.75,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.3862186372280121,
+      "epoch": 0.8624338624338624,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0010811445536091924,
+      "learning_rate": 9.660954318839932e-07,
+      "loss": 0.0044,
+      "num_tokens": 3076070.0,
+      "reward": 0.643750011920929,
+      "reward_std": 0.20177768170833588,
+      "rewards/itbench_correctness/mean": 0.643750011920929,
+      "rewards/itbench_correctness/std": 0.36142081022262573,
+      "step": 163,
+      "step_time": 79.29910835064948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 771.5625,
+      "completions/mean_terminated_length": 620.1000366210938,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 0.6091535091400146,
+      "epoch": 0.8677248677248677,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.53125,
+      "kl": 0.0012378692626953125,
+      "learning_rate": 9.654943827000546e-07,
+      "loss": 0.0099,
+      "num_tokens": 3094839.0,
+      "reward": 0.609375,
+      "reward_std": 0.1043153703212738,
+      "rewards/itbench_correctness/mean": 0.609375,
+      "rewards/itbench_correctness/std": 0.4278702139854431,
+      "step": 164,
+      "step_time": 97.59095096122473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 990.5625,
+      "completions/mean_terminated_length": 934.8333740234375,
+      "completions/min_length": 858.0,
+      "completions/min_terminated_length": 858.0,
+      "entropy": 0.5027446746826172,
+      "epoch": 0.873015873015873,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.390625,
+      "kl": 0.0011952045606449246,
+      "learning_rate": 9.648882429441256e-07,
+      "loss": 0.0129,
+      "num_tokens": 3138016.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 165,
+      "step_time": 121.71588209550828
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 965.0,
+      "completions/max_terminated_length": 965.0,
+      "completions/mean_length": 771.75,
+      "completions/mean_terminated_length": 771.75,
+      "completions/min_length": 571.0,
+      "completions/min_terminated_length": 571.0,
+      "entropy": 0.39909297227859497,
+      "epoch": 0.8783068783068783,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.46875,
+      "kl": 0.0013839628081768751,
+      "learning_rate": 9.642770192448535e-07,
+      "loss": 0.0037,
+      "num_tokens": 3161268.0,
+      "reward": 0.47187501192092896,
+      "reward_std": 0.08010874688625336,
+      "rewards/itbench_correctness/mean": 0.47187501192092896,
+      "rewards/itbench_correctness/std": 0.3993614614009857,
+      "step": 166,
+      "step_time": 104.31897877063602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 805.625,
+      "completions/mean_terminated_length": 755.2307739257812,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "entropy": 0.5784329175949097,
+      "epoch": 0.8835978835978836,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.00185630121268332,
+      "learning_rate": 9.636607182864826e-07,
+      "loss": -0.0227,
+      "num_tokens": 3196606.0,
+      "reward": 0.25,
+      "reward_std": 0.4355512857437134,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 167,
+      "step_time": 113.68263853341341
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 824.0,
+      "completions/max_terminated_length": 824.0,
+      "completions/mean_length": 573.625,
+      "completions/mean_terminated_length": 573.625,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "entropy": 0.4166485071182251,
+      "epoch": 0.8888888888888888,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6796875,
+      "kl": 0.0012832505162805319,
+      "learning_rate": 9.630393468087817e-07,
+      "loss": -0.0249,
+      "num_tokens": 3209872.0,
+      "reward": 0.2291666716337204,
+      "reward_std": 0.14026343822479248,
+      "rewards/itbench_correctness/mean": 0.2291666716337204,
+      "rewards/itbench_correctness/std": 0.1787301003932953,
+      "step": 168,
+      "step_time": 417.7054488658905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 966.0,
+      "completions/mean_length": 928.125,
+      "completions/mean_terminated_length": 804.857177734375,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 0.5990572571754456,
+      "epoch": 0.8941798941798942,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.03857421875,
+      "kl": 0.0013898048782721162,
+      "learning_rate": 9.624129116069694e-07,
+      "loss": 0.0001,
+      "num_tokens": 3258930.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 169,
+      "step_time": 225.11859526112676
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 959.5,
+      "completions/mean_terminated_length": 876.5714721679688,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.4815007746219635,
+      "epoch": 0.8994708994708994,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.001060598180629313,
+      "learning_rate": 9.61781419531641e-07,
+      "loss": 0.0041,
+      "num_tokens": 3282762.0,
+      "reward": 0.625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.4564354717731476,
+      "step": 170,
+      "step_time": 735.476375034079
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 507.0,
+      "completions/mean_length": 735.125,
+      "completions/mean_terminated_length": 446.25,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.5223601460456848,
+      "epoch": 0.9047619047619048,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9921875,
+      "kl": 0.0013405587524175644,
+      "learning_rate": 9.611448774886923e-07,
+      "loss": 0.0105,
+      "num_tokens": 3301500.0,
+      "reward": 0.6875,
+      "reward_std": 0.22201895713806152,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.33850160241127014,
+      "step": 171,
+      "step_time": 763.4565976867452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 692.0,
+      "completions/max_terminated_length": 692.0,
+      "completions/mean_length": 626.875,
+      "completions/mean_terminated_length": 626.875,
+      "completions/min_length": 567.0,
+      "completions/min_terminated_length": 567.0,
+      "entropy": 0.4658025801181793,
+      "epoch": 0.91005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 35.0,
+      "kl": 0.0011788216652348638,
+      "learning_rate": 9.605032924392455e-07,
+      "loss": -0.0153,
+      "num_tokens": 3315410.0,
+      "reward": 0.7395833134651184,
+      "reward_std": 0.16796313226222992,
+      "rewards/itbench_correctness/mean": 0.7395833134651184,
+      "rewards/itbench_correctness/std": 0.19924628734588623,
+      "step": 172,
+      "step_time": 103.72399638220668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 573.3125,
+      "completions/mean_terminated_length": 573.3125,
+      "completions/min_length": 445.0,
+      "completions/min_terminated_length": 445.0,
+      "entropy": 0.3575711250305176,
+      "epoch": 0.9153439153439153,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2734375,
+      "kl": 0.0011289599351584911,
+      "learning_rate": 9.598566713995717e-07,
+      "loss": 0.0046,
+      "num_tokens": 3328047.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.3333333432674408,
+      "rewards/itbench_correctness/std": 0.4036867320537567,
+      "step": 173,
+      "step_time": 597.9741206569597
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 885.25,
+      "completions/mean_terminated_length": 802.0,
+      "completions/min_length": 537.0,
+      "completions/min_terminated_length": 537.0,
+      "entropy": 0.4992939829826355,
+      "epoch": 0.9206349206349206,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0012533684493973851,
+      "learning_rate": 9.59205021441015e-07,
+      "loss": 0.0048,
+      "num_tokens": 3350707.0,
+      "reward": 0.03750000149011612,
+      "reward_std": 0.1060660183429718,
+      "rewards/itbench_correctness/mean": 0.03750000149011612,
+      "rewards/itbench_correctness/std": 0.15000000596046448,
+      "step": 174,
+      "step_time": 158.50677568931133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 715.8125,
+      "completions/mean_terminated_length": 476.1111145019531,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 0.5113070607185364,
+      "epoch": 0.9259259259259259,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.59375,
+      "kl": 0.0011880681850016117,
+      "learning_rate": 9.585483496899149e-07,
+      "loss": -0.0041,
+      "num_tokens": 3367576.0,
+      "reward": 0.6875,
+      "reward_std": 0.20044593513011932,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.42328083515167236,
+      "step": 175,
+      "step_time": 881.939713913016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 470.0,
+      "completions/mean_length": 832.5625,
+      "completions/mean_terminated_length": 411.3999938964844,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.5116732716560364,
+      "epoch": 0.9312169312169312,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.0010343582835048437,
+      "learning_rate": 9.578866633275286e-07,
+      "loss": 0.0285,
+      "num_tokens": 3392569.0,
+      "reward": 0.5052083730697632,
+      "reward_std": 0.1857735514640808,
+      "rewards/itbench_correctness/mean": 0.5052083730697632,
+      "rewards/itbench_correctness/std": 0.2930029034614563,
+      "step": 176,
+      "step_time": 269.2045645285398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 568.8125,
+      "completions/mean_terminated_length": 568.8125,
+      "completions/min_length": 370.0,
+      "completions/min_terminated_length": 370.0,
+      "entropy": 0.5203824043273926,
+      "epoch": 0.9365079365079365,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.017578125,
+      "kl": 0.0011270169634371996,
+      "learning_rate": 9.572199695899521e-07,
+      "loss": 0.0,
+      "num_tokens": 3405782.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 177,
+      "step_time": 226.17386937886477
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 958.875,
+      "completions/mean_terminated_length": 850.3333740234375,
+      "completions/min_length": 740.0,
+      "completions/min_terminated_length": 740.0,
+      "entropy": 0.39004039764404297,
+      "epoch": 0.9417989417989417,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0013279912527650595,
+      "learning_rate": 9.565482757680414e-07,
+      "loss": -0.0199,
+      "num_tokens": 3432116.0,
+      "reward": 0.625,
+      "reward_std": 0.28324785828590393,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.3626037836074829,
+      "step": 178,
+      "step_time": 150.1005060262978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 558.1875,
+      "completions/mean_terminated_length": 491.64288330078125,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.4765423834323883,
+      "epoch": 0.9470899470899471,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0016712526557967067,
+      "learning_rate": 9.558715892073323e-07,
+      "loss": 0.0807,
+      "num_tokens": 3467055.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 179,
+      "step_time": 91.84085294324905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1020.0,
+      "completions/mean_length": 847.5625,
+      "completions/mean_terminated_length": 822.357177734375,
+      "completions/min_length": 608.0,
+      "completions/min_terminated_length": 608.0,
+      "entropy": 0.3374382555484772,
+      "epoch": 0.9523809523809523,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04931640625,
+      "kl": 0.002606587251648307,
+      "learning_rate": 9.551899173079606e-07,
+      "loss": 0.0001,
+      "num_tokens": 3486896.0,
+      "reward": 0.4375,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.4518480598926544,
+      "step": 180,
+      "step_time": 250.4422083152458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 560.0,
+      "completions/mean_terminated_length": 529.0667114257812,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5392857193946838,
+      "epoch": 0.9576719576719577,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0206298828125,
+      "kl": 0.0013467188691720366,
+      "learning_rate": 9.545032675245813e-07,
+      "loss": 0.0,
+      "num_tokens": 3501360.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 181,
+      "step_time": 231.51458043325692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 722.3125,
+      "completions/mean_terminated_length": 702.2000122070312,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "entropy": 0.41533270478248596,
+      "epoch": 0.9629629629629629,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.96875,
+      "kl": 0.0013854112476110458,
+      "learning_rate": 9.538116473662861e-07,
+      "loss": -0.0126,
+      "num_tokens": 3528605.0,
+      "reward": 0.765625,
+      "reward_std": 0.4136722683906555,
+      "rewards/itbench_correctness/mean": 0.765625,
+      "rewards/itbench_correctness/std": 0.40278977155685425,
+      "step": 182,
+      "step_time": 96.76111165247858
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 662.0,
+      "completions/mean_length": 632.6875,
+      "completions/mean_terminated_length": 576.7857666015625,
+      "completions/min_length": 424.0,
+      "completions/min_terminated_length": 424.0,
+      "entropy": 0.46152326464653015,
+      "epoch": 0.9682539682539683,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.001549158594571054,
+      "learning_rate": 9.531150643965222e-07,
+      "loss": 0.005,
+      "num_tokens": 3549936.0,
+      "reward": 0.3125,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 183,
+      "step_time": 141.31063493527472
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 989.0,
+      "completions/mean_length": 763.9375,
+      "completions/mean_terminated_length": 607.9000244140625,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.3272519111633301,
+      "epoch": 0.9735449735449735,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1171875,
+      "kl": 0.0013428620295599103,
+      "learning_rate": 9.524135262330098e-07,
+      "loss": -0.0182,
+      "num_tokens": 3567807.0,
+      "reward": 0.3645833432674408,
+      "reward_std": 0.01928791031241417,
+      "rewards/itbench_correctness/mean": 0.3645833432674408,
+      "rewards/itbench_correctness/std": 0.3774610757827759,
+      "step": 184,
+      "step_time": 143.63786490540951
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 544.0,
+      "completions/max_terminated_length": 544.0,
+      "completions/mean_length": 427.9375,
+      "completions/mean_terminated_length": 427.9375,
+      "completions/min_length": 271.0,
+      "completions/min_terminated_length": 271.0,
+      "entropy": 0.4136117994785309,
+      "epoch": 0.9788359788359788,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.359375,
+      "kl": 0.0012134211137890816,
+      "learning_rate": 9.517070405476574e-07,
+      "loss": -0.0009,
+      "num_tokens": 3577486.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 185,
+      "step_time": 170.13555748201907
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 573.375,
+      "completions/mean_terminated_length": 543.3333740234375,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "entropy": 0.2842816710472107,
+      "epoch": 0.9841269841269841,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0015454581007361412,
+      "learning_rate": 9.509956150664795e-07,
+      "loss": -0.0815,
+      "num_tokens": 3591212.0,
+      "reward": 0.3984375,
+      "reward_std": 0.28348496556282043,
+      "rewards/itbench_correctness/mean": 0.3984375,
+      "rewards/itbench_correctness/std": 0.2954002320766449,
+      "step": 186,
+      "step_time": 82.30455144122243
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 894.0,
+      "completions/max_terminated_length": 894.0,
+      "completions/mean_length": 573.4375,
+      "completions/mean_terminated_length": 573.4375,
+      "completions/min_length": 353.0,
+      "completions/min_terminated_length": 353.0,
+      "entropy": 0.5754768252372742,
+      "epoch": 0.9894179894179894,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.002303932560607791,
+      "learning_rate": 9.502792575695111e-07,
+      "loss": 0.0049,
+      "num_tokens": 3614019.0,
+      "reward": 0.40625,
+      "reward_std": 0.1735912710428238,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.48196646571159363,
+      "step": 187,
+      "step_time": 89.72001887392253
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 801.25,
+      "completions/mean_terminated_length": 667.6000366210938,
+      "completions/min_length": 511.0,
+      "completions/min_terminated_length": 511.0,
+      "entropy": 0.44430577754974365,
+      "epoch": 0.9947089947089947,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9453125,
+      "kl": 0.0012758101802319288,
+      "learning_rate": 9.495579758907229e-07,
+      "loss": 0.0478,
+      "num_tokens": 3631471.0,
+      "reward": 0.4765625,
+      "reward_std": 0.23403453826904297,
+      "rewards/itbench_correctness/mean": 0.4765625,
+      "rewards/itbench_correctness/std": 0.4835174083709717,
+      "step": 188,
+      "step_time": 79.22767079528421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 716.1875,
+      "completions/mean_terminated_length": 613.5833740234375,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.6143642663955688,
+      "epoch": 1.0,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5234375,
+      "kl": 0.001404265291057527,
+      "learning_rate": 9.488317779179361e-07,
+      "loss": 0.0008,
+      "num_tokens": 3658762.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 189,
+      "step_time": 153.7122633298859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 800.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 575.1875,
+      "completions/mean_terminated_length": 575.1875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.46245789527893066,
+      "epoch": 1.0052910052910053,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.001671693054959178,
+      "learning_rate": 9.481006715927351e-07,
+      "loss": -0.0487,
+      "num_tokens": 3673277.0,
+      "reward": 0.6145833730697632,
+      "reward_std": 0.2882373631000519,
+      "rewards/itbench_correctness/mean": 0.6145833730697632,
+      "rewards/itbench_correctness/std": 0.43341347575187683,
+      "step": 190,
+      "step_time": 71.11040670704097
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 617.0,
+      "completions/max_terminated_length": 617.0,
+      "completions/mean_length": 503.75,
+      "completions/mean_terminated_length": 503.75,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.35334986448287964,
+      "epoch": 1.0105820105820107,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.453125,
+      "kl": 0.0010900463676080108,
+      "learning_rate": 9.473646649103817e-07,
+      "loss": 0.012,
+      "num_tokens": 3684537.0,
+      "reward": 0.875,
+      "reward_std": 0.16866441071033478,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.18257419764995575,
+      "step": 191,
+      "step_time": 796.484293489717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 985.0,
+      "completions/mean_length": 665.6875,
+      "completions/mean_terminated_length": 502.8182067871094,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.43263542652130127,
+      "epoch": 1.0158730158730158,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4765625,
+      "kl": 0.0013347232015803456,
+      "learning_rate": 9.466237659197269e-07,
+      "loss": -0.1131,
+      "num_tokens": 3704212.0,
+      "reward": 0.5625,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.4699290990829468,
+      "step": 192,
+      "step_time": 630.9985243473202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 506.0,
+      "completions/max_terminated_length": 506.0,
+      "completions/mean_length": 414.125,
+      "completions/mean_terminated_length": 414.125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3936009705066681,
+      "epoch": 1.0211640211640212,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.46875,
+      "kl": 0.0016619176603853703,
+      "learning_rate": 9.458779827231236e-07,
+      "loss": -0.0404,
+      "num_tokens": 3713654.0,
+      "reward": 0.65625,
+      "reward_std": 0.1735912710428238,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.3400367796421051,
+      "step": 193,
+      "step_time": 692.6463372064754
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 944.5625,
+      "completions/mean_terminated_length": 600.3333740234375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "entropy": 0.44041553139686584,
+      "epoch": 1.0264550264550265,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0013270394410938025,
+      "learning_rate": 9.451273234763371e-07,
+      "loss": 0.0,
+      "num_tokens": 3736343.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 194,
+      "step_time": 4224.783679332584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 861.5,
+      "completions/mean_terminated_length": 374.0,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5571677088737488,
+      "epoch": 1.0317460317460316,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.03125,
+      "kl": 0.0012215389870107174,
+      "learning_rate": 9.443717963884568e-07,
+      "loss": -0.0105,
+      "num_tokens": 3761855.0,
+      "reward": 0.375,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 195,
+      "step_time": 750.6342479139566
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 776.3125,
+      "completions/mean_terminated_length": 740.9285888671875,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 0.3168826997280121,
+      "epoch": 1.037037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.0012352862395346165,
+      "learning_rate": 9.436114097218058e-07,
+      "loss": 0.0208,
+      "num_tokens": 3779892.0,
+      "reward": 0.578125,
+      "reward_std": 0.25282490253448486,
+      "rewards/itbench_correctness/mean": 0.578125,
+      "rewards/itbench_correctness/std": 0.32556042075157166,
+      "step": 196,
+      "step_time": 167.4240329694003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 642.0,
+      "completions/mean_length": 712.4375,
+      "completions/mean_terminated_length": 525.5,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.30879902839660645,
+      "epoch": 1.0423280423280423,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.421875,
+      "kl": 0.001269629574380815,
+      "learning_rate": 9.42846171791851e-07,
+      "loss": -0.0334,
+      "num_tokens": 3798771.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 197,
+      "step_time": 1535.9739540033042
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 593.0,
+      "completions/mean_length": 782.875,
+      "completions/mean_terminated_length": 541.75,
+      "completions/min_length": 465.0,
+      "completions/min_terminated_length": 465.0,
+      "entropy": 0.38831230998039246,
+      "epoch": 1.0476190476190477,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.421875,
+      "kl": 0.001182721694931388,
+      "learning_rate": 9.420760909671118e-07,
+      "loss": 0.0,
+      "num_tokens": 3818961.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.17817413806915283,
+      "rewards/itbench_correctness/mean": 0.3333333432674408,
+      "rewards/itbench_correctness/std": 0.42163702845573425,
+      "step": 198,
+      "step_time": 118.89587634429336
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 746.625,
+      "completions/mean_terminated_length": 746.625,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.409844309091568,
+      "epoch": 1.052910052910053,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6328125,
+      "kl": 0.001056072534993291,
+      "learning_rate": 9.413011756690684e-07,
+      "loss": 0.0058,
+      "num_tokens": 3839107.0,
+      "reward": 0.46875,
+      "reward_std": 0.1944543570280075,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.43006783723831177,
+      "step": 199,
+      "step_time": 86.62848719768226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 775.25,
+      "completions/mean_terminated_length": 581.7777709960938,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.611415684223175,
+      "epoch": 1.0582010582010581,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.515625,
+      "kl": 0.0012188013643026352,
+      "learning_rate": 9.405214343720706e-07,
+      "loss": 0.0098,
+      "num_tokens": 3858671.0,
+      "reward": 0.1302083432674408,
+      "reward_std": 0.09300297498703003,
+      "rewards/itbench_correctness/mean": 0.1302083432674408,
+      "rewards/itbench_correctness/std": 0.13252796232700348,
+      "step": 200,
+      "step_time": 82.59393281675875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 843.9375,
+      "completions/mean_terminated_length": 703.888916015625,
+      "completions/min_length": 547.0,
+      "completions/min_terminated_length": 547.0,
+      "entropy": 0.46211951971054077,
+      "epoch": 1.0634920634920635,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3359375,
+      "kl": 0.000934995652642101,
+      "learning_rate": 9.397368756032444e-07,
+      "loss": -0.001,
+      "num_tokens": 3879470.0,
+      "reward": 0.71875,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.34308648109436035,
+      "step": 201,
+      "step_time": 210.63156687188894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 872.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 485.8125,
+      "completions/mean_terminated_length": 485.8125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.2428920567035675,
+      "epoch": 1.0687830687830688,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.002602006308734417,
+      "learning_rate": 9.389475079423988e-07,
+      "loss": -0.0939,
+      "num_tokens": 3892331.0,
+      "reward": 0.3125,
+      "reward_std": 0.25763458013534546,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.25730079412460327,
+      "step": 202,
+      "step_time": 79.75068347156048
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 597.0,
+      "completions/max_terminated_length": 597.0,
+      "completions/mean_length": 439.5,
+      "completions/mean_terminated_length": 439.5,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "entropy": 0.457337886095047,
+      "epoch": 1.074074074074074,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.15625,
+      "kl": 0.0015101981116458774,
+      "learning_rate": 9.381533400219317e-07,
+      "loss": -0.0691,
+      "num_tokens": 3902267.0,
+      "reward": 0.5852272510528564,
+      "reward_std": 0.252642422914505,
+      "rewards/itbench_correctness/mean": 0.5852272510528564,
+      "rewards/itbench_correctness/std": 0.3983004689216614,
+      "step": 203,
+      "step_time": 112.7746303929016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 692.0,
+      "completions/max_terminated_length": 692.0,
+      "completions/mean_length": 522.6875,
+      "completions/mean_terminated_length": 522.6875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.47829726338386536,
+      "epoch": 1.0793650793650793,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.0021134628914296627,
+      "learning_rate": 9.373543805267367e-07,
+      "loss": -0.0106,
+      "num_tokens": 3916214.0,
+      "reward": 0.4910714328289032,
+      "reward_std": 0.02525380812585354,
+      "rewards/itbench_correctness/mean": 0.4910714328289032,
+      "rewards/itbench_correctness/std": 0.5083487033843994,
+      "step": 204,
+      "step_time": 118.03211208153516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 754.0,
+      "completions/mean_length": 762.875,
+      "completions/mean_terminated_length": 606.2000122070312,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.3617892861366272,
+      "epoch": 1.0846560846560847,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.53125,
+      "kl": 0.0010750198271125555,
+      "learning_rate": 9.365506381941065e-07,
+      "loss": -0.0237,
+      "num_tokens": 3933452.0,
+      "reward": 0.4270833134651184,
+      "reward_std": 0.053405821323394775,
+      "rewards/itbench_correctness/mean": 0.4270833134651184,
+      "rewards/itbench_correctness/std": 0.4470841884613037,
+      "step": 205,
+      "step_time": 255.03229981381446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 665.0,
+      "completions/mean_length": 652.625,
+      "completions/mean_terminated_length": 363.77777099609375,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "entropy": 0.4872629642486572,
+      "epoch": 1.08994708994709,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.0011171969817951322,
+      "learning_rate": 9.357421218136386e-07,
+      "loss": -0.0152,
+      "num_tokens": 3953614.0,
+      "reward": 0.5843750238418579,
+      "reward_std": 0.21464183926582336,
+      "rewards/itbench_correctness/mean": 0.5843750238418579,
+      "rewards/itbench_correctness/std": 0.29686442017555237,
+      "step": 206,
+      "step_time": 127.40266931243241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 552.6875,
+      "completions/mean_terminated_length": 552.6875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4143390357494354,
+      "epoch": 1.0952380952380953,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6328125,
+      "kl": 0.001612935564480722,
+      "learning_rate": 9.349288402271387e-07,
+      "loss": -0.021,
+      "num_tokens": 3966409.0,
+      "reward": 0.71875,
+      "reward_std": 0.35564959049224854,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.44604745507240295,
+      "step": 207,
+      "step_time": 76.78453262429684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 884.0,
+      "completions/max_terminated_length": 884.0,
+      "completions/mean_length": 522.875,
+      "completions/mean_terminated_length": 522.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.2907004654407501,
+      "epoch": 1.1005291005291005,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0625,
+      "kl": 0.001368207624182105,
+      "learning_rate": 9.341108023285237e-07,
+      "loss": -0.1923,
+      "num_tokens": 3980703.0,
+      "reward": 0.5208333134651184,
+      "reward_std": 0.25392836332321167,
+      "rewards/itbench_correctness/mean": 0.5208333134651184,
+      "rewards/itbench_correctness/std": 0.45082229375839233,
+      "step": 208,
+      "step_time": 87.08768197055906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 712.4375,
+      "completions/mean_terminated_length": 712.4375,
+      "completions/min_length": 595.0,
+      "completions/min_terminated_length": 595.0,
+      "entropy": 0.4154750406742096,
+      "epoch": 1.1058201058201058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.15625,
+      "kl": 0.0015189462574198842,
+      "learning_rate": 9.332880170637252e-07,
+      "loss": 0.0093,
+      "num_tokens": 3996494.0,
+      "reward": 0.8671875,
+      "reward_std": 0.07790146768093109,
+      "rewards/itbench_correctness/mean": 0.8671875,
+      "rewards/itbench_correctness/std": 0.17361806333065033,
+      "step": 209,
+      "step_time": 73.19756223168224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 686.0,
+      "completions/max_terminated_length": 686.0,
+      "completions/mean_length": 527.75,
+      "completions/mean_terminated_length": 527.75,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "entropy": 0.43202275037765503,
+      "epoch": 1.1111111111111112,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4609375,
+      "kl": 0.0011430936865508556,
+      "learning_rate": 9.32460493430591e-07,
+      "loss": -0.0004,
+      "num_tokens": 4008082.0,
+      "reward": 0.9166666865348816,
+      "reward_std": 0.235702246427536,
+      "rewards/itbench_correctness/mean": 0.9166666865348816,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 210,
+      "step_time": 7547.0997234797105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 756.1875,
+      "completions/mean_terminated_length": 547.888916015625,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.4284651577472687,
+      "epoch": 1.1164021164021163,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9609375,
+      "kl": 0.0010993402684107423,
+      "learning_rate": 9.316282404787869e-07,
+      "loss": -0.0121,
+      "num_tokens": 4028837.0,
+      "reward": 0.7395833730697632,
+      "reward_std": 0.28634417057037354,
+      "rewards/itbench_correctness/mean": 0.7395833730697632,
+      "rewards/itbench_correctness/std": 0.35988038778305054,
+      "step": 211,
+      "step_time": 127.47603439353406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 974.0,
+      "completions/mean_length": 988.5,
+      "completions/mean_terminated_length": 740.0,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.3925139009952545,
+      "epoch": 1.1216931216931216,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6640625,
+      "kl": 0.0009769725147634745,
+      "learning_rate": 9.307912673096979e-07,
+      "loss": 0.0022,
+      "num_tokens": 4061109.0,
+      "reward": 0.375,
+      "reward_std": 0.1725163757801056,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.45338237285614014,
+      "step": 212,
+      "step_time": 153.4806991070509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 641.0625,
+      "completions/mean_terminated_length": 586.357177734375,
+      "completions/min_length": 344.0,
+      "completions/min_terminated_length": 344.0,
+      "entropy": 0.4180559515953064,
+      "epoch": 1.126984126984127,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.00120700488332659,
+      "learning_rate": 9.299495830763284e-07,
+      "loss": -0.0587,
+      "num_tokens": 4076166.0,
+      "reward": 0.3768939673900604,
+      "reward_std": 0.29744255542755127,
+      "rewards/itbench_correctness/mean": 0.3768939673900604,
+      "rewards/itbench_correctness/std": 0.3607577383518219,
+      "step": 213,
+      "step_time": 132.46722139418125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 841.0,
+      "completions/mean_length": 566.4375,
+      "completions/mean_terminated_length": 535.933349609375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "entropy": 0.4042811393737793,
+      "epoch": 1.1322751322751323,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.171875,
+      "kl": 0.00115107255987823,
+      "learning_rate": 9.291031969832025e-07,
+      "loss": 0.0001,
+      "num_tokens": 4089029.0,
+      "reward": 0.38786762952804565,
+      "reward_std": 0.16254664957523346,
+      "rewards/itbench_correctness/mean": 0.38786762952804565,
+      "rewards/itbench_correctness/std": 0.458029180765152,
+      "step": 214,
+      "step_time": 364.9181332997978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 791.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 452.25,
+      "completions/mean_terminated_length": 452.25,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "entropy": 0.4731896221637726,
+      "epoch": 1.1375661375661377,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0016687542665749788,
+      "learning_rate": 9.282521182862629e-07,
+      "loss": 0.0181,
+      "num_tokens": 4103865.0,
+      "reward": 0.587890625,
+      "reward_std": 0.31902575492858887,
+      "rewards/itbench_correctness/mean": 0.587890625,
+      "rewards/itbench_correctness/std": 0.37728795409202576,
+      "step": 215,
+      "step_time": 78.70893874578178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1018.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 621.3125,
+      "completions/mean_terminated_length": 621.3125,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 0.4731918275356293,
+      "epoch": 1.1428571428571428,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0012563822092488408,
+      "learning_rate": 9.273963562927694e-07,
+      "loss": 0.0316,
+      "num_tokens": 4116998.0,
+      "reward": 0.875,
+      "reward_std": 0.2630348801612854,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.2687419056892395,
+      "step": 216,
+      "step_time": 189.60282021015882
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1013.0,
+      "completions/mean_length": 981.875,
+      "completions/mean_terminated_length": 927.71435546875,
+      "completions/min_length": 883.0,
+      "completions/min_terminated_length": 883.0,
+      "entropy": 0.4297899305820465,
+      "epoch": 1.1481481481481481,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5078125,
+      "kl": 0.0011373235611245036,
+      "learning_rate": 9.265359203611987e-07,
+      "loss": 0.0,
+      "num_tokens": 4144004.0,
+      "reward": 0.02500000037252903,
+      "reward_std": 0.04629100486636162,
+      "rewards/itbench_correctness/mean": 0.02500000037252903,
+      "rewards/itbench_correctness/std": 0.06831301003694534,
+      "step": 217,
+      "step_time": 195.92587360646576
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 956.875,
+      "completions/mean_terminated_length": 845.0,
+      "completions/min_length": 667.0,
+      "completions/min_terminated_length": 667.0,
+      "entropy": 0.5434356331825256,
+      "epoch": 1.1534391534391535,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.03125,
+      "kl": 0.0011996476678177714,
+      "learning_rate": 9.2567081990114e-07,
+      "loss": 0.0111,
+      "num_tokens": 4186818.0,
+      "reward": 0.16249999403953552,
+      "reward_std": 0.25583362579345703,
+      "rewards/itbench_correctness/mean": 0.16249999403953552,
+      "rewards/itbench_correctness/std": 0.2673948407173157,
+      "step": 218,
+      "step_time": 182.73709686659276
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 886.0625,
+      "completions/mean_terminated_length": 708.7142944335938,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6590957045555115,
+      "epoch": 1.1587301587301586,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.578125,
+      "kl": 0.0019986084662377834,
+      "learning_rate": 9.248010643731934e-07,
+      "loss": 0.0001,
+      "num_tokens": 4218627.0,
+      "reward": 0.171875,
+      "reward_std": 0.16952534019947052,
+      "rewards/itbench_correctness/mean": 0.171875,
+      "rewards/itbench_correctness/std": 0.29181545972824097,
+      "step": 219,
+      "step_time": 215.92658524494618
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 586.3125,
+      "completions/mean_terminated_length": 586.3125,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "entropy": 0.48438331484794617,
+      "epoch": 1.164021164021164,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0286865234375,
+      "kl": 0.0012342262780293822,
+      "learning_rate": 9.239266632888658e-07,
+      "loss": 0.0,
+      "num_tokens": 4232136.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 220,
+      "step_time": 87.92167458124459
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 912.0,
+      "completions/max_terminated_length": 912.0,
+      "completions/mean_length": 588.4375,
+      "completions/mean_terminated_length": 588.4375,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.3789697289466858,
+      "epoch": 1.1693121693121693,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5625,
+      "kl": 0.0008935832884162664,
+      "learning_rate": 9.230476262104676e-07,
+      "loss": 0.0133,
+      "num_tokens": 4245863.0,
+      "reward": 0.6875,
+      "reward_std": 0.09531004726886749,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.15000000596046448,
+      "step": 221,
+      "step_time": 73.37130374461412
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 957.1875,
+      "completions/mean_terminated_length": 489.5,
+      "completions/min_length": 7.0,
+      "completions/min_terminated_length": 7.0,
+      "entropy": 0.35938623547554016,
+      "epoch": 1.1746031746031746,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.203125,
+      "kl": 0.0009425441385246813,
+      "learning_rate": 9.221639627510075e-07,
+      "loss": -0.1453,
+      "num_tokens": 4274026.0,
+      "reward": 0.2772817611694336,
+      "reward_std": 0.13795886933803558,
+      "rewards/itbench_correctness/mean": 0.2772817611694336,
+      "rewards/itbench_correctness/std": 0.22852860391139984,
+      "step": 222,
+      "step_time": 114.55098836030811
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 747.1875,
+      "completions/mean_terminated_length": 683.3077392578125,
+      "completions/min_length": 396.0,
+      "completions/min_terminated_length": 396.0,
+      "entropy": 0.36135509610176086,
+      "epoch": 1.17989417989418,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8359375,
+      "kl": 0.00118353555444628,
+      "learning_rate": 9.212756825740872e-07,
+      "loss": -0.0024,
+      "num_tokens": 4290805.0,
+      "reward": 0.4583333432674408,
+      "reward_std": 0.2527993321418762,
+      "rewards/itbench_correctness/mean": 0.4583333432674408,
+      "rewards/itbench_correctness/std": 0.30804041028022766,
+      "step": 223,
+      "step_time": 133.77505498286337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 501.3125,
+      "completions/mean_terminated_length": 466.4667053222656,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4587956666946411,
+      "epoch": 1.1851851851851851,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.515625,
+      "kl": 0.0022395735140889883,
+      "learning_rate": 9.203827953937968e-07,
+      "loss": -0.0998,
+      "num_tokens": 4302938.0,
+      "reward": 0.35208332538604736,
+      "reward_std": 0.3391679525375366,
+      "rewards/itbench_correctness/mean": 0.35208332538604736,
+      "rewards/itbench_correctness/std": 0.3392188847064972,
+      "step": 224,
+      "step_time": 86.85009481851012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 717.4375,
+      "completions/mean_terminated_length": 578.0909423828125,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 0.36240091919898987,
+      "epoch": 1.1904761904761905,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 15.875,
+      "kl": 0.0009973702253773808,
+      "learning_rate": 9.194853109746072e-07,
+      "loss": -0.0269,
+      "num_tokens": 4321705.0,
+      "reward": 0.4114583432674408,
+      "reward_std": 0.17598573863506317,
+      "rewards/itbench_correctness/mean": 0.4114583432674408,
+      "rewards/itbench_correctness/std": 0.3488987386226654,
+      "step": 225,
+      "step_time": 695.8362277401611
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 425.0,
+      "completions/max_terminated_length": 425.0,
+      "completions/mean_length": 332.375,
+      "completions/mean_terminated_length": 332.375,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.3760812282562256,
+      "epoch": 1.1957671957671958,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.0032024469692260027,
+      "learning_rate": 9.185832391312642e-07,
+      "loss": -0.0008,
+      "num_tokens": 4329399.0,
+      "reward": 0.3125,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.3095695972442627,
+      "step": 226,
+      "step_time": 71.38310491386801
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 922.0,
+      "completions/mean_length": 811.25,
+      "completions/mean_terminated_length": 537.7142944335938,
+      "completions/min_length": 409.0,
+      "completions/min_terminated_length": 409.0,
+      "entropy": 0.4955315887928009,
+      "epoch": 1.201058201058201,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.21875,
+      "kl": 0.0015454755630344152,
+      "learning_rate": 9.176765897286811e-07,
+      "loss": 0.0,
+      "num_tokens": 4367643.0,
+      "reward": 0.171875,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.171875,
+      "rewards/itbench_correctness/std": 0.25361964106559753,
+      "step": 227,
+      "step_time": 732.0901973983273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 692.625,
+      "completions/mean_terminated_length": 434.8888854980469,
+      "completions/min_length": 330.0,
+      "completions/min_terminated_length": 330.0,
+      "entropy": 0.4908861219882965,
+      "epoch": 1.2063492063492063,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01904296875,
+      "kl": 0.0013219286920502782,
+      "learning_rate": 9.167653726818304e-07,
+      "loss": 0.0,
+      "num_tokens": 4388877.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 228,
+      "step_time": 883.394539824687
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 532.375,
+      "completions/mean_terminated_length": 368.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.33810752630233765,
+      "epoch": 1.2116402116402116,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8203125,
+      "kl": 0.0016971830045804381,
+      "learning_rate": 9.158495979556358e-07,
+      "loss": -0.1874,
+      "num_tokens": 4406811.0,
+      "reward": 0.4947916567325592,
+      "reward_std": 0.33243152499198914,
+      "rewards/itbench_correctness/mean": 0.4947916567325592,
+      "rewards/itbench_correctness/std": 0.3914227783679962,
+      "step": 229,
+      "step_time": 297.6574033163488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 653.0,
+      "completions/max_terminated_length": 653.0,
+      "completions/mean_length": 516.5625,
+      "completions/mean_terminated_length": 516.5625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5071990489959717,
+      "epoch": 1.216931216931217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1640625,
+      "kl": 0.0015578863676637411,
+      "learning_rate": 9.14929275564863e-07,
+      "loss": -0.0018,
+      "num_tokens": 4418316.0,
+      "reward": 0.359375,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.359375,
+      "rewards/itbench_correctness/std": 0.3760402202606201,
+      "step": 230,
+      "step_time": 98.72435673046857
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 711.5625,
+      "completions/mean_terminated_length": 468.5555725097656,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "entropy": 0.34571805596351624,
+      "epoch": 1.2222222222222223,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3671875,
+      "kl": 0.0011883811093866825,
+      "learning_rate": 9.1400441557401e-07,
+      "loss": 0.0284,
+      "num_tokens": 4450277.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 231,
+      "step_time": 109.3037657784298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 506.375,
+      "completions/mean_terminated_length": 506.375,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "entropy": 0.5213527679443359,
+      "epoch": 1.2275132275132274,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.003350482787936926,
+      "learning_rate": 9.130750280971977e-07,
+      "loss": -0.0084,
+      "num_tokens": 4470851.0,
+      "reward": 0.5249999761581421,
+      "reward_std": 0.34211215376853943,
+      "rewards/itbench_correctness/mean": 0.5249999761581421,
+      "rewards/itbench_correctness/std": 0.3803507089614868,
+      "step": 232,
+      "step_time": 117.60546538699418
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 696.25,
+      "completions/mean_terminated_length": 696.25,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "entropy": 0.5285457968711853,
+      "epoch": 1.2328042328042328,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.020263671875,
+      "kl": 0.0014563931617885828,
+      "learning_rate": 9.121411232980587e-07,
+      "loss": 0.0,
+      "num_tokens": 4490551.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 233,
+      "step_time": 93.69822262041271
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 886.25,
+      "completions/mean_terminated_length": 854.4615478515625,
+      "completions/min_length": 672.0,
+      "completions/min_terminated_length": 672.0,
+      "entropy": 0.2617771625518799,
+      "epoch": 1.2380952380952381,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.234375,
+      "kl": 0.00089176808251068,
+      "learning_rate": 9.112027113896261e-07,
+      "loss": 0.0017,
+      "num_tokens": 4513339.0,
+      "reward": 0.375,
+      "reward_std": 0.15669579803943634,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.4425306022167206,
+      "step": 234,
+      "step_time": 236.43063350580633
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 650.0,
+      "completions/max_terminated_length": 650.0,
+      "completions/mean_length": 487.125,
+      "completions/mean_terminated_length": 487.125,
+      "completions/min_length": 334.0,
+      "completions/min_terminated_length": 334.0,
+      "entropy": 0.4331537187099457,
+      "epoch": 1.2433862433862433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.03125,
+      "kl": 0.0012799181276932359,
+      "learning_rate": 9.102598026342222e-07,
+      "loss": -0.0038,
+      "num_tokens": 4523829.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.3333333432674408,
+      "rewards/itbench_correctness/std": 0.4036867320537567,
+      "step": 235,
+      "step_time": 690.8860946493223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 579.0,
+      "completions/max_terminated_length": 579.0,
+      "completions/mean_length": 448.0,
+      "completions/mean_terminated_length": 448.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5066964030265808,
+      "epoch": 1.2486772486772486,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.034423828125,
+      "kl": 0.0018882593140006065,
+      "learning_rate": 9.093124073433462e-07,
+      "loss": 0.0,
+      "num_tokens": 4552069.0,
+      "reward": 0.3125,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.3227486312389374,
+      "step": 236,
+      "step_time": 151.43605288118124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 726.0,
+      "completions/max_terminated_length": 726.0,
+      "completions/mean_length": 564.9375,
+      "completions/mean_terminated_length": 564.9375,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 0.4885496199131012,
+      "epoch": 1.253968253968254,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.001722036860883236,
+      "learning_rate": 9.083605358775611e-07,
+      "loss": -0.0206,
+      "num_tokens": 4567172.0,
+      "reward": 0.6875,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 237,
+      "step_time": 79.38015065714717
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 528.6875,
+      "completions/mean_terminated_length": 528.6875,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "entropy": 0.4085589349269867,
+      "epoch": 1.2592592592592593,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5078125,
+      "kl": 0.0010910117998719215,
+      "learning_rate": 9.074041986463808e-07,
+      "loss": 0.0075,
+      "num_tokens": 4578575.0,
+      "reward": 0.9047619104385376,
+      "reward_std": 0.19606643915176392,
+      "rewards/itbench_correctness/mean": 0.9047619104385376,
+      "rewards/itbench_correctness/std": 0.2161296308040619,
+      "step": 238,
+      "step_time": 126.61944894865155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 795.0,
+      "completions/mean_length": 696.9375,
+      "completions/mean_terminated_length": 500.70001220703125,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "entropy": 0.5710698366165161,
+      "epoch": 1.2645502645502646,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.453125,
+      "kl": 0.0016381683526560664,
+      "learning_rate": 9.064434061081561e-07,
+      "loss": 0.017,
+      "num_tokens": 4602870.0,
+      "reward": 0.625,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 239,
+      "step_time": 184.73187920358032
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 933.0,
+      "completions/max_terminated_length": 933.0,
+      "completions/mean_length": 644.0625,
+      "completions/mean_terminated_length": 644.0625,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.4098981022834778,
+      "epoch": 1.2698412698412698,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.021240234375,
+      "kl": 0.001283331774175167,
+      "learning_rate": 9.0547816876996e-07,
+      "loss": 0.0,
+      "num_tokens": 4623511.0,
+      "reward": 0.550000011920929,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.550000011920929,
+      "rewards/itbench_correctness/std": 0.4647580087184906,
+      "step": 240,
+      "step_time": 119.32252531778067
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 804.875,
+      "completions/mean_terminated_length": 705.2727661132812,
+      "completions/min_length": 579.0,
+      "completions/min_terminated_length": 579.0,
+      "entropy": 0.4646684229373932,
+      "epoch": 1.2751322751322751,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.0014155855169519782,
+      "learning_rate": 9.045084971874737e-07,
+      "loss": 0.0092,
+      "num_tokens": 4644965.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 241,
+      "step_time": 253.224197126925
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 864.0,
+      "completions/mean_length": 761.9375,
+      "completions/mean_terminated_length": 604.7000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5197276473045349,
+      "epoch": 1.2804232804232805,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.390625,
+      "kl": 0.0014071539044380188,
+      "learning_rate": 9.0353440196487e-07,
+      "loss": -0.189,
+      "num_tokens": 4670188.0,
+      "reward": 0.609375,
+      "reward_std": 0.3135034143924713,
+      "rewards/itbench_correctness/mean": 0.609375,
+      "rewards/itbench_correctness/std": 0.41047483682632446,
+      "step": 242,
+      "step_time": 248.56613456085324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 806.9375,
+      "completions/mean_terminated_length": 756.84619140625,
+      "completions/min_length": 579.0,
+      "completions/min_terminated_length": 579.0,
+      "entropy": 0.4957013428211212,
+      "epoch": 1.2857142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.140625,
+      "kl": 0.0014731662813574076,
+      "learning_rate": 9.025558937546987e-07,
+      "loss": 0.0259,
+      "num_tokens": 4690107.0,
+      "reward": 0.6979166269302368,
+      "reward_std": 0.18552666902542114,
+      "rewards/itbench_correctness/mean": 0.6979166269302368,
+      "rewards/itbench_correctness/std": 0.18225695192813873,
+      "step": 243,
+      "step_time": 141.42184507194906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 865.0,
+      "completions/max_terminated_length": 865.0,
+      "completions/mean_length": 674.5625,
+      "completions/mean_terminated_length": 674.5625,
+      "completions/min_length": 510.0,
+      "completions/min_terminated_length": 510.0,
+      "entropy": 0.5336792469024658,
+      "epoch": 1.291005291005291,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.035400390625,
+      "kl": 0.0016851610271260142,
+      "learning_rate": 9.015729832577681e-07,
+      "loss": 0.0,
+      "num_tokens": 4710412.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 244,
+      "step_time": 105.10308491624892
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 527.75,
+      "completions/mean_terminated_length": 527.75,
+      "completions/min_length": 381.0,
+      "completions/min_terminated_length": 381.0,
+      "entropy": 0.5949786901473999,
+      "epoch": 1.2962962962962963,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.21875,
+      "kl": 0.001679896144196391,
+      "learning_rate": 9.005856812230304e-07,
+      "loss": -0.0189,
+      "num_tokens": 4723320.0,
+      "reward": 0.4322916865348816,
+      "reward_std": 0.031000997871160507,
+      "rewards/itbench_correctness/mean": 0.4322916865348816,
+      "rewards/itbench_correctness/std": 0.4484735131263733,
+      "step": 245,
+      "step_time": 98.61767490487546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 966.0,
+      "completions/mean_length": 661.1875,
+      "completions/mean_terminated_length": 577.4615478515625,
+      "completions/min_length": 327.0,
+      "completions/min_terminated_length": 327.0,
+      "entropy": 0.3705454170703888,
+      "epoch": 1.3015873015873016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.84375,
+      "kl": 0.0013237885432317853,
+      "learning_rate": 8.995939984474623e-07,
+      "loss": 0.0171,
+      "num_tokens": 4739299.0,
+      "reward": 0.6763392686843872,
+      "reward_std": 0.17046323418617249,
+      "rewards/itbench_correctness/mean": 0.6763392686843872,
+      "rewards/itbench_correctness/std": 0.29665619134902954,
+      "step": 246,
+      "step_time": 81.29718050733209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 945.125,
+      "completions/mean_terminated_length": 909.2727661132812,
+      "completions/min_length": 807.0,
+      "completions/min_terminated_length": 807.0,
+      "entropy": 0.31318607926368713,
+      "epoch": 1.306878306878307,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.515625,
+      "kl": 0.001092436257749796,
+      "learning_rate": 8.98597945775948e-07,
+      "loss": 0.0178,
+      "num_tokens": 4762901.0,
+      "reward": 0.41874998807907104,
+      "reward_std": 0.17100021243095398,
+      "rewards/itbench_correctness/mean": 0.41874998807907104,
+      "rewards/itbench_correctness/std": 0.4915536642074585,
+      "step": 247,
+      "step_time": 379.86342859547585
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 628.0,
+      "completions/max_terminated_length": 628.0,
+      "completions/mean_length": 466.1875,
+      "completions/mean_terminated_length": 466.1875,
+      "completions/min_length": 368.0,
+      "completions/min_terminated_length": 368.0,
+      "entropy": 0.49336370825767517,
+      "epoch": 1.312169312169312,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2578125,
+      "kl": 0.0015457995468750596,
+      "learning_rate": 8.975975341011595e-07,
+      "loss": -0.0118,
+      "num_tokens": 4772808.0,
+      "reward": 0.5645833611488342,
+      "reward_std": 0.01928791031241417,
+      "rewards/itbench_correctness/mean": 0.5645833611488342,
+      "rewards/itbench_correctness/std": 0.17201152443885803,
+      "step": 248,
+      "step_time": 83.39602283388376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 749.0,
+      "completions/max_terminated_length": 749.0,
+      "completions/mean_length": 502.6875,
+      "completions/mean_terminated_length": 502.6875,
+      "completions/min_length": 386.0,
+      "completions/min_terminated_length": 386.0,
+      "entropy": 0.3978614807128906,
+      "epoch": 1.3174603174603174,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0164794921875,
+      "kl": 0.001136748120188713,
+      "learning_rate": 8.965927743634389e-07,
+      "loss": 0.0,
+      "num_tokens": 4783827.0,
+      "reward": 0.5833333134651184,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5833333134651184,
+      "rewards/itbench_correctness/std": 0.4303314983844757,
+      "step": 249,
+      "step_time": 808.8291652789339
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 590.0,
+      "completions/max_terminated_length": 590.0,
+      "completions/mean_length": 488.375,
+      "completions/mean_terminated_length": 488.375,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.3910928964614868,
+      "epoch": 1.3227513227513228,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05224609375,
+      "kl": 0.0019690291956067085,
+      "learning_rate": 8.955836775506775e-07,
+      "loss": 0.0,
+      "num_tokens": 4795977.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 250,
+      "step_time": 1037.5402492322028
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 702.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 415.9375,
+      "completions/mean_terminated_length": 415.9375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "entropy": 0.40631103515625,
+      "epoch": 1.328042328042328,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5,
+      "kl": 0.0014389187563210726,
+      "learning_rate": 8.945702546981968e-07,
+      "loss": -0.0199,
+      "num_tokens": 4805024.0,
+      "reward": 0.546875,
+      "reward_std": 0.16521647572517395,
+      "rewards/itbench_correctness/mean": 0.546875,
+      "rewards/itbench_correctness/std": 0.24714809656143188,
+      "step": 251,
+      "step_time": 89.62413766887039
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 816.0,
+      "completions/max_terminated_length": 816.0,
+      "completions/mean_length": 614.9375,
+      "completions/mean_terminated_length": 614.9375,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "entropy": 0.5171257257461548,
+      "epoch": 1.3333333333333333,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.046875,
+      "kl": 0.0015879496932029724,
+      "learning_rate": 8.935525168886262e-07,
+      "loss": 0.0103,
+      "num_tokens": 4827879.0,
+      "reward": 0.359375,
+      "reward_std": 0.05866191163659096,
+      "rewards/itbench_correctness/mean": 0.359375,
+      "rewards/itbench_correctness/std": 0.3797157406806946,
+      "step": 252,
+      "step_time": 79.69811306335032
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 657.75,
+      "completions/mean_terminated_length": 657.75,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 0.3618395924568176,
+      "epoch": 1.3386243386243386,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8046875,
+      "kl": 0.001461725914850831,
+      "learning_rate": 8.925304752517839e-07,
+      "loss": -0.0421,
+      "num_tokens": 4842899.0,
+      "reward": 0.40937501192092896,
+      "reward_std": 0.30845823884010315,
+      "rewards/itbench_correctness/mean": 0.40937501192092896,
+      "rewards/itbench_correctness/std": 0.3658055067062378,
+      "step": 253,
+      "step_time": 131.8951225792989
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 725.875,
+      "completions/mean_terminated_length": 626.5,
+      "completions/min_length": 488.0,
+      "completions/min_terminated_length": 488.0,
+      "entropy": 0.5731014013290405,
+      "epoch": 1.343915343915344,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.1787109375,
+      "kl": 0.0015030583599582314,
+      "learning_rate": 8.91504140964553e-07,
+      "loss": 0.0,
+      "num_tokens": 4870481.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 254,
+      "step_time": 419.8384141791612
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 585.0,
+      "completions/mean_length": 756.4375,
+      "completions/mean_terminated_length": 488.875,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 0.4759150743484497,
+      "epoch": 1.3492063492063493,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.953125,
+      "kl": 0.0011384043609723449,
+      "learning_rate": 8.904735252507609e-07,
+      "loss": -0.0058,
+      "num_tokens": 4889368.0,
+      "reward": 0.2916666865348816,
+      "reward_std": 0.1178511381149292,
+      "rewards/itbench_correctness/mean": 0.2916666865348816,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 255,
+      "step_time": 75.48513688519597
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 621.25,
+      "completions/mean_terminated_length": 563.7142944335938,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.36378270387649536,
+      "epoch": 1.3544973544973544,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.171875,
+      "kl": 0.002011555014178157,
+      "learning_rate": 8.894386393810562e-07,
+      "loss": 0.014,
+      "num_tokens": 4904140.0,
+      "reward": 0.71875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 256,
+      "step_time": 448.9436140609905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 1007.375,
+      "completions/mean_terminated_length": 935.3333740234375,
+      "completions/min_length": 788.0,
+      "completions/min_terminated_length": 788.0,
+      "entropy": 0.4546469748020172,
+      "epoch": 1.3597883597883598,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6484375,
+      "kl": 0.0013795711565762758,
+      "learning_rate": 8.883994946727847e-07,
+      "loss": 0.0001,
+      "num_tokens": 4929690.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 257,
+      "step_time": 249.96061486005783
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 705.5,
+      "completions/mean_terminated_length": 632.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5301204919815063,
+      "epoch": 1.3650793650793651,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4609375,
+      "kl": 0.0015691749285906553,
+      "learning_rate": 8.873561024898667e-07,
+      "loss": -0.0308,
+      "num_tokens": 4954970.0,
+      "reward": 0.75,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 258,
+      "step_time": 128.34479956980795
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 708.0,
+      "completions/max_terminated_length": 708.0,
+      "completions/mean_length": 485.875,
+      "completions/mean_terminated_length": 485.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3889889419078827,
+      "epoch": 1.3703703703703702,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0859375,
+      "kl": 0.002027226844802499,
+      "learning_rate": 8.863084742426718e-07,
+      "loss": -0.0592,
+      "num_tokens": 4980176.0,
+      "reward": 0.75,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 259,
+      "step_time": 110.6389656001702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 744.0,
+      "completions/max_terminated_length": 744.0,
+      "completions/mean_length": 611.4375,
+      "completions/mean_terminated_length": 611.4375,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.4971890151500702,
+      "epoch": 1.3756613756613756,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.0010566740529611707,
+      "learning_rate": 8.852566213878946e-07,
+      "loss": -0.0071,
+      "num_tokens": 4992951.0,
+      "reward": 0.84375,
+      "reward_std": 0.22903135418891907,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.3520771861076355,
+      "step": 260,
+      "step_time": 502.41869831830263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 499.0,
+      "completions/max_terminated_length": 499.0,
+      "completions/mean_length": 419.0,
+      "completions/mean_terminated_length": 419.0,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 0.4797135889530182,
+      "epoch": 1.380952380952381,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0223388671875,
+      "kl": 0.0018066860502585769,
+      "learning_rate": 8.842005554284295e-07,
+      "loss": 0.0,
+      "num_tokens": 5002711.0,
+      "reward": 0.2083333283662796,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.2083333283662796,
+      "rewards/itbench_correctness/std": 0.21516574919223785,
+      "step": 261,
+      "step_time": 94.77580868080258
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 769.0,
+      "completions/mean_length": 758.8125,
+      "completions/mean_terminated_length": 599.7000122070312,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "entropy": 0.4796968996524811,
+      "epoch": 1.3862433862433863,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.234375,
+      "kl": 0.0017543428111821413,
+      "learning_rate": 8.831402879132445e-07,
+      "loss": -0.0144,
+      "num_tokens": 5039356.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 262,
+      "step_time": 486.13853998761624
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 912.0,
+      "completions/mean_length": 641.75,
+      "completions/mean_terminated_length": 514.3333740234375,
+      "completions/min_length": 334.0,
+      "completions/min_terminated_length": 334.0,
+      "entropy": 0.49863654375076294,
+      "epoch": 1.3915343915343916,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0019419684540480375,
+      "learning_rate": 8.820758304372555e-07,
+      "loss": 0.0274,
+      "num_tokens": 5056128.0,
+      "reward": 0.5,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 263,
+      "step_time": 1132.2881942698732
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 859.75,
+      "completions/mean_terminated_length": 785.0909423828125,
+      "completions/min_length": 476.0,
+      "completions/min_terminated_length": 476.0,
+      "entropy": 0.47223028540611267,
+      "epoch": 1.3968253968253967,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0234375,
+      "kl": 0.0012077168794348836,
+      "learning_rate": 8.810071946411988e-07,
+      "loss": 0.0,
+      "num_tokens": 5077620.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 264,
+      "step_time": 98.81888623256236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 616.875,
+      "completions/mean_terminated_length": 589.7333374023438,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 0.40688955783843994,
+      "epoch": 1.402116402116402,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0010585308773443103,
+      "learning_rate": 8.799343922115043e-07,
+      "loss": -0.0001,
+      "num_tokens": 5090890.0,
+      "reward": 0.6937500238418579,
+      "reward_std": 0.13999362289905548,
+      "rewards/itbench_correctness/mean": 0.6937500238418579,
+      "rewards/itbench_correctness/std": 0.3696281909942627,
+      "step": 265,
+      "step_time": 779.0254694251344
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1006.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 599.3125,
+      "completions/mean_terminated_length": 599.3125,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "entropy": 0.5172593593597412,
+      "epoch": 1.4074074074074074,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0015477510169148445,
+      "learning_rate": 8.788574348801674e-07,
+      "loss": -0.0177,
+      "num_tokens": 5103495.0,
+      "reward": 0.46875,
+      "reward_std": 0.1552036553621292,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.15478479862213135,
+      "step": 266,
+      "step_time": 430.513926978223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 705.0625,
+      "completions/mean_terminated_length": 513.7000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3616700768470764,
+      "epoch": 1.4126984126984126,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.140625,
+      "kl": 0.0013228055322542787,
+      "learning_rate": 8.777763344246208e-07,
+      "loss": 0.0035,
+      "num_tokens": 5126072.0,
+      "reward": 0.34375,
+      "reward_std": 0.1735912710428238,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.42695629596710205,
+      "step": 267,
+      "step_time": 154.13704107049853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 726.0,
+      "completions/max_terminated_length": 726.0,
+      "completions/mean_length": 493.375,
+      "completions/mean_terminated_length": 493.375,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 0.3830757439136505,
+      "epoch": 1.417989417989418,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5546875,
+      "kl": 0.0012491599190980196,
+      "learning_rate": 8.766911026676063e-07,
+      "loss": 0.0118,
+      "num_tokens": 5137798.0,
+      "reward": 0.796875,
+      "reward_std": 0.26196980476379395,
+      "rewards/itbench_correctness/mean": 0.796875,
+      "rewards/itbench_correctness/std": 0.27716949582099915,
+      "step": 268,
+      "step_time": 78.56690625380725
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 639.6875,
+      "completions/mean_terminated_length": 639.6875,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 0.41582804918289185,
+      "epoch": 1.4232804232804233,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0703125,
+      "kl": 0.0016272872453555465,
+      "learning_rate": 8.756017514770442e-07,
+      "loss": 0.0292,
+      "num_tokens": 5150993.0,
+      "reward": 0.78125,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.286865234375,
+      "step": 269,
+      "step_time": 87.11506285239011
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 907.8125,
+      "completions/mean_terminated_length": 559.25,
+      "completions/min_length": 480.0,
+      "completions/min_terminated_length": 480.0,
+      "entropy": 0.4648537039756775,
+      "epoch": 1.4285714285714286,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.046875,
+      "kl": 0.0014328653924167156,
+      "learning_rate": 8.745082927659046e-07,
+      "loss": 0.0103,
+      "num_tokens": 5182310.0,
+      "reward": 0.3958333432674408,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.3958333432674408,
+      "rewards/itbench_correctness/std": 0.47482940554618835,
+      "step": 270,
+      "step_time": 148.08130174782127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 821.0,
+      "completions/mean_terminated_length": 663.1111450195312,
+      "completions/min_length": 525.0,
+      "completions/min_terminated_length": 525.0,
+      "entropy": 0.38246041536331177,
+      "epoch": 1.433862433862434,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5703125,
+      "kl": 0.0012811021879315376,
+      "learning_rate": 8.734107384920769e-07,
+      "loss": 0.045,
+      "num_tokens": 5206270.0,
+      "reward": 0.4124999940395355,
+      "reward_std": 0.172688826918602,
+      "rewards/itbench_correctness/mean": 0.4124999940395355,
+      "rewards/itbench_correctness/std": 0.4869976043701172,
+      "step": 271,
+      "step_time": 145.7680284064263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 584.0,
+      "completions/mean_terminated_length": 482.4615478515625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3476027250289917,
+      "epoch": 1.439153439153439,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.578125,
+      "kl": 0.002004146808758378,
+      "learning_rate": 8.723091006582388e-07,
+      "loss": 0.0194,
+      "num_tokens": 5224206.0,
+      "reward": 0.4375,
+      "reward_std": 0.38298875093460083,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 272,
+      "step_time": 309.5534623619169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 534.0,
+      "completions/mean_length": 656.25,
+      "completions/mean_terminated_length": 370.22222900390625,
+      "completions/min_length": 315.0,
+      "completions/min_terminated_length": 315.0,
+      "entropy": 0.7039999961853027,
+      "epoch": 1.4444444444444444,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.421875,
+      "kl": 0.0013335734838619828,
+      "learning_rate": 8.712033913117249e-07,
+      "loss": 0.074,
+      "num_tokens": 5243858.0,
+      "reward": 0.015625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.015625,
+      "rewards/itbench_correctness/std": 0.0625,
+      "step": 273,
+      "step_time": 103.5548415929079
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 982.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 646.75,
+      "completions/mean_terminated_length": 646.75,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "entropy": 0.5102435350418091,
+      "epoch": 1.4497354497354498,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2265625,
+      "kl": 0.0015617223689332604,
+      "learning_rate": 8.700936225443958e-07,
+      "loss": 0.0166,
+      "num_tokens": 5259974.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 274,
+      "step_time": 187.742013909854
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 678.0,
+      "completions/max_terminated_length": 678.0,
+      "completions/mean_length": 469.75,
+      "completions/mean_terminated_length": 469.75,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "entropy": 0.42575839161872864,
+      "epoch": 1.455026455026455,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0016291391802951694,
+      "learning_rate": 8.689798064925048e-07,
+      "loss": 0.012,
+      "num_tokens": 5270194.0,
+      "reward": 0.4375,
+      "reward_std": 0.0862581878900528,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.4669642150402069,
+      "step": 275,
+      "step_time": 89.66884011216462
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 821.375,
+      "completions/mean_terminated_length": 807.86669921875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4723786413669586,
+      "epoch": 1.4603174603174602,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.001486663124524057,
+      "learning_rate": 8.678619553365658e-07,
+      "loss": -0.0841,
+      "num_tokens": 5292856.0,
+      "reward": 0.8402777910232544,
+      "reward_std": 0.21910008788108826,
+      "rewards/itbench_correctness/mean": 0.8402777910232544,
+      "rewards/itbench_correctness/std": 0.3417908549308777,
+      "step": 276,
+      "step_time": 421.1291719619185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 749.0,
+      "completions/max_terminated_length": 749.0,
+      "completions/mean_length": 588.8125,
+      "completions/mean_terminated_length": 588.8125,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "entropy": 0.43817004561424255,
+      "epoch": 1.4656084656084656,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.25,
+      "kl": 0.001539916731417179,
+      "learning_rate": 8.667400813012199e-07,
+      "loss": -0.002,
+      "num_tokens": 5312117.0,
+      "reward": 0.43541669845581055,
+      "reward_std": 0.1833198070526123,
+      "rewards/itbench_correctness/mean": 0.43541669845581055,
+      "rewards/itbench_correctness/std": 0.4274764358997345,
+      "step": 277,
+      "step_time": 134.6105333585292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 812.3125,
+      "completions/mean_terminated_length": 741.75,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "entropy": 0.4210202395915985,
+      "epoch": 1.470899470899471,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.0012785056605935097,
+      "learning_rate": 8.656141966551018e-07,
+      "loss": -0.0175,
+      "num_tokens": 5336450.0,
+      "reward": 0.125,
+      "reward_std": 0.2925041913986206,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.28867512941360474,
+      "step": 278,
+      "step_time": 588.067256687209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 688.0,
+      "completions/max_terminated_length": 688.0,
+      "completions/mean_length": 479.125,
+      "completions/mean_terminated_length": 479.125,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "entropy": 0.4654317796230316,
+      "epoch": 1.4761904761904763,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.039794921875,
+      "kl": 0.001857173629105091,
+      "learning_rate": 8.644843137107057e-07,
+      "loss": 0.0,
+      "num_tokens": 5347140.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 279,
+      "step_time": 93.42977315280586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 902.1875,
+      "completions/mean_terminated_length": 807.4444580078125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4633183181285858,
+      "epoch": 1.4814814814814814,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0013472916325554252,
+      "learning_rate": 8.633504448242504e-07,
+      "loss": -0.0543,
+      "num_tokens": 5367903.0,
+      "reward": 0.3077380955219269,
+      "reward_std": 0.26785334944725037,
+      "rewards/itbench_correctness/mean": 0.3077380955219269,
+      "rewards/itbench_correctness/std": 0.3630719482898712,
+      "step": 280,
+      "step_time": 90.29017782397568
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 777.25,
+      "completions/mean_terminated_length": 665.0909423828125,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.3731103241443634,
+      "epoch": 1.4867724867724867,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.0013979077339172363,
+      "learning_rate": 8.622126023955445e-07,
+      "loss": 0.0109,
+      "num_tokens": 5393947.0,
+      "reward": 0.31041666865348816,
+      "reward_std": 0.18059369921684265,
+      "rewards/itbench_correctness/mean": 0.31041666865348816,
+      "rewards/itbench_correctness/std": 0.32879552245140076,
+      "step": 281,
+      "step_time": 111.45759059861302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 635.0,
+      "completions/mean_length": 780.375,
+      "completions/mean_terminated_length": 536.75,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 0.6048374176025391,
+      "epoch": 1.492063492063492,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.53125,
+      "kl": 0.0014561447314918041,
+      "learning_rate": 8.610707988678503e-07,
+      "loss": 0.0,
+      "num_tokens": 5413105.0,
+      "reward": 0.609375,
+      "reward_std": 0.12387890368700027,
+      "rewards/itbench_correctness/mean": 0.609375,
+      "rewards/itbench_correctness/std": 0.4375000298023224,
+      "step": 282,
+      "step_time": 87.52544206380844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 871.0,
+      "completions/max_terminated_length": 871.0,
+      "completions/mean_length": 567.25,
+      "completions/mean_terminated_length": 567.25,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.39841338992118835,
+      "epoch": 1.4973544973544972,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1328125,
+      "kl": 0.0016809444641694427,
+      "learning_rate": 8.599250467277483e-07,
+      "loss": -0.1189,
+      "num_tokens": 5431333.0,
+      "reward": 0.3125,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 283,
+      "step_time": 285.17205636110157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 790.9375,
+      "completions/mean_terminated_length": 651.1000366210938,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 0.3767680823802948,
+      "epoch": 1.5026455026455028,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.59375,
+      "kl": 0.001456994330510497,
+      "learning_rate": 8.587753585050004e-07,
+      "loss": -0.0216,
+      "num_tokens": 5450876.0,
+      "reward": 0.03125,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.03125,
+      "rewards/itbench_correctness/std": 0.125,
+      "step": 284,
+      "step_time": 7258.076673376374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 711.0,
+      "completions/max_terminated_length": 711.0,
+      "completions/mean_length": 572.8125,
+      "completions/mean_terminated_length": 572.8125,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "entropy": 0.36661210656166077,
+      "epoch": 1.507936507936508,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5625,
+      "kl": 0.001243554288521409,
+      "learning_rate": 8.576217467724127e-07,
+      "loss": -0.0147,
+      "num_tokens": 5463649.0,
+      "reward": 0.875,
+      "reward_std": 0.2177756428718567,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.22360680997371674,
+      "step": 285,
+      "step_time": 65.26539001893252
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 785.0,
+      "completions/mean_length": 674.0,
+      "completions/mean_terminated_length": 464.0,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.4094955623149872,
+      "epoch": 1.5132275132275133,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.0017368828412145376,
+      "learning_rate": 8.564642241456986e-07,
+      "loss": 0.0128,
+      "num_tokens": 5481657.0,
+      "reward": 0.1640625,
+      "reward_std": 0.06629125773906708,
+      "rewards/itbench_correctness/mean": 0.1640625,
+      "rewards/itbench_correctness/std": 0.19213032722473145,
+      "step": 286,
+      "step_time": 1010.0560459299013
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 539.0,
+      "completions/mean_length": 677.5625,
+      "completions/mean_terminated_length": 469.70001220703125,
+      "completions/min_length": 398.0,
+      "completions/min_terminated_length": 398.0,
+      "entropy": 0.30107924342155457,
+      "epoch": 1.5185185185185186,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0014942490961402655,
+      "learning_rate": 8.553028032833396e-07,
+      "loss": 0.0399,
+      "num_tokens": 5500474.0,
+      "reward": 0.34375,
+      "reward_std": 0.12938730418682098,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.3966001570224762,
+      "step": 287,
+      "step_time": 967.4249309562147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 980.0,
+      "completions/mean_length": 861.8125,
+      "completions/mean_terminated_length": 851.0000610351562,
+      "completions/min_length": 747.0,
+      "completions/min_terminated_length": 747.0,
+      "entropy": 0.3689897656440735,
+      "epoch": 1.5238095238095237,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.001182331470772624,
+      "learning_rate": 8.541374968864485e-07,
+      "loss": 0.0161,
+      "num_tokens": 5520511.0,
+      "reward": 0.6545138955116272,
+      "reward_std": 0.2626494765281677,
+      "rewards/itbench_correctness/mean": 0.6545138955116272,
+      "rewards/itbench_correctness/std": 0.284541517496109,
+      "step": 288,
+      "step_time": 416.6262904284522
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 753.5625,
+      "completions/mean_terminated_length": 591.2999877929688,
+      "completions/min_length": 448.0,
+      "completions/min_terminated_length": 448.0,
+      "entropy": 0.6183959245681763,
+      "epoch": 1.529100529100529,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8359375,
+      "kl": 0.0017281738109886646,
+      "learning_rate": 8.529683176986295e-07,
+      "loss": 0.0359,
+      "num_tokens": 5542376.0,
+      "reward": 0.5255681872367859,
+      "reward_std": 0.07393435388803482,
+      "rewards/itbench_correctness/mean": 0.5255681872367859,
+      "rewards/itbench_correctness/std": 0.48524191975593567,
+      "step": 289,
+      "step_time": 100.21258049272001
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 867.0,
+      "completions/mean_length": 957.125,
+      "completions/mean_terminated_length": 756.5,
+      "completions/min_length": 689.0,
+      "completions/min_terminated_length": 689.0,
+      "entropy": 0.5600104331970215,
+      "epoch": 1.5343915343915344,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.224609375,
+      "kl": 0.002044468652456999,
+      "learning_rate": 8.517952785058384e-07,
+      "loss": 0.0001,
+      "num_tokens": 5578490.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 290,
+      "step_time": 204.50664361845702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 846.8125,
+      "completions/mean_terminated_length": 805.923095703125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.49125397205352783,
+      "epoch": 1.5396825396825395,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.625,
+      "kl": 0.001689074793830514,
+      "learning_rate": 8.506183921362442e-07,
+      "loss": -0.1079,
+      "num_tokens": 5598255.0,
+      "reward": 0.7552083730697632,
+      "reward_std": 0.3175256550312042,
+      "rewards/itbench_correctness/mean": 0.7552083730697632,
+      "rewards/itbench_correctness/std": 0.3325946629047394,
+      "step": 291,
+      "step_time": 141.62990444898605
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 697.3125,
+      "completions/mean_terminated_length": 443.22222900390625,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 0.5621582865715027,
+      "epoch": 1.544973544973545,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.140625,
+      "kl": 0.0017327865352854133,
+      "learning_rate": 8.494376714600877e-07,
+      "loss": -0.0076,
+      "num_tokens": 5618796.0,
+      "reward": 0.21250000596046448,
+      "reward_std": 0.02314549870789051,
+      "rewards/itbench_correctness/mean": 0.21250000596046448,
+      "rewards/itbench_correctness/std": 0.22173558175563812,
+      "step": 292,
+      "step_time": 116.08490148931742
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 701.75,
+      "completions/mean_terminated_length": 680.2667236328125,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.36052724719047546,
+      "epoch": 1.5502645502645502,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3203125,
+      "kl": 0.0012737837387248874,
+      "learning_rate": 8.48253129389541e-07,
+      "loss": 0.0105,
+      "num_tokens": 5634952.0,
+      "reward": 0.10546875,
+      "reward_std": 0.06008155643939972,
+      "rewards/itbench_correctness/mean": 0.10546875,
+      "rewards/itbench_correctness/std": 0.1363947093486786,
+      "step": 293,
+      "step_time": 391.972909046337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 706.0,
+      "completions/max_terminated_length": 706.0,
+      "completions/mean_length": 493.25,
+      "completions/mean_terminated_length": 493.25,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4176381230354309,
+      "epoch": 1.5555555555555556,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3125,
+      "kl": 0.0017929230816662312,
+      "learning_rate": 8.470647788785664e-07,
+      "loss": -0.0618,
+      "num_tokens": 5646612.0,
+      "reward": 0.671875,
+      "reward_std": 0.2810920476913452,
+      "rewards/itbench_correctness/mean": 0.671875,
+      "rewards/itbench_correctness/std": 0.3502231538295746,
+      "step": 294,
+      "step_time": 417.62054439727217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 522.0,
+      "completions/max_terminated_length": 522.0,
+      "completions/mean_length": 440.1875,
+      "completions/mean_terminated_length": 440.1875,
+      "completions/min_length": 383.0,
+      "completions/min_terminated_length": 383.0,
+      "entropy": 0.3566662073135376,
+      "epoch": 1.560846560846561,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0015034499811008573,
+      "learning_rate": 8.458726329227747e-07,
+      "loss": 0.0048,
+      "num_tokens": 5656511.0,
+      "reward": 0.7447916865348816,
+      "reward_std": 0.11666134744882584,
+      "rewards/itbench_correctness/mean": 0.7447916865348816,
+      "rewards/itbench_correctness/std": 0.17864912748336792,
+      "step": 295,
+      "step_time": 839.189504972659
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 840.0,
+      "completions/max_terminated_length": 840.0,
+      "completions/mean_length": 655.875,
+      "completions/mean_terminated_length": 655.875,
+      "completions/min_length": 488.0,
+      "completions/min_terminated_length": 488.0,
+      "entropy": 0.5641318559646606,
+      "epoch": 1.566137566137566,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.002510908292606473,
+      "learning_rate": 8.446767045592829e-07,
+      "loss": 0.0185,
+      "num_tokens": 5680541.0,
+      "reward": 0.265625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.265625,
+      "rewards/itbench_correctness/std": 0.28090256452560425,
+      "step": 296,
+      "step_time": 371.1978498548269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 985.0,
+      "completions/mean_length": 673.5,
+      "completions/mean_terminated_length": 650.1333618164062,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.5404602885246277,
+      "epoch": 1.5714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.001535046030767262,
+      "learning_rate": 8.434770068665722e-07,
+      "loss": 0.0315,
+      "num_tokens": 5695821.0,
+      "reward": 0.5397727489471436,
+      "reward_std": 0.29765012860298157,
+      "rewards/itbench_correctness/mean": 0.5397727489471436,
+      "rewards/itbench_correctness/std": 0.45329374074935913,
+      "step": 297,
+      "step_time": 163.53786495421082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 764.0625,
+      "completions/mean_terminated_length": 677.4166870117188,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 0.48425358533859253,
+      "epoch": 1.5767195767195767,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9609375,
+      "kl": 0.001875088200904429,
+      "learning_rate": 8.422735529643443e-07,
+      "loss": -0.0003,
+      "num_tokens": 5711494.0,
+      "reward": 0.390625,
+      "reward_std": 0.0794283002614975,
+      "rewards/itbench_correctness/mean": 0.390625,
+      "rewards/itbench_correctness/std": 0.3492303192615509,
+      "step": 298,
+      "step_time": 102.2513699810952
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 646.0,
+      "completions/mean_length": 742.1875,
+      "completions/mean_terminated_length": 523.0,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "entropy": 0.36648422479629517,
+      "epoch": 1.5820105820105819,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0281982421875,
+      "kl": 0.0010558166541159153,
+      "learning_rate": 8.410663560133783e-07,
+      "loss": 0.0,
+      "num_tokens": 5738233.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 299,
+      "step_time": 170.24059600010514
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 829.0,
+      "completions/mean_length": 844.5625,
+      "completions/mean_terminated_length": 705.0,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "entropy": 0.5233479142189026,
+      "epoch": 1.5873015873015874,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.015625,
+      "kl": 0.001366324140690267,
+      "learning_rate": 8.398554292153865e-07,
+      "loss": 0.0093,
+      "num_tokens": 5760042.0,
+      "reward": 0.4791666865348816,
+      "reward_std": 0.16512766480445862,
+      "rewards/itbench_correctness/mean": 0.4791666865348816,
+      "rewards/itbench_correctness/std": 0.3019995093345642,
+      "step": 300,
+      "step_time": 285.300213762559
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 753.6875,
+      "completions/mean_terminated_length": 543.4444580078125,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "entropy": 0.536031186580658,
+      "epoch": 1.5925925925925926,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5625,
+      "kl": 0.0015036487020552158,
+      "learning_rate": 8.386407858128706e-07,
+      "loss": 0.0005,
+      "num_tokens": 5783901.0,
+      "reward": 0.28437501192092896,
+      "reward_std": 0.07841908931732178,
+      "rewards/itbench_correctness/mean": 0.28437501192092896,
+      "rewards/itbench_correctness/std": 0.14226588606834412,
+      "step": 301,
+      "step_time": 148.27129491977394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 530.0625,
+      "completions/mean_terminated_length": 530.0625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.42825138568878174,
+      "epoch": 1.597883597883598,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3125,
+      "kl": 0.0016496152384206653,
+      "learning_rate": 8.374224390889759e-07,
+      "loss": -0.0793,
+      "num_tokens": 5802318.0,
+      "reward": 0.296875,
+      "reward_std": 0.13962560892105103,
+      "rewards/itbench_correctness/mean": 0.296875,
+      "rewards/itbench_correctness/std": 0.1434326171875,
+      "step": 302,
+      "step_time": 129.92639573384076
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 964.0,
+      "completions/mean_length": 1009.8125,
+      "completions/mean_terminated_length": 948.3333740234375,
+      "completions/min_length": 926.0,
+      "completions/min_terminated_length": 926.0,
+      "entropy": 0.43770501017570496,
+      "epoch": 1.6031746031746033,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.1875,
+      "kl": 0.0010520732030272484,
+      "learning_rate": 8.362004023673472e-07,
+      "loss": 0.0025,
+      "num_tokens": 5825731.0,
+      "reward": 0.5,
+      "reward_std": 0.1356339007616043,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.47214052081108093,
+      "step": 303,
+      "step_time": 78.02016614936292
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 747.8125,
+      "completions/mean_terminated_length": 582.1000366210938,
+      "completions/min_length": 467.0,
+      "completions/min_terminated_length": 467.0,
+      "entropy": 0.4011700749397278,
+      "epoch": 1.6084656084656084,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.0011779264314100146,
+      "learning_rate": 8.349746890119824e-07,
+      "loss": -0.0198,
+      "num_tokens": 5864376.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 304,
+      "step_time": 941.6569038927555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 718.0,
+      "completions/max_terminated_length": 718.0,
+      "completions/mean_length": 496.3125,
+      "completions/mean_terminated_length": 496.3125,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "entropy": 0.40297192335128784,
+      "epoch": 1.6137566137566137,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.00194979936350137,
+      "learning_rate": 8.337453124270862e-07,
+      "loss": -0.0047,
+      "num_tokens": 5884285.0,
+      "reward": 0.5625,
+      "reward_std": 0.3471825420856476,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 305,
+      "step_time": 107.52555268164724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 965.625,
+      "completions/mean_terminated_length": 837.2000122070312,
+      "completions/min_length": 659.0,
+      "completions/min_terminated_length": 659.0,
+      "entropy": 0.41838186979293823,
+      "epoch": 1.619047619047619,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.125,
+      "kl": 0.0014495647046715021,
+      "learning_rate": 8.325122860569241e-07,
+      "loss": -0.0125,
+      "num_tokens": 5910599.0,
+      "reward": 0.5354166626930237,
+      "reward_std": 0.05260828882455826,
+      "rewards/itbench_correctness/mean": 0.5354166626930237,
+      "rewards/itbench_correctness/std": 0.4056031107902527,
+      "step": 306,
+      "step_time": 113.85000483132899
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 630.75,
+      "completions/mean_terminated_length": 574.5714721679688,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 0.3202536702156067,
+      "epoch": 1.6243386243386242,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.078125,
+      "kl": 0.001761075691320002,
+      "learning_rate": 8.312756233856748e-07,
+      "loss": -0.0057,
+      "num_tokens": 5925579.0,
+      "reward": 0.1875,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 307,
+      "step_time": 252.66727325879037
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 831.3125,
+      "completions/mean_terminated_length": 681.4444580078125,
+      "completions/min_length": 517.0,
+      "completions/min_terminated_length": 517.0,
+      "entropy": 0.4017743170261383,
+      "epoch": 1.6296296296296298,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.984375,
+      "kl": 0.0010056800674647093,
+      "learning_rate": 8.300353379372833e-07,
+      "loss": 0.0837,
+      "num_tokens": 5954544.0,
+      "reward": 0.5625,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 308,
+      "step_time": 91.5687418980524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 589.0,
+      "completions/mean_length": 438.6875,
+      "completions/mean_terminated_length": 399.66668701171875,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.36244478821754456,
+      "epoch": 1.6349206349206349,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.171875,
+      "kl": 0.0014679917367175221,
+      "learning_rate": 8.287914432753123e-07,
+      "loss": -0.1423,
+      "num_tokens": 5971995.0,
+      "reward": 0.78125,
+      "reward_std": 0.2706093192100525,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.29007503390312195,
+      "step": 309,
+      "step_time": 103.0185587760061
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 951.5625,
+      "completions/mean_terminated_length": 830.8333740234375,
+      "completions/min_length": 641.0,
+      "completions/min_terminated_length": 641.0,
+      "entropy": 0.5422660112380981,
+      "epoch": 1.6402116402116402,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0625,
+      "kl": 0.00137800769880414,
+      "learning_rate": 8.275439530027947e-07,
+      "loss": -0.0189,
+      "num_tokens": 5994244.0,
+      "reward": 0.4236111044883728,
+      "reward_std": 0.3206467628479004,
+      "rewards/itbench_correctness/mean": 0.4236111044883728,
+      "rewards/itbench_correctness/std": 0.4552505910396576,
+      "step": 310,
+      "step_time": 408.57210523914546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 718.8125,
+      "completions/mean_terminated_length": 648.3846435546875,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.4062255322933197,
+      "epoch": 1.6455026455026456,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.0012174441944807768,
+      "learning_rate": 8.262928807620843e-07,
+      "loss": 0.0105,
+      "num_tokens": 6010465.0,
+      "reward": 0.25,
+      "reward_std": 0.3745020925998688,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.40824830532073975,
+      "step": 311,
+      "step_time": 475.81261223275214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 839.0,
+      "completions/max_terminated_length": 839.0,
+      "completions/mean_length": 547.4375,
+      "completions/mean_terminated_length": 547.4375,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.3781253695487976,
+      "epoch": 1.6507936507936507,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5390625,
+      "kl": 0.0014938174281269312,
+      "learning_rate": 8.250382402347064e-07,
+      "loss": 0.0148,
+      "num_tokens": 6023008.0,
+      "reward": 0.7837010025978088,
+      "reward_std": 0.2962387502193451,
+      "rewards/itbench_correctness/mean": 0.7837010025978088,
+      "rewards/itbench_correctness/std": 0.3930458724498749,
+      "step": 312,
+      "step_time": 166.25692852959037
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 740.0,
+      "completions/max_terminated_length": 740.0,
+      "completions/mean_length": 492.6875,
+      "completions/mean_terminated_length": 492.6875,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "entropy": 0.3694024980068207,
+      "epoch": 1.656084656084656,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5546875,
+      "kl": 0.0012153934221714735,
+      "learning_rate": 8.237800451412094e-07,
+      "loss": -0.0061,
+      "num_tokens": 6034795.0,
+      "reward": 0.671875,
+      "reward_std": 0.25043365359306335,
+      "rewards/itbench_correctness/mean": 0.671875,
+      "rewards/itbench_correctness/std": 0.3353670835494995,
+      "step": 313,
+      "step_time": 146.2452635196969
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 496.75,
+      "completions/mean_terminated_length": 496.75,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "entropy": 0.5234020948410034,
+      "epoch": 1.6613756613756614,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6171875,
+      "kl": 0.0013322219019755721,
+      "learning_rate": 8.225183092410127e-07,
+      "loss": -0.0344,
+      "num_tokens": 6045567.0,
+      "reward": 0.5625,
+      "reward_std": 0.3471825420856476,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 314,
+      "step_time": 998.2186722587794
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 660.6875,
+      "completions/mean_terminated_length": 636.4666748046875,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.44499102234840393,
+      "epoch": 1.6666666666666665,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0257568359375,
+      "kl": 0.001278597628697753,
+      "learning_rate": 8.212530463322582e-07,
+      "loss": 0.0,
+      "num_tokens": 6061834.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 315,
+      "step_time": 1148.7406483720988
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 588.0,
+      "completions/max_terminated_length": 588.0,
+      "completions/mean_length": 454.25,
+      "completions/mean_terminated_length": 454.25,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.41607043147087097,
+      "epoch": 1.671957671957672,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.078125,
+      "kl": 0.0012671623844653368,
+      "learning_rate": 8.199842702516582e-07,
+      "loss": 0.0057,
+      "num_tokens": 6072118.0,
+      "reward": 0.953125,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.953125,
+      "rewards/itbench_correctness/std": 0.1875,
+      "step": 316,
+      "step_time": 864.3541559455916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 723.0,
+      "completions/max_terminated_length": 723.0,
+      "completions/mean_length": 566.625,
+      "completions/mean_terminated_length": 566.625,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "entropy": 0.4729759395122528,
+      "epoch": 1.6772486772486772,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.001966112991794944,
+      "learning_rate": 8.187119948743449e-07,
+      "loss": 0.0121,
+      "num_tokens": 6086384.0,
+      "reward": 0.6979166269302368,
+      "reward_std": 0.3103903532028198,
+      "rewards/itbench_correctness/mean": 0.6979166269302368,
+      "rewards/itbench_correctness/std": 0.42259669303894043,
+      "step": 317,
+      "step_time": 80.52064239047468
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 691.0,
+      "completions/mean_length": 786.5,
+      "completions/mean_terminated_length": 549.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4755244851112366,
+      "epoch": 1.6825396825396826,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.494140625,
+      "kl": 0.0016143879620358348,
+      "learning_rate": 8.174362341137176e-07,
+      "loss": -0.0945,
+      "num_tokens": 6105360.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 318,
+      "step_time": 496.4877818999812
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1002.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 753.0,
+      "completions/mean_terminated_length": 753.0,
+      "completions/min_length": 557.0,
+      "completions/min_terminated_length": 557.0,
+      "entropy": 0.2895086407661438,
+      "epoch": 1.687830687830688,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0012620992492884398,
+      "learning_rate": 8.16157001921292e-07,
+      "loss": -0.0084,
+      "num_tokens": 6123976.0,
+      "reward": 0.8125,
+      "reward_std": 0.1462520956993103,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.19364917278289795,
+      "step": 319,
+      "step_time": 96.52025901339948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 1001.3125,
+      "completions/mean_terminated_length": 903.0,
+      "completions/min_length": 728.0,
+      "completions/min_terminated_length": 728.0,
+      "entropy": 0.39947569370269775,
+      "epoch": 1.693121693121693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.59375,
+      "kl": 0.0011935190996155143,
+      "learning_rate": 8.148743122865463e-07,
+      "loss": -0.0069,
+      "num_tokens": 6166669.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 320,
+      "step_time": 7083.372859461233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 675.4375,
+      "completions/mean_terminated_length": 652.2000122070312,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 0.37753307819366455,
+      "epoch": 1.6984126984126984,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0014044505078345537,
+      "learning_rate": 8.135881792367685e-07,
+      "loss": -0.0313,
+      "num_tokens": 6182420.0,
+      "reward": 0.5208333730697632,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5208333730697632,
+      "rewards/itbench_correctness/std": 0.48638883233070374,
+      "step": 321,
+      "step_time": 248.25969803985208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 685.5,
+      "completions/mean_terminated_length": 572.6666870117188,
+      "completions/min_length": 274.0,
+      "completions/min_terminated_length": 274.0,
+      "entropy": 0.6068562865257263,
+      "epoch": 1.7037037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0625,
+      "kl": 0.0015854442026466131,
+      "learning_rate": 8.122986168369039e-07,
+      "loss": -0.0155,
+      "num_tokens": 6206140.0,
+      "reward": 0.234375,
+      "reward_std": 0.3006556034088135,
+      "rewards/itbench_correctness/mean": 0.234375,
+      "rewards/itbench_correctness/std": 0.3158157467842102,
+      "step": 322,
+      "step_time": 145.6991236684844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 628.0,
+      "completions/mean_length": 787.6875,
+      "completions/mean_terminated_length": 551.375,
+      "completions/min_length": 476.0,
+      "completions/min_terminated_length": 476.0,
+      "entropy": 0.4164087772369385,
+      "epoch": 1.7089947089947088,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.59375,
+      "kl": 0.0013617142103612423,
+      "learning_rate": 8.110056391894003e-07,
+      "loss": 0.0,
+      "num_tokens": 6227303.0,
+      "reward": 0.5520833134651184,
+      "reward_std": 0.06200198456645012,
+      "rewards/itbench_correctness/mean": 0.5520833134651184,
+      "rewards/itbench_correctness/std": 0.4702983796596527,
+      "step": 323,
+      "step_time": 140.916482466273
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 824.0,
+      "completions/max_terminated_length": 824.0,
+      "completions/mean_length": 699.0,
+      "completions/mean_terminated_length": 699.0,
+      "completions/min_length": 592.0,
+      "completions/min_terminated_length": 592.0,
+      "entropy": 0.3690987229347229,
+      "epoch": 1.7142857142857144,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.59375,
+      "kl": 0.0010593783808872104,
+      "learning_rate": 8.097092604340541e-07,
+      "loss": 0.0083,
+      "num_tokens": 6244119.0,
+      "reward": 0.84375,
+      "reward_std": 0.14777101576328278,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.16720746457576752,
+      "step": 324,
+      "step_time": 67.42365125380456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 972.25,
+      "completions/mean_terminated_length": 748.0,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.5759835243225098,
+      "epoch": 1.7195767195767195,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06103515625,
+      "kl": 0.001520626712590456,
+      "learning_rate": 8.084094947478554e-07,
+      "loss": 0.0001,
+      "num_tokens": 6288875.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 325,
+      "step_time": 162.16153999976814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 785.0,
+      "completions/mean_length": 747.1875,
+      "completions/mean_terminated_length": 621.3636474609375,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.40150564908981323,
+      "epoch": 1.7248677248677249,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0517578125,
+      "kl": 0.0012618422042578459,
+      "learning_rate": 8.071063563448339e-07,
+      "loss": 0.0,
+      "num_tokens": 6307974.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 326,
+      "step_time": 658.432337153703
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 972.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 683.375,
+      "completions/mean_terminated_length": 683.375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4097311198711395,
+      "epoch": 1.7301587301587302,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5546875,
+      "kl": 0.0016633245395496488,
+      "learning_rate": 8.057998594759022e-07,
+      "loss": -0.0924,
+      "num_tokens": 6323180.0,
+      "reward": 0.45098039507865906,
+      "reward_std": 0.24446815252304077,
+      "rewards/itbench_correctness/mean": 0.45098039507865906,
+      "rewards/itbench_correctness/std": 0.37708234786987305,
+      "step": 327,
+      "step_time": 183.20953813474625
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 992.0625,
+      "completions/mean_terminated_length": 938.8333740234375,
+      "completions/min_length": 773.0,
+      "completions/min_terminated_length": 773.0,
+      "entropy": 0.4838404953479767,
+      "epoch": 1.7354497354497354,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.203125,
+      "kl": 0.0014802846126258373,
+      "learning_rate": 8.044900184287006e-07,
+      "loss": -0.0402,
+      "num_tokens": 6345661.0,
+      "reward": 0.7080707550048828,
+      "reward_std": 0.3341723680496216,
+      "rewards/itbench_correctness/mean": 0.7080707550048828,
+      "rewards/itbench_correctness/std": 0.33529266715049744,
+      "step": 328,
+      "step_time": 849.2058257460594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 852.4375,
+      "completions/mean_terminated_length": 827.9285888671875,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.4762812554836273,
+      "epoch": 1.7407407407407407,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8046875,
+      "kl": 0.0015186754753813148,
+      "learning_rate": 8.031768475274412e-07,
+      "loss": 0.0195,
+      "num_tokens": 6375004.0,
+      "reward": 0.4166666865348816,
+      "reward_std": 0.3177001476287842,
+      "rewards/itbench_correctness/mean": 0.4166666865348816,
+      "rewards/itbench_correctness/std": 0.42163705825805664,
+      "step": 329,
+      "step_time": 127.72529877442867
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 600.0,
+      "completions/mean_length": 461.375,
+      "completions/mean_terminated_length": 423.86669921875,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.353291779756546,
+      "epoch": 1.746031746031746,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0018113987753167748,
+      "learning_rate": 8.018603611327504e-07,
+      "loss": -0.0122,
+      "num_tokens": 6384650.0,
+      "reward": 0.53125,
+      "reward_std": 0.0578637570142746,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.08539126068353653,
+      "step": 330,
+      "step_time": 139.3600283851847
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 869.3125,
+      "completions/mean_terminated_length": 611.5,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "entropy": 0.39111366868019104,
+      "epoch": 1.7513227513227512,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8046875,
+      "kl": 0.0013434779830276966,
+      "learning_rate": 8.005405736415125e-07,
+      "loss": -0.0235,
+      "num_tokens": 6410383.0,
+      "reward": 0.6458333730697632,
+      "reward_std": 0.37862008810043335,
+      "rewards/itbench_correctness/mean": 0.6458333730697632,
+      "rewards/itbench_correctness/std": 0.36704525351524353,
+      "step": 331,
+      "step_time": 760.4793853284791
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 822.4375,
+      "completions/mean_terminated_length": 730.8181762695312,
+      "completions/min_length": 566.0,
+      "completions/min_terminated_length": 566.0,
+      "entropy": 0.2966790795326233,
+      "epoch": 1.7566137566137567,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.0013310646172612906,
+      "learning_rate": 7.992174994867123e-07,
+      "loss": 0.0379,
+      "num_tokens": 6431670.0,
+      "reward": 0.4851190745830536,
+      "reward_std": 0.19627538323402405,
+      "rewards/itbench_correctness/mean": 0.4851190745830536,
+      "rewards/itbench_correctness/std": 0.2754608690738678,
+      "step": 332,
+      "step_time": 100.20586761180311
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 902.0,
+      "completions/max_terminated_length": 902.0,
+      "completions/mean_length": 540.0,
+      "completions/mean_terminated_length": 540.0,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "entropy": 0.5,
+      "epoch": 1.7619047619047619,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.640625,
+      "kl": 0.0020073342602699995,
+      "learning_rate": 7.978911531372764e-07,
+      "loss": -0.007,
+      "num_tokens": 6452334.0,
+      "reward": 0.25,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 333,
+      "step_time": 130.50346516724676
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 894.25,
+      "completions/mean_terminated_length": 727.4285888671875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5636007785797119,
+      "epoch": 1.7671957671957672,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.03125,
+      "kl": 0.0019210126483812928,
+      "learning_rate": 7.965615490979163e-07,
+      "loss": -0.0575,
+      "num_tokens": 6485010.0,
+      "reward": 0.3125,
+      "reward_std": 0.3924052119255066,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 334,
+      "step_time": 287.8023096676916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 883.0,
+      "completions/mean_length": 860.6875,
+      "completions/mean_terminated_length": 697.375,
+      "completions/min_length": 606.0,
+      "completions/min_terminated_length": 606.0,
+      "entropy": 0.5019243359565735,
+      "epoch": 1.7724867724867726,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3203125,
+      "kl": 0.0012930968077853322,
+      "learning_rate": 7.952287019089685e-07,
+      "loss": 0.001,
+      "num_tokens": 6506909.0,
+      "reward": 0.956250011920929,
+      "reward_std": 0.086344413459301,
+      "rewards/itbench_correctness/mean": 0.956250011920929,
+      "rewards/itbench_correctness/std": 0.1263262927532196,
+      "step": 335,
+      "step_time": 81.09772168658674
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 778.0,
+      "completions/max_terminated_length": 778.0,
+      "completions/mean_length": 642.625,
+      "completions/mean_terminated_length": 642.625,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.38280490040779114,
+      "epoch": 1.7777777777777777,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.025146484375,
+      "kl": 0.0013457894092425704,
+      "learning_rate": 7.938926261462365e-07,
+      "loss": 0.0,
+      "num_tokens": 6521703.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 336,
+      "step_time": 95.01154231280088
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 756.6875,
+      "completions/mean_terminated_length": 548.7777709960938,
+      "completions/min_length": 356.0,
+      "completions/min_terminated_length": 356.0,
+      "entropy": 0.306599497795105,
+      "epoch": 1.783068783068783,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8046875,
+      "kl": 0.0012715982738882303,
+      "learning_rate": 7.925533364208308e-07,
+      "loss": -0.0117,
+      "num_tokens": 6541690.0,
+      "reward": 0.25,
+      "reward_std": 0.4355512857437134,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 337,
+      "step_time": 142.65012488793582
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 985.0,
+      "completions/mean_length": 928.625,
+      "completions/mean_terminated_length": 769.6666870117188,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3445954918861389,
+      "epoch": 1.7883597883597884,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0012255455367267132,
+      "learning_rate": 7.912108473790091e-07,
+      "loss": -0.0833,
+      "num_tokens": 6563700.0,
+      "reward": 0.36250001192092896,
+      "reward_std": 0.22638463973999023,
+      "rewards/itbench_correctness/mean": 0.36250001192092896,
+      "rewards/itbench_correctness/std": 0.4856267273426056,
+      "step": 338,
+      "step_time": 146.54249787330627
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 711.0,
+      "completions/max_terminated_length": 711.0,
+      "completions/mean_length": 537.0625,
+      "completions/mean_terminated_length": 537.0625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.333294540643692,
+      "epoch": 1.7936507936507935,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.140625,
+      "kl": 0.0017877722857519984,
+      "learning_rate": 7.898651737020166e-07,
+      "loss": -0.092,
+      "num_tokens": 6577333.0,
+      "reward": 0.875568151473999,
+      "reward_std": 0.17176605761051178,
+      "rewards/itbench_correctness/mean": 0.875568151473999,
+      "rewards/itbench_correctness/std": 0.24365714192390442,
+      "step": 339,
+      "step_time": 63.54152914788574
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 494.625,
+      "completions/mean_terminated_length": 494.625,
+      "completions/min_length": 288.0,
+      "completions/min_terminated_length": 288.0,
+      "entropy": 0.414455384016037,
+      "epoch": 1.798941798941799,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.59375,
+      "kl": 0.0016418452141806483,
+      "learning_rate": 7.88516330105925e-07,
+      "loss": 0.0023,
+      "num_tokens": 6588839.0,
+      "reward": 0.46875,
+      "reward_std": 0.23779192566871643,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.23935678601264954,
+      "step": 340,
+      "step_time": 421.9616943122819
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 897.6875,
+      "completions/mean_terminated_length": 771.375,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.4611849784851074,
+      "epoch": 1.8042328042328042,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3984375,
+      "kl": 0.0015902061713859439,
+      "learning_rate": 7.871643313414718e-07,
+      "loss": 0.0001,
+      "num_tokens": 6620194.0,
+      "reward": 0.4583333432674408,
+      "reward_std": 0.07715166360139847,
+      "rewards/itbench_correctness/mean": 0.4583333432674408,
+      "rewards/itbench_correctness/std": 0.4849589467048645,
+      "step": 341,
+      "step_time": 85.41169494390488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 788.5625,
+      "completions/mean_terminated_length": 710.0833740234375,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.5148609280586243,
+      "epoch": 1.8095238095238095,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.0013109652791172266,
+      "learning_rate": 7.858091921938987e-07,
+      "loss": 0.0209,
+      "num_tokens": 6637355.0,
+      "reward": 0.49609375,
+      "reward_std": 0.31232208013534546,
+      "rewards/itbench_correctness/mean": 0.49609375,
+      "rewards/itbench_correctness/std": 0.42540958523750305,
+      "step": 342,
+      "step_time": 104.21269215922803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 768.9375,
+      "completions/mean_terminated_length": 513.875,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.3641388416290283,
+      "epoch": 1.8148148148148149,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1640625,
+      "kl": 0.0012282090028747916,
+      "learning_rate": 7.844509274827906e-07,
+      "loss": 0.0,
+      "num_tokens": 6656498.0,
+      "reward": 0.84375,
+      "reward_std": 0.08258593082427979,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.19690898060798645,
+      "step": 343,
+      "step_time": 250.37473237421364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 728.8125,
+      "completions/mean_terminated_length": 551.7000122070312,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.4253494441509247,
+      "epoch": 1.82010582010582,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.09375,
+      "kl": 0.001717406790703535,
+      "learning_rate": 7.830895520619128e-07,
+      "loss": 0.0331,
+      "num_tokens": 6673527.0,
+      "reward": 0.3839285671710968,
+      "reward_std": 0.24145764112472534,
+      "rewards/itbench_correctness/mean": 0.3839285671710968,
+      "rewards/itbench_correctness/std": 0.41063666343688965,
+      "step": 344,
+      "step_time": 111.02232545148581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 734.6875,
+      "completions/mean_terminated_length": 509.6666564941406,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "entropy": 0.43555933237075806,
+      "epoch": 1.8253968253968254,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04296875,
+      "kl": 0.001631794380955398,
+      "learning_rate": 7.817250808190483e-07,
+      "loss": 0.0,
+      "num_tokens": 6696034.0,
+      "reward": 0.4285714328289032,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.4285714328289032,
+      "rewards/itbench_correctness/std": 0.4426266849040985,
+      "step": 345,
+      "step_time": 991.8486757231876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 815.0,
+      "completions/max_terminated_length": 815.0,
+      "completions/mean_length": 619.3125,
+      "completions/mean_terminated_length": 619.3125,
+      "completions/min_length": 426.0,
+      "completions/min_terminated_length": 426.0,
+      "entropy": 0.4714905619621277,
+      "epoch": 1.8306878306878307,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.001351330429315567,
+      "learning_rate": 7.803575286758363e-07,
+      "loss": 0.0057,
+      "num_tokens": 6714223.0,
+      "reward": 0.875,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.22360680997371674,
+      "step": 346,
+      "step_time": 114.43643134180456
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 865.0,
+      "completions/max_terminated_length": 865.0,
+      "completions/mean_length": 641.6875,
+      "completions/mean_terminated_length": 641.6875,
+      "completions/min_length": 420.0,
+      "completions/min_terminated_length": 420.0,
+      "entropy": 0.3397292196750641,
+      "epoch": 1.8359788359788358,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.15625,
+      "kl": 0.000967301893979311,
+      "learning_rate": 7.789869105876082e-07,
+      "loss": 0.0277,
+      "num_tokens": 6727946.0,
+      "reward": 0.9375,
+      "reward_std": 0.03857583552598953,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.08333335071802139,
+      "step": 347,
+      "step_time": 799.3260576492175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 975.0,
+      "completions/mean_length": 940.0625,
+      "completions/mean_terminated_length": 688.25,
+      "completions/min_length": 11.0,
+      "completions/min_terminated_length": 11.0,
+      "entropy": 0.4978392422199249,
+      "epoch": 1.8412698412698414,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.09375,
+      "kl": 0.00129983713850379,
+      "learning_rate": 7.776132415432232e-07,
+      "loss": -0.0185,
+      "num_tokens": 6748563.0,
+      "reward": 0.2109375,
+      "reward_std": 0.37981581687927246,
+      "rewards/itbench_correctness/mean": 0.2109375,
+      "rewards/itbench_correctness/std": 0.3783702850341797,
+      "step": 348,
+      "step_time": 79.58014123514295
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 987.375,
+      "completions/mean_terminated_length": 877.5,
+      "completions/min_length": 806.0,
+      "completions/min_terminated_length": 806.0,
+      "entropy": 0.5266489386558533,
+      "epoch": 1.8465608465608465,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0,
+      "kl": 0.001141308806836605,
+      "learning_rate": 7.762365365649067e-07,
+      "loss": 0.0426,
+      "num_tokens": 6769345.0,
+      "reward": 0.28125,
+      "reward_std": 0.32512497901916504,
+      "rewards/itbench_correctness/mean": 0.28125,
+      "rewards/itbench_correctness/std": 0.4366062581539154,
+      "step": 349,
+      "step_time": 74.42612945474684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 600.4375,
+      "completions/mean_terminated_length": 572.2000122070312,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "entropy": 0.2631414532661438,
+      "epoch": 1.8518518518518519,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3984375,
+      "kl": 0.0013918590266257524,
+      "learning_rate": 7.74856810708083e-07,
+      "loss": -0.0166,
+      "num_tokens": 6783368.0,
+      "reward": 0.4817708432674408,
+      "reward_std": 0.2546592652797699,
+      "rewards/itbench_correctness/mean": 0.4817708432674408,
+      "rewards/itbench_correctness/std": 0.2613040506839752,
+      "step": 350,
+      "step_time": 84.01541598606855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 569.0,
+      "completions/max_terminated_length": 569.0,
+      "completions/mean_length": 432.25,
+      "completions/mean_terminated_length": 432.25,
+      "completions/min_length": 334.0,
+      "completions/min_terminated_length": 334.0,
+      "entropy": 0.37478309869766235,
+      "epoch": 1.8571428571428572,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4140625,
+      "kl": 0.0021659955382347107,
+      "learning_rate": 7.734740790612136e-07,
+      "loss": -0.0002,
+      "num_tokens": 6793332.0,
+      "reward": 0.2847222089767456,
+      "reward_std": 0.1159602552652359,
+      "rewards/itbench_correctness/mean": 0.2847222089767456,
+      "rewards/itbench_correctness/std": 0.19016453623771667,
+      "step": 351,
+      "step_time": 52.17074024025351
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 744.5625,
+      "completions/mean_terminated_length": 527.2222290039062,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.44052714109420776,
+      "epoch": 1.8624338624338623,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.625,
+      "kl": 0.0016061851056292653,
+      "learning_rate": 7.720883567456298e-07,
+      "loss": 0.0084,
+      "num_tokens": 6812589.0,
+      "reward": 0.78125,
+      "reward_std": 0.23543904721736908,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.39308255910873413,
+      "step": 352,
+      "step_time": 82.14024385716766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 766.0,
+      "completions/max_terminated_length": 766.0,
+      "completions/mean_length": 459.25,
+      "completions/mean_terminated_length": 459.25,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "entropy": 0.5182362794876099,
+      "epoch": 1.8677248677248677,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.0017120328266173601,
+      "learning_rate": 7.706996589153689e-07,
+      "loss": 0.0197,
+      "num_tokens": 6822457.0,
+      "reward": 0.53125,
+      "reward_std": 0.38816186785697937,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.4069705307483673,
+      "step": 353,
+      "step_time": 141.23254205007106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 991.625,
+      "completions/mean_terminated_length": 920.4000244140625,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.3872431516647339,
+      "epoch": 1.873015873015873,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2734375,
+      "kl": 0.0011720954207703471,
+      "learning_rate": 7.693080007570083e-07,
+      "loss": -0.0084,
+      "num_tokens": 6850571.0,
+      "reward": 0.03125,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.03125,
+      "rewards/itbench_correctness/std": 0.125,
+      "step": 354,
+      "step_time": 552.564743893221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 522.0,
+      "completions/mean_terminated_length": 522.0,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.517241358757019,
+      "epoch": 1.8783068783068781,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5859375,
+      "kl": 0.0022999588400125504,
+      "learning_rate": 7.679133974894982e-07,
+      "loss": -0.0017,
+      "num_tokens": 6861683.0,
+      "reward": 0.40625,
+      "reward_std": 0.2882373631000519,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.42149975895881653,
+      "step": 355,
+      "step_time": 615.0445311861113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 993.8125,
+      "completions/mean_terminated_length": 927.4000244140625,
+      "completions/min_length": 802.0,
+      "completions/min_terminated_length": 802.0,
+      "entropy": 0.4246273934841156,
+      "epoch": 1.8835978835978837,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.890625,
+      "kl": 0.001227955799549818,
+      "learning_rate": 7.665158643639969e-07,
+      "loss": 0.022,
+      "num_tokens": 6892520.0,
+      "reward": 0.1294642835855484,
+      "reward_std": 0.2322283834218979,
+      "rewards/itbench_correctness/mean": 0.1294642835855484,
+      "rewards/itbench_correctness/std": 0.2531687021255493,
+      "step": 356,
+      "step_time": 94.74817245267332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 985.0,
+      "completions/max_terminated_length": 985.0,
+      "completions/mean_length": 760.8125,
+      "completions/mean_terminated_length": 760.8125,
+      "completions/min_length": 506.0,
+      "completions/min_terminated_length": 506.0,
+      "entropy": 0.316766619682312,
+      "epoch": 1.8888888888888888,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.0009881765581667423,
+      "learning_rate": 7.651154166637024e-07,
+      "loss": 0.0684,
+      "num_tokens": 6910381.0,
+      "reward": 0.7250000238418579,
+      "reward_std": 0.28192007541656494,
+      "rewards/itbench_correctness/mean": 0.7250000238418579,
+      "rewards/itbench_correctness/std": 0.3696845769882202,
+      "step": 357,
+      "step_time": 73.87886378820986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 723.5,
+      "completions/mean_terminated_length": 680.5714721679688,
+      "completions/min_length": 420.0,
+      "completions/min_terminated_length": 420.0,
+      "entropy": 0.43676573038101196,
+      "epoch": 1.8941798941798942,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.25,
+      "kl": 0.00232238182798028,
+      "learning_rate": 7.637120697036865e-07,
+      "loss": -0.0125,
+      "num_tokens": 6932685.0,
+      "reward": 0.9479166865348816,
+      "reward_std": 0.019287927076220512,
+      "rewards/itbench_correctness/mean": 0.9479166865348816,
+      "rewards/itbench_correctness/std": 0.05989960953593254,
+      "step": 358,
+      "step_time": 316.1507151676342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 922.0,
+      "completions/mean_length": 980.3125,
+      "completions/mean_terminated_length": 849.25,
+      "completions/min_length": 761.0,
+      "completions/min_terminated_length": 761.0,
+      "entropy": 0.5059611201286316,
+      "epoch": 1.8994708994708995,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.0012296068016439676,
+      "learning_rate": 7.623058388307268e-07,
+      "loss": 0.0303,
+      "num_tokens": 6960266.0,
+      "reward": 0.015625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.015625,
+      "rewards/itbench_correctness/std": 0.0625,
+      "step": 359,
+      "step_time": 104.70608714781702
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 735.0,
+      "completions/max_terminated_length": 735.0,
+      "completions/mean_length": 538.4375,
+      "completions/mean_terminated_length": 538.4375,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.3045850396156311,
+      "epoch": 1.9047619047619047,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.001858119503594935,
+      "learning_rate": 7.608967394231386e-07,
+      "loss": 0.0023,
+      "num_tokens": 6972577.0,
+      "reward": 0.8125,
+      "reward_std": 0.2982703447341919,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.3354102075099945,
+      "step": 360,
+      "step_time": 63.012714352458715
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 353.0,
+      "completions/mean_length": 982.0625,
+      "completions/mean_terminated_length": 353.0,
+      "completions/min_length": 353.0,
+      "completions/min_terminated_length": 353.0,
+      "entropy": 0.42563483119010925,
+      "epoch": 1.91005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.421875,
+      "kl": 0.0017107086023315787,
+      "learning_rate": 7.594847868906076e-07,
+      "loss": -0.0493,
+      "num_tokens": 6998698.0,
+      "reward": 0.6770833730697632,
+      "reward_std": 0.2745841145515442,
+      "rewards/itbench_correctness/mean": 0.6770833730697632,
+      "rewards/itbench_correctness/std": 0.3303687572479248,
+      "step": 361,
+      "step_time": 104.44509523361921
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 735.0,
+      "completions/max_terminated_length": 735.0,
+      "completions/mean_length": 527.0625,
+      "completions/mean_terminated_length": 527.0625,
+      "completions/min_length": 354.0,
+      "completions/min_terminated_length": 354.0,
+      "entropy": 0.46863511204719543,
+      "epoch": 1.9153439153439153,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.390625,
+      "kl": 0.0021068877540528774,
+      "learning_rate": 7.5806999667402e-07,
+      "loss": 0.0109,
+      "num_tokens": 7018539.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 362,
+      "step_time": 93.05886326078326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 899.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 664.125,
+      "completions/mean_terminated_length": 664.125,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.41859591007232666,
+      "epoch": 1.9206349206349205,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0654296875,
+      "kl": 0.001816941425204277,
+      "learning_rate": 7.566523842452956e-07,
+      "loss": 0.0,
+      "num_tokens": 7034125.0,
+      "reward": 0.25,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 363,
+      "step_time": 467.18558633420616
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 837.0,
+      "completions/mean_length": 868.125,
+      "completions/mean_terminated_length": 712.25,
+      "completions/min_length": 621.0,
+      "completions/min_terminated_length": 621.0,
+      "entropy": 0.49992799758911133,
+      "epoch": 1.925925925925926,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6953125,
+      "kl": 0.0011530888732522726,
+      "learning_rate": 7.552319651072163e-07,
+      "loss": 0.0,
+      "num_tokens": 7056823.0,
+      "reward": 0.5546875,
+      "reward_std": 0.09704047441482544,
+      "rewards/itbench_correctness/mean": 0.5546875,
+      "rewards/itbench_correctness/std": 0.3563961982727051,
+      "step": 364,
+      "step_time": 246.65094076655805
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 850.6875,
+      "completions/mean_terminated_length": 677.375,
+      "completions/min_length": 563.0,
+      "completions/min_terminated_length": 563.0,
+      "entropy": 0.46785688400268555,
+      "epoch": 1.9312169312169312,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4921875,
+      "kl": 0.001800206839106977,
+      "learning_rate": 7.538087547932584e-07,
+      "loss": 0.0001,
+      "num_tokens": 7084386.0,
+      "reward": 0.6041666865348816,
+      "reward_std": 0.07386711239814758,
+      "rewards/itbench_correctness/mean": 0.6041666865348816,
+      "rewards/itbench_correctness/std": 0.42108768224716187,
+      "step": 365,
+      "step_time": 140.03905525244772
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 775.0,
+      "completions/mean_length": 627.0625,
+      "completions/mean_terminated_length": 600.6000366210938,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "entropy": 0.5358317494392395,
+      "epoch": 1.9365079365079365,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.328125,
+      "kl": 0.0013238092651590705,
+      "learning_rate": 7.523827688674219e-07,
+      "loss": 0.0181,
+      "num_tokens": 7100155.0,
+      "reward": 0.65625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.3966001570224762,
+      "step": 366,
+      "step_time": 214.51375654805452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 813.6875,
+      "completions/mean_terminated_length": 650.1111450195312,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.7029725909233093,
+      "epoch": 1.9417989417989419,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01507568359375,
+      "kl": 0.0012179253390058875,
+      "learning_rate": 7.509540229240601e-07,
+      "loss": 0.0,
+      "num_tokens": 7147734.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 367,
+      "step_time": 142.35515129286796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 582.0,
+      "completions/max_terminated_length": 582.0,
+      "completions/mean_length": 393.875,
+      "completions/mean_terminated_length": 393.875,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "entropy": 0.35036495327949524,
+      "epoch": 1.947089947089947,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01324462890625,
+      "kl": 0.0013395985588431358,
+      "learning_rate": 7.495225325877103e-07,
+      "loss": 0.0,
+      "num_tokens": 7157244.0,
+      "reward": 0.0833333358168602,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0833333358168602,
+      "rewards/itbench_correctness/std": 0.08606629818677902,
+      "step": 368,
+      "step_time": 50.80347699671984
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 622.8125,
+      "completions/mean_terminated_length": 530.2307739257812,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 0.34520822763442993,
+      "epoch": 1.9523809523809523,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.002235216787084937,
+      "learning_rate": 7.480883135129211e-07,
+      "loss": -0.0501,
+      "num_tokens": 7173433.0,
+      "reward": 0.5,
+      "reward_std": 0.4629100561141968,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 369,
+      "step_time": 573.6945302598178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 812.0,
+      "completions/mean_length": 703.75,
+      "completions/mean_terminated_length": 558.1818237304688,
+      "completions/min_length": 367.0,
+      "completions/min_terminated_length": 367.0,
+      "entropy": 0.47744226455688477,
+      "epoch": 1.9576719576719577,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5078125,
+      "kl": 0.0014034686610102654,
+      "learning_rate": 7.466513813840824e-07,
+      "loss": 0.0179,
+      "num_tokens": 7199237.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 370,
+      "step_time": 871.0688090631738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 721.0,
+      "completions/mean_length": 799.0,
+      "completions/mean_terminated_length": 574.0,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "entropy": 0.5306633114814758,
+      "epoch": 1.9629629629629628,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0,
+      "kl": 0.0018356168875470757,
+      "learning_rate": 7.452117519152541e-07,
+      "loss": 0.0105,
+      "num_tokens": 7218325.0,
+      "reward": 0.21250000596046448,
+      "reward_std": 0.21977336704730988,
+      "rewards/itbench_correctness/mean": 0.21250000596046448,
+      "rewards/itbench_correctness/std": 0.239095538854599,
+      "step": 371,
+      "step_time": 154.47280544694513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 946.0,
+      "completions/mean_length": 673.0625,
+      "completions/mean_terminated_length": 649.6666870117188,
+      "completions/min_length": 386.0,
+      "completions/min_terminated_length": 386.0,
+      "entropy": 0.3461788594722748,
+      "epoch": 1.9682539682539684,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.671875,
+      "kl": 0.0013453871943056583,
+      "learning_rate": 7.437694408499932e-07,
+      "loss": -0.0357,
+      "num_tokens": 7233958.0,
+      "reward": 0.8125,
+      "reward_std": 0.13908715546131134,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.2713136672973633,
+      "step": 372,
+      "step_time": 89.71556733455509
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 855.0,
+      "completions/max_terminated_length": 855.0,
+      "completions/mean_length": 590.9375,
+      "completions/mean_terminated_length": 590.9375,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.3553675413131714,
+      "epoch": 1.9735449735449735,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.98046875,
+      "kl": 0.0020165969617664814,
+      "learning_rate": 7.423244639611826e-07,
+      "loss": -0.0436,
+      "num_tokens": 7253629.0,
+      "reward": 0.125,
+      "reward_std": 0.10564428567886353,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.19364917278289795,
+      "step": 373,
+      "step_time": 113.8127332655713
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 714.0,
+      "completions/mean_length": 715.875,
+      "completions/mean_terminated_length": 407.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5308189392089844,
+      "epoch": 1.9788359788359788,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7734375,
+      "kl": 0.0015742821851745248,
+      "learning_rate": 7.408768370508576e-07,
+      "loss": -0.0489,
+      "num_tokens": 7276483.0,
+      "reward": 0.4375,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 374,
+      "step_time": 63.292789563536644
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 612.8125,
+      "completions/mean_terminated_length": 612.8125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.40632331371307373,
+      "epoch": 1.9841269841269842,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.89453125,
+      "kl": 0.0012454271782189608,
+      "learning_rate": 7.394265759500347e-07,
+      "loss": -0.015,
+      "num_tokens": 7296752.0,
+      "reward": 0.40937501192092896,
+      "reward_std": 0.16952534019947052,
+      "rewards/itbench_correctness/mean": 0.40937501192092896,
+      "rewards/itbench_correctness/std": 0.3946385681629181,
+      "step": 375,
+      "step_time": 100.02406275831163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 414.0,
+      "completions/mean_length": 618.0625,
+      "completions/mean_terminated_length": 302.3333435058594,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4271412789821625,
+      "epoch": 1.9894179894179893,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.390625,
+      "kl": 0.0022359276190400124,
+      "learning_rate": 7.379736965185368e-07,
+      "loss": -0.0336,
+      "num_tokens": 7315561.0,
+      "reward": 0.02083333395421505,
+      "reward_std": 0.03857583925127983,
+      "rewards/itbench_correctness/mean": 0.02083333395421505,
+      "rewards/itbench_correctness/std": 0.05692750960588455,
+      "step": 376,
+      "step_time": 119.51777216419578
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 578.0,
+      "completions/mean_length": 737.875,
+      "completions/mean_terminated_length": 451.75,
+      "completions/min_length": 360.0,
+      "completions/min_terminated_length": 360.0,
+      "entropy": 0.4445197284221649,
+      "epoch": 1.9947089947089947,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.234375,
+      "kl": 0.0014217497082427144,
+      "learning_rate": 7.365182146448204e-07,
+      "loss": 0.006,
+      "num_tokens": 7339351.0,
+      "reward": 0.46875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4989572763442993,
+      "step": 377,
+      "step_time": 101.3526473660022
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 957.8125,
+      "completions/mean_terminated_length": 872.7142944335938,
+      "completions/min_length": 811.0,
+      "completions/min_terminated_length": 811.0,
+      "entropy": 0.5074061751365662,
+      "epoch": 2.0,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.0012597291497513652,
+      "learning_rate": 7.350601462458024e-07,
+      "loss": 0.0172,
+      "num_tokens": 7379628.0,
+      "reward": 0.40625,
+      "reward_std": 0.08210401982069016,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.43430978059768677,
+      "step": 378,
+      "step_time": 126.26444634236395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 871.0,
+      "completions/max_terminated_length": 871.0,
+      "completions/mean_length": 574.6875,
+      "completions/mean_terminated_length": 574.6875,
+      "completions/min_length": 448.0,
+      "completions/min_terminated_length": 448.0,
+      "entropy": 0.4402392506599426,
+      "epoch": 2.005291005291005,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0262451171875,
+      "kl": 0.0012956882128491998,
+      "learning_rate": 7.335995072666847e-07,
+      "loss": 0.0,
+      "num_tokens": 7393239.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 379,
+      "step_time": 1022.2069364916533
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 945.5625,
+      "completions/mean_terminated_length": 844.7142944335938,
+      "completions/min_length": 713.0,
+      "completions/min_terminated_length": 713.0,
+      "entropy": 0.5710886120796204,
+      "epoch": 2.0105820105820107,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.125,
+      "kl": 0.0013581872917711735,
+      "learning_rate": 7.321363136807818e-07,
+      "loss": 0.0597,
+      "num_tokens": 7419288.0,
+      "reward": 0.49687498807907104,
+      "reward_std": 0.14920906722545624,
+      "rewards/itbench_correctness/mean": 0.49687498807907104,
+      "rewards/itbench_correctness/std": 0.4410097301006317,
+      "step": 380,
+      "step_time": 104.49046333320439
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 816.75,
+      "completions/mean_terminated_length": 747.6666870117188,
+      "completions/min_length": 626.0,
+      "completions/min_terminated_length": 626.0,
+      "entropy": 0.38934803009033203,
+      "epoch": 2.015873015873016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.84375,
+      "kl": 0.001262744888663292,
+      "learning_rate": 7.306705814893439e-07,
+      "loss": -0.0128,
+      "num_tokens": 7437700.0,
+      "reward": 0.4375,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.4281744360923767,
+      "step": 381,
+      "step_time": 86.5950373802334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 804.875,
+      "completions/mean_terminated_length": 790.2667236328125,
+      "completions/min_length": 570.0,
+      "completions/min_terminated_length": 570.0,
+      "entropy": 0.47212299704551697,
+      "epoch": 2.0211640211640214,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0021258334163576365,
+      "learning_rate": 7.292023267213835e-07,
+      "loss": -0.0265,
+      "num_tokens": 7457698.0,
+      "reward": 0.75,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 382,
+      "step_time": 89.82852033432573
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 692.875,
+      "completions/mean_terminated_length": 670.800048828125,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "entropy": 0.26556017994880676,
+      "epoch": 2.0264550264550265,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.625,
+      "kl": 0.0009645685204304755,
+      "learning_rate": 7.277315654334996e-07,
+      "loss": -0.0005,
+      "num_tokens": 7474384.0,
+      "reward": 0.796875,
+      "reward_std": 0.11100947856903076,
+      "rewards/itbench_correctness/mean": 0.796875,
+      "rewards/itbench_correctness/std": 0.1359764039516449,
+      "step": 383,
+      "step_time": 838.6957097211853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 654.375,
+      "completions/mean_terminated_length": 601.5714721679688,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.39732569456100464,
+      "epoch": 2.0317460317460316,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.796875,
+      "kl": 0.0012180046178400517,
+      "learning_rate": 7.262583137097018e-07,
+      "loss": 0.0288,
+      "num_tokens": 7490990.0,
+      "reward": 0.6614583134651184,
+      "reward_std": 0.30096644163131714,
+      "rewards/itbench_correctness/mean": 0.6614583134651184,
+      "rewards/itbench_correctness/std": 0.2926076054573059,
+      "step": 384,
+      "step_time": 613.9163551460952
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 685.25,
+      "completions/mean_terminated_length": 662.6666870117188,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.41152864694595337,
+      "epoch": 2.037037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.609375,
+      "kl": 0.0014390156138688326,
+      "learning_rate": 7.247825876612352e-07,
+      "loss": -0.0331,
+      "num_tokens": 7505778.0,
+      "reward": 0.3324652910232544,
+      "reward_std": 0.2620442807674408,
+      "rewards/itbench_correctness/mean": 0.3324652910232544,
+      "rewards/itbench_correctness/std": 0.35357046127319336,
+      "step": 385,
+      "step_time": 686.7393183000386
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 706.1875,
+      "completions/mean_terminated_length": 561.727294921875,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "entropy": 0.4871227443218231,
+      "epoch": 2.0423280423280423,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0018794104689732194,
+      "learning_rate": 7.233044034264033e-07,
+      "loss": 0.0135,
+      "num_tokens": 7523709.0,
+      "reward": 0.9829545617103577,
+      "reward_std": 0.023524951189756393,
+      "rewards/itbench_correctness/mean": 0.9829545617103577,
+      "rewards/itbench_correctness/std": 0.03664661943912506,
+      "step": 386,
+      "step_time": 79.07936265133321
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 968.625,
+      "completions/mean_terminated_length": 913.25,
+      "completions/min_length": 798.0,
+      "completions/min_terminated_length": 798.0,
+      "entropy": 0.35927215218544006,
+      "epoch": 2.0476190476190474,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.34375,
+      "kl": 0.001336527056992054,
+      "learning_rate": 7.21823777170392e-07,
+      "loss": -0.0067,
+      "num_tokens": 7549655.0,
+      "reward": 0.5,
+      "reward_std": 0.3729091286659241,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.38490018248558044,
+      "step": 387,
+      "step_time": 140.9922649441287
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 605.0,
+      "completions/mean_length": 678.0625,
+      "completions/mean_terminated_length": 470.5,
+      "completions/min_length": 334.0,
+      "completions/min_terminated_length": 334.0,
+      "entropy": 0.45128583908081055,
+      "epoch": 2.052910052910053,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.328125,
+      "kl": 0.0015049315989017487,
+      "learning_rate": 7.203407250850928e-07,
+      "loss": -0.0119,
+      "num_tokens": 7567664.0,
+      "reward": 0.859375,
+      "reward_std": 0.19408094882965088,
+      "rewards/itbench_correctness/mean": 0.859375,
+      "rewards/itbench_correctness/std": 0.30233466625213623,
+      "step": 388,
+      "step_time": 802.1390054896474
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 900.0,
+      "completions/mean_length": 659.5625,
+      "completions/mean_terminated_length": 635.2667236328125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.36842605471611023,
+      "epoch": 2.058201058201058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.000988945015706122,
+      "learning_rate": 7.188552633889259e-07,
+      "loss": 0.0073,
+      "num_tokens": 7583641.0,
+      "reward": 0.40625,
+      "reward_std": 0.01767767034471035,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.420267790555954,
+      "step": 389,
+      "step_time": 102.60351053066552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 906.0,
+      "completions/mean_length": 627.5,
+      "completions/mean_terminated_length": 536.0,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "entropy": 0.4270916283130646,
+      "epoch": 2.0634920634920633,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0013897939352318645,
+      "learning_rate": 7.173674083266623e-07,
+      "loss": -0.0168,
+      "num_tokens": 7597833.0,
+      "reward": 0.4437499940395355,
+      "reward_std": 0.08210402727127075,
+      "rewards/itbench_correctness/mean": 0.4437499940395355,
+      "rewards/itbench_correctness/std": 0.4718315303325653,
+      "step": 390,
+      "step_time": 941.484922320582
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 739.0,
+      "completions/max_terminated_length": 739.0,
+      "completions/mean_length": 509.375,
+      "completions/mean_terminated_length": 509.375,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "entropy": 0.4750920236110687,
+      "epoch": 2.068783068783069,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.98828125,
+      "kl": 0.001085254829376936,
+      "learning_rate": 7.158771761692464e-07,
+      "loss": -0.0065,
+      "num_tokens": 7608767.0,
+      "reward": 0.6937500238418579,
+      "reward_std": 0.1399936079978943,
+      "rewards/itbench_correctness/mean": 0.6937500238418579,
+      "rewards/itbench_correctness/std": 0.3696281909942627,
+      "step": 391,
+      "step_time": 484.2868151040748
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 686.0,
+      "completions/mean_length": 783.5,
+      "completions/mean_terminated_length": 543.0,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "entropy": 0.5437141060829163,
+      "epoch": 2.074074074074074,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9609375,
+      "kl": 0.0014096169034019113,
+      "learning_rate": 7.143845832136187e-07,
+      "loss": 0.0198,
+      "num_tokens": 7629503.0,
+      "reward": 0.390625,
+      "reward_std": 0.35405686497688293,
+      "rewards/itbench_correctness/mean": 0.390625,
+      "rewards/itbench_correctness/std": 0.40019527077674866,
+      "step": 392,
+      "step_time": 247.14675129018724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 573.0,
+      "completions/max_terminated_length": 573.0,
+      "completions/mean_length": 411.0,
+      "completions/mean_terminated_length": 411.0,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "entropy": 0.4428223967552185,
+      "epoch": 2.0793650793650795,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8203125,
+      "kl": 0.002443905221298337,
+      "learning_rate": 7.128896457825363e-07,
+      "loss": -0.0199,
+      "num_tokens": 7638431.0,
+      "reward": 0.42500001192092896,
+      "reward_std": 0.1060660183429718,
+      "rewards/itbench_correctness/mean": 0.42500001192092896,
+      "rewards/itbench_correctness/std": 0.12382783740758896,
+      "step": 393,
+      "step_time": 81.88051935099065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 575.75,
+      "completions/mean_terminated_length": 575.75,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.33174121379852295,
+      "epoch": 2.0846560846560847,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.002629755064845085,
+      "learning_rate": 7.113923802243956e-07,
+      "loss": -0.0546,
+      "num_tokens": 7652163.0,
+      "reward": 0.3479166626930237,
+      "reward_std": 0.15140824019908905,
+      "rewards/itbench_correctness/mean": 0.3479166626930237,
+      "rewards/itbench_correctness/std": 0.14930394291877747,
+      "step": 394,
+      "step_time": 82.69239473901689
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 812.25,
+      "completions/mean_terminated_length": 741.6666870117188,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5195444822311401,
+      "epoch": 2.0899470899470898,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3203125,
+      "kl": 0.0016354058170691133,
+      "learning_rate": 7.098928029130528e-07,
+      "loss": -0.0397,
+      "num_tokens": 7684783.0,
+      "reward": 0.8125,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 395,
+      "step_time": 280.40570612065494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 763.375,
+      "completions/mean_terminated_length": 726.1428833007812,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 0.5344686508178711,
+      "epoch": 2.0952380952380953,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0014837183989584446,
+      "learning_rate": 7.083909302476452e-07,
+      "loss": 0.0739,
+      "num_tokens": 7704229.0,
+      "reward": 0.2749999761581421,
+      "reward_std": 0.17728103697299957,
+      "rewards/itbench_correctness/mean": 0.2749999761581421,
+      "rewards/itbench_correctness/std": 0.3732738196849823,
+      "step": 396,
+      "step_time": 85.86852881591767
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 741.0,
+      "completions/max_terminated_length": 741.0,
+      "completions/mean_length": 552.5625,
+      "completions/mean_terminated_length": 552.5625,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "entropy": 0.46148625016212463,
+      "epoch": 2.1005291005291005,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.984375,
+      "kl": 0.0017779659247025847,
+      "learning_rate": 7.068867786524115e-07,
+      "loss": 0.0021,
+      "num_tokens": 7725262.0,
+      "reward": 0.6354166269302368,
+      "reward_std": 0.36623916029930115,
+      "rewards/itbench_correctness/mean": 0.6354166269302368,
+      "rewards/itbench_correctness/std": 0.41373974084854126,
+      "step": 397,
+      "step_time": 108.55369200650603
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 638.5,
+      "completions/mean_terminated_length": 463.2727355957031,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "entropy": 0.46045419573783875,
+      "epoch": 2.105820105820106,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.90625,
+      "kl": 0.0019145008409395814,
+      "learning_rate": 7.053803645765127e-07,
+      "loss": -0.0524,
+      "num_tokens": 7743718.0,
+      "reward": 0.3229166567325592,
+      "reward_std": 0.18293291330337524,
+      "rewards/itbench_correctness/mean": 0.3229166567325592,
+      "rewards/itbench_correctness/std": 0.25783106684684753,
+      "step": 398,
+      "step_time": 124.76364956516773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 718.875,
+      "completions/mean_terminated_length": 481.5555725097656,
+      "completions/min_length": 372.0,
+      "completions/min_terminated_length": 372.0,
+      "entropy": 0.3672404885292053,
+      "epoch": 2.111111111111111,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3125,
+      "kl": 0.001165916328318417,
+      "learning_rate": 7.038717044938518e-07,
+      "loss": 0.0019,
+      "num_tokens": 7766796.0,
+      "reward": 0.4196428656578064,
+      "reward_std": 0.11090338230133057,
+      "rewards/itbench_correctness/mean": 0.4196428656578064,
+      "rewards/itbench_correctness/std": 0.45912888646125793,
+      "step": 399,
+      "step_time": 92.30510796047747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 742.4375,
+      "completions/mean_terminated_length": 702.2142944335938,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "entropy": 0.4337065517902374,
+      "epoch": 2.1164021164021163,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.25,
+      "kl": 0.0014574953820556402,
+      "learning_rate": 7.023608149028936e-07,
+      "loss": -0.0175,
+      "num_tokens": 7783843.0,
+      "reward": 0.8125,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 400,
+      "step_time": 163.5410966835916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 705.0,
+      "completions/max_terminated_length": 705.0,
+      "completions/mean_length": 410.75,
+      "completions/mean_terminated_length": 410.75,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "entropy": 0.41387704014778137,
+      "epoch": 2.121693121693122,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.91796875,
+      "kl": 0.001556264702230692,
+      "learning_rate": 7.008477123264847e-07,
+      "loss": -0.0122,
+      "num_tokens": 7792695.0,
+      "reward": 0.625,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.4281744360923767,
+      "step": 401,
+      "step_time": 68.41044199559838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 720.375,
+      "completions/mean_terminated_length": 538.2000122070312,
+      "completions/min_length": 329.0,
+      "completions/min_terminated_length": 329.0,
+      "entropy": 0.3942390978336334,
+      "epoch": 2.126984126984127,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5234375,
+      "kl": 0.0016359214205294847,
+      "learning_rate": 6.993324133116725e-07,
+      "loss": 0.026,
+      "num_tokens": 7820917.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 402,
+      "step_time": 80.19952200446278
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 813.4375,
+      "completions/mean_terminated_length": 649.6666870117188,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "entropy": 0.4278140664100647,
+      "epoch": 2.132275132275132,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0014102754648774862,
+      "learning_rate": 6.978149344295241e-07,
+      "loss": 0.0034,
+      "num_tokens": 7839988.0,
+      "reward": 0.8035714626312256,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.8035714626312256,
+      "rewards/itbench_correctness/std": 0.1907735913991928,
+      "step": 403,
+      "step_time": 899.5258836848661
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 697.0,
+      "completions/max_terminated_length": 697.0,
+      "completions/mean_length": 411.5625,
+      "completions/mean_terminated_length": 411.5625,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.4130599796772003,
+      "epoch": 2.1375661375661377,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.7421875,
+      "kl": 0.0012595870066434145,
+      "learning_rate": 6.962952922749457e-07,
+      "loss": -0.051,
+      "num_tokens": 7853909.0,
+      "reward": 0.39008620381355286,
+      "reward_std": 0.2060610055923462,
+      "rewards/itbench_correctness/mean": 0.39008620381355286,
+      "rewards/itbench_correctness/std": 0.4915003180503845,
+      "step": 404,
+      "step_time": 101.09288472961634
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 870.0,
+      "completions/mean_length": 735.25,
+      "completions/mean_terminated_length": 604.0,
+      "completions/min_length": 469.0,
+      "completions/min_terminated_length": 469.0,
+      "entropy": 0.5277116894721985,
+      "epoch": 2.142857142857143,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.609375,
+      "kl": 0.0016202793922275305,
+      "learning_rate": 6.947735034665001e-07,
+      "loss": 0.0106,
+      "num_tokens": 7879449.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 405,
+      "step_time": 199.16461731493473
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 836.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 484.25,
+      "completions/mean_terminated_length": 484.25,
+      "completions/min_length": 282.0,
+      "completions/min_terminated_length": 282.0,
+      "entropy": 0.35105833411216736,
+      "epoch": 2.148148148148148,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.0018317453796043992,
+      "learning_rate": 6.932495846462261e-07,
+      "loss": 0.0256,
+      "num_tokens": 7889989.0,
+      "reward": 0.5866477489471436,
+      "reward_std": 0.1253555417060852,
+      "rewards/itbench_correctness/mean": 0.5866477489471436,
+      "rewards/itbench_correctness/std": 0.4179156720638275,
+      "step": 406,
+      "step_time": 137.3452343745157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 651.0,
+      "completions/max_terminated_length": 651.0,
+      "completions/mean_length": 487.5,
+      "completions/mean_terminated_length": 487.5,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "entropy": 0.38564103841781616,
+      "epoch": 2.1534391534391535,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5859375,
+      "kl": 0.0013662968995049596,
+      "learning_rate": 6.917235524794558e-07,
+      "loss": 0.009,
+      "num_tokens": 7901029.0,
+      "reward": 0.890625,
+      "reward_std": 0.22707363963127136,
+      "rewards/itbench_correctness/mean": 0.890625,
+      "rewards/itbench_correctness/std": 0.22302372753620148,
+      "step": 407,
+      "step_time": 68.24169243406504
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 953.25,
+      "completions/mean_terminated_length": 862.2857666015625,
+      "completions/min_length": 584.0,
+      "completions/min_terminated_length": 584.0,
+      "entropy": 0.36926305294036865,
+      "epoch": 2.1587301587301586,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8359375,
+      "kl": 0.0011503396090120077,
+      "learning_rate": 6.901954236546324e-07,
+      "loss": -0.0107,
+      "num_tokens": 7924657.0,
+      "reward": 0.5640318393707275,
+      "reward_std": 0.34312185645103455,
+      "rewards/itbench_correctness/mean": 0.5640318393707275,
+      "rewards/itbench_correctness/std": 0.37077954411506653,
+      "step": 408,
+      "step_time": 846.7672138344496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 717.0,
+      "completions/mean_length": 768.3125,
+      "completions/mean_terminated_length": 569.4444580078125,
+      "completions/min_length": 464.0,
+      "completions/min_terminated_length": 464.0,
+      "entropy": 0.41389408707618713,
+      "epoch": 2.164021164021164,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.0032037936616688967,
+      "learning_rate": 6.886652148831279e-07,
+      "loss": 0.0154,
+      "num_tokens": 7947958.0,
+      "reward": 0.1041666716337204,
+      "reward_std": 0.19795581698417664,
+      "rewards/itbench_correctness/mean": 0.1041666716337204,
+      "rewards/itbench_correctness/std": 0.291070818901062,
+      "step": 409,
+      "step_time": 180.43269913457334
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 513.1875,
+      "completions/mean_terminated_length": 513.1875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.378029465675354,
+      "epoch": 2.1693121693121693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.458984375,
+      "kl": 0.0019971805159002542,
+      "learning_rate": 6.871329428990601e-07,
+      "loss": -0.0628,
+      "num_tokens": 7959761.0,
+      "reward": 0.71875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 410,
+      "step_time": 105.00821590330452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 707.0,
+      "completions/max_terminated_length": 707.0,
+      "completions/mean_length": 519.75,
+      "completions/mean_terminated_length": 519.75,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.4290524423122406,
+      "epoch": 2.1746031746031744,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.037841796875,
+      "kl": 0.0021021170541644096,
+      "learning_rate": 6.855986244591103e-07,
+      "loss": 0.0,
+      "num_tokens": 7976613.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 411,
+      "step_time": 872.8376494199038
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 883.0625,
+      "completions/mean_terminated_length": 773.4444580078125,
+      "completions/min_length": 588.0,
+      "completions/min_terminated_length": 588.0,
+      "entropy": 0.5888597965240479,
+      "epoch": 2.17989417989418,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.7109375,
+      "kl": 0.002118014730513096,
+      "learning_rate": 6.840622763423391e-07,
+      "loss": 0.0001,
+      "num_tokens": 8008030.0,
+      "reward": 0.25,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 412,
+      "step_time": 87.85907210037112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 618.0,
+      "completions/mean_length": 798.0625,
+      "completions/mean_terminated_length": 572.125,
+      "completions/min_length": 490.0,
+      "completions/min_terminated_length": 490.0,
+      "entropy": 0.3759104013442993,
+      "epoch": 2.185185185185185,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.0015699933283030987,
+      "learning_rate": 6.825239153500029e-07,
+      "loss": 0.0035,
+      "num_tokens": 8027279.0,
+      "reward": 0.4518229365348816,
+      "reward_std": 0.22612185776233673,
+      "rewards/itbench_correctness/mean": 0.4518229365348816,
+      "rewards/itbench_correctness/std": 0.41147592663764954,
+      "step": 413,
+      "step_time": 98.12107760738581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 596.0,
+      "completions/mean_length": 754.125,
+      "completions/mean_terminated_length": 484.25,
+      "completions/min_length": 362.0,
+      "completions/min_terminated_length": 362.0,
+      "entropy": 0.5648930668830872,
+      "epoch": 2.1904761904761907,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5234375,
+      "kl": 0.0014875817578285933,
+      "learning_rate": 6.809835583053715e-07,
+      "loss": 0.0,
+      "num_tokens": 8047225.0,
+      "reward": 0.6000000238418579,
+      "reward_std": 0.13887301087379456,
+      "rewards/itbench_correctness/mean": 0.6000000238418579,
+      "rewards/itbench_correctness/std": 0.4546060562133789,
+      "step": 414,
+      "step_time": 132.7436649715528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 491.0,
+      "completions/max_terminated_length": 491.0,
+      "completions/mean_length": 362.0625,
+      "completions/mean_terminated_length": 362.0625,
+      "completions/min_length": 301.0,
+      "completions/min_terminated_length": 301.0,
+      "entropy": 0.4474365711212158,
+      "epoch": 2.195767195767196,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.0019536821637302637,
+      "learning_rate": 6.794412220535425e-07,
+      "loss": 0.0011,
+      "num_tokens": 8055514.0,
+      "reward": 0.21875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.2561737895011902,
+      "step": 415,
+      "step_time": 80.36343740858138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 892.25,
+      "completions/mean_terminated_length": 760.5,
+      "completions/min_length": 508.0,
+      "completions/min_terminated_length": 508.0,
+      "entropy": 0.542448878288269,
+      "epoch": 2.201058201058201,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4921875,
+      "kl": 0.001819648896344006,
+      "learning_rate": 6.778969234612583e-07,
+      "loss": 0.0001,
+      "num_tokens": 8082102.0,
+      "reward": 0.640625,
+      "reward_std": 0.0867956355214119,
+      "rewards/itbench_correctness/mean": 0.640625,
+      "rewards/itbench_correctness/std": 0.3896446228027344,
+      "step": 416,
+      "step_time": 243.75771763175726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 1010.1875,
+      "completions/mean_terminated_length": 913.5,
+      "completions/min_length": 827.0,
+      "completions/min_terminated_length": 827.0,
+      "entropy": 0.6018684506416321,
+      "epoch": 2.2063492063492065,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0012090131640434265,
+      "learning_rate": 6.763506794167206e-07,
+      "loss": 0.0067,
+      "num_tokens": 8108297.0,
+      "reward": 0.46875,
+      "reward_std": 0.2346404492855072,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4905354380607605,
+      "step": 417,
+      "step_time": 88.06844450253993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 861.75,
+      "completions/mean_terminated_length": 764.4000244140625,
+      "completions/min_length": 686.0,
+      "completions/min_terminated_length": 686.0,
+      "entropy": 0.5616478323936462,
+      "epoch": 2.2116402116402116,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.375,
+      "kl": 0.0016445523360744119,
+      "learning_rate": 6.748025068294067e-07,
+      "loss": -0.0042,
+      "num_tokens": 8138485.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 418,
+      "step_time": 170.92238603066653
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 770.0,
+      "completions/mean_length": 637.4375,
+      "completions/mean_terminated_length": 611.6666870117188,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.5302480459213257,
+      "epoch": 2.2169312169312168,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3359375,
+      "kl": 0.0013207794399932027,
+      "learning_rate": 6.732524226298841e-07,
+      "loss": -0.0056,
+      "num_tokens": 8171140.0,
+      "reward": 0.21875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.2561737895011902,
+      "step": 419,
+      "step_time": 438.8370144786313
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 663.0,
+      "completions/max_terminated_length": 663.0,
+      "completions/mean_length": 541.875,
+      "completions/mean_terminated_length": 541.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.44844290614128113,
+      "epoch": 2.2222222222222223,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2734375,
+      "kl": 0.001750895637087524,
+      "learning_rate": 6.717004437696249e-07,
+      "loss": -0.0712,
+      "num_tokens": 8183074.0,
+      "reward": 0.796875,
+      "reward_std": 0.1886717677116394,
+      "rewards/itbench_correctness/mean": 0.796875,
+      "rewards/itbench_correctness/std": 0.25634312629699707,
+      "step": 420,
+      "step_time": 80.88844703137875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 709.4375,
+      "completions/mean_terminated_length": 520.7000122070312,
+      "completions/min_length": 356.0,
+      "completions/min_terminated_length": 356.0,
+      "entropy": 0.28755176067352295,
+      "epoch": 2.2275132275132274,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.001124854083172977,
+      "learning_rate": 6.701465872208216e-07,
+      "loss": 0.0004,
+      "num_tokens": 8200969.0,
+      "reward": 0.59375,
+      "reward_std": 0.36201947927474976,
+      "rewards/itbench_correctness/mean": 0.59375,
+      "rewards/itbench_correctness/std": 0.3598804175853729,
+      "step": 421,
+      "step_time": 135.98304109089077
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 629.0,
+      "completions/max_terminated_length": 629.0,
+      "completions/mean_length": 456.5625,
+      "completions/mean_terminated_length": 456.5625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.37234771251678467,
+      "epoch": 2.2328042328042326,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.7578125,
+      "kl": 0.001798869576305151,
+      "learning_rate": 6.685908699762001e-07,
+      "loss": -0.0517,
+      "num_tokens": 8211418.0,
+      "reward": 0.3270833492279053,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.3270833492279053,
+      "rewards/itbench_correctness/std": 0.24227891862392426,
+      "step": 422,
+      "step_time": 66.6123378733173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 997.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 658.6875,
+      "completions/mean_terminated_length": 658.6875,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.30059778690338135,
+      "epoch": 2.238095238095238,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.34375,
+      "kl": 0.00184349634218961,
+      "learning_rate": 6.670333090488356e-07,
+      "loss": -0.0076,
+      "num_tokens": 8227349.0,
+      "reward": 0.65625,
+      "reward_std": 0.0578637570142746,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.08539126068353653,
+      "step": 423,
+      "step_time": 148.7072524903342
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 991.8125,
+      "completions/mean_terminated_length": 766.5,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.25408029556274414,
+      "epoch": 2.2433862433862433,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.001193932956084609,
+      "learning_rate": 6.654739214719641e-07,
+      "loss": -0.0169,
+      "num_tokens": 8252114.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 424,
+      "step_time": 569.5789128560573
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 811.375,
+      "completions/mean_terminated_length": 598.75,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.4436912536621094,
+      "epoch": 2.248677248677249,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8203125,
+      "kl": 0.0011822007363662124,
+      "learning_rate": 6.639127242987987e-07,
+      "loss": -0.0078,
+      "num_tokens": 8271904.0,
+      "reward": 0.65625,
+      "reward_std": 0.3243582546710968,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.3966001570224762,
+      "step": 425,
+      "step_time": 137.0775876250118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 793.625,
+      "completions/mean_terminated_length": 688.9091186523438,
+      "completions/min_length": 507.0,
+      "completions/min_terminated_length": 507.0,
+      "entropy": 0.44857457280158997,
+      "epoch": 2.253968253968254,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.859375,
+      "kl": 0.0012536750873550773,
+      "learning_rate": 6.623497346023417e-07,
+      "loss": 0.0162,
+      "num_tokens": 8290466.0,
+      "reward": 0.2544642686843872,
+      "reward_std": 0.20485526323318481,
+      "rewards/itbench_correctness/mean": 0.2544642686843872,
+      "rewards/itbench_correctness/std": 0.2508065104484558,
+      "step": 426,
+      "step_time": 381.0265443623066
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 882.0,
+      "completions/max_terminated_length": 882.0,
+      "completions/mean_length": 668.5625,
+      "completions/mean_terminated_length": 668.5625,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "entropy": 0.4487239420413971,
+      "epoch": 2.259259259259259,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.0014325481606647372,
+      "learning_rate": 6.607849694751977e-07,
+      "loss": -0.0007,
+      "num_tokens": 8311259.0,
+      "reward": 0.800000011920929,
+      "reward_std": 0.09974324703216553,
+      "rewards/itbench_correctness/mean": 0.800000011920929,
+      "rewards/itbench_correctness/std": 0.10954451560974121,
+      "step": 427,
+      "step_time": 78.88445997610688
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 777.0,
+      "completions/max_terminated_length": 777.0,
+      "completions/mean_length": 581.875,
+      "completions/mean_terminated_length": 581.875,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.36090224981307983,
+      "epoch": 2.2645502645502646,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 13.25,
+      "kl": 0.0013826474314555526,
+      "learning_rate": 6.592184460293877e-07,
+      "loss": 0.0109,
+      "num_tokens": 8325825.0,
+      "reward": 0.4765625,
+      "reward_std": 0.3056884706020355,
+      "rewards/itbench_correctness/mean": 0.4765625,
+      "rewards/itbench_correctness/std": 0.4062500298023224,
+      "step": 428,
+      "step_time": 880.099565721117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 642.125,
+      "completions/mean_terminated_length": 587.5714721679688,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3862176239490509,
+      "epoch": 2.2698412698412698,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.21875,
+      "kl": 0.0022476972080767155,
+      "learning_rate": 6.576501813961608e-07,
+      "loss": -0.0962,
+      "num_tokens": 8351099.0,
+      "reward": 0.75,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 429,
+      "step_time": 397.64244225714356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 775.5,
+      "completions/mean_terminated_length": 718.1538696289062,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 0.5390070676803589,
+      "epoch": 2.2751322751322753,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0013746700715273619,
+      "learning_rate": 6.560801927258079e-07,
+      "loss": 0.0052,
+      "num_tokens": 8370627.0,
+      "reward": 0.6875,
+      "reward_std": 0.44403791427612305,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 430,
+      "step_time": 87.13009965512902
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 923.0,
+      "completions/mean_length": 880.875,
+      "completions/mean_terminated_length": 737.75,
+      "completions/min_length": 619.0,
+      "completions/min_terminated_length": 619.0,
+      "entropy": 0.5494536757469177,
+      "epoch": 2.2804232804232805,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.546875,
+      "kl": 0.0012544175842776895,
+      "learning_rate": 6.545084971874736e-07,
+      "loss": 0.0,
+      "num_tokens": 8397801.0,
+      "reward": 0.11249999701976776,
+      "reward_std": 0.09449111670255661,
+      "rewards/itbench_correctness/mean": 0.11249999701976776,
+      "rewards/itbench_correctness/std": 0.12974333763122559,
+      "step": 431,
+      "step_time": 98.64099729061127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 961.0,
+      "completions/mean_length": 733.5625,
+      "completions/mean_terminated_length": 559.2999877929688,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.42532163858413696,
+      "epoch": 2.2857142857142856,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0014970600605010986,
+      "learning_rate": 6.529351119689687e-07,
+      "loss": 0.0263,
+      "num_tokens": 8413058.0,
+      "reward": 0.4921875,
+      "reward_std": 0.10436524450778961,
+      "rewards/itbench_correctness/mean": 0.4921875,
+      "rewards/itbench_correctness/std": 0.47096699476242065,
+      "step": 432,
+      "step_time": 579.7829250898212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 803.1875,
+      "completions/mean_terminated_length": 729.5833740234375,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 0.3336705267429352,
+      "epoch": 2.291005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0011378336930647492,
+      "learning_rate": 6.513600542765816e-07,
+      "loss": -0.0063,
+      "num_tokens": 8433925.0,
+      "reward": 0.78125,
+      "reward_std": 0.13837619125843048,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.19924625754356384,
+      "step": 433,
+      "step_time": 182.92570608016104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 910.25,
+      "completions/mean_terminated_length": 796.5,
+      "completions/min_length": 618.0,
+      "completions/min_terminated_length": 618.0,
+      "entropy": 0.3559461832046509,
+      "epoch": 2.2962962962962963,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4140625,
+      "kl": 0.0012446820037439466,
+      "learning_rate": 6.497833413348909e-07,
+      "loss": -0.0338,
+      "num_tokens": 8462945.0,
+      "reward": 0.5625,
+      "reward_std": 0.03857583925127983,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.454911470413208,
+      "step": 434,
+      "step_time": 330.48511962778866
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 862.0,
+      "completions/mean_length": 839.0625,
+      "completions/mean_terminated_length": 654.125,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.464804470539093,
+      "epoch": 2.3015873015873014,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8984375,
+      "kl": 0.0014192892704159021,
+      "learning_rate": 6.482049903865768e-07,
+      "loss": 0.008,
+      "num_tokens": 8487602.0,
+      "reward": 0.8541666865348816,
+      "reward_std": 0.3027648329734802,
+      "rewards/itbench_correctness/mean": 0.8541666865348816,
+      "rewards/itbench_correctness/std": 0.2973649799823761,
+      "step": 435,
+      "step_time": 126.97447157558054
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 963.5,
+      "completions/mean_terminated_length": 916.4444580078125,
+      "completions/min_length": 747.0,
+      "completions/min_terminated_length": 747.0,
+      "entropy": 0.39647120237350464,
+      "epoch": 2.306878306878307,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.140625,
+      "kl": 0.0012867730110883713,
+      "learning_rate": 6.466250186922324e-07,
+      "loss": 0.0023,
+      "num_tokens": 8509778.0,
+      "reward": 0.697578489780426,
+      "reward_std": 0.291044145822525,
+      "rewards/itbench_correctness/mean": 0.697578489780426,
+      "rewards/itbench_correctness/std": 0.3142254650592804,
+      "step": 436,
+      "step_time": 449.84700517356396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 681.0,
+      "completions/max_terminated_length": 681.0,
+      "completions/mean_length": 460.5625,
+      "completions/mean_terminated_length": 460.5625,
+      "completions/min_length": 294.0,
+      "completions/min_terminated_length": 294.0,
+      "entropy": 0.4451078772544861,
+      "epoch": 2.312169312169312,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.02685546875,
+      "kl": 0.0015331042231991887,
+      "learning_rate": 6.450434435301751e-07,
+      "loss": 0.0,
+      "num_tokens": 8519963.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.3333333432674408,
+      "rewards/itbench_correctness/std": 0.17213259637355804,
+      "step": 437,
+      "step_time": 796.3606786699966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 565.0,
+      "completions/mean_length": 611.5,
+      "completions/mean_terminated_length": 424.0,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.4153720438480377,
+      "epoch": 2.317460317460317,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9453125,
+      "kl": 0.0015486880438402295,
+      "learning_rate": 6.43460282196257e-07,
+      "loss": 0.008,
+      "num_tokens": 8541179.0,
+      "reward": 0.2395833432674408,
+      "reward_std": 0.0883883535861969,
+      "rewards/itbench_correctness/mean": 0.2395833432674408,
+      "rewards/itbench_correctness/std": 0.27533650398254395,
+      "step": 438,
+      "step_time": 145.90149160753936
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 912.0,
+      "completions/max_terminated_length": 912.0,
+      "completions/mean_length": 577.0625,
+      "completions/mean_terminated_length": 577.0625,
+      "completions/min_length": 368.0,
+      "completions/min_terminated_length": 368.0,
+      "entropy": 0.5198743343353271,
+      "epoch": 2.322751322751323,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1171875,
+      "kl": 0.0017036347417160869,
+      "learning_rate": 6.418755520036774e-07,
+      "loss": 0.0056,
+      "num_tokens": 8558452.0,
+      "reward": 0.765625,
+      "reward_std": 0.09300297498703003,
+      "rewards/itbench_correctness/mean": 0.765625,
+      "rewards/itbench_correctness/std": 0.2733854353427887,
+      "step": 439,
+      "step_time": 154.49192036502063
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 522.0,
+      "completions/max_terminated_length": 522.0,
+      "completions/mean_length": 362.25,
+      "completions/mean_terminated_length": 362.25,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "entropy": 0.4306418299674988,
+      "epoch": 2.328042328042328,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.94140625,
+      "kl": 0.0017033821204677224,
+      "learning_rate": 6.402892702827916e-07,
+      "loss": -0.0083,
+      "num_tokens": 8566496.0,
+      "reward": 0.1953125,
+      "reward_std": 0.08956430107355118,
+      "rewards/itbench_correctness/mean": 0.1953125,
+      "rewards/itbench_correctness/std": 0.2359323352575302,
+      "step": 440,
+      "step_time": 84.76937860064209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 722.875,
+      "completions/mean_terminated_length": 622.5,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.40670931339263916,
+      "epoch": 2.3333333333333335,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.90625,
+      "kl": 0.0013241568813100457,
+      "learning_rate": 6.387014543809223e-07,
+      "loss": 0.0764,
+      "num_tokens": 8586822.0,
+      "reward": 0.4375,
+      "reward_std": 0.3339453935623169,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.35420751571655273,
+      "step": 441,
+      "step_time": 146.5532330982387
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.48046875,
+      "epoch": 2.3386243386243386,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0238037109375,
+      "kl": 0.0011454581981524825,
+      "learning_rate": 6.371121216621697e-07,
+      "loss": 0.0,
+      "num_tokens": 8615486.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 442,
+      "step_time": 117.81919787544757
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 771.0,
+      "completions/max_terminated_length": 771.0,
+      "completions/mean_length": 531.1875,
+      "completions/mean_terminated_length": 531.1875,
+      "completions/min_length": 356.0,
+      "completions/min_terminated_length": 356.0,
+      "entropy": 0.3595717251300812,
+      "epoch": 2.3439153439153437,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0625,
+      "kl": 0.001713753561489284,
+      "learning_rate": 6.355212895072222e-07,
+      "loss": -0.0025,
+      "num_tokens": 8627873.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 443,
+      "step_time": 1035.1622464098036
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 728.5,
+      "completions/mean_terminated_length": 594.1818237304688,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 0.3816060423851013,
+      "epoch": 2.3492063492063493,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0015201023779809475,
+      "learning_rate": 6.339289753131648e-07,
+      "loss": 0.0442,
+      "num_tokens": 8645001.0,
+      "reward": 0.109375,
+      "reward_std": 0.30935919284820557,
+      "rewards/itbench_correctness/mean": 0.109375,
+      "rewards/itbench_correctness/std": 0.30233466625213623,
+      "step": 444,
+      "step_time": 933.9612167160958
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 916.4375,
+      "completions/mean_terminated_length": 808.875,
+      "completions/min_length": 648.0,
+      "completions/min_terminated_length": 648.0,
+      "entropy": 0.49976131319999695,
+      "epoch": 2.3544973544973544,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.96875,
+      "kl": 0.0014402419328689575,
+      "learning_rate": 6.323351964932908e-07,
+      "loss": 0.0092,
+      "num_tokens": 8666528.0,
+      "reward": 0.44999998807907104,
+      "reward_std": 0.09669842571020126,
+      "rewards/itbench_correctness/mean": 0.44999998807907104,
+      "rewards/itbench_correctness/std": 0.41733282804489136,
+      "step": 445,
+      "step_time": 141.34655232075602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 564.0,
+      "completions/mean_length": 649.5,
+      "completions/mean_terminated_length": 358.22222900390625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4064665138721466,
+      "epoch": 2.35978835978836,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 6.28125,
+      "kl": 0.0020702362526208162,
+      "learning_rate": 6.307399704769098e-07,
+      "loss": -0.1482,
+      "num_tokens": 8692528.0,
+      "reward": 0.2447916716337204,
+      "reward_std": 0.1262161284685135,
+      "rewards/itbench_correctness/mean": 0.2447916716337204,
+      "rewards/itbench_correctness/std": 0.16020458936691284,
+      "step": 446,
+      "step_time": 118.47124487534165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 942.625,
+      "completions/mean_terminated_length": 861.25,
+      "completions/min_length": 709.0,
+      "completions/min_terminated_length": 709.0,
+      "entropy": 0.50921630859375,
+      "epoch": 2.365079365079365,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0014145068125799298,
+      "learning_rate": 6.291433147091583e-07,
+      "loss": 0.0104,
+      "num_tokens": 8714466.0,
+      "reward": 0.4854166805744171,
+      "reward_std": 0.3144327402114868,
+      "rewards/itbench_correctness/mean": 0.4854166805744171,
+      "rewards/itbench_correctness/std": 0.393459290266037,
+      "step": 447,
+      "step_time": 369.4135863818228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 899.375,
+      "completions/mean_terminated_length": 691.6666870117188,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5092425346374512,
+      "epoch": 2.3703703703703702,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.00447537936270237,
+      "learning_rate": 6.275452466508075e-07,
+      "loss": -0.0622,
+      "num_tokens": 8750192.0,
+      "reward": 0.09375,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.09375,
+      "rewards/itbench_correctness/std": 0.20155644416809082,
+      "step": 448,
+      "step_time": 469.61162946000695
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 773.0,
+      "completions/max_terminated_length": 773.0,
+      "completions/mean_length": 550.5,
+      "completions/mean_terminated_length": 550.5,
+      "completions/min_length": 430.0,
+      "completions/min_terminated_length": 430.0,
+      "entropy": 0.4396003484725952,
+      "epoch": 2.375661375661376,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0023843871895223856,
+      "learning_rate": 6.259457837780741e-07,
+      "loss": 0.0034,
+      "num_tokens": 8762320.0,
+      "reward": 0.75,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 449,
+      "step_time": 616.3559736898169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 776.0,
+      "completions/mean_length": 805.8125,
+      "completions/mean_terminated_length": 587.625,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 0.6304196119308472,
+      "epoch": 2.380952380952381,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.03125,
+      "kl": 0.0017924520652741194,
+      "learning_rate": 6.243449435824276e-07,
+      "loss": -0.0157,
+      "num_tokens": 8783789.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.2629520893096924,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.43885374069213867,
+      "step": 450,
+      "step_time": 88.87441652361304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 751.1875,
+      "completions/mean_terminated_length": 539.0,
+      "completions/min_length": 393.0,
+      "completions/min_terminated_length": 393.0,
+      "entropy": 0.4606040418148041,
+      "epoch": 2.386243386243386,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.875,
+      "kl": 0.0016598474467173219,
+      "learning_rate": 6.227427435703995e-07,
+      "loss": -0.0021,
+      "num_tokens": 8807136.0,
+      "reward": 0.5546875,
+      "reward_std": 0.2294243574142456,
+      "rewards/itbench_correctness/mean": 0.5546875,
+      "rewards/itbench_correctness/std": 0.3080184757709503,
+      "step": 451,
+      "step_time": 986.2314578304067
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 817.625,
+      "completions/mean_terminated_length": 770.0,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.5087907314300537,
+      "epoch": 2.3915343915343916,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.359375,
+      "kl": 0.001573887187987566,
+      "learning_rate": 6.211392012633931e-07,
+      "loss": -0.0123,
+      "num_tokens": 8831826.0,
+      "reward": 0.59375,
+      "reward_std": 0.18600594997406006,
+      "rewards/itbench_correctness/mean": 0.59375,
+      "rewards/itbench_correctness/std": 0.4905354380607605,
+      "step": 452,
+      "step_time": 369.9421289321035
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 602.0,
+      "completions/mean_terminated_length": 602.0,
+      "completions/min_length": 417.0,
+      "completions/min_terminated_length": 417.0,
+      "entropy": 0.49501660466194153,
+      "epoch": 2.3968253968253967,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.796875,
+      "kl": 0.0018204387743026018,
+      "learning_rate": 6.1953433419749e-07,
+      "loss": 0.0146,
+      "num_tokens": 8844602.0,
+      "reward": 0.4765625,
+      "reward_std": 0.10795740783214569,
+      "rewards/itbench_correctness/mean": 0.4765625,
+      "rewards/itbench_correctness/std": 0.2781464755535126,
+      "step": 453,
+      "step_time": 103.37166160158813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 552.0,
+      "completions/max_terminated_length": 552.0,
+      "completions/mean_length": 449.25,
+      "completions/mean_terminated_length": 449.25,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "entropy": 0.5052865743637085,
+      "epoch": 2.402116402116402,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.546875,
+      "kl": 0.0016118157655000687,
+      "learning_rate": 6.17928159923259e-07,
+      "loss": 0.0095,
+      "num_tokens": 8854590.0,
+      "reward": 0.71875,
+      "reward_std": 0.35564959049224854,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.44604745507240295,
+      "step": 454,
+      "step_time": 980.6566639961675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 832.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 527.625,
+      "completions/mean_terminated_length": 527.625,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "entropy": 0.48140251636505127,
+      "epoch": 2.4074074074074074,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.0013566534034907818,
+      "learning_rate": 6.163206960055652e-07,
+      "loss": -0.0056,
+      "num_tokens": 8868488.0,
+      "reward": 0.21875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.2561737895011902,
+      "step": 455,
+      "step_time": 94.81741558108479
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 701.0,
+      "completions/mean_length": 838.3125,
+      "completions/mean_terminated_length": 652.625,
+      "completions/min_length": 561.0,
+      "completions/min_terminated_length": 561.0,
+      "entropy": 0.37217625975608826,
+      "epoch": 2.4126984126984126,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.0014726277440786362,
+      "learning_rate": 6.147119600233758e-07,
+      "loss": -0.0124,
+      "num_tokens": 8892141.0,
+      "reward": 0.5520833134651184,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.5520833134651184,
+      "rewards/itbench_correctness/std": 0.4781087338924408,
+      "step": 456,
+      "step_time": 774.9578263629228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 805.3125,
+      "completions/mean_terminated_length": 732.4166870117188,
+      "completions/min_length": 492.0,
+      "completions/min_terminated_length": 492.0,
+      "entropy": 0.4619324803352356,
+      "epoch": 2.417989417989418,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.09326171875,
+      "kl": 0.0020939442329108715,
+      "learning_rate": 6.131019695695702e-07,
+      "loss": 0.0001,
+      "num_tokens": 8917394.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 457,
+      "step_time": 160.94475755654275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 509.0,
+      "completions/max_terminated_length": 509.0,
+      "completions/mean_length": 415.625,
+      "completions/mean_terminated_length": 415.625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "entropy": 0.4980451166629791,
+      "epoch": 2.4232804232804233,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.002369649475440383,
+      "learning_rate": 6.114907422507459e-07,
+      "loss": 0.0046,
+      "num_tokens": 8926548.0,
+      "reward": 0.625,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.3535533845424652,
+      "step": 458,
+      "step_time": 130.5550601184368
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 761.1875,
+      "completions/mean_terminated_length": 641.727294921875,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 0.3757287263870239,
+      "epoch": 2.4285714285714284,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.0018944862531498075,
+      "learning_rate": 6.098782956870265e-07,
+      "loss": 0.0028,
+      "num_tokens": 8944471.0,
+      "reward": 0.1875,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 459,
+      "step_time": 351.48624353297055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 730.5,
+      "completions/mean_terminated_length": 502.22222900390625,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.5448322892189026,
+      "epoch": 2.433862433862434,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.390625,
+      "kl": 0.0037839075084775686,
+      "learning_rate": 6.082646475118699e-07,
+      "loss": 0.0032,
+      "num_tokens": 8962455.0,
+      "reward": 0.10880681872367859,
+      "reward_std": 0.1294855922460556,
+      "rewards/itbench_correctness/mean": 0.10880681872367859,
+      "rewards/itbench_correctness/std": 0.12925373017787933,
+      "step": 460,
+      "step_time": 137.66368599049747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 908.0,
+      "completions/mean_length": 897.5,
+      "completions/mean_terminated_length": 799.1111450195312,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.41671308875083923,
+      "epoch": 2.439153439153439,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.041015625,
+      "kl": 0.0014645819319412112,
+      "learning_rate": 6.066498153718734e-07,
+      "loss": 0.0001,
+      "num_tokens": 8993351.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 461,
+      "step_time": 331.1069351742044
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 950.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 638.3125,
+      "completions/mean_terminated_length": 638.3125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.333692342042923,
+      "epoch": 2.4444444444444446,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9453125,
+      "kl": 0.0018866208847612143,
+      "learning_rate": 6.05033816926583e-07,
+      "loss": -0.0577,
+      "num_tokens": 9008684.0,
+      "reward": 0.0520833358168602,
+      "reward_std": 0.043129097670316696,
+      "rewards/itbench_correctness/mean": 0.0520833358168602,
+      "rewards/itbench_correctness/std": 0.07978560030460358,
+      "step": 462,
+      "step_time": 83.101976220496
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 937.0,
+      "completions/mean_length": 986.3125,
+      "completions/mean_terminated_length": 873.25,
+      "completions/min_length": 793.0,
+      "completions/min_terminated_length": 793.0,
+      "entropy": 0.42785629630088806,
+      "epoch": 2.4497354497354498,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3828125,
+      "kl": 0.0011906343279406428,
+      "learning_rate": 6.034166698482983e-07,
+      "loss": 0.0241,
+      "num_tokens": 9031329.0,
+      "reward": 0.1875,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 463,
+      "step_time": 6071.444487111643
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 857.0,
+      "completions/mean_length": 587.5,
+      "completions/mean_terminated_length": 442.0,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "entropy": 0.5753191709518433,
+      "epoch": 2.455026455026455,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0014363499358296394,
+      "learning_rate": 6.017983918218811e-07,
+      "loss": -0.0264,
+      "num_tokens": 9064257.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 464,
+      "step_time": 156.23662452865392
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 617.0,
+      "completions/mean_length": 489.0,
+      "completions/mean_terminated_length": 453.3333435058594,
+      "completions/min_length": 362.0,
+      "completions/min_terminated_length": 362.0,
+      "entropy": 0.5644171833992004,
+      "epoch": 2.4603174603174605,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.0021656756289303303,
+      "learning_rate": 6.001790005445606e-07,
+      "loss": 0.0027,
+      "num_tokens": 9093129.0,
+      "reward": 0.359375,
+      "reward_std": 0.19408094882965088,
+      "rewards/itbench_correctness/mean": 0.359375,
+      "rewards/itbench_correctness/std": 0.4561501145362854,
+      "step": 465,
+      "step_time": 114.18916879687458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 609.0,
+      "completions/max_terminated_length": 609.0,
+      "completions/mean_length": 454.1875,
+      "completions/mean_terminated_length": 454.1875,
+      "completions/min_length": 10.0,
+      "completions/min_terminated_length": 10.0,
+      "entropy": 0.45355716347694397,
+      "epoch": 2.4656084656084656,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.7421875,
+      "kl": 0.0023789138067513704,
+      "learning_rate": 5.985585137257401e-07,
+      "loss": -0.0632,
+      "num_tokens": 9103764.0,
+      "reward": 0.71875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 466,
+      "step_time": 111.49137642700225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 558.0,
+      "completions/max_terminated_length": 558.0,
+      "completions/mean_length": 459.8125,
+      "completions/mean_terminated_length": 459.8125,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 0.3784151077270508,
+      "epoch": 2.4708994708994707,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4375,
+      "kl": 0.0017901118844747543,
+      "learning_rate": 5.969369490868042e-07,
+      "loss": 0.0151,
+      "num_tokens": 9114825.0,
+      "reward": 0.6927083730697632,
+      "reward_std": 0.1530819982290268,
+      "rewards/itbench_correctness/mean": 0.6927083730697632,
+      "rewards/itbench_correctness/std": 0.26652559638023376,
+      "step": 467,
+      "step_time": 45.78145207092166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 631.375,
+      "completions/mean_terminated_length": 631.375,
+      "completions/min_length": 379.0,
+      "completions/min_terminated_length": 379.0,
+      "entropy": 0.46565037965774536,
+      "epoch": 2.4761904761904763,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.1806640625,
+      "kl": 0.0021383543498814106,
+      "learning_rate": 5.953143243609234e-07,
+      "loss": 0.0001,
+      "num_tokens": 9128071.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 468,
+      "step_time": 75.04701119381934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 632.0,
+      "completions/max_terminated_length": 632.0,
+      "completions/mean_length": 431.6875,
+      "completions/mean_terminated_length": 431.6875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5073114037513733,
+      "epoch": 2.4814814814814814,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1953125,
+      "kl": 0.0021981364116072655,
+      "learning_rate": 5.936906572928624e-07,
+      "loss": -0.0696,
+      "num_tokens": 9143002.0,
+      "reward": 0.871874988079071,
+      "reward_std": 0.20840224623680115,
+      "rewards/itbench_correctness/mean": 0.871874988079071,
+      "rewards/itbench_correctness/std": 0.2529616057872772,
+      "step": 469,
+      "step_time": 78.34151318110526
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 646.0625,
+      "completions/mean_terminated_length": 419.3000183105469,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "entropy": 0.5819870233535767,
+      "epoch": 2.4867724867724865,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.002163654426112771,
+      "learning_rate": 5.920659656387836e-07,
+      "loss": 0.072,
+      "num_tokens": 9169283.0,
+      "reward": 0.28125,
+      "reward_std": 0.2651650309562683,
+      "rewards/itbench_correctness/mean": 0.28125,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 470,
+      "step_time": 197.0865554632619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 707.25,
+      "completions/mean_terminated_length": 563.2727661132812,
+      "completions/min_length": 324.0,
+      "completions/min_terminated_length": 324.0,
+      "entropy": 0.41003888845443726,
+      "epoch": 2.492063492063492,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.001396682346239686,
+      "learning_rate": 5.90440267166055e-07,
+      "loss": -0.0062,
+      "num_tokens": 9187327.0,
+      "reward": 0.875,
+      "reward_std": 0.2619796097278595,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.273861289024353,
+      "step": 471,
+      "step_time": 86.19406038243324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 684.0,
+      "completions/max_terminated_length": 684.0,
+      "completions/mean_length": 556.1875,
+      "completions/mean_terminated_length": 556.1875,
+      "completions/min_length": 431.0,
+      "completions/min_terminated_length": 431.0,
+      "entropy": 0.3613889217376709,
+      "epoch": 2.497354497354497,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0625,
+      "kl": 0.001245600637048483,
+      "learning_rate": 5.888135796530544e-07,
+      "loss": 0.0086,
+      "num_tokens": 9200090.0,
+      "reward": 0.5729166865348816,
+      "reward_std": 0.0294627845287323,
+      "rewards/itbench_correctness/mean": 0.5729166865348816,
+      "rewards/itbench_correctness/std": 0.4429227113723755,
+      "step": 472,
+      "step_time": 72.5158723751083
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 559.0,
+      "completions/max_terminated_length": 559.0,
+      "completions/mean_length": 477.25,
+      "completions/mean_terminated_length": 477.25,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.45049765706062317,
+      "epoch": 2.502645502645503,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.15625,
+      "kl": 0.001709087984636426,
+      "learning_rate": 5.871859208889758e-07,
+      "loss": -0.0137,
+      "num_tokens": 9210406.0,
+      "reward": 0.640625,
+      "reward_std": 0.11451567709445953,
+      "rewards/itbench_correctness/mean": 0.640625,
+      "rewards/itbench_correctness/std": 0.40278977155685425,
+      "step": 473,
+      "step_time": 132.74163577985018
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 835.9375,
+      "completions/mean_terminated_length": 773.25,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.521570086479187,
+      "epoch": 2.507936507936508,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5546875,
+      "kl": 0.0017986115999519825,
+      "learning_rate": 5.855573086736349e-07,
+      "loss": -0.1102,
+      "num_tokens": 9243021.0,
+      "reward": 0.875,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 474,
+      "step_time": 366.73758555483073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1013.0,
+      "completions/mean_length": 830.5,
+      "completions/mean_terminated_length": 742.5454711914062,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "entropy": 0.2576760947704315,
+      "epoch": 2.5132275132275135,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0009246127447113395,
+      "learning_rate": 5.839277608172738e-07,
+      "loss": 0.0089,
+      "num_tokens": 9265781.0,
+      "reward": 0.42500001192092896,
+      "reward_std": 0.026726115494966507,
+      "rewards/itbench_correctness/mean": 0.42500001192092896,
+      "rewards/itbench_correctness/std": 0.44045430421829224,
+      "step": 475,
+      "step_time": 242.08502481784672
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 789.1875,
+      "completions/mean_terminated_length": 648.2999877929688,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.3066444993019104,
+      "epoch": 2.5185185185185186,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0016043871873989701,
+      "learning_rate": 5.82297295140367e-07,
+      "loss": 0.0033,
+      "num_tokens": 9285040.0,
+      "reward": 0.75,
+      "reward_std": 0.15430335700511932,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.3333333432674408,
+      "step": 476,
+      "step_time": 477.0931530073285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1020.0,
+      "completions/mean_length": 922.125,
+      "completions/mean_terminated_length": 820.25,
+      "completions/min_length": 486.0,
+      "completions/min_terminated_length": 486.0,
+      "entropy": 0.37738919258117676,
+      "epoch": 2.5238095238095237,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.859375,
+      "kl": 0.0015391347697004676,
+      "learning_rate": 5.806659294734255e-07,
+      "loss": -0.0201,
+      "num_tokens": 9310586.0,
+      "reward": 0.6581439971923828,
+      "reward_std": 0.30061405897140503,
+      "rewards/itbench_correctness/mean": 0.6581439971923828,
+      "rewards/itbench_correctness/std": 0.3035711646080017,
+      "step": 477,
+      "step_time": 111.43491127341986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 737.75,
+      "completions/mean_terminated_length": 718.6666870117188,
+      "completions/min_length": 599.0,
+      "completions/min_terminated_length": 599.0,
+      "entropy": 0.21823111176490784,
+      "epoch": 2.5291005291005293,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6875,
+      "kl": 0.0010663650464266539,
+      "learning_rate": 5.790336816568032e-07,
+      "loss": -0.0021,
+      "num_tokens": 9327982.0,
+      "reward": 0.4713541865348816,
+      "reward_std": 0.4539119601249695,
+      "rewards/itbench_correctness/mean": 0.4713541865348816,
+      "rewards/itbench_correctness/std": 0.4463635981082916,
+      "step": 478,
+      "step_time": 79.33199557475746
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 704.0,
+      "completions/max_terminated_length": 704.0,
+      "completions/mean_length": 493.125,
+      "completions/mean_terminated_length": 493.125,
+      "completions/min_length": 382.0,
+      "completions/min_terminated_length": 382.0,
+      "entropy": 0.559695839881897,
+      "epoch": 2.5343915343915344,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.08935546875,
+      "kl": 0.0020555509254336357,
+      "learning_rate": 5.774005695405007e-07,
+      "loss": 0.0,
+      "num_tokens": 9340472.0,
+      "reward": 0.6666666269302368,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.6666666269302368,
+      "rewards/itbench_correctness/std": 0.17213258147239685,
+      "step": 479,
+      "step_time": 60.24873013421893
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 764.0,
+      "completions/max_terminated_length": 764.0,
+      "completions/mean_length": 519.625,
+      "completions/mean_terminated_length": 519.625,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "entropy": 0.49651190638542175,
+      "epoch": 2.5396825396825395,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.03466796875,
+      "kl": 0.0015541493194177747,
+      "learning_rate": 5.757666109839702e-07,
+      "loss": 0.0,
+      "num_tokens": 9370882.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 480,
+      "step_time": 1150.014605092816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 790.0,
+      "completions/max_terminated_length": 790.0,
+      "completions/mean_length": 575.5,
+      "completions/mean_terminated_length": 575.5,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.4587315320968628,
+      "epoch": 2.544973544973545,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.0013555011246353388,
+      "learning_rate": 5.741318238559209e-07,
+      "loss": -0.0073,
+      "num_tokens": 9383698.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 481,
+      "step_time": 187.38160399720073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 647.4375,
+      "completions/mean_terminated_length": 647.4375,
+      "completions/min_length": 419.0,
+      "completions/min_terminated_length": 419.0,
+      "entropy": 0.3753257989883423,
+      "epoch": 2.5502645502645502,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.037841796875,
+      "kl": 0.0015313706826418638,
+      "learning_rate": 5.724962260341229e-07,
+      "loss": 0.0,
+      "num_tokens": 9398977.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 482,
+      "step_time": 765.347533389926
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 859.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 583.375,
+      "completions/mean_terminated_length": 583.375,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.5416755676269531,
+      "epoch": 2.5555555555555554,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.15625,
+      "kl": 0.0014298075111582875,
+      "learning_rate": 5.708598354052121e-07,
+      "loss": 0.0256,
+      "num_tokens": 9415967.0,
+      "reward": 0.8541666865348816,
+      "reward_std": 0.049801189452409744,
+      "rewards/itbench_correctness/mean": 0.8541666865348816,
+      "rewards/itbench_correctness/std": 0.16527193784713745,
+      "step": 483,
+      "step_time": 99.65433174744248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 569.0,
+      "completions/max_terminated_length": 569.0,
+      "completions/mean_length": 448.9375,
+      "completions/mean_terminated_length": 448.9375,
+      "completions/min_length": 353.0,
+      "completions/min_terminated_length": 353.0,
+      "entropy": 0.4521787464618683,
+      "epoch": 2.560846560846561,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.15625,
+      "kl": 0.0014567336766049266,
+      "learning_rate": 5.692226698644937e-07,
+      "loss": -0.0088,
+      "num_tokens": 9425990.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 484,
+      "step_time": 159.2589992955327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 837.875,
+      "completions/mean_terminated_length": 775.8333740234375,
+      "completions/min_length": 585.0,
+      "completions/min_terminated_length": 585.0,
+      "entropy": 0.38430553674697876,
+      "epoch": 2.566137566137566,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.033935546875,
+      "kl": 0.0012451084330677986,
+      "learning_rate": 5.675847473157485e-07,
+      "loss": 0.0,
+      "num_tokens": 9444988.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 485,
+      "step_time": 265.1822805535048
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 847.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 574.375,
+      "completions/mean_terminated_length": 574.375,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.4108814001083374,
+      "epoch": 2.571428571428571,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5859375,
+      "kl": 0.0019665309228003025,
+      "learning_rate": 5.659460856710345e-07,
+      "loss": -0.008,
+      "num_tokens": 9458426.0,
+      "reward": 0.8883928656578064,
+      "reward_std": 0.16245228052139282,
+      "rewards/itbench_correctness/mean": 0.8883928656578064,
+      "rewards/itbench_correctness/std": 0.1985812783241272,
+      "step": 486,
+      "step_time": 129.54102603532374
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 794.0,
+      "completions/mean_length": 775.9375,
+      "completions/mean_terminated_length": 583.0,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.32863470911979675,
+      "epoch": 2.5767195767195767,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0015423308359459043,
+      "learning_rate": 5.643067028504931e-07,
+      "loss": -0.0219,
+      "num_tokens": 9485297.0,
+      "reward": 0.375,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.4281744360923767,
+      "step": 487,
+      "step_time": 71.69435486476868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 765.0,
+      "completions/max_terminated_length": 765.0,
+      "completions/mean_length": 509.4375,
+      "completions/mean_terminated_length": 509.4375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3533308804035187,
+      "epoch": 2.582010582010582,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6015625,
+      "kl": 0.0020819108467549086,
+      "learning_rate": 5.626666167821521e-07,
+      "loss": 0.0039,
+      "num_tokens": 9497008.0,
+      "reward": 0.78125,
+      "reward_std": 0.3471629321575165,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.4069705307483673,
+      "step": 488,
+      "step_time": 89.01397905871272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 845.0,
+      "completions/max_terminated_length": 845.0,
+      "completions/mean_length": 567.0625,
+      "completions/mean_terminated_length": 567.0625,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "entropy": 0.5466769337654114,
+      "epoch": 2.5873015873015874,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.035400390625,
+      "kl": 0.0019230879843235016,
+      "learning_rate": 5.6102584540173e-07,
+      "loss": 0.0,
+      "num_tokens": 9519441.0,
+      "reward": 0.25,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 489,
+      "step_time": 71.94888481497765
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 997.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 768.8125,
+      "completions/mean_terminated_length": 768.8125,
+      "completions/min_length": 575.0,
+      "completions/min_terminated_length": 575.0,
+      "entropy": 0.2939598262310028,
+      "epoch": 2.5925925925925926,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.203125,
+      "kl": 0.0013061386998742819,
+      "learning_rate": 5.5938440665244e-07,
+      "loss": 0.0038,
+      "num_tokens": 9537934.0,
+      "reward": 0.84375,
+      "reward_std": 0.0578637570142746,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.17969882488250732,
+      "step": 490,
+      "step_time": 106.79387213569134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 944.9375,
+      "completions/mean_terminated_length": 771.0,
+      "completions/min_length": 584.0,
+      "completions/min_terminated_length": 584.0,
+      "entropy": 0.2941993474960327,
+      "epoch": 2.597883597883598,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.043212890625,
+      "kl": 0.0013492838479578495,
+      "learning_rate": 5.577423184847931e-07,
+      "loss": 0.0,
+      "num_tokens": 9561381.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 491,
+      "step_time": 823.8002629633993
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 980.0,
+      "completions/mean_length": 816.3125,
+      "completions/mean_terminated_length": 786.6428833007812,
+      "completions/min_length": 570.0,
+      "completions/min_terminated_length": 570.0,
+      "entropy": 0.4140571057796478,
+      "epoch": 2.6031746031746033,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9140625,
+      "kl": 0.001417625928297639,
+      "learning_rate": 5.560995988564023e-07,
+      "loss": 0.0223,
+      "num_tokens": 9581962.0,
+      "reward": 0.109375,
+      "reward_std": 0.2414703369140625,
+      "rewards/itbench_correctness/mean": 0.109375,
+      "rewards/itbench_correctness/std": 0.2576940953731537,
+      "step": 492,
+      "step_time": 97.65742574445903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 588.0,
+      "completions/mean_length": 701.125,
+      "completions/mean_terminated_length": 450.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4649670124053955,
+      "epoch": 2.6084656084656084,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0015433584339916706,
+      "learning_rate": 5.544562657317863e-07,
+      "loss": -0.014,
+      "num_tokens": 9616972.0,
+      "reward": 0.3125,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.35939764976501465,
+      "step": 493,
+      "step_time": 147.00880005117506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 735.75,
+      "completions/mean_terminated_length": 694.5714721679688,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "entropy": 0.4131838381290436,
+      "epoch": 2.613756613756614,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8125,
+      "kl": 0.001086304779164493,
+      "learning_rate": 5.528123370821729e-07,
+      "loss": 0.0469,
+      "num_tokens": 9632520.0,
+      "reward": 0.6505681872367859,
+      "reward_std": 0.23257695138454437,
+      "rewards/itbench_correctness/mean": 0.6505681872367859,
+      "rewards/itbench_correctness/std": 0.45813921093940735,
+      "step": 494,
+      "step_time": 73.31144659873098
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 2.0,
+      "completions/mean_length": 768.3125,
+      "completions/mean_terminated_length": 1.25,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.640364408493042,
+      "epoch": 2.619047619047619,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.46875,
+      "kl": 0.0018688439158722758,
+      "learning_rate": 5.511678308853025e-07,
+      "loss": -0.1102,
+      "num_tokens": 9660781.0,
+      "reward": 0.0520833358168602,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.0520833358168602,
+      "rewards/itbench_correctness/std": 0.13220004737377167,
+      "step": 495,
+      "step_time": 85.44449219666421
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 750.5,
+      "completions/mean_terminated_length": 659.3333740234375,
+      "completions/min_length": 447.0,
+      "completions/min_terminated_length": 447.0,
+      "entropy": 0.46902066469192505,
+      "epoch": 2.624338624338624,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.265625,
+      "kl": 0.0013028380926698446,
+      "learning_rate": 5.495227651252315e-07,
+      "loss": 0.0103,
+      "num_tokens": 9677765.0,
+      "reward": 0.59375,
+      "reward_std": 0.1735912710428238,
+      "rewards/itbench_correctness/mean": 0.59375,
+      "rewards/itbench_correctness/std": 0.48196646571159363,
+      "step": 496,
+      "step_time": 781.4977411162108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 677.0,
+      "completions/max_terminated_length": 677.0,
+      "completions/mean_length": 390.75,
+      "completions/mean_terminated_length": 390.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4273832440376282,
+      "epoch": 2.6296296296296298,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2109375,
+      "kl": 0.0021745527628809214,
+      "learning_rate": 5.478771577921351e-07,
+      "loss": -0.0118,
+      "num_tokens": 9686945.0,
+      "reward": 0.627480149269104,
+      "reward_std": 0.2220388650894165,
+      "rewards/itbench_correctness/mean": 0.627480149269104,
+      "rewards/itbench_correctness/std": 0.3782695233821869,
+      "step": 497,
+      "step_time": 134.51860492676497
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 800.0,
+      "completions/mean_length": 764.5,
+      "completions/mean_terminated_length": 505.0,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "entropy": 0.3531720042228699,
+      "epoch": 2.634920634920635,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.0014397338964045048,
+      "learning_rate": 5.462310268821117e-07,
+      "loss": 0.0163,
+      "num_tokens": 9713425.0,
+      "reward": 0.265625,
+      "reward_std": 0.1724265068769455,
+      "rewards/itbench_correctness/mean": 0.265625,
+      "rewards/itbench_correctness/std": 0.3616048991680145,
+      "step": 498,
+      "step_time": 152.6961117470637
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 576.0625,
+      "completions/mean_terminated_length": 576.0625,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "entropy": 0.28642725944519043,
+      "epoch": 2.64021164021164,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5625,
+      "kl": 0.0037036272697150707,
+      "learning_rate": 5.445843903969854e-07,
+      "loss": -0.0221,
+      "num_tokens": 9727450.0,
+      "reward": 0.2291666716337204,
+      "reward_std": 0.2048145830631256,
+      "rewards/itbench_correctness/mean": 0.2291666716337204,
+      "rewards/itbench_correctness/std": 0.22669117152690887,
+      "step": 499,
+      "step_time": 78.76068393606693
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 794.4375,
+      "completions/mean_terminated_length": 690.0909423828125,
+      "completions/min_length": 480.0,
+      "completions/min_terminated_length": 480.0,
+      "entropy": 0.3776256740093231,
+      "epoch": 2.6455026455026456,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0019214467611163855,
+      "learning_rate": 5.429372663441085e-07,
+      "loss": 0.0057,
+      "num_tokens": 9745281.0,
+      "reward": 0.40625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.20155644416809082,
+      "step": 500,
+      "step_time": 542.0401397850364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 570.9375,
+      "completions/mean_terminated_length": 570.9375,
+      "completions/min_length": 425.0,
+      "completions/min_terminated_length": 425.0,
+      "entropy": 0.3800766170024872,
+      "epoch": 2.6507936507936507,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.546875,
+      "kl": 0.0011877365177497268,
+      "learning_rate": 5.412896727361662e-07,
+      "loss": 0.0347,
+      "num_tokens": 9757936.0,
+      "reward": 0.8323863744735718,
+      "reward_std": 0.1946326196193695,
+      "rewards/itbench_correctness/mean": 0.8323863744735718,
+      "rewards/itbench_correctness/std": 0.28190067410469055,
+      "step": 501,
+      "step_time": 66.27912161499262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 658.25,
+      "completions/mean_terminated_length": 492.0,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.30991265177726746,
+      "epoch": 2.656084656084656,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.65625,
+      "kl": 0.001489053014665842,
+      "learning_rate": 5.396416275909779e-07,
+      "loss": 0.0148,
+      "num_tokens": 9773348.0,
+      "reward": 0.21875,
+      "reward_std": 0.3061639666557312,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 502,
+      "step_time": 1172.296461245045
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 723.0,
+      "completions/max_terminated_length": 723.0,
+      "completions/mean_length": 537.6875,
+      "completions/mean_terminated_length": 537.6875,
+      "completions/min_length": 398.0,
+      "completions/min_terminated_length": 398.0,
+      "entropy": 0.3291874825954437,
+      "epoch": 2.6613756613756614,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6875,
+      "kl": 0.0015724250115454197,
+      "learning_rate": 5.379931489313015e-07,
+      "loss": 0.0265,
+      "num_tokens": 9786871.0,
+      "reward": 0.45625001192092896,
+      "reward_std": 0.05625351518392563,
+      "rewards/itbench_correctness/mean": 0.45625001192092896,
+      "rewards/itbench_correctness/std": 0.34699106216430664,
+      "step": 503,
+      "step_time": 66.85712667554617
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 512.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 372.875,
+      "completions/mean_terminated_length": 372.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4425075352191925,
+      "epoch": 2.6666666666666665,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4296875,
+      "kl": 0.0019720701966434717,
+      "learning_rate": 5.363442547846355e-07,
+      "loss": -0.043,
+      "num_tokens": 9800949.0,
+      "reward": 0.3671875,
+      "reward_std": 0.3643017113208771,
+      "rewards/itbench_correctness/mean": 0.3671875,
+      "rewards/itbench_correctness/std": 0.4119788408279419,
+      "step": 504,
+      "step_time": 72.04895468428731
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 854.0,
+      "completions/mean_length": 784.4375,
+      "completions/mean_terminated_length": 704.5833740234375,
+      "completions/min_length": 449.0,
+      "completions/min_terminated_length": 449.0,
+      "entropy": 0.47422516345977783,
+      "epoch": 2.671957671957672,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9140625,
+      "kl": 0.001217540237121284,
+      "learning_rate": 5.34694963183022e-07,
+      "loss": -0.0013,
+      "num_tokens": 9817452.0,
+      "reward": 0.8229166269302368,
+      "reward_std": 0.2745841145515442,
+      "rewards/itbench_correctness/mean": 0.8229166269302368,
+      "rewards/itbench_correctness/std": 0.3303687572479248,
+      "step": 505,
+      "step_time": 82.16243282984942
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 677.0,
+      "completions/max_terminated_length": 677.0,
+      "completions/mean_length": 512.4375,
+      "completions/mean_terminated_length": 512.4375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5346993803977966,
+      "epoch": 2.677248677248677,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.7890625,
+      "kl": 0.0038210356142371893,
+      "learning_rate": 5.330452921628497e-07,
+      "loss": -0.1759,
+      "num_tokens": 9828603.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.21967849135398865,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.24152295291423798,
+      "step": 506,
+      "step_time": 130.50164964888245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 873.9375,
+      "completions/mean_terminated_length": 757.2222290039062,
+      "completions/min_length": 608.0,
+      "completions/min_terminated_length": 608.0,
+      "entropy": 0.446256160736084,
+      "epoch": 2.682539682539683,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.234375,
+      "kl": 0.0013827559305354953,
+      "learning_rate": 5.313952597646567e-07,
+      "loss": -0.0479,
+      "num_tokens": 9851170.0,
+      "reward": 0.1875,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 507,
+      "step_time": 549.6535976743326
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 572.0,
+      "completions/max_terminated_length": 572.0,
+      "completions/mean_length": 426.5,
+      "completions/mean_terminated_length": 426.5,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.46189919114112854,
+      "epoch": 2.687830687830688,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.0022915503941476345,
+      "learning_rate": 5.297448840329328e-07,
+      "loss": -0.0217,
+      "num_tokens": 9860442.0,
+      "reward": 0.2698863744735718,
+      "reward_std": 0.15896323323249817,
+      "rewards/itbench_correctness/mean": 0.2698863744735718,
+      "rewards/itbench_correctness/std": 0.2320520281791687,
+      "step": 508,
+      "step_time": 67.97736590728164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 781.4375,
+      "completions/mean_terminated_length": 746.7857666015625,
+      "completions/min_length": 478.0,
+      "completions/min_terminated_length": 478.0,
+      "entropy": 0.2853715121746063,
+      "epoch": 2.693121693121693,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.203125,
+      "kl": 0.0011138498084619641,
+      "learning_rate": 5.280941830159227e-07,
+      "loss": 0.0167,
+      "num_tokens": 9880081.0,
+      "reward": 0.34375,
+      "reward_std": 0.18600594997406006,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.4366062581539154,
+      "step": 509,
+      "step_time": 240.96920191589743
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 877.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 581.5625,
+      "completions/mean_terminated_length": 581.5625,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "entropy": 0.5089736580848694,
+      "epoch": 2.6984126984126986,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.001449383096769452,
+      "learning_rate": 5.264431747654283e-07,
+      "loss": 0.0141,
+      "num_tokens": 9910954.0,
+      "reward": 0.34375,
+      "reward_std": 0.18600594997406006,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.4366062581539154,
+      "step": 510,
+      "step_time": 138.8211078811437
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 759.0,
+      "completions/max_terminated_length": 759.0,
+      "completions/mean_length": 600.4375,
+      "completions/mean_terminated_length": 600.4375,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.3697304129600525,
+      "epoch": 2.7037037037037037,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1015625,
+      "kl": 0.001372107770293951,
+      "learning_rate": 5.247918773366111e-07,
+      "loss": 0.0158,
+      "num_tokens": 9925225.0,
+      "reward": 0.875,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 511,
+      "step_time": 78.60712255910039
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 521.0,
+      "completions/max_terminated_length": 521.0,
+      "completions/mean_length": 443.125,
+      "completions/mean_terminated_length": 443.125,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "entropy": 0.36784201860427856,
+      "epoch": 2.708994708994709,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.625,
+      "kl": 0.001885003293864429,
+      "learning_rate": 5.231403087877955e-07,
+      "loss": -0.0034,
+      "num_tokens": 9936491.0,
+      "reward": 0.5052083730697632,
+      "reward_std": 0.29658451676368713,
+      "rewards/itbench_correctness/mean": 0.5052083730697632,
+      "rewards/itbench_correctness/std": 0.4892064332962036,
+      "step": 512,
+      "step_time": 1101.7817776547745
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 630.0,
+      "completions/max_terminated_length": 630.0,
+      "completions/mean_length": 506.5,
+      "completions/mean_terminated_length": 506.5,
+      "completions/min_length": 346.0,
+      "completions/min_terminated_length": 346.0,
+      "entropy": 0.5034551024436951,
+      "epoch": 2.7142857142857144,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0027517788112163544,
+      "learning_rate": 5.214884871802703e-07,
+      "loss": -0.0104,
+      "num_tokens": 9958027.0,
+      "reward": 0.5333333015441895,
+      "reward_std": 0.24348656833171844,
+      "rewards/itbench_correctness/mean": 0.5333333015441895,
+      "rewards/itbench_correctness/std": 0.3538151979446411,
+      "step": 513,
+      "step_time": 115.8853734144941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 503.375,
+      "completions/mean_terminated_length": 468.66668701171875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3277874290943146,
+      "epoch": 2.7195767195767195,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3125,
+      "kl": 0.005129899829626083,
+      "learning_rate": 5.198364305780921e-07,
+      "loss": -0.1331,
+      "num_tokens": 9970817.0,
+      "reward": 0.4270833432674408,
+      "reward_std": 0.3061639964580536,
+      "rewards/itbench_correctness/mean": 0.4270833432674408,
+      "rewards/itbench_correctness/std": 0.32185083627700806,
+      "step": 514,
+      "step_time": 81.30817873775959
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 763.0,
+      "completions/max_terminated_length": 763.0,
+      "completions/mean_length": 523.0,
+      "completions/mean_terminated_length": 523.0,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.41108986735343933,
+      "epoch": 2.7248677248677247,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.34375,
+      "kl": 0.0014420569641515613,
+      "learning_rate": 5.181841570478872e-07,
+      "loss": 0.0416,
+      "num_tokens": 9982881.0,
+      "reward": 0.9943181872367859,
+      "reward_std": 0.016070598736405373,
+      "rewards/itbench_correctness/mean": 0.9943181872367859,
+      "rewards/itbench_correctness/std": 0.02272726595401764,
+      "step": 515,
+      "step_time": 76.78445727284998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 978.5,
+      "completions/mean_terminated_length": 878.4000244140625,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.33316299319267273,
+      "epoch": 2.7301587301587302,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.875,
+      "kl": 0.0013756535481661558,
+      "learning_rate": 5.165316846586541e-07,
+      "loss": -0.0065,
+      "num_tokens": 10008745.0,
+      "reward": 0.2053571343421936,
+      "reward_std": 0.28105252981185913,
+      "rewards/itbench_correctness/mean": 0.2053571343421936,
+      "rewards/itbench_correctness/std": 0.32667672634124756,
+      "step": 516,
+      "step_time": 816.1864080894738
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 882.0,
+      "completions/max_terminated_length": 882.0,
+      "completions/mean_length": 624.875,
+      "completions/mean_terminated_length": 624.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5921184420585632,
+      "epoch": 2.7354497354497354,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.0018502407474443316,
+      "learning_rate": 5.148790314815662e-07,
+      "loss": -0.0101,
+      "num_tokens": 10025191.0,
+      "reward": 0.5062500238418579,
+      "reward_std": 0.25888073444366455,
+      "rewards/itbench_correctness/mean": 0.5062500238418579,
+      "rewards/itbench_correctness/std": 0.4753507673740387,
+      "step": 517,
+      "step_time": 72.83533152658492
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 836.0,
+      "completions/mean_length": 823.0625,
+      "completions/mean_terminated_length": 622.125,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5637481808662415,
+      "epoch": 2.7407407407407405,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.609375,
+      "kl": 0.0017002089880406857,
+      "learning_rate": 5.132262155897738e-07,
+      "loss": -0.1006,
+      "num_tokens": 10054440.0,
+      "reward": 0.4895833134651184,
+      "reward_std": 0.3517908453941345,
+      "rewards/itbench_correctness/mean": 0.4895833134651184,
+      "rewards/itbench_correctness/std": 0.5072392821311951,
+      "step": 518,
+      "step_time": 96.34473600052297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 994.25,
+      "completions/mean_terminated_length": 786.0,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.3198390603065491,
+      "epoch": 2.746031746031746,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.043212890625,
+      "kl": 0.0014571960782632232,
+      "learning_rate": 5.115732550582069e-07,
+      "loss": 0.0001,
+      "num_tokens": 10079516.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 519,
+      "step_time": 8666.9965882916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 863.0,
+      "completions/mean_length": 863.5,
+      "completions/mean_terminated_length": 703.0,
+      "completions/min_length": 565.0,
+      "completions/min_terminated_length": 565.0,
+      "entropy": 0.5211349129676819,
+      "epoch": 2.751322751322751,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.0015487850178033113,
+      "learning_rate": 5.099201679633768e-07,
+      "loss": 0.0001,
+      "num_tokens": 10101236.0,
+      "reward": 0.46875,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.185404971241951,
+      "step": 520,
+      "step_time": 312.3351803580299
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 725.0,
+      "completions/max_terminated_length": 725.0,
+      "completions/mean_length": 511.4375,
+      "completions/mean_terminated_length": 511.4375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4203837215900421,
+      "epoch": 2.7566137566137567,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.56640625,
+      "kl": 0.0017837980994954705,
+      "learning_rate": 5.082669723831793e-07,
+      "loss": -0.0992,
+      "num_tokens": 10119339.0,
+      "reward": 0.37708336114883423,
+      "reward_std": 0.0176776684820652,
+      "rewards/itbench_correctness/mean": 0.37708336114883423,
+      "rewards/itbench_correctness/std": 0.30005404353141785,
+      "step": 521,
+      "step_time": 109.74898790102452
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 974.6875,
+      "completions/mean_terminated_length": 866.2000122070312,
+      "completions/min_length": 794.0,
+      "completions/min_terminated_length": 794.0,
+      "entropy": 0.3508816957473755,
+      "epoch": 2.761904761904762,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.984375,
+      "kl": 0.0012084582122042775,
+      "learning_rate": 5.066136863966962e-07,
+      "loss": 0.0043,
+      "num_tokens": 10142334.0,
+      "reward": 0.46875,
+      "reward_std": 0.28270021080970764,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4035433530807495,
+      "step": 522,
+      "step_time": 106.33708533085883
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 875.5625,
+      "completions/mean_terminated_length": 760.1111450195312,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "entropy": 0.5824826955795288,
+      "epoch": 2.7671957671957674,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0019700033590197563,
+      "learning_rate": 5.049603280839982e-07,
+      "loss": -0.0028,
+      "num_tokens": 10170263.0,
+      "reward": 0.015625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.015625,
+      "rewards/itbench_correctness/std": 0.0625,
+      "step": 523,
+      "step_time": 73.45079297944903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 543.125,
+      "completions/mean_terminated_length": 543.125,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "entropy": 0.5413118600845337,
+      "epoch": 2.7724867724867726,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.46875,
+      "kl": 0.00216845516115427,
+      "learning_rate": 5.033069155259471e-07,
+      "loss": -0.0028,
+      "num_tokens": 10185017.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 524,
+      "step_time": 89.7467988235876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 734.1875,
+      "completions/mean_terminated_length": 692.7857666015625,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.6020260453224182,
+      "epoch": 2.7777777777777777,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9609375,
+      "kl": 0.0015728548169136047,
+      "learning_rate": 5.016534668039976e-07,
+      "loss": -0.0002,
+      "num_tokens": 10209820.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.235702246427536,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 525,
+      "step_time": 208.850717083551
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 974.0,
+      "completions/mean_length": 1010.375,
+      "completions/mean_terminated_length": 951.3333740234375,
+      "completions/min_length": 938.0,
+      "completions/min_terminated_length": 938.0,
+      "entropy": 0.522578239440918,
+      "epoch": 2.7830687830687832,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.00148448022082448,
+      "learning_rate": 5e-07,
+      "loss": 0.0001,
+      "num_tokens": 10238170.0,
+      "reward": 0.171875,
+      "reward_std": 0.188242569565773,
+      "rewards/itbench_correctness/mean": 0.171875,
+      "rewards/itbench_correctness/std": 0.3125,
+      "step": 526,
+      "step_time": 942.4442430688068
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 841.5,
+      "completions/mean_terminated_length": 659.0,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.45632797479629517,
+      "epoch": 2.7883597883597884,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.21875,
+      "kl": 0.001308864215388894,
+      "learning_rate": 4.983465331960023e-07,
+      "loss": -0.0154,
+      "num_tokens": 10258074.0,
+      "reward": 0.5531250238418579,
+      "reward_std": 0.24483326077461243,
+      "rewards/itbench_correctness/mean": 0.5531250238418579,
+      "rewards/itbench_correctness/std": 0.421295166015625,
+      "step": 527,
+      "step_time": 177.91924435272813
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 511.0,
+      "completions/max_terminated_length": 511.0,
+      "completions/mean_length": 325.75,
+      "completions/mean_terminated_length": 325.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.33614733815193176,
+      "epoch": 2.7936507936507935,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0859375,
+      "kl": 0.00386231392621994,
+      "learning_rate": 4.96693084474053e-07,
+      "loss": -0.0765,
+      "num_tokens": 10269622.0,
+      "reward": 0.5625,
+      "reward_std": 0.3471825420856476,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 528,
+      "step_time": 59.13615032006055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 629.375,
+      "completions/mean_terminated_length": 450.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.46077457070350647,
+      "epoch": 2.798941798941799,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9765625,
+      "kl": 0.0026442273519933224,
+      "learning_rate": 4.950396719160018e-07,
+      "loss": -0.0582,
+      "num_tokens": 10287964.0,
+      "reward": 0.3645833432674408,
+      "reward_std": 0.3061639964580536,
+      "rewards/itbench_correctness/mean": 0.3645833432674408,
+      "rewards/itbench_correctness/std": 0.3507597744464874,
+      "step": 529,
+      "step_time": 269.48390776105225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 713.5,
+      "completions/mean_terminated_length": 527.2000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6839523315429688,
+      "epoch": 2.804232804232804,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.65234375,
+      "kl": 0.0018830286571756005,
+      "learning_rate": 4.933863136033039e-07,
+      "loss": -0.1144,
+      "num_tokens": 10317900.0,
+      "reward": 0.3072916567325592,
+      "reward_std": 0.19150808453559875,
+      "rewards/itbench_correctness/mean": 0.3072916567325592,
+      "rewards/itbench_correctness/std": 0.4113198518753052,
+      "step": 530,
+      "step_time": 151.5169429546222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 462.0,
+      "completions/mean_length": 717.125,
+      "completions/mean_terminated_length": 410.25,
+      "completions/min_length": 339.0,
+      "completions/min_terminated_length": 339.0,
+      "entropy": 0.5047934651374817,
+      "epoch": 2.8095238095238093,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0322265625,
+      "kl": 0.0017540534026920795,
+      "learning_rate": 4.917330276168208e-07,
+      "loss": 0.0,
+      "num_tokens": 10342214.0,
+      "reward": 0.699999988079071,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.699999988079071,
+      "rewards/itbench_correctness/std": 0.3098386824131012,
+      "step": 531,
+      "step_time": 203.65224741771817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 872.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 629.5625,
+      "completions/mean_terminated_length": 629.5625,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.3589794635772705,
+      "epoch": 2.814814814814815,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5703125,
+      "kl": 0.0014826505212113261,
+      "learning_rate": 4.900798320366232e-07,
+      "loss": -0.017,
+      "num_tokens": 10355927.0,
+      "reward": 0.7332720756530762,
+      "reward_std": 0.21437928080558777,
+      "rewards/itbench_correctness/mean": 0.7332720756530762,
+      "rewards/itbench_correctness/std": 0.37859466671943665,
+      "step": 532,
+      "step_time": 348.23073250520974
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.392578125,
+      "epoch": 2.82010582010582,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.21875,
+      "kl": 0.0014328202232718468,
+      "learning_rate": 4.88426744941793e-07,
+      "loss": 0.0001,
+      "num_tokens": 10384407.0,
+      "reward": 0.8541666269302368,
+      "reward_std": 0.2482243776321411,
+      "rewards/itbench_correctness/mean": 0.8541666269302368,
+      "rewards/itbench_correctness/std": 0.26440009474754333,
+      "step": 533,
+      "step_time": 118.49520284496248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 630.0,
+      "completions/max_terminated_length": 630.0,
+      "completions/mean_length": 498.25,
+      "completions/mean_terminated_length": 498.25,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "entropy": 0.38534870743751526,
+      "epoch": 2.825396825396825,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4921875,
+      "kl": 0.002330408664420247,
+      "learning_rate": 4.86773784410226e-07,
+      "loss": 0.0162,
+      "num_tokens": 10399339.0,
+      "reward": 0.46875,
+      "reward_std": 0.2609178125858307,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4312717914581299,
+      "step": 534,
+      "step_time": 567.9226626912132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 921.625,
+      "completions/mean_terminated_length": 790.0000610351562,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.2951308786869049,
+      "epoch": 2.8306878306878307,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0556640625,
+      "kl": 0.0014564162120223045,
+      "learning_rate": 4.851209685184338e-07,
+      "loss": 0.0001,
+      "num_tokens": 10424093.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 535,
+      "step_time": 216.71312026213855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 733.0,
+      "completions/mean_terminated_length": 733.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.45839017629623413,
+      "epoch": 2.835978835978836,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0234375,
+      "kl": 0.0023340799380093813,
+      "learning_rate": 4.834683153413459e-07,
+      "loss": -0.1065,
+      "num_tokens": 10440501.0,
+      "reward": 0.856249988079071,
+      "reward_std": 0.17614421248435974,
+      "rewards/itbench_correctness/mean": 0.856249988079071,
+      "rewards/itbench_correctness/std": 0.2827690541744232,
+      "step": 536,
+      "step_time": 74.85195223800838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 987.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 563.875,
+      "completions/mean_terminated_length": 563.875,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "entropy": 0.48237642645835876,
+      "epoch": 2.8412698412698414,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01373291015625,
+      "kl": 0.001148638199083507,
+      "learning_rate": 4.818158429521129e-07,
+      "loss": 0.0,
+      "num_tokens": 10452811.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 537,
+      "step_time": 238.9646631795913
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 900.0,
+      "completions/max_terminated_length": 900.0,
+      "completions/mean_length": 613.0,
+      "completions/mean_terminated_length": 613.0,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "entropy": 0.48939642310142517,
+      "epoch": 2.8465608465608465,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.25,
+      "kl": 0.0019839375745505095,
+      "learning_rate": 4.801635694219079e-07,
+      "loss": 0.0307,
+      "num_tokens": 10466963.0,
+      "reward": 0.484375,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.484375,
+      "rewards/itbench_correctness/std": 0.503891110420227,
+      "step": 538,
+      "step_time": 778.2056272830814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 965.0,
+      "completions/mean_length": 868.4375,
+      "completions/mean_terminated_length": 747.4444580078125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.27981287240982056,
+      "epoch": 2.851851851851852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.62890625,
+      "kl": 0.0014373651938512921,
+      "learning_rate": 4.785115128197298e-07,
+      "loss": -0.1155,
+      "num_tokens": 10487634.0,
+      "reward": 0.3897058963775635,
+      "reward_std": 0.16693422198295593,
+      "rewards/itbench_correctness/mean": 0.3897058963775635,
+      "rewards/itbench_correctness/std": 0.46261632442474365,
+      "step": 539,
+      "step_time": 163.97635082527995
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 985.0,
+      "completions/mean_length": 695.375,
+      "completions/mean_terminated_length": 546.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5637246370315552,
+      "epoch": 2.857142857142857,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.734375,
+      "kl": 0.0018984549678862095,
+      "learning_rate": 4.768596912122045e-07,
+      "loss": -0.1365,
+      "num_tokens": 10528456.0,
+      "reward": 0.375,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 540,
+      "step_time": 148.45136263035238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 971.5,
+      "completions/mean_terminated_length": 856.0,
+      "completions/min_length": 558.0,
+      "completions/min_terminated_length": 558.0,
+      "entropy": 0.4776119291782379,
+      "epoch": 2.8624338624338623,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.84375,
+      "kl": 0.0011911564506590366,
+      "learning_rate": 4.752081226633888e-07,
+      "loss": 0.0568,
+      "num_tokens": 10571880.0,
+      "reward": 0.5,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 541,
+      "step_time": 150.04778977762908
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 988.6875,
+      "completions/mean_terminated_length": 882.75,
+      "completions/min_length": 781.0,
+      "completions/min_terminated_length": 781.0,
+      "entropy": 0.5259498357772827,
+      "epoch": 2.867724867724868,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04052734375,
+      "kl": 0.001533015980385244,
+      "learning_rate": 4.7355682523457173e-07,
+      "loss": 0.0001,
+      "num_tokens": 10605523.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 542,
+      "step_time": 117.57136417739093
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 840.0,
+      "completions/max_terminated_length": 840.0,
+      "completions/mean_length": 550.3125,
+      "completions/mean_terminated_length": 550.3125,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 0.523339033126831,
+      "epoch": 2.873015873015873,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05419921875,
+      "kl": 0.002573953941464424,
+      "learning_rate": 4.719058169840772e-07,
+      "loss": 0.0001,
+      "num_tokens": 10628208.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 543,
+      "step_time": 99.59436613786966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 718.0,
+      "completions/mean_length": 829.5,
+      "completions/mean_terminated_length": 635.0,
+      "completions/min_length": 556.0,
+      "completions/min_terminated_length": 556.0,
+      "entropy": 0.3833634853363037,
+      "epoch": 2.878306878306878,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8359375,
+      "kl": 0.0010798868024721742,
+      "learning_rate": 4.702551159670672e-07,
+      "loss": -0.0133,
+      "num_tokens": 10649920.0,
+      "reward": 0.4388020932674408,
+      "reward_std": 0.24691906571388245,
+      "rewards/itbench_correctness/mean": 0.4388020932674408,
+      "rewards/itbench_correctness/std": 0.3513753414154053,
+      "step": 544,
+      "step_time": 112.22929359227419
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 786.3125,
+      "completions/mean_terminated_length": 643.7000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5061600804328918,
+      "epoch": 2.8835978835978837,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.56640625,
+      "kl": 0.001771087758243084,
+      "learning_rate": 4.686047402353433e-07,
+      "loss": -0.1004,
+      "num_tokens": 10672789.0,
+      "reward": 0.3177083432674408,
+      "reward_std": 0.129746213555336,
+      "rewards/itbench_correctness/mean": 0.3177083432674408,
+      "rewards/itbench_correctness/std": 0.37294963002204895,
+      "step": 545,
+      "step_time": 111.85205744486302
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 842.0,
+      "completions/max_terminated_length": 842.0,
+      "completions/mean_length": 618.625,
+      "completions/mean_terminated_length": 618.625,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.2796524465084076,
+      "epoch": 2.888888888888889,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.203125,
+      "kl": 0.0013817870058119297,
+      "learning_rate": 4.669547078371503e-07,
+      "loss": -0.0018,
+      "num_tokens": 10688151.0,
+      "reward": 0.6770833730697632,
+      "reward_std": 0.08258593082427979,
+      "rewards/itbench_correctness/mean": 0.6770833730697632,
+      "rewards/itbench_correctness/std": 0.3520771563053131,
+      "step": 546,
+      "step_time": 128.49444034136832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 718.5,
+      "completions/mean_terminated_length": 698.1333618164062,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.5622825622558594,
+      "epoch": 2.894179894179894,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3203125,
+      "kl": 0.0019919220358133316,
+      "learning_rate": 4.6530503681697796e-07,
+      "loss": 0.0318,
+      "num_tokens": 10714559.0,
+      "reward": 0.17499999701976776,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.17499999701976776,
+      "rewards/itbench_correctness/std": 0.19832633435726166,
+      "step": 547,
+      "step_time": 282.7451619775966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 745.0,
+      "completions/max_terminated_length": 745.0,
+      "completions/mean_length": 453.4375,
+      "completions/mean_terminated_length": 453.4375,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "entropy": 0.4057891070842743,
+      "epoch": 2.8994708994708995,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.140625,
+      "kl": 0.0013302209554240108,
+      "learning_rate": 4.6365574521536446e-07,
+      "loss": 0.0067,
+      "num_tokens": 10725190.0,
+      "reward": 0.234375,
+      "reward_std": 0.12387890368700027,
+      "rewards/itbench_correctness/mean": 0.234375,
+      "rewards/itbench_correctness/std": 0.29536348581314087,
+      "step": 548,
+      "step_time": 440.6484692748636
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 755.5625,
+      "completions/mean_terminated_length": 594.5,
+      "completions/min_length": 406.0,
+      "completions/min_terminated_length": 406.0,
+      "entropy": 0.5479361414909363,
+      "epoch": 2.9047619047619047,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.28125,
+      "kl": 0.002523271832615137,
+      "learning_rate": 4.620068510686984e-07,
+      "loss": 0.0625,
+      "num_tokens": 10743759.0,
+      "reward": 0.4166666865348816,
+      "reward_std": 0.3667176067829132,
+      "rewards/itbench_correctness/mean": 0.4166666865348816,
+      "rewards/itbench_correctness/std": 0.38005849719047546,
+      "step": 549,
+      "step_time": 73.56386850681156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 692.0,
+      "completions/mean_terminated_length": 433.77777099609375,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.3309248685836792,
+      "epoch": 2.91005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.001961898524314165,
+      "learning_rate": 4.60358372409022e-07,
+      "loss": 0.0117,
+      "num_tokens": 10761847.0,
+      "reward": 0.5104166865348816,
+      "reward_std": 0.2609047293663025,
+      "rewards/itbench_correctness/mean": 0.5104166865348816,
+      "rewards/itbench_correctness/std": 0.27533650398254395,
+      "step": 550,
+      "step_time": 100.34413592051715
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 671.0,
+      "completions/max_terminated_length": 671.0,
+      "completions/mean_length": 539.6875,
+      "completions/mean_terminated_length": 539.6875,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "entropy": 0.4928778111934662,
+      "epoch": 2.9153439153439153,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.002205328783020377,
+      "learning_rate": 4.5871032726383385e-07,
+      "loss": -0.008,
+      "num_tokens": 10785042.0,
+      "reward": 0.3125,
+      "reward_std": 0.44403791427612305,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 551,
+      "step_time": 102.3933826405555
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 950.9375,
+      "completions/mean_terminated_length": 894.1111450195312,
+      "completions/min_length": 639.0,
+      "completions/min_terminated_length": 639.0,
+      "entropy": 0.3764705955982208,
+      "epoch": 2.9206349206349205,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5859375,
+      "kl": 0.0018417153041809797,
+      "learning_rate": 4.5706273365589144e-07,
+      "loss": -0.0023,
+      "num_tokens": 10806801.0,
+      "reward": 0.2447916567325592,
+      "reward_std": 0.20343953371047974,
+      "rewards/itbench_correctness/mean": 0.2447916567325592,
+      "rewards/itbench_correctness/std": 0.37573233246803284,
+      "step": 552,
+      "step_time": 254.7854423839599
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 981.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 713.875,
+      "completions/mean_terminated_length": 713.875,
+      "completions/min_length": 475.0,
+      "completions/min_terminated_length": 475.0,
+      "entropy": 0.4454561471939087,
+      "epoch": 2.925925925925926,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.921875,
+      "kl": 0.0015814114594832063,
+      "learning_rate": 4.554156096030148e-07,
+      "loss": 0.0134,
+      "num_tokens": 10823391.0,
+      "reward": 0.859375,
+      "reward_std": 0.17926117777824402,
+      "rewards/itbench_correctness/mean": 0.859375,
+      "rewards/itbench_correctness/std": 0.17405499517917633,
+      "step": 553,
+      "step_time": 135.78249835129827
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 1012.5625,
+      "completions/mean_terminated_length": 978.25,
+      "completions/min_length": 935.0,
+      "completions/min_terminated_length": 935.0,
+      "entropy": 0.30615395307540894,
+      "epoch": 2.931216931216931,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6328125,
+      "kl": 0.001216786215081811,
+      "learning_rate": 4.5376897311788825e-07,
+      "loss": -0.002,
+      "num_tokens": 10847512.0,
+      "reward": 0.1041666716337204,
+      "reward_std": 0.12400396913290024,
+      "rewards/itbench_correctness/mean": 0.1041666716337204,
+      "rewards/itbench_correctness/std": 0.2006932497024536,
+      "step": 554,
+      "step_time": 7353.796993748285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 883.5625,
+      "completions/mean_terminated_length": 743.125,
+      "completions/min_length": 659.0,
+      "completions/min_terminated_length": 659.0,
+      "entropy": 0.6835962533950806,
+      "epoch": 2.9365079365079367,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5625,
+      "kl": 0.001729651470668614,
+      "learning_rate": 4.521228422078649e-07,
+      "loss": 0.0001,
+      "num_tokens": 10879377.0,
+      "reward": 0.1484375,
+      "reward_std": 0.17971175909042358,
+      "rewards/itbench_correctness/mean": 0.1484375,
+      "rewards/itbench_correctness/std": 0.2894634008407593,
+      "step": 555,
+      "step_time": 211.0395448282361
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 859.25,
+      "completions/mean_terminated_length": 731.1111450195312,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.4469013810157776,
+      "epoch": 2.941798941798942,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.001122955116443336,
+      "learning_rate": 4.5047723487476864e-07,
+      "loss": 0.0002,
+      "num_tokens": 10909653.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 556,
+      "step_time": 91.0195178175345
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 774.0,
+      "completions/max_terminated_length": 774.0,
+      "completions/mean_length": 452.875,
+      "completions/mean_terminated_length": 452.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.47474467754364014,
+      "epoch": 2.947089947089947,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5390625,
+      "kl": 0.0027064280584454536,
+      "learning_rate": 4.488321691146975e-07,
+      "loss": -0.0516,
+      "num_tokens": 10919539.0,
+      "reward": 0.75,
+      "reward_std": 0.4629100561141968,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 557,
+      "step_time": 96.5944811757654
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 785.0,
+      "completions/max_terminated_length": 785.0,
+      "completions/mean_length": 443.125,
+      "completions/mean_terminated_length": 443.125,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "entropy": 0.4874471127986908,
+      "epoch": 2.9523809523809526,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4375,
+      "kl": 0.0014341843780130148,
+      "learning_rate": 4.4718766291782723e-07,
+      "loss": 0.0231,
+      "num_tokens": 10928989.0,
+      "reward": 0.1875,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 558,
+      "step_time": 76.11392251215875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 683.625,
+      "completions/mean_terminated_length": 528.9091186523438,
+      "completions/min_length": 279.0,
+      "completions/min_terminated_length": 279.0,
+      "entropy": 0.38032546639442444,
+      "epoch": 2.9576719576719577,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.0016363576287403703,
+      "learning_rate": 4.4554373426821367e-07,
+      "loss": -0.0025,
+      "num_tokens": 10944967.0,
+      "reward": 0.4375,
+      "reward_std": 0.2077372521162033,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.291070818901062,
+      "step": 559,
+      "step_time": 134.31548726093024
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 681.75,
+      "completions/mean_terminated_length": 681.75,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.3021635413169861,
+      "epoch": 2.962962962962963,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.609375,
+      "kl": 0.0013805434573441744,
+      "learning_rate": 4.439004011435979e-07,
+      "loss": 0.0081,
+      "num_tokens": 10961347.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.21535253524780273,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.4238273799419403,
+      "step": 560,
+      "step_time": 93.65936294849962
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 731.3125,
+      "completions/mean_terminated_length": 633.75,
+      "completions/min_length": 435.0,
+      "completions/min_terminated_length": 435.0,
+      "entropy": 0.43756943941116333,
+      "epoch": 2.9682539682539684,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.0011807921109721065,
+      "learning_rate": 4.4225768151520694e-07,
+      "loss": 0.0115,
+      "num_tokens": 10977016.0,
+      "reward": 0.5282738208770752,
+      "reward_std": 0.09272660315036774,
+      "rewards/itbench_correctness/mean": 0.5282738208770752,
+      "rewards/itbench_correctness/std": 0.390090674161911,
+      "step": 561,
+      "step_time": 83.44914623722434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 991.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 700.75,
+      "completions/mean_terminated_length": 700.75,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 0.54513019323349,
+      "epoch": 2.9735449735449735,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.98828125,
+      "kl": 0.0026250071823596954,
+      "learning_rate": 4.406155933475599e-07,
+      "loss": 0.0199,
+      "num_tokens": 11001308.0,
+      "reward": 0.9479166269302368,
+      "reward_std": 0.043129097670316696,
+      "rewards/itbench_correctness/mean": 0.9479166269302368,
+      "rewards/itbench_correctness/std": 0.07978560030460358,
+      "step": 562,
+      "step_time": 114.21156205888838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 927.0,
+      "completions/mean_length": 708.8125,
+      "completions/mean_terminated_length": 636.0769653320312,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.2962701618671417,
+      "epoch": 2.9788359788359786,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.890625,
+      "kl": 0.001674781320616603,
+      "learning_rate": 4.389741545982699e-07,
+      "loss": -0.0206,
+      "num_tokens": 11017705.0,
+      "reward": 0.2916666865348816,
+      "reward_std": 0.4096291959285736,
+      "rewards/itbench_correctness/mean": 0.2916666865348816,
+      "rewards/itbench_correctness/std": 0.4013864994049072,
+      "step": 563,
+      "step_time": 173.21209927741438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 679.0,
+      "completions/max_terminated_length": 679.0,
+      "completions/mean_length": 483.6875,
+      "completions/mean_terminated_length": 483.6875,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "entropy": 0.6161002516746521,
+      "epoch": 2.984126984126984,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3203125,
+      "kl": 0.0027410590555518866,
+      "learning_rate": 4.3733338321784777e-07,
+      "loss": 0.0192,
+      "num_tokens": 11029732.0,
+      "reward": 0.40625,
+      "reward_std": 0.08258593827486038,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.43448033928871155,
+      "step": 564,
+      "step_time": 86.51283952593803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 788.0,
+      "completions/mean_length": 822.0625,
+      "completions/mean_terminated_length": 665.0,
+      "completions/min_length": 388.0,
+      "completions/min_terminated_length": 388.0,
+      "entropy": 0.3941306173801422,
+      "epoch": 2.9894179894179893,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.0017499164678156376,
+      "learning_rate": 4.3569329714950703e-07,
+      "loss": -0.0189,
+      "num_tokens": 11051933.0,
+      "reward": 0.21875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.20155644416809082,
+      "step": 565,
+      "step_time": 424.9380031451583
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 931.5,
+      "completions/mean_terminated_length": 839.0,
+      "completions/min_length": 676.0,
+      "completions/min_terminated_length": 676.0,
+      "entropy": 0.5625335574150085,
+      "epoch": 2.9947089947089944,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4453125,
+      "kl": 0.0013327021151781082,
+      "learning_rate": 4.340539143289655e-07,
+      "loss": 0.0,
+      "num_tokens": 11079021.0,
+      "reward": 0.11249999701976776,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.11249999701976776,
+      "rewards/itbench_correctness/std": 0.24186775088310242,
+      "step": 566,
+      "step_time": 103.31370590813458
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 806.8125,
+      "completions/mean_terminated_length": 756.6923217773438,
+      "completions/min_length": 516.0,
+      "completions/min_terminated_length": 516.0,
+      "entropy": 0.5825393199920654,
+      "epoch": 3.0,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.203125,
+      "kl": 0.0027419670950621367,
+      "learning_rate": 4.324152526842517e-07,
+      "loss": 0.0188,
+      "num_tokens": 11105082.0,
+      "reward": 0.3854166865348816,
+      "reward_std": 0.46477773785591125,
+      "rewards/itbench_correctness/mean": 0.3854166865348816,
+      "rewards/itbench_correctness/std": 0.4702983796596527,
+      "step": 567,
+      "step_time": 128.40436456073076
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 758.0,
+      "completions/mean_length": 836.625,
+      "completions/mean_terminated_length": 649.25,
+      "completions/min_length": 551.0,
+      "completions/min_terminated_length": 551.0,
+      "entropy": 0.3920513987541199,
+      "epoch": 3.005291005291005,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3359375,
+      "kl": 0.0014088655589148402,
+      "learning_rate": 4.307773301355062e-07,
+      "loss": 0.0,
+      "num_tokens": 11133620.0,
+      "reward": 0.34166669845581055,
+      "reward_std": 0.1725163757801056,
+      "rewards/itbench_correctness/mean": 0.34166669845581055,
+      "rewards/itbench_correctness/std": 0.3432955741882324,
+      "step": 568,
+      "step_time": 109.38008708879352
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 454.0,
+      "completions/mean_length": 684.5625,
+      "completions/mean_terminated_length": 345.125,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "entropy": 0.5375696420669556,
+      "epoch": 3.0105820105820107,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.0021559942979365587,
+      "learning_rate": 4.2914016459478786e-07,
+      "loss": 0.0188,
+      "num_tokens": 11152141.0,
+      "reward": 0.4140625,
+      "reward_std": 0.24306795001029968,
+      "rewards/itbench_correctness/mean": 0.4140625,
+      "rewards/itbench_correctness/std": 0.2446032166481018,
+      "step": 569,
+      "step_time": 244.70517920982093
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 738.1875,
+      "completions/mean_terminated_length": 608.2727661132812,
+      "completions/min_length": 415.0,
+      "completions/min_terminated_length": 415.0,
+      "entropy": 0.5039370059967041,
+      "epoch": 3.015873015873016,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.25,
+      "kl": 0.0019608521834015846,
+      "learning_rate": 4.275037739658771e-07,
+      "loss": 0.0055,
+      "num_tokens": 11174272.0,
+      "reward": 0.0989583358168602,
+      "reward_std": 0.03100099228322506,
+      "rewards/itbench_correctness/mean": 0.0989583358168602,
+      "rewards/itbench_correctness/std": 0.11063265055418015,
+      "step": 570,
+      "step_time": 694.2117186943069
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 720.5625,
+      "completions/mean_terminated_length": 538.5,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "entropy": 0.5023853182792664,
+      "epoch": 3.0211640211640214,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.026611328125,
+      "kl": 0.0014148685149848461,
+      "learning_rate": 4.258681761440789e-07,
+      "loss": 0.0,
+      "num_tokens": 11210273.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 571,
+      "step_time": 249.2116071432829
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 816.0,
+      "completions/mean_length": 720.6875,
+      "completions/mean_terminated_length": 484.77777099609375,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 0.3996184170246124,
+      "epoch": 3.0264550264550265,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0012311309110373259,
+      "learning_rate": 4.2423338901602983e-07,
+      "loss": 0.0147,
+      "num_tokens": 11229236.0,
+      "reward": 0.7708333730697632,
+      "reward_std": 0.19795581698417664,
+      "rewards/itbench_correctness/mean": 0.7708333730697632,
+      "rewards/itbench_correctness/std": 0.35939764976501465,
+      "step": 572,
+      "step_time": 247.85055056307465
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 520.125,
+      "completions/mean_terminated_length": 520.125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.35183849930763245,
+      "epoch": 3.0317460317460316,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.03125,
+      "kl": 0.002790507161989808,
+      "learning_rate": 4.225994304594993e-07,
+      "loss": -0.0321,
+      "num_tokens": 11242278.0,
+      "reward": 0.1875,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 573,
+      "step_time": 426.5937115754932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 835.625,
+      "completions/mean_terminated_length": 792.1538696289062,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.48107704520225525,
+      "epoch": 3.037037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9921875,
+      "kl": 0.0021311482414603233,
+      "learning_rate": 4.2096631834319687e-07,
+      "loss": -0.0252,
+      "num_tokens": 11264776.0,
+      "reward": 0.265625,
+      "reward_std": 0.2204262614250183,
+      "rewards/itbench_correctness/mean": 0.265625,
+      "rewards/itbench_correctness/std": 0.2183031290769577,
+      "step": 574,
+      "step_time": 419.11753554455936
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 994.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 638.6875,
+      "completions/mean_terminated_length": 638.6875,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "entropy": 0.375770628452301,
+      "epoch": 3.0423280423280423,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0015504560433328152,
+      "learning_rate": 4.193340705265745e-07,
+      "loss": 0.0162,
+      "num_tokens": 11285235.0,
+      "reward": 0.47968751192092896,
+      "reward_std": 0.17499202489852905,
+      "rewards/itbench_correctness/mean": 0.47968751192092896,
+      "rewards/itbench_correctness/std": 0.4592764973640442,
+      "step": 575,
+      "step_time": 152.97607036307454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 607.8125,
+      "completions/mean_terminated_length": 607.8125,
+      "completions/min_length": 377.0,
+      "completions/min_terminated_length": 377.0,
+      "entropy": 0.39650386571884155,
+      "epoch": 3.0476190476190474,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0011127422330901027,
+      "learning_rate": 4.1770270485963294e-07,
+      "loss": -0.0156,
+      "num_tokens": 11298640.0,
+      "reward": 0.6644607782363892,
+      "reward_std": 0.04214790090918541,
+      "rewards/itbench_correctness/mean": 0.6644607782363892,
+      "rewards/itbench_correctness/std": 0.2791382670402527,
+      "step": 576,
+      "step_time": 166.16624604724348
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 808.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 698.3125,
+      "completions/mean_terminated_length": 698.3125,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 0.3837823271751404,
+      "epoch": 3.052910052910053,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.140625,
+      "kl": 0.0011828916613012552,
+      "learning_rate": 4.1607223918272614e-07,
+      "loss": -0.002,
+      "num_tokens": 11313829.0,
+      "reward": 0.625,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 577,
+      "step_time": 81.87011110130697
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 592.0,
+      "completions/max_terminated_length": 592.0,
+      "completions/mean_length": 517.3125,
+      "completions/mean_terminated_length": 517.3125,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 0.3595505654811859,
+      "epoch": 3.058201058201058,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.001632296247407794,
+      "learning_rate": 4.1444269132636494e-07,
+      "loss": 0.0011,
+      "num_tokens": 11325674.0,
+      "reward": 0.96875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.96875,
+      "rewards/itbench_correctness/std": 0.125,
+      "step": 578,
+      "step_time": 853.527063309215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 753.4375,
+      "completions/mean_terminated_length": 630.45458984375,
+      "completions/min_length": 386.0,
+      "completions/min_terminated_length": 386.0,
+      "entropy": 0.5415180325508118,
+      "epoch": 3.0634920634920633,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.390625,
+      "kl": 0.016372833400964737,
+      "learning_rate": 4.1281407911102424e-07,
+      "loss": 0.0314,
+      "num_tokens": 11359377.0,
+      "reward": 0.0234375,
+      "reward_std": 0.03234682232141495,
+      "rewards/itbench_correctness/mean": 0.0234375,
+      "rewards/itbench_correctness/std": 0.050389111042022705,
+      "step": 579,
+      "step_time": 93.2713974667713
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 772.0,
+      "completions/mean_length": 592.8125,
+      "completions/mean_terminated_length": 531.2142944335938,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.4048497676849365,
+      "epoch": 3.068783068783069,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0021097399294376373,
+      "learning_rate": 4.1118642034694565e-07,
+      "loss": 0.0827,
+      "num_tokens": 11371702.0,
+      "reward": 0.34375,
+      "reward_std": 0.21564549207687378,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.46135368943214417,
+      "step": 580,
+      "step_time": 78.31074696686119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 749.0,
+      "completions/max_terminated_length": 749.0,
+      "completions/mean_length": 500.0,
+      "completions/mean_terminated_length": 500.0,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "entropy": 0.4180000126361847,
+      "epoch": 3.074074074074074,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.875,
+      "kl": 0.0022473859135061502,
+      "learning_rate": 4.095597328339452e-07,
+      "loss": 0.003,
+      "num_tokens": 11395878.0,
+      "reward": 0.75,
+      "reward_std": 0.4355512857437134,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 581,
+      "step_time": 106.9057395812124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 953.8125,
+      "completions/mean_terminated_length": 462.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5346962809562683,
+      "epoch": 3.0793650793650795,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.25,
+      "kl": 0.0019432251574471593,
+      "learning_rate": 4.079340343612164e-07,
+      "loss": -0.0605,
+      "num_tokens": 11434283.0,
+      "reward": 0.1145833358168602,
+      "reward_std": 0.17747542262077332,
+      "rewards/itbench_correctness/mean": 0.1145833358168602,
+      "rewards/itbench_correctness/std": 0.17969882488250732,
+      "step": 582,
+      "step_time": 153.69186680205166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 930.5,
+      "completions/mean_terminated_length": 810.2857666015625,
+      "completions/min_length": 720.0,
+      "completions/min_terminated_length": 720.0,
+      "entropy": 0.42557764053344727,
+      "epoch": 3.0846560846560847,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.001963236602023244,
+      "learning_rate": 4.0630934270713755e-07,
+      "loss": 0.015,
+      "num_tokens": 11456923.0,
+      "reward": 0.20416668057441711,
+      "reward_std": 0.2299290895462036,
+      "rewards/itbench_correctness/mean": 0.20416668057441711,
+      "rewards/itbench_correctness/std": 0.2864534258842468,
+      "step": 583,
+      "step_time": 133.70225734543055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 602.0,
+      "completions/max_terminated_length": 602.0,
+      "completions/mean_length": 510.6875,
+      "completions/mean_terminated_length": 510.6875,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 0.5052013397216797,
+      "epoch": 3.0899470899470898,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0240478515625,
+      "kl": 0.001320964889600873,
+      "learning_rate": 4.046856756390766e-07,
+      "loss": 0.0,
+      "num_tokens": 11468238.0,
+      "reward": 0.5833333134651184,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5833333134651184,
+      "rewards/itbench_correctness/std": 0.4303314983844757,
+      "step": 584,
+      "step_time": 56.649503622204065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 657.5625,
+      "completions/mean_terminated_length": 657.5625,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "entropy": 0.5353103280067444,
+      "epoch": 3.0952380952380953,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6328125,
+      "kl": 0.0017962680431082845,
+      "learning_rate": 4.030630509131959e-07,
+      "loss": -0.0069,
+      "num_tokens": 11496463.0,
+      "reward": 0.875,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 585,
+      "step_time": 126.4090378023684
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.63671875,
+      "epoch": 3.1005291005291005,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.21875,
+      "kl": 0.0013208300806581974,
+      "learning_rate": 4.0144148627425986e-07,
+      "loss": 0.0001,
+      "num_tokens": 11522143.0,
+      "reward": 0.1875,
+      "reward_std": 0.27381423115730286,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.273861289024353,
+      "step": 586,
+      "step_time": 592.935981715098
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 560.0,
+      "completions/mean_length": 712.0625,
+      "completions/mean_terminated_length": 400.125,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.252786785364151,
+      "epoch": 3.105820105820106,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.85546875,
+      "kl": 0.001402621390298009,
+      "learning_rate": 3.998209994554394e-07,
+      "loss": -0.0316,
+      "num_tokens": 11543664.0,
+      "reward": 0.375,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 587,
+      "step_time": 146.66224777232856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1012.0,
+      "completions/mean_length": 716.3125,
+      "completions/mean_terminated_length": 695.800048828125,
+      "completions/min_length": 482.0,
+      "completions/min_terminated_length": 482.0,
+      "entropy": 0.5612075924873352,
+      "epoch": 3.111111111111111,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.022705078125,
+      "kl": 0.0013308442430570722,
+      "learning_rate": 3.9820160817811887e-07,
+      "loss": 0.0,
+      "num_tokens": 11582781.0,
+      "reward": 0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.05000000074505806,
+      "rewards/itbench_correctness/std": 0.05163978040218353,
+      "step": 588,
+      "step_time": 122.08641688153148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1018.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 505.375,
+      "completions/mean_terminated_length": 505.375,
+      "completions/min_length": 285.0,
+      "completions/min_terminated_length": 285.0,
+      "entropy": 0.31263911724090576,
+      "epoch": 3.1164021164021163,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0014801534125581384,
+      "learning_rate": 3.965833301517016e-07,
+      "loss": 0.0609,
+      "num_tokens": 11594227.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 589,
+      "step_time": 936.1484231920913
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 625.0,
+      "completions/max_terminated_length": 625.0,
+      "completions/mean_length": 464.875,
+      "completions/mean_terminated_length": 464.875,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 0.46248990297317505,
+      "epoch": 3.121693121693122,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.578125,
+      "kl": 0.0018392398487776518,
+      "learning_rate": 3.9496618307341713e-07,
+      "loss": 0.0127,
+      "num_tokens": 11610305.0,
+      "reward": 0.24289773404598236,
+      "reward_std": 0.14991973340511322,
+      "rewards/itbench_correctness/mean": 0.24289773404598236,
+      "rewards/itbench_correctness/std": 0.15188638865947723,
+      "step": 590,
+      "step_time": 92.19220882095397
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 852.0,
+      "completions/max_terminated_length": 852.0,
+      "completions/mean_length": 608.3125,
+      "completions/mean_terminated_length": 608.3125,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 0.5589232444763184,
+      "epoch": 3.126984126984127,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4609375,
+      "kl": 0.0023250363301485777,
+      "learning_rate": 3.9335018462812664e-07,
+      "loss": 0.0092,
+      "num_tokens": 11631278.0,
+      "reward": 0.3999999761581421,
+      "reward_std": 0.16903084516525269,
+      "rewards/itbench_correctness/mean": 0.3999999761581421,
+      "rewards/itbench_correctness/std": 0.47328636050224304,
+      "step": 591,
+      "step_time": 87.29508406948298
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 946.0,
+      "completions/max_terminated_length": 946.0,
+      "completions/mean_length": 611.375,
+      "completions/mean_terminated_length": 611.375,
+      "completions/min_length": 456.0,
+      "completions/min_terminated_length": 456.0,
+      "entropy": 0.5332242846488953,
+      "epoch": 3.132275132275132,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05029296875,
+      "kl": 0.0023294584825634956,
+      "learning_rate": 3.9173535248813017e-07,
+      "loss": 0.0001,
+      "num_tokens": 11654876.0,
+      "reward": 0.0833333358168602,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0833333358168602,
+      "rewards/itbench_correctness/std": 0.08606629818677902,
+      "step": 592,
+      "step_time": 78.46097278501838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 586.0,
+      "completions/max_terminated_length": 586.0,
+      "completions/mean_length": 482.6875,
+      "completions/mean_terminated_length": 482.6875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4537096917629242,
+      "epoch": 3.1375661375661377,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.416015625,
+      "kl": 0.0019261433044448495,
+      "learning_rate": 3.901217043129734e-07,
+      "loss": -0.0825,
+      "num_tokens": 11666455.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 593,
+      "step_time": 112.39422312192619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 745.5,
+      "completions/mean_terminated_length": 618.9091186523438,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.4077800214290619,
+      "epoch": 3.142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9140625,
+      "kl": 0.0012227533152326941,
+      "learning_rate": 3.885092577492542e-07,
+      "loss": 0.0307,
+      "num_tokens": 11693759.0,
+      "reward": 0.5625,
+      "reward_std": 0.49022960662841797,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 594,
+      "step_time": 92.09622034989297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 869.4375,
+      "completions/mean_terminated_length": 847.357177734375,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "entropy": 0.33354899287223816,
+      "epoch": 3.148148148148148,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.0010007465025410056,
+      "learning_rate": 3.8689803043042996e-07,
+      "loss": -0.0018,
+      "num_tokens": 11713342.0,
+      "reward": 0.5208333730697632,
+      "reward_std": 0.4459637701511383,
+      "rewards/itbench_correctness/mean": 0.5208333730697632,
+      "rewards/itbench_correctness/std": 0.438325971364975,
+      "step": 595,
+      "step_time": 515.5870687887073
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 587.0,
+      "completions/max_terminated_length": 587.0,
+      "completions/mean_length": 442.25,
+      "completions/mean_terminated_length": 442.25,
+      "completions/min_length": 319.0,
+      "completions/min_terminated_length": 319.0,
+      "entropy": 0.4250989258289337,
+      "epoch": 3.1534391534391535,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5234375,
+      "kl": 0.001579253701493144,
+      "learning_rate": 3.8528803997662423e-07,
+      "loss": -0.0061,
+      "num_tokens": 11723002.0,
+      "reward": 0.48750001192092896,
+      "reward_std": 0.1636853665113449,
+      "rewards/itbench_correctness/mean": 0.48750001192092896,
+      "rewards/itbench_correctness/std": 0.16683325171470642,
+      "step": 596,
+      "step_time": 60.05524417478591
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 522.0,
+      "completions/mean_length": 720.0,
+      "completions/mean_terminated_length": 416.0,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.3097222149372101,
+      "epoch": 3.1587301587301586,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1484375,
+      "kl": 0.0024181241169571877,
+      "learning_rate": 3.8367930399443486e-07,
+      "loss": -0.0056,
+      "num_tokens": 11740490.0,
+      "reward": 0.2395833432674408,
+      "reward_std": 0.0883883386850357,
+      "rewards/itbench_correctness/mean": 0.2395833432674408,
+      "rewards/itbench_correctness/std": 0.27533650398254395,
+      "step": 597,
+      "step_time": 7237.127478616312
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 860.0,
+      "completions/mean_terminated_length": 805.3333740234375,
+      "completions/min_length": 498.0,
+      "completions/min_terminated_length": 498.0,
+      "entropy": 0.43023255467414856,
+      "epoch": 3.164021164021164,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3671875,
+      "kl": 0.001607713638804853,
+      "learning_rate": 3.8207184007674085e-07,
+      "loss": 0.001,
+      "num_tokens": 11764610.0,
+      "reward": 0.7083333730697632,
+      "reward_std": 0.19416078925132751,
+      "rewards/itbench_correctness/mean": 0.7083333730697632,
+      "rewards/itbench_correctness/std": 0.40138646960258484,
+      "step": 598,
+      "step_time": 144.42800151277333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 582.0,
+      "completions/max_terminated_length": 582.0,
+      "completions/mean_length": 423.9375,
+      "completions/mean_terminated_length": 423.9375,
+      "completions/min_length": 295.0,
+      "completions/min_terminated_length": 295.0,
+      "entropy": 0.35382574796676636,
+      "epoch": 3.1693121693121693,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4921875,
+      "kl": 0.003284444333985448,
+      "learning_rate": 3.8046566580250995e-07,
+      "loss": 0.0187,
+      "num_tokens": 11774265.0,
+      "reward": 0.6006944179534912,
+      "reward_std": 0.2023771107196808,
+      "rewards/itbench_correctness/mean": 0.6006944179534912,
+      "rewards/itbench_correctness/std": 0.25677546858787537,
+      "step": 599,
+      "step_time": 91.41469971835613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 605.0,
+      "completions/mean_length": 745.9375,
+      "completions/mean_terminated_length": 467.875,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 0.4826141595840454,
+      "epoch": 3.1746031746031744,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0022031215485185385,
+      "learning_rate": 3.788607987366069e-07,
+      "loss": -0.0035,
+      "num_tokens": 11791280.0,
+      "reward": 0.6674107313156128,
+      "reward_std": 0.29377472400665283,
+      "rewards/itbench_correctness/mean": 0.6674107313156128,
+      "rewards/itbench_correctness/std": 0.40818971395492554,
+      "step": 600,
+      "step_time": 727.5520837632939
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 637.4375,
+      "completions/mean_terminated_length": 637.4375,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.6149622797966003,
+      "epoch": 3.17989417989418,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0012068739160895348,
+      "learning_rate": 3.772572564296004e-07,
+      "loss": 0.0218,
+      "num_tokens": 11804407.0,
+      "reward": 0.8541666865348816,
+      "reward_std": 0.022271769121289253,
+      "rewards/itbench_correctness/mean": 0.8541666865348816,
+      "rewards/itbench_correctness/std": 0.15365907549858093,
+      "step": 601,
+      "step_time": 197.79692050255835
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 553.5,
+      "completions/mean_terminated_length": 553.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4065040647983551,
+      "epoch": 3.185185185185185,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.09375,
+      "kl": 0.0014250976964831352,
+      "learning_rate": 3.7565505641757266e-07,
+      "loss": -0.106,
+      "num_tokens": 11817495.0,
+      "reward": 0.5593750476837158,
+      "reward_std": 0.18626472353935242,
+      "rewards/itbench_correctness/mean": 0.5593750476837158,
+      "rewards/itbench_correctness/std": 0.26154589653015137,
+      "step": 602,
+      "step_time": 91.9841024801135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 829.5,
+      "completions/mean_terminated_length": 816.5333862304688,
+      "completions/min_length": 658.0,
+      "completions/min_terminated_length": 658.0,
+      "entropy": 0.44605183601379395,
+      "epoch": 3.1904761904761907,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.296875,
+      "kl": 0.0014674547128379345,
+      "learning_rate": 3.74054216221926e-07,
+      "loss": -0.0149,
+      "num_tokens": 11837095.0,
+      "reward": 0.1666666716337204,
+      "reward_std": 0.2182178944349289,
+      "rewards/itbench_correctness/mean": 0.1666666716337204,
+      "rewards/itbench_correctness/std": 0.3442651927471161,
+      "step": 603,
+      "step_time": 129.12120711896569
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 888.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 551.9375,
+      "completions/mean_terminated_length": 551.9375,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "entropy": 0.24821650981903076,
+      "epoch": 3.195767195767196,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.46875,
+      "kl": 0.0011176143307238817,
+      "learning_rate": 3.724547533491924e-07,
+      "loss": -0.0278,
+      "num_tokens": 11850606.0,
+      "reward": 0.2187500149011612,
+      "reward_std": 0.0883883535861969,
+      "rewards/itbench_correctness/mean": 0.2187500149011612,
+      "rewards/itbench_correctness/std": 0.2561737895011902,
+      "step": 604,
+      "step_time": 94.76566615886986
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 537.6875,
+      "completions/mean_terminated_length": 537.6875,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.4314773976802826,
+      "epoch": 3.201058201058201,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5234375,
+      "kl": 0.0012784149730578065,
+      "learning_rate": 3.708566852908418e-07,
+      "loss": 0.0069,
+      "num_tokens": 11862497.0,
+      "reward": 0.21875,
+      "reward_std": 0.2609178125858307,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.3204091787338257,
+      "step": 605,
+      "step_time": 816.5052793165669
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 921.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 623.4375,
+      "completions/mean_terminated_length": 623.4375,
+      "completions/min_length": 375.0,
+      "completions/min_terminated_length": 375.0,
+      "entropy": 0.3528822064399719,
+      "epoch": 3.2063492063492065,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.001327100908383727,
+      "learning_rate": 3.692600295230901e-07,
+      "loss": 0.0131,
+      "num_tokens": 11878112.0,
+      "reward": 0.6583333611488342,
+      "reward_std": 0.11426578462123871,
+      "rewards/itbench_correctness/mean": 0.6583333611488342,
+      "rewards/itbench_correctness/std": 0.22377237677574158,
+      "step": 606,
+      "step_time": 69.29318222776055
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 989.8125,
+      "completions/mean_terminated_length": 887.25,
+      "completions/min_length": 777.0,
+      "completions/min_terminated_length": 777.0,
+      "entropy": 0.6627517938613892,
+      "epoch": 3.2116402116402116,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.609375,
+      "kl": 0.0013202429981902242,
+      "learning_rate": 3.6766480350670925e-07,
+      "loss": 0.0134,
+      "num_tokens": 11904909.0,
+      "reward": 0.3958333432674408,
+      "reward_std": 0.19795581698417664,
+      "rewards/itbench_correctness/mean": 0.3958333432674408,
+      "rewards/itbench_correctness/std": 0.4901813864707947,
+      "step": 607,
+      "step_time": 82.23766458127648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 1011.5625,
+      "completions/mean_terminated_length": 957.6666870117188,
+      "completions/min_length": 915.0,
+      "completions/min_terminated_length": 915.0,
+      "entropy": 0.29854804277420044,
+      "epoch": 3.2169312169312168,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.015625,
+      "kl": 0.00097948859911412,
+      "learning_rate": 3.660710246868352e-07,
+      "loss": -0.0092,
+      "num_tokens": 11932430.0,
+      "reward": 0.6073908805847168,
+      "reward_std": 0.2576354146003723,
+      "rewards/itbench_correctness/mean": 0.6073908805847168,
+      "rewards/itbench_correctness/std": 0.26400262117385864,
+      "step": 608,
+      "step_time": 116.86947522684932
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 908.0,
+      "completions/mean_length": 644.625,
+      "completions/mean_terminated_length": 619.3333740234375,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "entropy": 0.5987977385520935,
+      "epoch": 3.2222222222222223,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.890625,
+      "kl": 0.001607781508937478,
+      "learning_rate": 3.6447871049277796e-07,
+      "loss": 0.0406,
+      "num_tokens": 11946784.0,
+      "reward": 0.6499999761581421,
+      "reward_std": 0.3926178812980652,
+      "rewards/itbench_correctness/mean": 0.6499999761581421,
+      "rewards/itbench_correctness/std": 0.43204939365386963,
+      "step": 609,
+      "step_time": 80.47608442325145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 867.0,
+      "completions/max_terminated_length": 867.0,
+      "completions/mean_length": 678.625,
+      "completions/mean_terminated_length": 678.625,
+      "completions/min_length": 442.0,
+      "completions/min_terminated_length": 442.0,
+      "entropy": 0.6159513592720032,
+      "epoch": 3.2275132275132274,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.045654296875,
+      "kl": 0.001993054524064064,
+      "learning_rate": 3.6288787833783016e-07,
+      "loss": 0.0001,
+      "num_tokens": 11972402.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 610,
+      "step_time": 71.49992215260863
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 725.8125,
+      "completions/mean_terminated_length": 683.2142944335938,
+      "completions/min_length": 452.0,
+      "completions/min_terminated_length": 452.0,
+      "entropy": 0.410574346780777,
+      "epoch": 3.2328042328042326,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.001559928059577942,
+      "learning_rate": 3.612985456190778e-07,
+      "loss": 0.0063,
+      "num_tokens": 11988567.0,
+      "reward": 0.3571428656578064,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.3571428656578064,
+      "rewards/itbench_correctness/std": 0.39382997155189514,
+      "step": 611,
+      "step_time": 165.54018260445446
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 925.25,
+      "completions/mean_terminated_length": 826.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5879492163658142,
+      "epoch": 3.238095238095238,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9453125,
+      "kl": 0.0012924325419589877,
+      "learning_rate": 3.597107297172084e-07,
+      "loss": -0.0471,
+      "num_tokens": 12017675.0,
+      "reward": 0.265625,
+      "reward_std": 0.3114553987979889,
+      "rewards/itbench_correctness/mean": 0.265625,
+      "rewards/itbench_correctness/std": 0.4422362744808197,
+      "step": 612,
+      "step_time": 94.39502456784248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 862.375,
+      "completions/mean_terminated_length": 700.75,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.3385998010635376,
+      "epoch": 3.2433862433862433,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.001366258948110044,
+      "learning_rate": 3.5812444799632247e-07,
+      "loss": -0.0171,
+      "num_tokens": 12043681.0,
+      "reward": 0.5,
+      "reward_std": 0.5175491571426392,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 613,
+      "step_time": 669.0384511752054
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1011.0,
+      "completions/mean_length": 880.625,
+      "completions/mean_terminated_length": 832.8333740234375,
+      "completions/min_length": 580.0,
+      "completions/min_terminated_length": 580.0,
+      "entropy": 0.5132718086242676,
+      "epoch": 3.248677248677249,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6015625,
+      "kl": 0.0020796628668904305,
+      "learning_rate": 3.565397178037429e-07,
+      "loss": 0.0037,
+      "num_tokens": 12073499.0,
+      "reward": 0.25,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 614,
+      "step_time": 113.26465024612844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 695.0,
+      "completions/max_terminated_length": 695.0,
+      "completions/mean_length": 495.875,
+      "completions/mean_terminated_length": 495.875,
+      "completions/min_length": 356.0,
+      "completions/min_terminated_length": 356.0,
+      "entropy": 0.45777666568756104,
+      "epoch": 3.253968253968254,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.578125,
+      "kl": 0.0019308277405798435,
+      "learning_rate": 3.5495655646982503e-07,
+      "loss": -0.0023,
+      "num_tokens": 12093601.0,
+      "reward": 0.5227272510528564,
+      "reward_std": 0.32154878973960876,
+      "rewards/itbench_correctness/mean": 0.5227272510528564,
+      "rewards/itbench_correctness/std": 0.41261112689971924,
+      "step": 615,
+      "step_time": 105.98786111921072
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 663.0,
+      "completions/max_terminated_length": 663.0,
+      "completions/mean_length": 480.8125,
+      "completions/mean_terminated_length": 480.8125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.29949304461479187,
+      "epoch": 3.259259259259259,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.453125,
+      "kl": 0.001420930726453662,
+      "learning_rate": 3.533749813077677e-07,
+      "loss": -0.0137,
+      "num_tokens": 12104318.0,
+      "reward": 0.43359375,
+      "reward_std": 0.2698231339454651,
+      "rewards/itbench_correctness/mean": 0.43359375,
+      "rewards/itbench_correctness/std": 0.35901251435279846,
+      "step": 616,
+      "step_time": 130.85422169603407
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 700.875,
+      "completions/mean_terminated_length": 507.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5279115438461304,
+      "epoch": 3.2645502645502646,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0030069290660321712,
+      "learning_rate": 3.517950096134232e-07,
+      "loss": -0.0693,
+      "num_tokens": 12120292.0,
+      "reward": 0.5572916269302368,
+      "reward_std": 0.408902645111084,
+      "rewards/itbench_correctness/mean": 0.5572916269302368,
+      "rewards/itbench_correctness/std": 0.4146828055381775,
+      "step": 617,
+      "step_time": 76.69993899855763
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 958.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 681.9375,
+      "completions/mean_terminated_length": 681.9375,
+      "completions/min_length": 533.0,
+      "completions/min_terminated_length": 533.0,
+      "entropy": 0.3578040599822998,
+      "epoch": 3.2698412698412698,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.001377190463244915,
+      "learning_rate": 3.502166586651092e-07,
+      "loss": 0.0161,
+      "num_tokens": 12136035.0,
+      "reward": 0.78125,
+      "reward_std": 0.2896047830581665,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.28321075439453125,
+      "step": 618,
+      "step_time": 78.15438072942197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 711.0,
+      "completions/max_terminated_length": 711.0,
+      "completions/mean_length": 502.5625,
+      "completions/mean_terminated_length": 502.5625,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "entropy": 0.39796045422554016,
+      "epoch": 3.2751322751322753,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2734375,
+      "kl": 0.0014053636696189642,
+      "learning_rate": 3.4863994572341843e-07,
+      "loss": -0.0057,
+      "num_tokens": 12146932.0,
+      "reward": 0.875,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 619,
+      "step_time": 814.3009329754859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 837.0,
+      "completions/mean_length": 677.75,
+      "completions/mean_terminated_length": 408.4444580078125,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "entropy": 0.4455920457839966,
+      "epoch": 3.2804232804232805,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.640625,
+      "kl": 0.0013953611487522721,
+      "learning_rate": 3.470648880310313e-07,
+      "loss": 0.0023,
+      "num_tokens": 12164472.0,
+      "reward": 0.39375001192092896,
+      "reward_std": 0.23028594255447388,
+      "rewards/itbench_correctness/mean": 0.39375001192092896,
+      "rewards/itbench_correctness/std": 0.2535580098628998,
+      "step": 620,
+      "step_time": 774.6395965730771
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.318359375,
+      "epoch": 3.2857142857142856,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.609375,
+      "kl": 0.0011473577469587326,
+      "learning_rate": 3.454915028125263e-07,
+      "loss": 0.0,
+      "num_tokens": 12192136.0,
+      "reward": 0.2708333432674408,
+      "reward_std": 0.19795581698417664,
+      "rewards/itbench_correctness/mean": 0.2708333432674408,
+      "rewards/itbench_correctness/std": 0.3890872597694397,
+      "step": 621,
+      "step_time": 147.70375349000096
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 601.0,
+      "completions/max_terminated_length": 601.0,
+      "completions/mean_length": 504.4375,
+      "completions/mean_terminated_length": 504.4375,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "entropy": 0.4182876944541931,
+      "epoch": 3.291005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4609375,
+      "kl": 0.002045721746981144,
+      "learning_rate": 3.43919807274192e-07,
+      "loss": -0.0186,
+      "num_tokens": 12203111.0,
+      "reward": 0.47727274894714355,
+      "reward_std": 0.29765012860298157,
+      "rewards/itbench_correctness/mean": 0.47727274894714355,
+      "rewards/itbench_correctness/std": 0.40613409876823425,
+      "step": 622,
+      "step_time": 49.89000040013343
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 767.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 515.8125,
+      "completions/mean_terminated_length": 515.8125,
+      "completions/min_length": 359.0,
+      "completions/min_terminated_length": 359.0,
+      "entropy": 0.4788561761379242,
+      "epoch": 3.2962962962962963,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.0019279435509815812,
+      "learning_rate": 3.4234981860383927e-07,
+      "loss": 0.014,
+      "num_tokens": 12215068.0,
+      "reward": 0.375,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 623,
+      "step_time": 93.9447353342548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 831.125,
+      "completions/mean_terminated_length": 681.1111450195312,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.4283351004123688,
+      "epoch": 3.3015873015873014,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.234375,
+      "kl": 0.0011250257957726717,
+      "learning_rate": 3.407815539706124e-07,
+      "loss": 0.003,
+      "num_tokens": 12235334.0,
+      "reward": 0.5703125,
+      "reward_std": 0.05964459478855133,
+      "rewards/itbench_correctness/mean": 0.5703125,
+      "rewards/itbench_correctness/std": 0.4511992335319519,
+      "step": 624,
+      "step_time": 102.94410282652825
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 969.0,
+      "completions/mean_length": 864.875,
+      "completions/mean_terminated_length": 741.1111450195312,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.3584333062171936,
+      "epoch": 3.306878306878307,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.0012643268564715981,
+      "learning_rate": 3.3921503052480236e-07,
+      "loss": 0.0147,
+      "num_tokens": 12256380.0,
+      "reward": 0.5694444179534912,
+      "reward_std": 0.30291885137557983,
+      "rewards/itbench_correctness/mean": 0.5694444179534912,
+      "rewards/itbench_correctness/std": 0.3557291328907013,
+      "step": 625,
+      "step_time": 484.2877219989896
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 779.9375,
+      "completions/mean_terminated_length": 779.9375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.41028928756713867,
+      "epoch": 3.312169312169312,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0014163429150357842,
+      "learning_rate": 3.3765026539765827e-07,
+      "loss": -0.0162,
+      "num_tokens": 12274915.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.2507132589817047,
+      "rewards/itbench_correctness/mean": 0.30000001192092896,
+      "rewards/itbench_correctness/std": 0.46188023686408997,
+      "step": 626,
+      "step_time": 97.47150356322527
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 638.0,
+      "completions/max_terminated_length": 638.0,
+      "completions/mean_length": 437.3125,
+      "completions/mean_terminated_length": 437.3125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.541946530342102,
+      "epoch": 3.317460317460317,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5625,
+      "kl": 0.0028036432340741158,
+      "learning_rate": 3.360872757012011e-07,
+      "loss": 0.0022,
+      "num_tokens": 12284448.0,
+      "reward": 0.3958333432674408,
+      "reward_std": 0.10767625272274017,
+      "rewards/itbench_correctness/mean": 0.3958333432674408,
+      "rewards/itbench_correctness/std": 0.3657817840576172,
+      "step": 627,
+      "step_time": 93.49457087833434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 943.0,
+      "completions/mean_length": 665.375,
+      "completions/mean_terminated_length": 545.8333740234375,
+      "completions/min_length": 385.0,
+      "completions/min_terminated_length": 385.0,
+      "entropy": 0.3877512812614441,
+      "epoch": 3.322751322751323,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0014740098267793655,
+      "learning_rate": 3.345260785280358e-07,
+      "loss": 0.0046,
+      "num_tokens": 12301662.0,
+      "reward": 0.609375,
+      "reward_std": 0.11860001087188721,
+      "rewards/itbench_correctness/mean": 0.609375,
+      "rewards/itbench_correctness/std": 0.19654129445552826,
+      "step": 628,
+      "step_time": 155.64716604631394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 985.625,
+      "completions/mean_terminated_length": 921.6666870117188,
+      "completions/min_length": 835.0,
+      "completions/min_terminated_length": 835.0,
+      "entropy": 0.3896005153656006,
+      "epoch": 3.328042328042328,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.0012615715386345983,
+      "learning_rate": 3.329666909511645e-07,
+      "loss": 0.0,
+      "num_tokens": 12328376.0,
+      "reward": 0.07500000298023224,
+      "reward_std": 0.09161254018545151,
+      "rewards/itbench_correctness/mean": 0.07500000298023224,
+      "rewards/itbench_correctness/std": 0.14719600975513458,
+      "step": 629,
+      "step_time": 188.39432869665325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 747.5,
+      "completions/mean_terminated_length": 621.8181762695312,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.32107022404670715,
+      "epoch": 3.3333333333333335,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.08935546875,
+      "kl": 0.0018245604587718844,
+      "learning_rate": 3.314091300237999e-07,
+      "loss": 0.0001,
+      "num_tokens": 12347128.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 630,
+      "step_time": 1010.9434453165159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 588.0,
+      "completions/max_terminated_length": 588.0,
+      "completions/mean_length": 508.3125,
+      "completions/mean_terminated_length": 508.3125,
+      "completions/min_length": 416.0,
+      "completions/min_terminated_length": 416.0,
+      "entropy": 0.38755688071250916,
+      "epoch": 3.3386243386243386,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.0018675376195460558,
+      "learning_rate": 3.2985341277917846e-07,
+      "loss": -0.0092,
+      "num_tokens": 12363957.0,
+      "reward": 0.640625,
+      "reward_std": 0.1446593999862671,
+      "rewards/itbench_correctness/mean": 0.640625,
+      "rewards/itbench_correctness/std": 0.29181545972824097,
+      "step": 631,
+      "step_time": 96.36415668576956
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 822.125,
+      "completions/mean_terminated_length": 822.125,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "entropy": 0.29314276576042175,
+      "epoch": 3.3439153439153437,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.921875,
+      "kl": 0.002633322263136506,
+      "learning_rate": 3.282995562303753e-07,
+      "loss": 0.025,
+      "num_tokens": 12384663.0,
+      "reward": 0.6741071343421936,
+      "reward_std": 0.29771745204925537,
+      "rewards/itbench_correctness/mean": 0.6741071343421936,
+      "rewards/itbench_correctness/std": 0.3393692374229431,
+      "step": 632,
+      "step_time": 81.67576451133937
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 661.0,
+      "completions/mean_length": 741.4375,
+      "completions/mean_terminated_length": 521.6666870117188,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.49633312225341797,
+      "epoch": 3.3492063492063493,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0015913519309833646,
+      "learning_rate": 3.2674757737011606e-07,
+      "loss": 0.0059,
+      "num_tokens": 12407326.0,
+      "reward": 0.4937500059604645,
+      "reward_std": 0.01767767034471035,
+      "rewards/itbench_correctness/mean": 0.4937500059604645,
+      "rewards/itbench_correctness/std": 0.510514497756958,
+      "step": 633,
+      "step_time": 96.86188104748726
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 776.0625,
+      "completions/mean_terminated_length": 740.6428833007812,
+      "completions/min_length": 527.0,
+      "completions/min_terminated_length": 527.0,
+      "entropy": 0.6081984639167786,
+      "epoch": 3.3544973544973544,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.390625,
+      "kl": 0.0018657725304365158,
+      "learning_rate": 3.2519749317059327e-07,
+      "loss": 0.0253,
+      "num_tokens": 12442815.0,
+      "reward": 0.4375,
+      "reward_std": 0.49022960662841797,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 634,
+      "step_time": 115.92930174898356
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 700.625,
+      "completions/mean_terminated_length": 654.4285888671875,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.3297056257724762,
+      "epoch": 3.35978835978836,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.07666015625,
+      "kl": 0.0020810659043490887,
+      "learning_rate": 3.236493205832794e-07,
+      "loss": 0.0001,
+      "num_tokens": 12459393.0,
+      "reward": 0.0833333358168602,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0833333358168602,
+      "rewards/itbench_correctness/std": 0.08606629818677902,
+      "step": 635,
+      "step_time": 450.1781229842454
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 885.0,
+      "completions/mean_length": 707.625,
+      "completions/mean_terminated_length": 563.8181762695312,
+      "completions/min_length": 412.0,
+      "completions/min_terminated_length": 412.0,
+      "entropy": 0.48048049211502075,
+      "epoch": 3.365079365079365,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.001259908196516335,
+      "learning_rate": 3.221030765387417e-07,
+      "loss": -0.0114,
+      "num_tokens": 12495003.0,
+      "reward": 0.4375,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 636,
+      "step_time": 161.46651719231158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 364.0,
+      "completions/mean_length": 671.0,
+      "completions/mean_terminated_length": 318.0,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "entropy": 0.42324888706207275,
+      "epoch": 3.3703703703703702,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.03125,
+      "kl": 0.0016913211438804865,
+      "learning_rate": 3.205587779464576e-07,
+      "loss": 0.008,
+      "num_tokens": 12512171.0,
+      "reward": 0.71875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 637,
+      "step_time": 103.04572070110589
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 756.0,
+      "completions/max_terminated_length": 756.0,
+      "completions/mean_length": 563.5,
+      "completions/mean_terminated_length": 563.5,
+      "completions/min_length": 413.0,
+      "completions/min_terminated_length": 413.0,
+      "entropy": 0.41703638434410095,
+      "epoch": 3.375661375661376,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0859375,
+      "kl": 0.0013338474091142416,
+      "learning_rate": 3.190164416946285e-07,
+      "loss": -0.022,
+      "num_tokens": 12524003.0,
+      "reward": 0.9166666269302368,
+      "reward_std": 0.044543541967868805,
+      "rewards/itbench_correctness/mean": 0.9166666269302368,
+      "rewards/itbench_correctness/std": 0.10540926456451416,
+      "step": 638,
+      "step_time": 89.99323462788016
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 761.0,
+      "completions/max_terminated_length": 761.0,
+      "completions/mean_length": 590.5625,
+      "completions/mean_terminated_length": 590.5625,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.3149539530277252,
+      "epoch": 3.380952380952381,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.0011156360851600766,
+      "learning_rate": 3.174760846499972e-07,
+      "loss": 0.0079,
+      "num_tokens": 12539556.0,
+      "reward": 0.40625,
+      "reward_std": 0.01767767034471035,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.420267790555954,
+      "step": 639,
+      "step_time": 1139.211639557965
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 851.0,
+      "completions/mean_length": 881.3125,
+      "completions/mean_terminated_length": 567.4000244140625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.3472094237804413,
+      "epoch": 3.386243386243386,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.34375,
+      "kl": 0.0015863279113546014,
+      "learning_rate": 3.15937723657661e-07,
+      "loss": 0.0695,
+      "num_tokens": 12568353.0,
+      "reward": 0.34375,
+      "reward_std": 0.16925080120563507,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.4236907958984375,
+      "step": 640,
+      "step_time": 180.37901693582535
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 982.0,
+      "completions/mean_length": 970.4375,
+      "completions/mean_terminated_length": 881.1666870117188,
+      "completions/min_length": 710.0,
+      "completions/min_terminated_length": 710.0,
+      "entropy": 0.6265215277671814,
+      "epoch": 3.3915343915343916,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0014786267420277,
+      "learning_rate": 3.1440137554088953e-07,
+      "loss": 0.0438,
+      "num_tokens": 12603128.0,
+      "reward": 0.19999998807907104,
+      "reward_std": 0.3343248665332794,
+      "rewards/itbench_correctness/mean": 0.19999998807907104,
+      "rewards/itbench_correctness/std": 0.3326660096645355,
+      "step": 641,
+      "step_time": 174.49532955139875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 603.75,
+      "completions/mean_terminated_length": 603.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4306418299674988,
+      "epoch": 3.3968253968253967,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4140625,
+      "kl": 0.002129571046680212,
+      "learning_rate": 3.1286705710093984e-07,
+      "loss": -0.0944,
+      "num_tokens": 12617060.0,
+      "reward": 0.737500011920929,
+      "reward_std": 0.25792384147644043,
+      "rewards/itbench_correctness/mean": 0.737500011920929,
+      "rewards/itbench_correctness/std": 0.26884526014328003,
+      "step": 642,
+      "step_time": 78.10744374617934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 781.0,
+      "completions/max_terminated_length": 781.0,
+      "completions/mean_length": 557.0,
+      "completions/mean_terminated_length": 557.0,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "entropy": 0.35727110505104065,
+      "epoch": 3.402116402116402,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.041015625,
+      "kl": 0.0019330759532749653,
+      "learning_rate": 3.113347851168721e-07,
+      "loss": 0.0,
+      "num_tokens": 12629716.0,
+      "reward": 0.75,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 643,
+      "step_time": 797.8743101553991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 699.75,
+      "completions/mean_terminated_length": 552.3636474609375,
+      "completions/min_length": 361.0,
+      "completions/min_terminated_length": 361.0,
+      "entropy": 0.3372633159160614,
+      "epoch": 3.4074074074074074,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5859375,
+      "kl": 0.00145020114723593,
+      "learning_rate": 3.0980457634536774e-07,
+      "loss": 0.0122,
+      "num_tokens": 12654240.0,
+      "reward": 0.46875,
+      "reward_std": 0.3471629321575165,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.3859512209892273,
+      "step": 644,
+      "step_time": 142.23401138465852
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 616.0,
+      "completions/max_terminated_length": 616.0,
+      "completions/mean_length": 412.625,
+      "completions/mean_terminated_length": 412.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.43380793929100037,
+      "epoch": 3.4126984126984126,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.921875,
+      "kl": 0.003082259325310588,
+      "learning_rate": 3.082764475205442e-07,
+      "loss": -0.0126,
+      "num_tokens": 12663858.0,
+      "reward": 0.828125,
+      "reward_std": 0.3143535256385803,
+      "rewards/itbench_correctness/mean": 0.828125,
+      "rewards/itbench_correctness/std": 0.3502231538295746,
+      "step": 645,
+      "step_time": 1102.395908644423
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 635.9375,
+      "completions/mean_terminated_length": 635.9375,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.4497297406196594,
+      "epoch": 3.417989417989418,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8359375,
+      "kl": 0.0015347811859101057,
+      "learning_rate": 3.06750415353774e-07,
+      "loss": 0.0033,
+      "num_tokens": 12680857.0,
+      "reward": 0.5078125,
+      "reward_std": 0.19887377321720123,
+      "rewards/itbench_correctness/mean": 0.5078125,
+      "rewards/itbench_correctness/std": 0.4642843008041382,
+      "step": 646,
+      "step_time": 96.87032896187156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 726.5625,
+      "completions/mean_terminated_length": 657.923095703125,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.3275699019432068,
+      "epoch": 3.4232804232804233,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.001753173884935677,
+      "learning_rate": 3.052264965335e-07,
+      "loss": -0.0118,
+      "num_tokens": 12701762.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 647,
+      "step_time": 256.6041612662375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 966.4375,
+      "completions/mean_terminated_length": 870.5,
+      "completions/min_length": 778.0,
+      "completions/min_terminated_length": 778.0,
+      "entropy": 0.5587531328201294,
+      "epoch": 3.4285714285714284,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3984375,
+      "kl": 0.0016219299286603928,
+      "learning_rate": 3.037047077250543e-07,
+      "loss": 0.0127,
+      "num_tokens": 12725865.0,
+      "reward": 0.3958333432674408,
+      "reward_std": 0.0862581878900528,
+      "rewards/itbench_correctness/mean": 0.3958333432674408,
+      "rewards/itbench_correctness/std": 0.4254627227783203,
+      "step": 648,
+      "step_time": 119.4891459485516
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 877.0,
+      "completions/mean_length": 857.0,
+      "completions/mean_terminated_length": 727.1111450195312,
+      "completions/min_length": 572.0,
+      "completions/min_terminated_length": 572.0,
+      "entropy": 0.3640606701374054,
+      "epoch": 3.433862433862434,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5234375,
+      "kl": 0.001420785440132022,
+      "learning_rate": 3.02185065570476e-07,
+      "loss": -0.0097,
+      "num_tokens": 12745521.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 649,
+      "step_time": 151.27136832941324
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 640.0,
+      "completions/max_terminated_length": 640.0,
+      "completions/mean_length": 518.3125,
+      "completions/mean_terminated_length": 518.3125,
+      "completions/min_length": 448.0,
+      "completions/min_terminated_length": 448.0,
+      "entropy": 0.3357048034667969,
+      "epoch": 3.439153439153439,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0135498046875,
+      "kl": 0.0010722068836912513,
+      "learning_rate": 3.006675866883275e-07,
+      "loss": 0.0,
+      "num_tokens": 12758590.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 650,
+      "step_time": 951.2108103726059
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 911.0,
+      "completions/mean_length": 753.5,
+      "completions/mean_terminated_length": 735.4666748046875,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.5069674849510193,
+      "epoch": 3.4444444444444446,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3984375,
+      "kl": 0.0015233854064717889,
+      "learning_rate": 2.9915228767351535e-07,
+      "loss": 0.0302,
+      "num_tokens": 12775142.0,
+      "reward": 0.6875,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.3095695972442627,
+      "step": 651,
+      "step_time": 164.39702508877963
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 945.8125,
+      "completions/mean_terminated_length": 711.25,
+      "completions/min_length": 618.0,
+      "completions/min_terminated_length": 618.0,
+      "entropy": 0.6090002059936523,
+      "epoch": 3.4497354497354498,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.8359375,
+      "kl": 0.0016115251928567886,
+      "learning_rate": 2.9763918509710647e-07,
+      "loss": 0.0001,
+      "num_tokens": 12806947.0,
+      "reward": 0.5625,
+      "reward_std": 0.09449111670255661,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.4699290990829468,
+      "step": 652,
+      "step_time": 204.87098419014364
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 806.375,
+      "completions/mean_terminated_length": 707.45458984375,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5605332255363464,
+      "epoch": 3.455026455026455,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.001556203467771411,
+      "learning_rate": 2.961282955061483e-07,
+      "loss": -0.0919,
+      "num_tokens": 12831673.0,
+      "reward": 0.8125,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 653,
+      "step_time": 89.19978978857398
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 883.1875,
+      "completions/mean_terminated_length": 819.1818237304688,
+      "completions/min_length": 698.0,
+      "completions/min_terminated_length": 698.0,
+      "entropy": 0.33967873454093933,
+      "epoch": 3.4603174603174605,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0015498069114983082,
+      "learning_rate": 2.9461963542348733e-07,
+      "loss": 0.0158,
+      "num_tokens": 12860092.0,
+      "reward": 0.6000000238418579,
+      "reward_std": 0.28192007541656494,
+      "rewards/itbench_correctness/mean": 0.6000000238418579,
+      "rewards/itbench_correctness/std": 0.4242640733718872,
+      "step": 654,
+      "step_time": 85.3845539437607
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 841.75,
+      "completions/mean_terminated_length": 829.6000366210938,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 0.4942084848880768,
+      "epoch": 3.4656084656084656,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.0017516858642920852,
+      "learning_rate": 2.931132213475884e-07,
+      "loss": 0.0093,
+      "num_tokens": 12887824.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 655,
+      "step_time": 330.32038860116154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 744.625,
+      "completions/mean_terminated_length": 704.7142944335938,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.4163169264793396,
+      "epoch": 3.4708994708994707,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9609375,
+      "kl": 0.0016559087671339512,
+      "learning_rate": 2.916090697523549e-07,
+      "loss": 0.0156,
+      "num_tokens": 12904234.0,
+      "reward": 0.875,
+      "reward_std": 0.25583362579345703,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.26457512378692627,
+      "step": 656,
+      "step_time": 138.49934119079262
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 745.0625,
+      "completions/mean_terminated_length": 618.2727661132812,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4966026246547699,
+      "epoch": 3.4761904761904763,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0017499123932793736,
+      "learning_rate": 2.901071970869472e-07,
+      "loss": -0.0134,
+      "num_tokens": 12932827.0,
+      "reward": 0.515625,
+      "reward_std": 0.32311493158340454,
+      "rewards/itbench_correctness/mean": 0.515625,
+      "rewards/itbench_correctness/std": 0.436970591545105,
+      "step": 657,
+      "step_time": 79.10722716152668
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 881.0,
+      "completions/max_terminated_length": 881.0,
+      "completions/mean_length": 691.5,
+      "completions/mean_terminated_length": 691.5,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "entropy": 0.37310194969177246,
+      "epoch": 3.4814814814814814,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3203125,
+      "kl": 0.0011259306920692325,
+      "learning_rate": 2.8860761977560433e-07,
+      "loss": -0.0094,
+      "num_tokens": 12948403.0,
+      "reward": 0.8828125,
+      "reward_std": 0.07790146768093109,
+      "rewards/itbench_correctness/mean": 0.8828125,
+      "rewards/itbench_correctness/std": 0.16117246448993683,
+      "step": 658,
+      "step_time": 615.1303538642824
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 583.0,
+      "completions/max_terminated_length": 583.0,
+      "completions/mean_length": 474.0,
+      "completions/mean_terminated_length": 474.0,
+      "completions/min_length": 364.0,
+      "completions/min_terminated_length": 364.0,
+      "entropy": 0.5063291192054749,
+      "epoch": 3.4867724867724865,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6328125,
+      "kl": 0.0021006267052143812,
+      "learning_rate": 2.8711035421746363e-07,
+      "loss": 0.004,
+      "num_tokens": 12967043.0,
+      "reward": 0.2265625,
+      "reward_std": 0.24306795001029968,
+      "rewards/itbench_correctness/mean": 0.2265625,
+      "rewards/itbench_correctness/std": 0.2784583568572998,
+      "step": 659,
+      "step_time": 88.2505495576188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 750.375,
+      "completions/mean_terminated_length": 537.5555419921875,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.5357321500778198,
+      "epoch": 3.492063492063492,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0,
+      "kl": 0.0018287766724824905,
+      "learning_rate": 2.856154167863814e-07,
+      "loss": -0.2463,
+      "num_tokens": 12990473.0,
+      "reward": 0.4114583134651184,
+      "reward_std": 0.1860596239566803,
+      "rewards/itbench_correctness/mean": 0.4114583134651184,
+      "rewards/itbench_correctness/std": 0.3502231538295746,
+      "step": 660,
+      "step_time": 113.05205366853625
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 691.875,
+      "completions/mean_terminated_length": 615.2307739257812,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "entropy": 0.5694670081138611,
+      "epoch": 3.497354497354497,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0017705087084323168,
+      "learning_rate": 2.841228238307536e-07,
+      "loss": -0.0069,
+      "num_tokens": 13014367.0,
+      "reward": 0.4375,
+      "reward_std": 0.3532657027244568,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.38188132643699646,
+      "step": 661,
+      "step_time": 138.94573136605322
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 796.875,
+      "completions/mean_terminated_length": 764.4285888671875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.40658822655677795,
+      "epoch": 3.502645502645503,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.921875,
+      "kl": 0.001931068836711347,
+      "learning_rate": 2.8263259167333774e-07,
+      "loss": 0.0253,
+      "num_tokens": 13039493.0,
+      "reward": 0.10625000298023224,
+      "reward_std": 0.14168164134025574,
+      "rewards/itbench_correctness/mean": 0.10625000298023224,
+      "rewards/itbench_correctness/std": 0.1722267121076584,
+      "step": 662,
+      "step_time": 135.8809123178944
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 669.3125,
+      "completions/mean_terminated_length": 669.3125,
+      "completions/min_length": 418.0,
+      "completions/min_terminated_length": 418.0,
+      "entropy": 0.4213278591632843,
+      "epoch": 3.507936507936508,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.625,
+      "kl": 0.001925744814798236,
+      "learning_rate": 2.811447366110741e-07,
+      "loss": -0.0154,
+      "num_tokens": 13055098.0,
+      "reward": 0.4001736044883728,
+      "reward_std": 0.14328064024448395,
+      "rewards/itbench_correctness/mean": 0.4001736044883728,
+      "rewards/itbench_correctness/std": 0.45731422305107117,
+      "step": 663,
+      "step_time": 1172.1156589342281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 480.0,
+      "completions/mean_length": 654.1875,
+      "completions/mean_terminated_length": 366.5555725097656,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4555268883705139,
+      "epoch": 3.5132275132275135,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2734375,
+      "kl": 0.0016494187293574214,
+      "learning_rate": 2.7965927491490704e-07,
+      "loss": -0.1445,
+      "num_tokens": 13078749.0,
+      "reward": 0.890625,
+      "reward_std": 0.2414703369140625,
+      "rewards/itbench_correctness/mean": 0.890625,
+      "rewards/itbench_correctness/std": 0.2576940953731537,
+      "step": 664,
+      "step_time": 790.0815438805148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 962.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 565.625,
+      "completions/mean_terminated_length": 565.625,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "entropy": 0.43668508529663086,
+      "epoch": 3.5185185185185186,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.041259765625,
+      "kl": 0.0019728399347513914,
+      "learning_rate": 2.7817622282960813e-07,
+      "loss": 0.0,
+      "num_tokens": 13091575.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 665,
+      "step_time": 94.42669316660613
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 763.0,
+      "completions/max_terminated_length": 763.0,
+      "completions/mean_length": 603.8125,
+      "completions/mean_terminated_length": 603.8125,
+      "completions/min_length": 493.0,
+      "completions/min_terminated_length": 493.0,
+      "entropy": 0.4007866680622101,
+      "epoch": 3.5238095238095237,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.001383933937177062,
+      "learning_rate": 2.7669559657359673e-07,
+      "loss": -0.0116,
+      "num_tokens": 13105124.0,
+      "reward": 0.78125,
+      "reward_std": 0.3471629321575165,
+      "rewards/itbench_correctness/mean": 0.78125,
+      "rewards/itbench_correctness/std": 0.4069705307483673,
+      "step": 666,
+      "step_time": 72.2391463033855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 888.0,
+      "completions/mean_length": 718.625,
+      "completions/mean_terminated_length": 616.8333740234375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4369455575942993,
+      "epoch": 3.5291005291005293,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6875,
+      "kl": 0.0016333801904693246,
+      "learning_rate": 2.7521741233876493e-07,
+      "loss": -0.1036,
+      "num_tokens": 13134246.0,
+      "reward": 0.42500001192092896,
+      "reward_std": 0.3273707628250122,
+      "rewards/itbench_correctness/mean": 0.42500001192092896,
+      "rewards/itbench_correctness/std": 0.44347113370895386,
+      "step": 667,
+      "step_time": 192.8937590336427
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 837.0,
+      "completions/mean_length": 604.125,
+      "completions/mean_terminated_length": 507.23077392578125,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "entropy": 0.3542313277721405,
+      "epoch": 3.5343915343915344,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.0024427901953458786,
+      "learning_rate": 2.737416862902981e-07,
+      "loss": -0.0391,
+      "num_tokens": 13147840.0,
+      "reward": 0.2569444477558136,
+      "reward_std": 0.12661024928092957,
+      "rewards/itbench_correctness/mean": 0.2569444477558136,
+      "rewards/itbench_correctness/std": 0.23537467420101166,
+      "step": 668,
+      "step_time": 92.6653502555564
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 399.0,
+      "completions/mean_length": 689.0,
+      "completions/mean_terminated_length": 354.0,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "entropy": 0.42670536041259766,
+      "epoch": 3.5396825396825395,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.6640625,
+      "kl": 0.0015613737050443888,
+      "learning_rate": 2.722684345665003e-07,
+      "loss": 0.0,
+      "num_tokens": 13163432.0,
+      "reward": 0.5,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.3651483952999115,
+      "step": 669,
+      "step_time": 71.54559296742082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 809.0,
+      "completions/max_terminated_length": 809.0,
+      "completions/mean_length": 584.25,
+      "completions/mean_terminated_length": 584.25,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.45186135172843933,
+      "epoch": 3.544973544973545,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.0028362891171127558,
+      "learning_rate": 2.707976732786166e-07,
+      "loss": -0.0174,
+      "num_tokens": 13176020.0,
+      "reward": 0.3971354365348816,
+      "reward_std": 0.3280077576637268,
+      "rewards/itbench_correctness/mean": 0.3971354365348816,
+      "rewards/itbench_correctness/std": 0.4046509563922882,
+      "step": 670,
+      "step_time": 115.24186983983964
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 692.0,
+      "completions/max_terminated_length": 692.0,
+      "completions/mean_length": 513.3125,
+      "completions/mean_terminated_length": 513.3125,
+      "completions/min_length": 371.0,
+      "completions/min_terminated_length": 371.0,
+      "entropy": 0.4909290075302124,
+      "epoch": 3.5502645502645502,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0017170688370242715,
+      "learning_rate": 2.6932941851065615e-07,
+      "loss": -0.0309,
+      "num_tokens": 13189441.0,
+      "reward": 0.4270833432674408,
+      "reward_std": 0.36084234714508057,
+      "rewards/itbench_correctness/mean": 0.4270833432674408,
+      "rewards/itbench_correctness/std": 0.40583136677742004,
+      "step": 671,
+      "step_time": 68.62113481201231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 781.125,
+      "completions/mean_terminated_length": 592.2222290039062,
+      "completions/min_length": 446.0,
+      "completions/min_terminated_length": 446.0,
+      "entropy": 0.35333654284477234,
+      "epoch": 3.5555555555555554,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0257568359375,
+      "kl": 0.0011492978082969785,
+      "learning_rate": 2.6786368631921834e-07,
+      "loss": 0.0,
+      "num_tokens": 13207635.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 672,
+      "step_time": 7667.425364185125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 803.0,
+      "completions/mean_length": 563.0625,
+      "completions/mean_terminated_length": 532.3333740234375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5292485356330872,
+      "epoch": 3.560846560846561,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.0018245880492031574,
+      "learning_rate": 2.664004927333151e-07,
+      "loss": 0.0172,
+      "num_tokens": 13229564.0,
+      "reward": 0.34166666865348816,
+      "reward_std": 0.13930098712444305,
+      "rewards/itbench_correctness/mean": 0.34166666865348816,
+      "rewards/itbench_correctness/std": 0.4009248614311218,
+      "step": 673,
+      "step_time": 94.37730458006263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 1013.6875,
+      "completions/mean_terminated_length": 969.0,
+      "completions/min_length": 946.0,
+      "completions/min_terminated_length": 946.0,
+      "entropy": 0.3610580265522003,
+      "epoch": 3.566137566137566,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.609375,
+      "kl": 0.0011879053199663758,
+      "learning_rate": 2.6493985375419775e-07,
+      "loss": 0.0,
+      "num_tokens": 13256847.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 674,
+      "step_time": 230.72378408256918
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 635.0,
+      "completions/max_terminated_length": 635.0,
+      "completions/mean_length": 416.375,
+      "completions/mean_terminated_length": 416.375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5355749130249023,
+      "epoch": 3.571428571428571,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.65625,
+      "kl": 0.0020900224335491657,
+      "learning_rate": 2.6348178535517965e-07,
+      "loss": -0.1134,
+      "num_tokens": 13268133.0,
+      "reward": 0.5729166269302368,
+      "reward_std": 0.20013636350631714,
+      "rewards/itbench_correctness/mean": 0.5729166269302368,
+      "rewards/itbench_correctness/std": 0.28361913561820984,
+      "step": 675,
+      "step_time": 56.21729406807572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 622.0,
+      "completions/max_terminated_length": 622.0,
+      "completions/mean_length": 439.875,
+      "completions/mean_terminated_length": 439.875,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "entropy": 0.4046604037284851,
+      "epoch": 3.5767195767195767,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9609375,
+      "kl": 0.0022217335645109415,
+      "learning_rate": 2.620263034814632e-07,
+      "loss": -0.0015,
+      "num_tokens": 13278187.0,
+      "reward": 0.6875,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 676,
+      "step_time": 1194.6907618306577
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 747.0,
+      "completions/mean_length": 816.25,
+      "completions/mean_terminated_length": 608.5,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "entropy": 0.45329248905181885,
+      "epoch": 3.582010582010582,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.0015461933799088001,
+      "learning_rate": 2.605734240499652e-07,
+      "loss": -0.0181,
+      "num_tokens": 13301351.0,
+      "reward": 0.4479166567325592,
+      "reward_std": 0.06200198084115982,
+      "rewards/itbench_correctness/mean": 0.4479166567325592,
+      "rewards/itbench_correctness/std": 0.4702983796596527,
+      "step": 677,
+      "step_time": 130.93884664587677
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 697.0,
+      "completions/max_terminated_length": 697.0,
+      "completions/mean_length": 537.1875,
+      "completions/mean_terminated_length": 537.1875,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 0.4207097291946411,
+      "epoch": 3.5873015873015874,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.052734375,
+      "kl": 0.001912532257847488,
+      "learning_rate": 2.591231629491423e-07,
+      "loss": 0.0,
+      "num_tokens": 13314330.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 678,
+      "step_time": 95.27887518052012
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 978.0,
+      "completions/max_terminated_length": 978.0,
+      "completions/mean_length": 665.8125,
+      "completions/mean_terminated_length": 665.8125,
+      "completions/min_length": 536.0,
+      "completions/min_terminated_length": 536.0,
+      "entropy": 0.36646953225135803,
+      "epoch": 3.5925925925925926,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.625,
+      "kl": 0.002286773407831788,
+      "learning_rate": 2.5767553603881764e-07,
+      "loss": -0.0025,
+      "num_tokens": 13329519.0,
+      "reward": 0.625,
+      "reward_std": 0.3104073107242584,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.3979112207889557,
+      "step": 679,
+      "step_time": 807.2301516216248
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 599.0,
+      "completions/max_terminated_length": 599.0,
+      "completions/mean_length": 451.3125,
+      "completions/mean_terminated_length": 451.3125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "entropy": 0.3833264112472534,
+      "epoch": 3.597883597883598,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.484375,
+      "kl": 0.0012348750606179237,
+      "learning_rate": 2.5623055915000686e-07,
+      "loss": 0.01,
+      "num_tokens": 13339652.0,
+      "reward": 0.6193181872367859,
+      "reward_std": 0.13179811835289001,
+      "rewards/itbench_correctness/mean": 0.6193181872367859,
+      "rewards/itbench_correctness/std": 0.4134864807128906,
+      "step": 680,
+      "step_time": 135.08529091719538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1020.0,
+      "completions/mean_length": 646.125,
+      "completions/mean_terminated_length": 558.923095703125,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "entropy": 0.4178757965564728,
+      "epoch": 3.6031746031746033,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0014911647886037827,
+      "learning_rate": 2.547882480847461e-07,
+      "loss": 0.0,
+      "num_tokens": 13353918.0,
+      "reward": 0.38749998807907104,
+      "reward_std": 0.1157275065779686,
+      "rewards/itbench_correctness/mean": 0.38749998807907104,
+      "rewards/itbench_correctness/std": 0.4303099811077118,
+      "step": 681,
+      "step_time": 99.35422214772552
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 886.0,
+      "completions/mean_length": 769.75,
+      "completions/mean_terminated_length": 572.0,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.7534914016723633,
+      "epoch": 3.6084656084656084,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.03125,
+      "kl": 0.0021665135864168406,
+      "learning_rate": 2.533486186159175e-07,
+      "loss": 0.0043,
+      "num_tokens": 13375346.0,
+      "reward": 0.390625,
+      "reward_std": 0.27564918994903564,
+      "rewards/itbench_correctness/mean": 0.390625,
+      "rewards/itbench_correctness/std": 0.4913311004638672,
+      "step": 682,
+      "step_time": 92.11163073871285
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 654.0,
+      "completions/mean_length": 768.75,
+      "completions/mean_terminated_length": 513.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3460162580013275,
+      "epoch": 3.613756613756614,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0234375,
+      "kl": 0.0016589618753641844,
+      "learning_rate": 2.5191168648707884e-07,
+      "loss": -0.0447,
+      "num_tokens": 13402526.0,
+      "reward": 0.11328125,
+      "reward_std": 0.08341467380523682,
+      "rewards/itbench_correctness/mean": 0.11328125,
+      "rewards/itbench_correctness/std": 0.16332921385765076,
+      "step": 683,
+      "step_time": 159.6508161853999
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 757.0,
+      "completions/max_terminated_length": 757.0,
+      "completions/mean_length": 546.1875,
+      "completions/mean_terminated_length": 546.1875,
+      "completions/min_length": 440.0,
+      "completions/min_terminated_length": 440.0,
+      "entropy": 0.44124042987823486,
+      "epoch": 3.619047619047619,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0013375874841585755,
+      "learning_rate": 2.5047746741228977e-07,
+      "loss": -0.0346,
+      "num_tokens": 13415305.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.21535253524780273,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.4238273799419403,
+      "step": 684,
+      "step_time": 587.8082643058151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 800.875,
+      "completions/mean_terminated_length": 786.0000610351562,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.34712034463882446,
+      "epoch": 3.624338624338624,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0234375,
+      "kl": 0.0008109880145639181,
+      "learning_rate": 2.490459770759398e-07,
+      "loss": -0.008,
+      "num_tokens": 13434959.0,
+      "reward": 0.296875,
+      "reward_std": 0.0646936446428299,
+      "rewards/itbench_correctness/mean": 0.296875,
+      "rewards/itbench_correctness/std": 0.31909704208374023,
+      "step": 685,
+      "step_time": 89.69222616031766
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 577.0,
+      "completions/max_terminated_length": 577.0,
+      "completions/mean_length": 470.125,
+      "completions/mean_terminated_length": 470.125,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "entropy": 0.45094388723373413,
+      "epoch": 3.6296296296296298,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0274658203125,
+      "kl": 0.0016529399435967207,
+      "learning_rate": 2.476172311325783e-07,
+      "loss": 0.0,
+      "num_tokens": 13445337.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 686,
+      "step_time": 106.66353179235011
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 804.6875,
+      "completions/mean_terminated_length": 673.1000366210938,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.4324660301208496,
+      "epoch": 3.634920634920635,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.0018160316394641995,
+      "learning_rate": 2.4619124520674145e-07,
+      "loss": 0.0029,
+      "num_tokens": 13466644.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 687,
+      "step_time": 257.70799226593226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 765.375,
+      "completions/mean_terminated_length": 610.2000122070312,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "entropy": 0.535685122013092,
+      "epoch": 3.64021164021164,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.515625,
+      "kl": 0.0018447366310283542,
+      "learning_rate": 2.447680348927837e-07,
+      "loss": 0.0164,
+      "num_tokens": 13488698.0,
+      "reward": 0.1875,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 688,
+      "step_time": 100.56179421767592
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 980.0,
+      "completions/mean_length": 680.0625,
+      "completions/mean_terminated_length": 473.70001220703125,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.36467236280441284,
+      "epoch": 3.6455026455026456,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.0013507335679605603,
+      "learning_rate": 2.4334761575470434e-07,
+      "loss": 0.0176,
+      "num_tokens": 13505539.0,
+      "reward": 0.518750011920929,
+      "reward_std": 0.2103695124387741,
+      "rewards/itbench_correctness/mean": 0.518750011920929,
+      "rewards/itbench_correctness/std": 0.38929542899131775,
+      "step": 689,
+      "step_time": 76.81764477398247
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 815.0,
+      "completions/mean_length": 653.4375,
+      "completions/mean_terminated_length": 628.7333374023438,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.4254423677921295,
+      "epoch": 3.6507936507936507,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.0013152090832591057,
+      "learning_rate": 2.419300033259798e-07,
+      "loss": 0.0775,
+      "num_tokens": 13524146.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 690,
+      "step_time": 80.11625996977091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 765.9375,
+      "completions/mean_terminated_length": 565.2222290039062,
+      "completions/min_length": 443.0,
+      "completions/min_terminated_length": 443.0,
+      "entropy": 0.3042023777961731,
+      "epoch": 3.656084656084656,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.546875,
+      "kl": 0.002678963588550687,
+      "learning_rate": 2.4051521310939254e-07,
+      "loss": 0.0039,
+      "num_tokens": 13544377.0,
+      "reward": 0.375,
+      "reward_std": 0.49871626496315,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 691,
+      "step_time": 937.4359912928194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 990.8125,
+      "completions/mean_terminated_length": 847.0,
+      "completions/min_length": 785.0,
+      "completions/min_terminated_length": 785.0,
+      "entropy": 0.4723396301269531,
+      "epoch": 3.6613756613756614,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.171875,
+      "kl": 0.001547610154375434,
+      "learning_rate": 2.3910326057686124e-07,
+      "loss": 0.0235,
+      "num_tokens": 13570094.0,
+      "reward": 0.2622767686843872,
+      "reward_std": 0.17326994240283966,
+      "rewards/itbench_correctness/mean": 0.2622767686843872,
+      "rewards/itbench_correctness/std": 0.22186197340488434,
+      "step": 692,
+      "step_time": 241.44549081102014
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 585.0,
+      "completions/mean_length": 634.1875,
+      "completions/mean_terminated_length": 457.0,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "entropy": 0.39735883474349976,
+      "epoch": 3.6666666666666665,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0194091796875,
+      "kl": 0.0012317921500653028,
+      "learning_rate": 2.3769416116927333e-07,
+      "loss": 0.0,
+      "num_tokens": 13591193.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.3333333432674408,
+      "rewards/itbench_correctness/std": 0.3442651927471161,
+      "step": 693,
+      "step_time": 156.18573713861406
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 587.0,
+      "completions/mean_length": 772.6875,
+      "completions/mean_terminated_length": 521.375,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.41672733426094055,
+      "epoch": 3.671957671957672,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.984375,
+      "kl": 0.0018979725427925587,
+      "learning_rate": 2.362879302963135e-07,
+      "loss": 0.0086,
+      "num_tokens": 13612116.0,
+      "reward": 0.53125,
+      "reward_std": 0.29986464977264404,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.4366062581539154,
+      "step": 694,
+      "step_time": 268.16689282283187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 817.3125,
+      "completions/mean_terminated_length": 610.625,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "entropy": 0.5040911436080933,
+      "epoch": 3.677248677248677,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0017554876394569874,
+      "learning_rate": 2.3488458333629773e-07,
+      "loss": 0.0111,
+      "num_tokens": 13636593.0,
+      "reward": 0.768750011920929,
+      "reward_std": 0.3954527974128723,
+      "rewards/itbench_correctness/mean": 0.768750011920929,
+      "rewards/itbench_correctness/std": 0.39355337619781494,
+      "step": 695,
+      "step_time": 83.74904467258602
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 973.5,
+      "completions/mean_terminated_length": 822.0,
+      "completions/min_length": 554.0,
+      "completions/min_terminated_length": 554.0,
+      "entropy": 0.44170519709587097,
+      "epoch": 3.682539682539683,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.0014584577875211835,
+      "learning_rate": 2.3348413563600323e-07,
+      "loss": 0.0101,
+      "num_tokens": 13659681.0,
+      "reward": 0.6041666865348816,
+      "reward_std": 0.19287918508052826,
+      "rewards/itbench_correctness/mean": 0.6041666865348816,
+      "rewards/itbench_correctness/std": 0.48638883233070374,
+      "step": 696,
+      "step_time": 80.2720007058233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 911.125,
+      "completions/mean_terminated_length": 766.0000610351562,
+      "completions/min_length": 662.0,
+      "completions/min_terminated_length": 662.0,
+      "entropy": 0.4477980434894562,
+      "epoch": 3.687830687830688,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04833984375,
+      "kl": 0.0018148425733670592,
+      "learning_rate": 2.3208660251050156e-07,
+      "loss": 0.0001,
+      "num_tokens": 13701451.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 697,
+      "step_time": 590.3981730565429
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 743.0,
+      "completions/mean_length": 921.625,
+      "completions/mean_terminated_length": 478.0,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.5208191871643066,
+      "epoch": 3.693121693121693,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.140625,
+      "kl": 0.003547599073499441,
+      "learning_rate": 2.306919992429917e-07,
+      "loss": -0.0535,
+      "num_tokens": 13724541.0,
+      "reward": 0.48284316062927246,
+      "reward_std": 0.2941243052482605,
+      "rewards/itbench_correctness/mean": 0.48284316062927246,
+      "rewards/itbench_correctness/std": 0.4697091579437256,
+      "step": 698,
+      "step_time": 156.73526183422655
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 939.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 654.875,
+      "completions/mean_terminated_length": 654.875,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 0.3054018020629883,
+      "epoch": 3.6984126984126986,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5625,
+      "kl": 0.0012958102161064744,
+      "learning_rate": 2.2930034108463097e-07,
+      "loss": -0.0132,
+      "num_tokens": 13740411.0,
+      "reward": 0.6156250238418579,
+      "reward_std": 0.1601249873638153,
+      "rewards/itbench_correctness/mean": 0.6156250238418579,
+      "rewards/itbench_correctness/std": 0.16301201283931732,
+      "step": 699,
+      "step_time": 99.30126603785902
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 749.0,
+      "completions/mean_length": 779.1875,
+      "completions/mean_terminated_length": 534.375,
+      "completions/min_length": 443.0,
+      "completions/min_terminated_length": 443.0,
+      "entropy": 0.6083259582519531,
+      "epoch": 3.7037037037037037,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.020751953125,
+      "kl": 0.0015235234750434756,
+      "learning_rate": 2.2791164325437046e-07,
+      "loss": 0.0,
+      "num_tokens": 13769414.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 700,
+      "step_time": 121.30535170529038
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 895.125,
+      "completions/mean_terminated_length": 766.25,
+      "completions/min_length": 389.0,
+      "completions/min_terminated_length": 389.0,
+      "entropy": 0.33514872193336487,
+      "epoch": 3.708994708994709,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9921875,
+      "kl": 0.0013990112347528338,
+      "learning_rate": 2.2652592093878665e-07,
+      "loss": 0.036,
+      "num_tokens": 13794072.0,
+      "reward": 0.125,
+      "reward_std": 0.1746530830860138,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.17743022739887238,
+      "step": 701,
+      "step_time": 441.7110221767798
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 952.0,
+      "completions/max_terminated_length": 952.0,
+      "completions/mean_length": 561.3125,
+      "completions/mean_terminated_length": 561.3125,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "entropy": 0.5237724184989929,
+      "epoch": 3.7142857142857144,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0223388671875,
+      "kl": 0.0015220106579363346,
+      "learning_rate": 2.2514318929191706e-07,
+      "loss": 0.0,
+      "num_tokens": 13810757.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 702,
+      "step_time": 111.1009431509301
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 439.0,
+      "completions/mean_length": 699.9375,
+      "completions/mean_terminated_length": 375.875,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.4943298399448395,
+      "epoch": 3.7195767195767195,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.001631591934710741,
+      "learning_rate": 2.237634634350934e-07,
+      "loss": 0.0044,
+      "num_tokens": 13831820.0,
+      "reward": 0.6041666865348816,
+      "reward_std": 0.49329501390457153,
+      "rewards/itbench_correctness/mean": 0.6041666865348816,
+      "rewards/itbench_correctness/std": 0.4901813864707947,
+      "step": 703,
+      "step_time": 122.44539823755622
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 715.6875,
+      "completions/mean_terminated_length": 530.7000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4499170482158661,
+      "epoch": 3.7248677248677247,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0019055476877838373,
+      "learning_rate": 2.223867584567766e-07,
+      "loss": -0.0356,
+      "num_tokens": 13848519.0,
+      "reward": 0.6761363744735718,
+      "reward_std": 0.2798159420490265,
+      "rewards/itbench_correctness/mean": 0.6761363744735718,
+      "rewards/itbench_correctness/std": 0.4717574715614319,
+      "step": 704,
+      "step_time": 72.04201124608517
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 882.0,
+      "completions/mean_length": 997.9375,
+      "completions/mean_terminated_length": 815.5,
+      "completions/min_length": 749.0,
+      "completions/min_terminated_length": 749.0,
+      "entropy": 0.5070458054542542,
+      "epoch": 3.7301587301587302,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4453125,
+      "kl": 0.0012196371098980308,
+      "learning_rate": 2.21013089412392e-07,
+      "loss": 0.0007,
+      "num_tokens": 13874222.0,
+      "reward": 0.27085813879966736,
+      "reward_std": 0.10523707419633865,
+      "rewards/itbench_correctness/mean": 0.27085813879966736,
+      "rewards/itbench_correctness/std": 0.3145284056663513,
+      "step": 705,
+      "step_time": 71.73409328702837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 560.5,
+      "completions/mean_terminated_length": 560.5,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "entropy": 0.4085637927055359,
+      "epoch": 3.7354497354497354,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.001750650000758469,
+      "learning_rate": 2.1964247132416368e-07,
+      "loss": 0.0159,
+      "num_tokens": 13886822.0,
+      "reward": 0.578125,
+      "reward_std": 0.13797250390052795,
+      "rewards/itbench_correctness/mean": 0.578125,
+      "rewards/itbench_correctness/std": 0.3949551582336426,
+      "step": 706,
+      "step_time": 854.83407723438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 703.0,
+      "completions/mean_length": 760.5,
+      "completions/mean_terminated_length": 497.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4549638330936432,
+      "epoch": 3.7407407407407405,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6015625,
+      "kl": 0.004151123110204935,
+      "learning_rate": 2.1827491918095177e-07,
+      "loss": -0.0855,
+      "num_tokens": 13909070.0,
+      "reward": 0.4791666567325592,
+      "reward_std": 0.221320241689682,
+      "rewards/itbench_correctness/mean": 0.4791666567325592,
+      "rewards/itbench_correctness/std": 0.47871360182762146,
+      "step": 707,
+      "step_time": 122.79243450798094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 957.75,
+      "completions/mean_terminated_length": 847.3333740234375,
+      "completions/min_length": 769.0,
+      "completions/min_terminated_length": 769.0,
+      "entropy": 0.48238056898117065,
+      "epoch": 3.746031746031746,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.001507254783064127,
+      "learning_rate": 2.1691044793808733e-07,
+      "loss": 0.0161,
+      "num_tokens": 13939002.0,
+      "reward": 0.515625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.515625,
+      "rewards/itbench_correctness/std": 0.503891110420227,
+      "step": 708,
+      "step_time": 193.1200410258025
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 843.0,
+      "completions/max_terminated_length": 843.0,
+      "completions/mean_length": 613.0625,
+      "completions/mean_terminated_length": 613.0625,
+      "completions/min_length": 351.0,
+      "completions/min_terminated_length": 351.0,
+      "entropy": 0.39963299036026,
+      "epoch": 3.751322751322751,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.0013470432022586465,
+      "learning_rate": 2.1554907251720945e-07,
+      "loss": -0.0121,
+      "num_tokens": 13952611.0,
+      "reward": 0.6875,
+      "reward_std": 0.33614614605903625,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.4425306022167206,
+      "step": 709,
+      "step_time": 139.45972900651395
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 739.0,
+      "completions/max_terminated_length": 739.0,
+      "completions/mean_length": 533.5,
+      "completions/mean_terminated_length": 533.5,
+      "completions/min_length": 355.0,
+      "completions/min_terminated_length": 355.0,
+      "entropy": 0.5510777831077576,
+      "epoch": 3.7566137566137567,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0260009765625,
+      "kl": 0.0015925114275887609,
+      "learning_rate": 2.1419080780610122e-07,
+      "loss": 0.0,
+      "num_tokens": 13963859.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 710,
+      "step_time": 515.4728924324736
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 950.6875,
+      "completions/mean_terminated_length": 877.375,
+      "completions/min_length": 796.0,
+      "completions/min_terminated_length": 796.0,
+      "entropy": 0.3702583611011505,
+      "epoch": 3.761904761904762,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.201171875,
+      "kl": 0.001159166102297604,
+      "learning_rate": 2.128356686585282e-07,
+      "loss": 0.0,
+      "num_tokens": 13988342.0,
+      "reward": 0.875,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.12909944355487823,
+      "step": 711,
+      "step_time": 94.62389472685754
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 636.3125,
+      "completions/mean_terminated_length": 546.84619140625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5091837644577026,
+      "epoch": 3.7671957671957674,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3671875,
+      "kl": 0.0018951293313875794,
+      "learning_rate": 2.1148366989407497e-07,
+      "loss": -0.0344,
+      "num_tokens": 14007139.0,
+      "reward": 0.1041666716337204,
+      "reward_std": 0.1178511306643486,
+      "rewards/itbench_correctness/mean": 0.1041666716337204,
+      "rewards/itbench_correctness/std": 0.13437096774578094,
+      "step": 712,
+      "step_time": 456.887752013281
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 657.875,
+      "completions/mean_terminated_length": 573.3846435546875,
+      "completions/min_length": 307.0,
+      "completions/min_terminated_length": 307.0,
+      "entropy": 0.40737220644950867,
+      "epoch": 3.7724867724867726,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3984375,
+      "kl": 0.0015902683371677995,
+      "learning_rate": 2.101348262979833e-07,
+      "loss": -0.0082,
+      "num_tokens": 14027257.0,
+      "reward": 0.4742647111415863,
+      "reward_std": 0.0482964813709259,
+      "rewards/itbench_correctness/mean": 0.4742647111415863,
+      "rewards/itbench_correctness/std": 0.4942431151866913,
+      "step": 713,
+      "step_time": 178.64222278352827
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 391.0,
+      "completions/mean_length": 685.0625,
+      "completions/mean_terminated_length": 346.125,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "entropy": 0.6481160521507263,
+      "epoch": 3.7777777777777777,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 7.1875,
+      "kl": 0.0014102908316999674,
+      "learning_rate": 2.0878915262099096e-07,
+      "loss": 0.0,
+      "num_tokens": 14044666.0,
+      "reward": 0.046875,
+      "reward_std": 0.09300297498703003,
+      "rewards/itbench_correctness/mean": 0.046875,
+      "rewards/itbench_correctness/std": 0.1359764039516449,
+      "step": 714,
+      "step_time": 107.09331988729537
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 779.3125,
+      "completions/mean_terminated_length": 668.0909423828125,
+      "completions/min_length": 421.0,
+      "completions/min_terminated_length": 421.0,
+      "entropy": 0.49530836939811707,
+      "epoch": 3.7830687830687832,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.0014117928221821785,
+      "learning_rate": 2.0744666357916925e-07,
+      "loss": 0.0542,
+      "num_tokens": 14060735.0,
+      "reward": 0.4479166865348816,
+      "reward_std": 0.20154890418052673,
+      "rewards/itbench_correctness/mean": 0.4479166865348816,
+      "rewards/itbench_correctness/std": 0.4550386667251587,
+      "step": 715,
+      "step_time": 420.523594789207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 808.0,
+      "completions/mean_length": 562.0625,
+      "completions/mean_terminated_length": 531.2667236328125,
+      "completions/min_length": 336.0,
+      "completions/min_terminated_length": 336.0,
+      "entropy": 0.50172358751297,
+      "epoch": 3.7883597883597884,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2421875,
+      "kl": 0.0018034385284408927,
+      "learning_rate": 2.0610737385376348e-07,
+      "loss": 0.0156,
+      "num_tokens": 14075944.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 716,
+      "step_time": 1006.8671864075586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 628.0,
+      "completions/mean_length": 761.0625,
+      "completions/mean_terminated_length": 556.5555419921875,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.41783690452575684,
+      "epoch": 3.7936507936507935,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.049560546875,
+      "kl": 0.0019152258755639195,
+      "learning_rate": 2.0477129809103145e-07,
+      "loss": 0.0001,
+      "num_tokens": 14098977.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 717,
+      "step_time": 450.0135377245024
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 872.0,
+      "completions/mean_length": 786.25,
+      "completions/mean_terminated_length": 643.6000366210938,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.521462619304657,
+      "epoch": 3.798941798941799,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0230712890625,
+      "kl": 0.001451818854548037,
+      "learning_rate": 2.0343845090208367e-07,
+      "loss": 0.0,
+      "num_tokens": 14115053.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 718,
+      "step_time": 349.72742245160043
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 635.5625,
+      "completions/mean_terminated_length": 635.5625,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "entropy": 0.4814632833003998,
+      "epoch": 3.804232804232804,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0014020799426361918,
+      "learning_rate": 2.0210884686272367e-07,
+      "loss": 0.0339,
+      "num_tokens": 14129190.0,
+      "reward": 0.7466298937797546,
+      "reward_std": 0.3058916926383972,
+      "rewards/itbench_correctness/mean": 0.7466298937797546,
+      "rewards/itbench_correctness/std": 0.38068902492523193,
+      "step": 719,
+      "step_time": 175.0832874653861
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 673.0,
+      "completions/max_terminated_length": 673.0,
+      "completions/mean_length": 479.125,
+      "completions/mean_terminated_length": 479.125,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "entropy": 0.44873467087745667,
+      "epoch": 3.8095238095238093,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.21875,
+      "kl": 0.0020029842853546143,
+      "learning_rate": 2.0078250051328782e-07,
+      "loss": 0.0077,
+      "num_tokens": 14139808.0,
+      "reward": 0.648809552192688,
+      "reward_std": 0.17577169835567474,
+      "rewards/itbench_correctness/mean": 0.648809552192688,
+      "rewards/itbench_correctness/std": 0.28511843085289,
+      "step": 720,
+      "step_time": 102.81194345280528
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 570.0,
+      "completions/max_terminated_length": 570.0,
+      "completions/mean_length": 414.3125,
+      "completions/mean_terminated_length": 414.3125,
+      "completions/min_length": 328.0,
+      "completions/min_terminated_length": 328.0,
+      "entropy": 0.35721829533576965,
+      "epoch": 3.814814814814815,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9375,
+      "kl": 0.0016476555028930306,
+      "learning_rate": 1.9945942635848745e-07,
+      "loss": -0.0019,
+      "num_tokens": 14150685.0,
+      "reward": 0.453125,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.453125,
+      "rewards/itbench_correctness/std": 0.5018196105957031,
+      "step": 721,
+      "step_time": 877.2679685084149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 821.0,
+      "completions/mean_length": 784.125,
+      "completions/mean_terminated_length": 544.25,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.45911046862602234,
+      "epoch": 3.82010582010582,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.328125,
+      "kl": 0.0014677182771265507,
+      "learning_rate": 1.981396388672496e-07,
+      "loss": 0.0,
+      "num_tokens": 14172399.0,
+      "reward": 0.2109375,
+      "reward_std": 0.06629125773906708,
+      "rewards/itbench_correctness/mean": 0.2109375,
+      "rewards/itbench_correctness/std": 0.2359323352575302,
+      "step": 722,
+      "step_time": 208.89351680781692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 906.0,
+      "completions/mean_length": 920.625,
+      "completions/mean_terminated_length": 817.25,
+      "completions/min_length": 737.0,
+      "completions/min_terminated_length": 737.0,
+      "entropy": 0.262864887714386,
+      "epoch": 3.825396825396825,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0010848907986655831,
+      "learning_rate": 1.9682315247255892e-07,
+      "loss": 0.0043,
+      "num_tokens": 14196345.0,
+      "reward": 0.3854166865348816,
+      "reward_std": 0.297717809677124,
+      "rewards/itbench_correctness/mean": 0.3854166865348816,
+      "rewards/itbench_correctness/std": 0.3145764470100403,
+      "step": 723,
+      "step_time": 847.0749486461282
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 602.0,
+      "completions/max_terminated_length": 602.0,
+      "completions/mean_length": 442.375,
+      "completions/mean_terminated_length": 442.375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "entropy": 0.47018933296203613,
+      "epoch": 3.8306878306878307,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.9609375,
+      "kl": 0.0014332140563055873,
+      "learning_rate": 1.9550998157129944e-07,
+      "loss": -0.0054,
+      "num_tokens": 14206399.0,
+      "reward": 0.921875,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.921875,
+      "rewards/itbench_correctness/std": 0.1983000785112381,
+      "step": 724,
+      "step_time": 91.33387219905853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 687.5625,
+      "completions/mean_terminated_length": 425.8888854980469,
+      "completions/min_length": 314.0,
+      "completions/min_terminated_length": 314.0,
+      "entropy": 0.3752386271953583,
+      "epoch": 3.835978835978836,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.640625,
+      "kl": 0.0017074119532480836,
+      "learning_rate": 1.942001405240979e-07,
+      "loss": 0.0057,
+      "num_tokens": 14222120.0,
+      "reward": 0.2232142835855484,
+      "reward_std": 0.18483898043632507,
+      "rewards/itbench_correctness/mean": 0.2232142835855484,
+      "rewards/itbench_correctness/std": 0.21329134702682495,
+      "step": 725,
+      "step_time": 93.2281863456592
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 730.0,
+      "completions/mean_length": 709.0,
+      "completions/mean_terminated_length": 604.0,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.5275035500526428,
+      "epoch": 3.8412698412698414,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.002380735008046031,
+      "learning_rate": 1.9289364365516607e-07,
+      "loss": 0.0403,
+      "num_tokens": 14258400.0,
+      "reward": 0.453125,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.453125,
+      "rewards/itbench_correctness/std": 0.5018196105957031,
+      "step": 726,
+      "step_time": 588.1754347216338
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 655.8125,
+      "completions/mean_terminated_length": 631.2667236328125,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.5062422752380371,
+      "epoch": 3.8465608465608465,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6953125,
+      "kl": 0.001622261363081634,
+      "learning_rate": 1.915905052521445e-07,
+      "loss": 0.0287,
+      "num_tokens": 14277157.0,
+      "reward": 0.5,
+      "reward_std": 0.2177756428718567,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.3872983455657959,
+      "step": 727,
+      "step_time": 455.7577704479918
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 664.0,
+      "completions/max_terminated_length": 664.0,
+      "completions/mean_length": 502.9375,
+      "completions/mean_terminated_length": 502.9375,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "entropy": 0.3757922351360321,
+      "epoch": 3.851851851851852,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.0013325168984010816,
+      "learning_rate": 1.9029073956594604e-07,
+      "loss": -0.0056,
+      "num_tokens": 14288780.0,
+      "reward": 0.90625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.90625,
+      "rewards/itbench_correctness/std": 0.20155644416809082,
+      "step": 728,
+      "step_time": 82.46759236324579
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 680.0,
+      "completions/max_terminated_length": 680.0,
+      "completions/mean_length": 541.125,
+      "completions/mean_terminated_length": 541.125,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.3862323760986328,
+      "epoch": 3.857142857142857,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.033935546875,
+      "kl": 0.0012876316905021667,
+      "learning_rate": 1.8899436081059972e-07,
+      "loss": 0.0,
+      "num_tokens": 14300838.0,
+      "reward": 0.75,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 729,
+      "step_time": 74.73872442170978
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 920.1875,
+      "completions/mean_terminated_length": 786.7142944335938,
+      "completions/min_length": 736.0,
+      "completions/min_terminated_length": 736.0,
+      "entropy": 0.22712762653827667,
+      "epoch": 3.8624338624338623,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 10.0,
+      "kl": 0.0014261262258514762,
+      "learning_rate": 1.877013831630961e-07,
+      "loss": -0.022,
+      "num_tokens": 14323721.0,
+      "reward": 0.1458333432674408,
+      "reward_std": 0.0589255690574646,
+      "rewards/itbench_correctness/mean": 0.1458333432674408,
+      "rewards/itbench_correctness/std": 0.17078252136707306,
+      "step": 730,
+      "step_time": 134.0663373246789
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 893.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 542.5,
+      "completions/mean_terminated_length": 542.5,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "entropy": 0.26359447836875916,
+      "epoch": 3.867724867724868,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.265625,
+      "kl": 0.0024193329736590385,
+      "learning_rate": 1.8641182076323148e-07,
+      "loss": -0.0109,
+      "num_tokens": 14337273.0,
+      "reward": 0.4166666865348816,
+      "reward_std": 0.08908706903457642,
+      "rewards/itbench_correctness/mean": 0.4166666865348816,
+      "rewards/itbench_correctness/std": 0.14907118678092957,
+      "step": 731,
+      "step_time": 69.43494361732155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 542.0,
+      "completions/mean_length": 759.0625,
+      "completions/mean_terminated_length": 494.125,
+      "completions/min_length": 468.0,
+      "completions/min_terminated_length": 468.0,
+      "entropy": 0.3925895392894745,
+      "epoch": 3.873015873015873,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5078125,
+      "kl": 0.0015624676598235965,
+      "learning_rate": 1.8512568771345378e-07,
+      "loss": 0.0,
+      "num_tokens": 14358434.0,
+      "reward": 0.9166666865348816,
+      "reward_std": 0.17817416787147522,
+      "rewards/itbench_correctness/mean": 0.9166666865348816,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 732,
+      "step_time": 105.75646356213838
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 645.6875,
+      "completions/mean_terminated_length": 620.4666748046875,
+      "completions/min_length": 438.0,
+      "completions/min_terminated_length": 438.0,
+      "entropy": 0.6411770582199097,
+      "epoch": 3.878306878306878,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0023670424707233906,
+      "learning_rate": 1.8384299807870805e-07,
+      "loss": -0.0062,
+      "num_tokens": 14375989.0,
+      "reward": 0.4270833134651184,
+      "reward_std": 0.019287927076220512,
+      "rewards/itbench_correctness/mean": 0.4270833134651184,
+      "rewards/itbench_correctness/std": 0.44187626242637634,
+      "step": 733,
+      "step_time": 85.2570496154949
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 450.0,
+      "completions/mean_length": 710.0625,
+      "completions/mean_terminated_length": 396.125,
+      "completions/min_length": 334.0,
+      "completions/min_terminated_length": 334.0,
+      "entropy": 0.4816477298736572,
+      "epoch": 3.8835978835978837,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0220947265625,
+      "kl": 0.0012989024398848414,
+      "learning_rate": 1.8256376588628235e-07,
+      "loss": 0.0,
+      "num_tokens": 14392414.0,
+      "reward": 0.6785714626312256,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.6785714626312256,
+      "rewards/itbench_correctness/std": 0.18442778289318085,
+      "step": 734,
+      "step_time": 1033.4855123637244
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 899.0,
+      "completions/mean_length": 602.5625,
+      "completions/mean_terminated_length": 411.0,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "entropy": 0.44476714730262756,
+      "epoch": 3.888888888888889,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0017304952489212155,
+      "learning_rate": 1.812880051256551e-07,
+      "loss": -0.0347,
+      "num_tokens": 14405887.0,
+      "reward": 0.3031249940395355,
+      "reward_std": 0.3322408199310303,
+      "rewards/itbench_correctness/mean": 0.3031249940395355,
+      "rewards/itbench_correctness/std": 0.3288711905479431,
+      "step": 735,
+      "step_time": 81.48759328760207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 607.625,
+      "completions/mean_terminated_length": 548.1428833007812,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "entropy": 0.3587739169597626,
+      "epoch": 3.894179894179894,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.04931640625,
+      "kl": 0.0017037625657394528,
+      "learning_rate": 1.8001572974834168e-07,
+      "loss": 0.0,
+      "num_tokens": 14424505.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 736,
+      "step_time": 320.6608420452103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 850.0,
+      "completions/mean_length": 610.8125,
+      "completions/mean_terminated_length": 583.2667236328125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4322111904621124,
+      "epoch": 3.8994708994708995,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.498046875,
+      "kl": 0.0016470147529616952,
+      "learning_rate": 1.787469536677419e-07,
+      "loss": -0.1173,
+      "num_tokens": 14461894.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 737,
+      "step_time": 313.65273729898036
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 737.0,
+      "completions/mean_length": 715.9375,
+      "completions/mean_terminated_length": 476.3333435058594,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 0.7011785507202148,
+      "epoch": 3.9047619047619047,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.796875,
+      "kl": 0.001664389856159687,
+      "learning_rate": 1.7748169075898727e-07,
+      "loss": 0.0427,
+      "num_tokens": 14482421.0,
+      "reward": 0.2447916716337204,
+      "reward_std": 0.235783189535141,
+      "rewards/itbench_correctness/mean": 0.2447916716337204,
+      "rewards/itbench_correctness/std": 0.3475692868232727,
+      "step": 738,
+      "step_time": 84.38055996689945
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 554.0,
+      "completions/max_terminated_length": 554.0,
+      "completions/mean_length": 416.9375,
+      "completions/mean_terminated_length": 416.9375,
+      "completions/min_length": 303.0,
+      "completions/min_terminated_length": 303.0,
+      "entropy": 0.4149302840232849,
+      "epoch": 3.91005291005291,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.0016000322066247463,
+      "learning_rate": 1.762199548587906e-07,
+      "loss": -0.0046,
+      "num_tokens": 14491636.0,
+      "reward": 0.546875,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.546875,
+      "rewards/itbench_correctness/std": 0.07739239931106567,
+      "step": 739,
+      "step_time": 123.19425270985812
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 993.0,
+      "completions/mean_length": 759.5,
+      "completions/mean_terminated_length": 639.2727661132812,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.35286372900009155,
+      "epoch": 3.9153439153439153,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.84375,
+      "kl": 0.0021646912209689617,
+      "learning_rate": 1.7496175976529337e-07,
+      "loss": -0.0035,
+      "num_tokens": 14509100.0,
+      "reward": 0.75,
+      "reward_std": 0.1259881556034088,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.31031644344329834,
+      "step": 740,
+      "step_time": 139.75384074263275
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 614.625,
+      "completions/mean_terminated_length": 614.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5694529414176941,
+      "epoch": 3.9206349206349205,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.0038674459792673588,
+      "learning_rate": 1.7370711923791564e-07,
+      "loss": 0.0097,
+      "num_tokens": 14532734.0,
+      "reward": 0.265625,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.265625,
+      "rewards/itbench_correctness/std": 0.28090256452560425,
+      "step": 741,
+      "step_time": 372.90891189686954
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 855.0,
+      "completions/mean_length": 923.75,
+      "completions/mean_terminated_length": 623.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5044655203819275,
+      "epoch": 3.925925925925926,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.890625,
+      "kl": 0.0016132418531924486,
+      "learning_rate": 1.7245604699720535e-07,
+      "loss": -0.069,
+      "num_tokens": 14559434.0,
+      "reward": 0.18854166567325592,
+      "reward_std": 0.15936720371246338,
+      "rewards/itbench_correctness/mean": 0.18854166567325592,
+      "rewards/itbench_correctness/std": 0.15572041273117065,
+      "step": 742,
+      "step_time": 162.81722828093916
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 870.875,
+      "completions/mean_terminated_length": 819.8333740234375,
+      "completions/min_length": 529.0,
+      "completions/min_terminated_length": 529.0,
+      "entropy": 0.4018946588039398,
+      "epoch": 3.931216931216931,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0014265145873650908,
+      "learning_rate": 1.7120855672468776e-07,
+      "loss": -0.0312,
+      "num_tokens": 14580296.0,
+      "reward": 0.5625,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 743,
+      "step_time": 541.2288927352056
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 715.0,
+      "completions/mean_length": 771.0625,
+      "completions/mean_terminated_length": 574.3333129882812,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.5732349753379822,
+      "epoch": 3.9365079365079367,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4296875,
+      "kl": 0.0019290586933493614,
+      "learning_rate": 1.6996466206271675e-07,
+      "loss": -0.0139,
+      "num_tokens": 14620177.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 744,
+      "step_time": 638.7111993785948
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 632.0,
+      "completions/max_terminated_length": 632.0,
+      "completions/mean_length": 472.5,
+      "completions/mean_terminated_length": 472.5,
+      "completions/min_length": 353.0,
+      "completions/min_terminated_length": 353.0,
+      "entropy": 0.40634921193122864,
+      "epoch": 3.941798941798942,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.0012795755174010992,
+      "learning_rate": 1.6872437661432516e-07,
+      "loss": 0.0089,
+      "num_tokens": 14630313.0,
+      "reward": 0.734375,
+      "reward_std": 0.15738674998283386,
+      "rewards/itbench_correctness/mean": 0.734375,
+      "rewards/itbench_correctness/std": 0.34856685996055603,
+      "step": 745,
+      "step_time": 94.9421982690692
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 629.0,
+      "completions/max_terminated_length": 629.0,
+      "completions/mean_length": 483.375,
+      "completions/mean_terminated_length": 483.375,
+      "completions/min_length": 410.0,
+      "completions/min_terminated_length": 410.0,
+      "entropy": 0.39513835310935974,
+      "epoch": 3.947089947089947,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.004012894816696644,
+      "learning_rate": 1.674877139430758e-07,
+      "loss": -0.015,
+      "num_tokens": 14646847.0,
+      "reward": 0.4375,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 746,
+      "step_time": 83.17668206058443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1015.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 630.6875,
+      "completions/mean_terminated_length": 630.6875,
+      "completions/min_length": 491.0,
+      "completions/min_terminated_length": 491.0,
+      "entropy": 0.4281042516231537,
+      "epoch": 3.9523809523809526,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.001534756040200591,
+      "learning_rate": 1.6625468757291377e-07,
+      "loss": -0.012,
+      "num_tokens": 14660506.0,
+      "reward": 0.640625,
+      "reward_std": 0.27564918994903564,
+      "rewards/itbench_correctness/mean": 0.640625,
+      "rewards/itbench_correctness/std": 0.341183602809906,
+      "step": 747,
+      "step_time": 418.7796147307381
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 585.625,
+      "completions/mean_terminated_length": 439.5,
+      "completions/min_length": 337.0,
+      "completions/min_terminated_length": 337.0,
+      "entropy": 0.35346850752830505,
+      "epoch": 3.9576719576719577,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.0018646334065124393,
+      "learning_rate": 1.6502531098801753e-07,
+      "loss": -0.0214,
+      "num_tokens": 14676340.0,
+      "reward": 0.7291666269302368,
+      "reward_std": 0.3471825420856476,
+      "rewards/itbench_correctness/mean": 0.7291666269302368,
+      "rewards/itbench_correctness/std": 0.3542075455188751,
+      "step": 748,
+      "step_time": 124.75608675274998
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 735.3125,
+      "completions/mean_terminated_length": 694.0714721679688,
+      "completions/min_length": 547.0,
+      "completions/min_terminated_length": 547.0,
+      "entropy": 0.5439863801002502,
+      "epoch": 3.962962962962963,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7109375,
+      "kl": 0.0013193426420912147,
+      "learning_rate": 1.6379959763265266e-07,
+      "loss": 0.028,
+      "num_tokens": 14694473.0,
+      "reward": 0.375,
+      "reward_std": 0.249358132481575,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.273861289024353,
+      "step": 749,
+      "step_time": 466.49796204734594
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 735.25,
+      "completions/mean_terminated_length": 562.0,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "entropy": 0.4950697124004364,
+      "epoch": 3.9682539682539684,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0012579681351780891,
+      "learning_rate": 1.62577560911024e-07,
+      "loss": -0.0039,
+      "num_tokens": 14711301.0,
+      "reward": 0.65625,
+      "reward_std": 0.30173346400260925,
+      "rewards/itbench_correctness/mean": 0.65625,
+      "rewards/itbench_correctness/std": 0.375,
+      "step": 750,
+      "step_time": 988.6653963262215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 682.0,
+      "completions/max_terminated_length": 682.0,
+      "completions/mean_length": 411.375,
+      "completions/mean_terminated_length": 411.375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.47645092010498047,
+      "epoch": 3.9735449735449735,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06201171875,
+      "kl": 0.002736019669100642,
+      "learning_rate": 1.6135921418712955e-07,
+      "loss": 0.0,
+      "num_tokens": 14720819.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 751,
+      "step_time": 143.75118728913367
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 789.0,
+      "completions/mean_length": 756.6875,
+      "completions/mean_terminated_length": 548.7777709960938,
+      "completions/min_length": 348.0,
+      "completions/min_terminated_length": 348.0,
+      "entropy": 0.5418353080749512,
+      "epoch": 3.9788359788359786,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.375,
+      "kl": 0.0015679231146350503,
+      "learning_rate": 1.601445707846135e-07,
+      "loss": 0.0288,
+      "num_tokens": 14743110.0,
+      "reward": 0.3697916865348816,
+      "reward_std": 0.014731383882462978,
+      "rewards/itbench_correctness/mean": 0.3697916865348816,
+      "rewards/itbench_correctness/std": 0.3824491500854492,
+      "step": 752,
+      "step_time": 114.25977344904095
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 973.0,
+      "completions/mean_length": 878.625,
+      "completions/mean_terminated_length": 812.5454711914062,
+      "completions/min_length": 700.0,
+      "completions/min_terminated_length": 700.0,
+      "entropy": 0.29022619128227234,
+      "epoch": 3.984126984126984,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.041748046875,
+      "kl": 0.001140591804869473,
+      "learning_rate": 1.5893364398662174e-07,
+      "loss": 0.0,
+      "num_tokens": 14769432.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 753,
+      "step_time": 683.4805310554802
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 809.3125,
+      "completions/mean_terminated_length": 680.5,
+      "completions/min_length": 461.0,
+      "completions/min_terminated_length": 461.0,
+      "entropy": 0.2903698980808258,
+      "epoch": 3.9894179894179893,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6171875,
+      "kl": 0.0009075679117813706,
+      "learning_rate": 1.5772644703565564e-07,
+      "loss": -0.0137,
+      "num_tokens": 14788877.0,
+      "reward": 0.7447916865348816,
+      "reward_std": 0.1927933692932129,
+      "rewards/itbench_correctness/mean": 0.7447916865348816,
+      "rewards/itbench_correctness/std": 0.2643453776836395,
+      "step": 754,
+      "step_time": 105.73342135362327
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 823.0,
+      "completions/mean_terminated_length": 666.6666870117188,
+      "completions/min_length": 583.0,
+      "completions/min_terminated_length": 583.0,
+      "entropy": 0.4835965931415558,
+      "epoch": 3.9947089947089944,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.453125,
+      "kl": 0.001044503878802061,
+      "learning_rate": 1.565229931334277e-07,
+      "loss": 0.0002,
+      "num_tokens": 14808949.0,
+      "reward": 0.596875011920929,
+      "reward_std": 0.041052017360925674,
+      "rewards/itbench_correctness/mean": 0.596875011920929,
+      "rewards/itbench_correctness/std": 0.4201066493988037,
+      "step": 755,
+      "step_time": 158.0458857798949
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 702.8125,
+      "completions/mean_terminated_length": 656.9285888671875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.26465097069740295,
+      "epoch": 4.0,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.050537109375,
+      "kl": 0.001511210692115128,
+      "learning_rate": 1.553232954407171e-07,
+      "loss": 0.0,
+      "num_tokens": 14833258.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 756,
+      "step_time": 233.77977779414505
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 826.0,
+      "completions/max_terminated_length": 826.0,
+      "completions/mean_length": 684.9375,
+      "completions/mean_terminated_length": 684.9375,
+      "completions/min_length": 562.0,
+      "completions/min_terminated_length": 562.0,
+      "entropy": 0.49639564752578735,
+      "epoch": 4.005291005291006,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9765625,
+      "kl": 0.0017367677064612508,
+      "learning_rate": 1.5412736707722534e-07,
+      "loss": -0.0237,
+      "num_tokens": 14860073.0,
+      "reward": 0.38749998807907104,
+      "reward_std": 0.42026326060295105,
+      "rewards/itbench_correctness/mean": 0.38749998807907104,
+      "rewards/itbench_correctness/std": 0.49244290590286255,
+      "step": 757,
+      "step_time": 126.27328568976372
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 557.0,
+      "completions/mean_length": 696.6875,
+      "completions/mean_terminated_length": 442.1111145019531,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5597918629646301,
+      "epoch": 4.01058201058201,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.921875,
+      "kl": 0.0017616557888686657,
+      "learning_rate": 1.529352211214337e-07,
+      "loss": -0.0553,
+      "num_tokens": 14877180.0,
+      "reward": 0.5062500238418579,
+      "reward_std": 0.23287571966648102,
+      "rewards/itbench_correctness/mean": 0.5062500238418579,
+      "rewards/itbench_correctness/std": 0.33042481541633606,
+      "step": 758,
+      "step_time": 145.3167848372832
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 443.0,
+      "completions/mean_length": 701.1875,
+      "completions/mean_terminated_length": 378.375,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "entropy": 0.3793564438819885,
+      "epoch": 4.015873015873016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7265625,
+      "kl": 0.0020792728755623102,
+      "learning_rate": 1.517468706104589e-07,
+      "loss": -0.0105,
+      "num_tokens": 14894423.0,
+      "reward": 0.75,
+      "reward_std": 0.1315174549818039,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.2357022613286972,
+      "step": 759,
+      "step_time": 143.815074888058
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 896.0,
+      "completions/max_terminated_length": 896.0,
+      "completions/mean_length": 579.625,
+      "completions/mean_terminated_length": 579.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4658184051513672,
+      "epoch": 4.021164021164021,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.0024365338031202555,
+      "learning_rate": 1.5056232853991208e-07,
+      "loss": -0.0313,
+      "num_tokens": 14908353.0,
+      "reward": 0.7057291865348816,
+      "reward_std": 0.188043013215065,
+      "rewards/itbench_correctness/mean": 0.7057291865348816,
+      "rewards/itbench_correctness/std": 0.325060099363327,
+      "step": 760,
+      "step_time": 169.37305662687868
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 720.0,
+      "completions/max_terminated_length": 720.0,
+      "completions/mean_length": 617.875,
+      "completions/mean_terminated_length": 617.875,
+      "completions/min_length": 518.0,
+      "completions/min_terminated_length": 518.0,
+      "entropy": 0.2735181152820587,
+      "epoch": 4.026455026455026,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0019248781027272344,
+      "learning_rate": 1.493816078637557e-07,
+      "loss": 0.0042,
+      "num_tokens": 14922327.0,
+      "reward": 0.5,
+      "reward_std": 0.4629100561141968,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 761,
+      "step_time": 152.33051012922078
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 825.9375,
+      "completions/mean_terminated_length": 707.1000366210938,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.5787363052368164,
+      "epoch": 4.031746031746032,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.828125,
+      "kl": 0.0014200946316123009,
+      "learning_rate": 1.4820472149416153e-07,
+      "loss": 0.022,
+      "num_tokens": 14957710.0,
+      "reward": 0.09375,
+      "reward_std": 0.03788072243332863,
+      "rewards/itbench_correctness/mean": 0.09375,
+      "rewards/itbench_correctness/std": 0.10978876054286957,
+      "step": 762,
+      "step_time": 102.25276782084256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 949.5625,
+      "completions/mean_terminated_length": 875.125,
+      "completions/min_length": 767.0,
+      "completions/min_terminated_length": 767.0,
+      "entropy": 0.3391035199165344,
+      "epoch": 4.037037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8984375,
+      "kl": 0.0015424852026626468,
+      "learning_rate": 1.470316823013707e-07,
+      "loss": 0.0162,
+      "num_tokens": 14984159.0,
+      "reward": 0.4062500298023224,
+      "reward_std": 0.2351749688386917,
+      "rewards/itbench_correctness/mean": 0.4062500298023224,
+      "rewards/itbench_correctness/std": 0.2916666865348816,
+      "step": 763,
+      "step_time": 115.0978871025145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 812.3125,
+      "completions/mean_terminated_length": 782.0714721679688,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.5022697448730469,
+      "epoch": 4.042328042328043,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.21875,
+      "kl": 0.0012976002180948853,
+      "learning_rate": 1.4586250311355132e-07,
+      "loss": -0.0125,
+      "num_tokens": 15003988.0,
+      "reward": 0.15625,
+      "reward_std": 0.18600594997406006,
+      "rewards/itbench_correctness/mean": 0.15625,
+      "rewards/itbench_correctness/std": 0.3010398745536804,
+      "step": 764,
+      "step_time": 219.78546244930476
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 832.0,
+      "completions/mean_length": 883.4375,
+      "completions/mean_terminated_length": 742.875,
+      "completions/min_length": 634.0,
+      "completions/min_terminated_length": 634.0,
+      "entropy": 0.4867350459098816,
+      "epoch": 4.0476190476190474,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9453125,
+      "kl": 0.00221477122977376,
+      "learning_rate": 1.4469719671666043e-07,
+      "loss": -0.0099,
+      "num_tokens": 15032371.0,
+      "reward": 0.7265625,
+      "reward_std": 0.3056884706020355,
+      "rewards/itbench_correctness/mean": 0.7265625,
+      "rewards/itbench_correctness/std": 0.3329750895500183,
+      "step": 765,
+      "step_time": 196.3956578373909
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 634.0,
+      "completions/mean_terminated_length": 634.0,
+      "completions/min_length": 397.0,
+      "completions/min_terminated_length": 397.0,
+      "entropy": 0.44164037704467773,
+      "epoch": 4.052910052910053,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.078125,
+      "kl": 0.0014499113894999027,
+      "learning_rate": 1.435357758543015e-07,
+      "loss": -0.0066,
+      "num_tokens": 15046147.0,
+      "reward": 0.8051470518112183,
+      "reward_std": 0.252979040145874,
+      "rewards/itbench_correctness/mean": 0.8051470518112183,
+      "rewards/itbench_correctness/std": 0.399953156709671,
+      "step": 766,
+      "step_time": 170.79877135157585
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 512.0,
+      "completions/mean_length": 731.25,
+      "completions/mean_terminated_length": 438.5,
+      "completions/min_length": 369.0,
+      "completions/min_terminated_length": 369.0,
+      "entropy": 0.2762393057346344,
+      "epoch": 4.058201058201059,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.033447265625,
+      "kl": 0.0011394057655707002,
+      "learning_rate": 1.4237825322758735e-07,
+      "loss": 0.0,
+      "num_tokens": 15063599.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 767,
+      "step_time": 863.5087349172682
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 764.25,
+      "completions/mean_terminated_length": 608.4000244140625,
+      "completions/min_length": 362.0,
+      "completions/min_terminated_length": 362.0,
+      "entropy": 0.3663722574710846,
+      "epoch": 4.063492063492063,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0017420370131731033,
+      "learning_rate": 1.412246414949997e-07,
+      "loss": -0.0017,
+      "num_tokens": 15080899.0,
+      "reward": 0.234375,
+      "reward_std": 0.1953546553850174,
+      "rewards/itbench_correctness/mean": 0.234375,
+      "rewards/itbench_correctness/std": 0.36032232642173767,
+      "step": 768,
+      "step_time": 277.51467712502927
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 764.5625,
+      "completions/mean_terminated_length": 704.6923217773438,
+      "completions/min_length": 474.0,
+      "completions/min_terminated_length": 474.0,
+      "entropy": 0.34791138768196106,
+      "epoch": 4.068783068783069,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.96875,
+      "kl": 0.0010995686752721667,
+      "learning_rate": 1.400749532722516e-07,
+      "loss": 0.0227,
+      "num_tokens": 15098204.0,
+      "reward": 0.46875,
+      "reward_std": 0.353828489780426,
+      "rewards/itbench_correctness/mean": 0.46875,
+      "rewards/itbench_correctness/std": 0.4181916415691376,
+      "step": 769,
+      "step_time": 572.5676989480853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 407.0,
+      "completions/mean_length": 687.0625,
+      "completions/mean_terminated_length": 350.125,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "entropy": 0.5006822347640991,
+      "epoch": 4.074074074074074,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5703125,
+      "kl": 0.0013974824687466025,
+      "learning_rate": 1.389292011321498e-07,
+      "loss": 0.0,
+      "num_tokens": 15122029.0,
+      "reward": 0.6875,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.3095695972442627,
+      "step": 770,
+      "step_time": 206.3093525590375
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 810.0,
+      "completions/max_terminated_length": 810.0,
+      "completions/mean_length": 523.125,
+      "completions/mean_terminated_length": 523.125,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.4262843430042267,
+      "epoch": 4.079365079365079,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0012783248675987124,
+      "learning_rate": 1.3778739760445552e-07,
+      "loss": -0.0522,
+      "num_tokens": 15132895.0,
+      "reward": 0.31534090638160706,
+      "reward_std": 0.3429919481277466,
+      "rewards/itbench_correctness/mean": 0.31534090638160706,
+      "rewards/itbench_correctness/std": 0.34952497482299805,
+      "step": 771,
+      "step_time": 136.4253909336403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 935.0,
+      "completions/mean_terminated_length": 905.3333740234375,
+      "completions/min_length": 770.0,
+      "completions/min_terminated_length": 770.0,
+      "entropy": 0.498395711183548,
+      "epoch": 4.084656084656085,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.1875,
+      "kl": 0.001486019929870963,
+      "learning_rate": 1.3664955517574967e-07,
+      "loss": 0.0119,
+      "num_tokens": 15163503.0,
+      "reward": 0.4791666865348816,
+      "reward_std": 0.4529353678226471,
+      "rewards/itbench_correctness/mean": 0.4791666865348816,
+      "rewards/itbench_correctness/std": 0.4549115002155304,
+      "step": 772,
+      "step_time": 149.56670145317912
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 700.25,
+      "completions/mean_terminated_length": 506.0,
+      "completions/min_length": 325.0,
+      "completions/min_terminated_length": 325.0,
+      "entropy": 0.4455551505088806,
+      "epoch": 4.08994708994709,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.001568189705722034,
+      "learning_rate": 1.3551568628929432e-07,
+      "loss": 0.0036,
+      "num_tokens": 15185307.0,
+      "reward": 0.125,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.273861289024353,
+      "step": 773,
+      "step_time": 821.3875108454376
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 904.0,
+      "completions/max_terminated_length": 904.0,
+      "completions/mean_length": 763.9375,
+      "completions/mean_terminated_length": 763.9375,
+      "completions/min_length": 583.0,
+      "completions/min_terminated_length": 583.0,
+      "entropy": 0.44768059253692627,
+      "epoch": 4.095238095238095,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0322265625,
+      "kl": 0.0012937538558617234,
+      "learning_rate": 1.3438580334489818e-07,
+      "loss": 0.0,
+      "num_tokens": 15207562.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 774,
+      "step_time": 537.4289036728442
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 642.8125,
+      "completions/mean_terminated_length": 617.4000244140625,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "entropy": 0.4698103964328766,
+      "epoch": 4.1005291005291005,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0859375,
+      "kl": 0.0019668727181851864,
+      "learning_rate": 1.3325991869878012e-07,
+      "loss": -0.0025,
+      "num_tokens": 15221975.0,
+      "reward": 0.15625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.15625,
+      "rewards/itbench_correctness/std": 0.23935678601264954,
+      "step": 775,
+      "step_time": 191.42062663193792
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 817.0,
+      "completions/max_terminated_length": 817.0,
+      "completions/mean_length": 593.375,
+      "completions/mean_terminated_length": 593.375,
+      "completions/min_length": 433.0,
+      "completions/min_terminated_length": 433.0,
+      "entropy": 0.434800922870636,
+      "epoch": 4.105820105820106,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3359375,
+      "kl": 0.0018295013578608632,
+      "learning_rate": 1.321380446634342e-07,
+      "loss": 0.0433,
+      "num_tokens": 15235517.0,
+      "reward": 0.75,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.30276504158973694,
+      "step": 776,
+      "step_time": 495.94140707794577
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 567.0,
+      "completions/mean_length": 683.5625,
+      "completions/mean_terminated_length": 418.77777099609375,
+      "completions/min_length": 343.0,
+      "completions/min_terminated_length": 343.0,
+      "entropy": 0.5295785069465637,
+      "epoch": 4.111111111111111,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8359375,
+      "kl": 0.002165441866964102,
+      "learning_rate": 1.3102019350749527e-07,
+      "loss": 0.0164,
+      "num_tokens": 15252526.0,
+      "reward": 0.75,
+      "reward_std": 0.4629100561141968,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 777,
+      "step_time": 161.9596445625648
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 699.0,
+      "completions/mean_length": 830.625,
+      "completions/mean_terminated_length": 637.25,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.4839729070663452,
+      "epoch": 4.116402116402116,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4921875,
+      "kl": 0.0012055831030011177,
+      "learning_rate": 1.299063774556042e-07,
+      "loss": 0.0,
+      "num_tokens": 15273064.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 778,
+      "step_time": 104.92770658805966
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 831.1875,
+      "completions/mean_terminated_length": 681.2222290039062,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.42589667439460754,
+      "epoch": 4.121693121693122,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.203125,
+      "kl": 0.001626756857149303,
+      "learning_rate": 1.287966086882751e-07,
+      "loss": 0.0145,
+      "num_tokens": 15292003.0,
+      "reward": 0.8660714626312256,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.8660714626312256,
+      "rewards/itbench_correctness/std": 0.24169892072677612,
+      "step": 779,
+      "step_time": 1019.1612946912646
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 866.0,
+      "completions/max_terminated_length": 866.0,
+      "completions/mean_length": 657.125,
+      "completions/mean_terminated_length": 657.125,
+      "completions/min_length": 467.0,
+      "completions/min_terminated_length": 467.0,
+      "entropy": 0.366749107837677,
+      "epoch": 4.1269841269841265,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.0010745770996436477,
+      "learning_rate": 1.2769089934176126e-07,
+      "loss": 0.0175,
+      "num_tokens": 15308181.0,
+      "reward": 0.9187500476837158,
+      "reward_std": 0.0258774496614933,
+      "rewards/itbench_correctness/mean": 0.9187500476837158,
+      "rewards/itbench_correctness/std": 0.09105858951807022,
+      "step": 780,
+      "step_time": 172.30875083897263
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 626.6875,
+      "completions/mean_terminated_length": 600.2000122070312,
+      "completions/min_length": 322.0,
+      "completions/min_terminated_length": 322.0,
+      "entropy": 0.3909444510936737,
+      "epoch": 4.132275132275132,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0703125,
+      "kl": 0.0016821923200041056,
+      "learning_rate": 1.2658926150792322e-07,
+      "loss": 0.0125,
+      "num_tokens": 15333040.0,
+      "reward": 0.640625,
+      "reward_std": 0.0867956355214119,
+      "rewards/itbench_correctness/mean": 0.640625,
+      "rewards/itbench_correctness/std": 0.3896446228027344,
+      "step": 781,
+      "step_time": 372.3970377044752
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 615.75,
+      "completions/mean_terminated_length": 588.5333862304688,
+      "completions/min_length": 434.0,
+      "completions/min_terminated_length": 434.0,
+      "entropy": 0.43524158000946045,
+      "epoch": 4.137566137566138,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5390625,
+      "kl": 0.0014195973053574562,
+      "learning_rate": 1.2549170723409547e-07,
+      "loss": 0.0509,
+      "num_tokens": 15354348.0,
+      "reward": 0.04375000298023224,
+      "reward_std": 0.0176776684820652,
+      "rewards/itbench_correctness/mean": 0.04375000298023224,
+      "rewards/itbench_correctness/std": 0.05123475566506386,
+      "step": 782,
+      "step_time": 1192.5468442188576
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 530.0,
+      "completions/max_terminated_length": 530.0,
+      "completions/mean_length": 385.75,
+      "completions/mean_terminated_length": 385.75,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.3966299295425415,
+      "epoch": 4.142857142857143,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.1953125,
+      "kl": 0.0015992495464161038,
+      "learning_rate": 1.243982485229559e-07,
+      "loss": -0.0488,
+      "num_tokens": 15367896.0,
+      "reward": 0.6400861740112305,
+      "reward_std": 0.4733222424983978,
+      "rewards/itbench_correctness/mean": 0.6400861740112305,
+      "rewards/itbench_correctness/std": 0.4832458794116974,
+      "step": 783,
+      "step_time": 120.91243299655616
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 876.0,
+      "completions/max_terminated_length": 876.0,
+      "completions/mean_length": 658.6875,
+      "completions/mean_terminated_length": 658.6875,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 0.28237974643707275,
+      "epoch": 4.148148148148148,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.375,
+      "kl": 0.0011543561704456806,
+      "learning_rate": 1.2330889733239368e-07,
+      "loss": -0.006,
+      "num_tokens": 15385411.0,
+      "reward": 0.359375,
+      "reward_std": 0.04419417306780815,
+      "rewards/itbench_correctness/mean": 0.359375,
+      "rewards/itbench_correctness/std": 0.3760402202606201,
+      "step": 784,
+      "step_time": 991.2049660263583
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 839.375,
+      "completions/mean_terminated_length": 654.75,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "entropy": 0.3037974536418915,
+      "epoch": 4.1534391534391535,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.032958984375,
+      "kl": 0.0011951366905122995,
+      "learning_rate": 1.222236655753791e-07,
+      "loss": 0.0,
+      "num_tokens": 15404521.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 785,
+      "step_time": 491.68448298610747
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 748.6875,
+      "completions/mean_terminated_length": 685.1538696289062,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.28850486874580383,
+      "epoch": 4.158730158730159,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.40625,
+      "kl": 0.0016689749900251627,
+      "learning_rate": 1.2114256511983274e-07,
+      "loss": -0.1494,
+      "num_tokens": 15422436.0,
+      "reward": 0.5833333730697632,
+      "reward_std": 0.3327338695526123,
+      "rewards/itbench_correctness/mean": 0.5833333730697632,
+      "rewards/itbench_correctness/std": 0.3648312985897064,
+      "step": 786,
+      "step_time": 87.55616814736277
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 623.0,
+      "completions/mean_length": 728.3125,
+      "completions/mean_terminated_length": 498.3333435058594,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.549214780330658,
+      "epoch": 4.164021164021164,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9453125,
+      "kl": 0.0020776954479515553,
+      "learning_rate": 1.2006560778849579e-07,
+      "loss": -0.0303,
+      "num_tokens": 15441321.0,
+      "reward": 0.4000000059604645,
+      "reward_std": 0.302165687084198,
+      "rewards/itbench_correctness/mean": 0.4000000059604645,
+      "rewards/itbench_correctness/std": 0.4898979663848877,
+      "step": 787,
+      "step_time": 132.10281661339104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 523.0,
+      "completions/mean_length": 955.3125,
+      "completions/mean_terminated_length": 474.5,
+      "completions/min_length": 426.0,
+      "completions/min_terminated_length": 426.0,
+      "entropy": 0.3663722574710846,
+      "epoch": 4.169312169312169,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0634765625,
+      "kl": 0.0013061100617051125,
+      "learning_rate": 1.1899280535880119e-07,
+      "loss": 0.0,
+      "num_tokens": 15465854.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 788,
+      "step_time": 161.56111018918455
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 747.875,
+      "completions/mean_terminated_length": 655.8333740234375,
+      "completions/min_length": 463.0,
+      "completions/min_terminated_length": 463.0,
+      "entropy": 0.26207587122917175,
+      "epoch": 4.174603174603175,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.07373046875,
+      "kl": 0.0017630105139687657,
+      "learning_rate": 1.1792416956274443e-07,
+      "loss": 0.0001,
+      "num_tokens": 15485668.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 789,
+      "step_time": 249.84946880768985
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 826.875,
+      "completions/mean_terminated_length": 673.5555419921875,
+      "completions/min_length": 505.0,
+      "completions/min_terminated_length": 505.0,
+      "entropy": 0.5635676383972168,
+      "epoch": 4.1798941798941796,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0018597254529595375,
+      "learning_rate": 1.1685971208675538e-07,
+      "loss": -0.0061,
+      "num_tokens": 15514546.0,
+      "reward": 0.40625,
+      "reward_std": 0.3471629321575165,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.4366062581539154,
+      "step": 790,
+      "step_time": 128.95786687266082
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 963.0,
+      "completions/max_terminated_length": 963.0,
+      "completions/mean_length": 678.5,
+      "completions/mean_terminated_length": 678.5,
+      "completions/min_length": 384.0,
+      "completions/min_terminated_length": 384.0,
+      "entropy": 0.5630066394805908,
+      "epoch": 4.185185185185185,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2265625,
+      "kl": 0.0021489164792001247,
+      "learning_rate": 1.1579944457157059e-07,
+      "loss": -0.0285,
+      "num_tokens": 15538698.0,
+      "reward": 0.34375,
+      "reward_std": 0.22903135418891907,
+      "rewards/itbench_correctness/mean": 0.34375,
+      "rewards/itbench_correctness/std": 0.4732423722743988,
+      "step": 791,
+      "step_time": 140.31763851176947
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 693.5625,
+      "completions/mean_terminated_length": 646.357177734375,
+      "completions/min_length": 423.0,
+      "completions/min_terminated_length": 423.0,
+      "entropy": 0.48157158493995667,
+      "epoch": 4.190476190476191,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.002031001029536128,
+      "learning_rate": 1.1474337861210543e-07,
+      "loss": 0.0427,
+      "num_tokens": 15560955.0,
+      "reward": 0.375,
+      "reward_std": 0.4355512857437134,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 792,
+      "step_time": 343.5579600026831
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 605.0,
+      "completions/max_terminated_length": 605.0,
+      "completions/mean_length": 522.625,
+      "completions/mean_terminated_length": 522.625,
+      "completions/min_length": 457.0,
+      "completions/min_terminated_length": 457.0,
+      "entropy": 0.5089691281318665,
+      "epoch": 4.195767195767195,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.00155110121704638,
+      "learning_rate": 1.1369152575732821e-07,
+      "loss": -0.0072,
+      "num_tokens": 15572493.0,
+      "reward": 0.7232142686843872,
+      "reward_std": 0.3042290210723877,
+      "rewards/itbench_correctness/mean": 0.7232142686843872,
+      "rewards/itbench_correctness/std": 0.4347764849662781,
+      "step": 793,
+      "step_time": 130.16915812157094
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 901.4375,
+      "completions/mean_terminated_length": 845.727294921875,
+      "completions/min_length": 659.0,
+      "completions/min_terminated_length": 659.0,
+      "entropy": 0.4392983317375183,
+      "epoch": 4.201058201058201,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9765625,
+      "kl": 0.0012185334926471114,
+      "learning_rate": 1.1264389751013325e-07,
+      "loss": 0.0073,
+      "num_tokens": 15592028.0,
+      "reward": 0.7326388955116272,
+      "reward_std": 0.3273431062698364,
+      "rewards/itbench_correctness/mean": 0.7326388955116272,
+      "rewards/itbench_correctness/std": 0.33346834778785706,
+      "step": 794,
+      "step_time": 521.8481605676934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 563.3125,
+      "completions/mean_terminated_length": 563.3125,
+      "completions/min_length": 476.0,
+      "completions/min_terminated_length": 476.0,
+      "entropy": 0.39232221245765686,
+      "epoch": 4.2063492063492065,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.921875,
+      "kl": 0.0014798998599871993,
+      "learning_rate": 1.1160050532721527e-07,
+      "loss": 0.0313,
+      "num_tokens": 15605457.0,
+      "reward": 0.8125,
+      "reward_std": 0.3657589256763458,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.35939764976501465,
+      "step": 795,
+      "step_time": 94.33376376517117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 926.625,
+      "completions/mean_terminated_length": 882.3636474609375,
+      "completions/min_length": 758.0,
+      "completions/min_terminated_length": 758.0,
+      "entropy": 0.40145689249038696,
+      "epoch": 4.211640211640212,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3671875,
+      "kl": 0.0010374147677794099,
+      "learning_rate": 1.1056136061894384e-07,
+      "loss": -0.0077,
+      "num_tokens": 15626107.0,
+      "reward": 0.9632353186607361,
+      "reward_std": 0.051545556634664536,
+      "rewards/itbench_correctness/mean": 0.9632353186607361,
+      "rewards/itbench_correctness/std": 0.08000864833593369,
+      "step": 796,
+      "step_time": 205.19225138891488
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 824.0,
+      "completions/max_terminated_length": 824.0,
+      "completions/mean_length": 572.9375,
+      "completions/mean_terminated_length": 572.9375,
+      "completions/min_length": 437.0,
+      "completions/min_terminated_length": 437.0,
+      "entropy": 0.45729246735572815,
+      "epoch": 4.216931216931217,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.375,
+      "kl": 0.0012878067791461945,
+      "learning_rate": 1.095264747492391e-07,
+      "loss": 0.0172,
+      "num_tokens": 15639266.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 797,
+      "step_time": 114.62835809681565
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 932.0,
+      "completions/mean_length": 729.6875,
+      "completions/mean_terminated_length": 710.0667114257812,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 512.0,
+      "entropy": 0.3563169240951538,
+      "epoch": 4.222222222222222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.171875,
+      "kl": 0.0015947025967761874,
+      "learning_rate": 1.0849585903544706e-07,
+      "loss": -0.0083,
+      "num_tokens": 15655869.0,
+      "reward": 0.4285714328289032,
+      "reward_std": 0.19342948496341705,
+      "rewards/itbench_correctness/mean": 0.4285714328289032,
+      "rewards/itbench_correctness/std": 0.27437829971313477,
+      "step": 798,
+      "step_time": 143.3468729155138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 754.0,
+      "completions/mean_length": 741.5,
+      "completions/mean_terminated_length": 572.0,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.5232636332511902,
+      "epoch": 4.227513227513228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.002720025833696127,
+      "learning_rate": 1.0746952474821613e-07,
+      "loss": -0.0039,
+      "num_tokens": 15678173.0,
+      "reward": 0.3645833432674408,
+      "reward_std": 0.01928791031241417,
+      "rewards/itbench_correctness/mean": 0.3645833432674408,
+      "rewards/itbench_correctness/std": 0.3774610757827759,
+      "step": 799,
+      "step_time": 129.29889920540154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 540.0,
+      "completions/mean_length": 640.9375,
+      "completions/mean_terminated_length": 411.1000061035156,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.37913212180137634,
+      "epoch": 4.232804232804233,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.015625,
+      "kl": 0.0011215686099603772,
+      "learning_rate": 1.0644748311137375e-07,
+      "loss": 0.0096,
+      "num_tokens": 15702932.0,
+      "reward": 0.21875,
+      "reward_std": 0.0883883461356163,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.2561737895011902,
+      "step": 800,
+      "step_time": 164.42878744658083
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 927.0,
+      "completions/mean_length": 705.5,
+      "completions/mean_terminated_length": 684.2667236328125,
+      "completions/min_length": 534.0,
+      "completions/min_terminated_length": 534.0,
+      "entropy": 0.3798724412918091,
+      "epoch": 4.238095238095238,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.0012407073518261313,
+      "learning_rate": 1.0542974530180327e-07,
+      "loss": -0.0115,
+      "num_tokens": 15719212.0,
+      "reward": 0.905024528503418,
+      "reward_std": 0.16057346761226654,
+      "rewards/itbench_correctness/mean": 0.905024528503418,
+      "rewards/itbench_correctness/std": 0.17238253355026245,
+      "step": 801,
+      "step_time": 167.65833072923124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 873.875,
+      "completions/mean_terminated_length": 783.7999877929688,
+      "completions/min_length": 589.0,
+      "completions/min_terminated_length": 589.0,
+      "entropy": 0.5355457067489624,
+      "epoch": 4.243386243386244,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4296875,
+      "kl": 0.001486996770836413,
+      "learning_rate": 1.0441632244932235e-07,
+      "loss": 0.0187,
+      "num_tokens": 15744658.0,
+      "reward": 0.2916666865348816,
+      "reward_std": 0.18722420930862427,
+      "rewards/itbench_correctness/mean": 0.2916666865348816,
+      "rewards/itbench_correctness/std": 0.395187109708786,
+      "step": 802,
+      "step_time": 429.37837726902217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.466796875,
+      "epoch": 4.248677248677248,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.015625,
+      "kl": 0.0013433246640488505,
+      "learning_rate": 1.0340722563656107e-07,
+      "loss": 0.0001,
+      "num_tokens": 15774058.0,
+      "reward": 0.5989583730697632,
+      "reward_std": 0.2046467512845993,
+      "rewards/itbench_correctness/mean": 0.5989583730697632,
+      "rewards/itbench_correctness/std": 0.3842606544494629,
+      "step": 803,
+      "step_time": 261.3252537054941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 897.0,
+      "completions/max_terminated_length": 897.0,
+      "completions/mean_length": 631.0625,
+      "completions/mean_terminated_length": 631.0625,
+      "completions/min_length": 538.0,
+      "completions/min_terminated_length": 538.0,
+      "entropy": 0.4817272424697876,
+      "epoch": 4.253968253968254,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.875,
+      "kl": 0.0022742159198969603,
+      "learning_rate": 1.0240246589884045e-07,
+      "loss": 0.0008,
+      "num_tokens": 15797947.0,
+      "reward": 0.40833333134651184,
+      "reward_std": 0.37586042284965515,
+      "rewards/itbench_correctness/mean": 0.40833333134651184,
+      "rewards/itbench_correctness/std": 0.392994225025177,
+      "step": 804,
+      "step_time": 447.9019009033218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 941.0,
+      "completions/max_terminated_length": 941.0,
+      "completions/mean_length": 759.8125,
+      "completions/mean_terminated_length": 759.8125,
+      "completions/min_length": 583.0,
+      "completions/min_terminated_length": 583.0,
+      "entropy": 0.5580323934555054,
+      "epoch": 4.2592592592592595,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06005859375,
+      "kl": 0.002108614193275571,
+      "learning_rate": 1.0140205422405212e-07,
+      "loss": 0.0001,
+      "num_tokens": 15830432.0,
+      "reward": 0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.05000000074505806,
+      "rewards/itbench_correctness/std": 0.05163978040218353,
+      "step": 805,
+      "step_time": 109.95679971016943
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 724.0,
+      "completions/mean_length": 816.0,
+      "completions/mean_terminated_length": 608.0,
+      "completions/min_length": 512.0,
+      "completions/min_terminated_length": 512.0,
+      "entropy": 0.44117647409439087,
+      "epoch": 4.264550264550264,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0013343130704015493,
+      "learning_rate": 1.0040600155253764e-07,
+      "loss": -0.0138,
+      "num_tokens": 15850416.0,
+      "reward": 0.9583333730697632,
+      "reward_std": 0.11785111576318741,
+      "rewards/itbench_correctness/mean": 0.9583333730697632,
+      "rewards/itbench_correctness/std": 0.1666666567325592,
+      "step": 806,
+      "step_time": 525.2538241520524
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 798.0,
+      "completions/max_terminated_length": 798.0,
+      "completions/mean_length": 573.125,
+      "completions/mean_terminated_length": 573.125,
+      "completions/min_length": 399.0,
+      "completions/min_terminated_length": 399.0,
+      "entropy": 0.3297709822654724,
+      "epoch": 4.26984126984127,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.125,
+      "kl": 0.0016436699079349637,
+      "learning_rate": 9.941431877696954e-08,
+      "loss": -0.0137,
+      "num_tokens": 15863410.0,
+      "reward": 0.828125,
+      "reward_std": 0.22097086906433105,
+      "rewards/itbench_correctness/mean": 0.828125,
+      "rewards/itbench_correctness/std": 0.3502231538295746,
+      "step": 807,
+      "step_time": 648.7502203145996
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 924.8125,
+      "completions/mean_terminated_length": 825.625,
+      "completions/min_length": 718.0,
+      "completions/min_terminated_length": 718.0,
+      "entropy": 0.27573156356811523,
+      "epoch": 4.275132275132275,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0010158391669392586,
+      "learning_rate": 9.842701674223187e-08,
+      "loss": 0.0033,
+      "num_tokens": 15890943.0,
+      "reward": 0.7604166865348816,
+      "reward_std": 0.12147815525531769,
+      "rewards/itbench_correctness/mean": 0.7604166865348816,
+      "rewards/itbench_correctness/std": 0.2979482412338257,
+      "step": 808,
+      "step_time": 365.2914238469675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 530.375,
+      "completions/mean_terminated_length": 530.375,
+      "completions/min_length": 368.0,
+      "completions/min_terminated_length": 368.0,
+      "entropy": 0.41480085253715515,
+      "epoch": 4.28042328042328,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0703125,
+      "kl": 0.0014939939137548208,
+      "learning_rate": 9.744410624530147e-08,
+      "loss": 0.009,
+      "num_tokens": 15902429.0,
+      "reward": 0.6875,
+      "reward_std": 0.03857584670186043,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.28463754057884216,
+      "step": 809,
+      "step_time": 69.47060746885836
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 611.0,
+      "completions/mean_length": 745.8125,
+      "completions/mean_terminated_length": 467.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4344255328178406,
+      "epoch": 4.285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5390625,
+      "kl": 0.0032775227446109056,
+      "learning_rate": 9.646559803512993e-08,
+      "loss": -0.0804,
+      "num_tokens": 15922018.0,
+      "reward": 0.6015625,
+      "reward_std": 0.24306795001029968,
+      "rewards/itbench_correctness/mean": 0.6015625,
+      "rewards/itbench_correctness/std": 0.3824775218963623,
+      "step": 810,
+      "step_time": 242.85561118088663
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 983.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 685.3125,
+      "completions/mean_terminated_length": 685.3125,
+      "completions/min_length": 386.0,
+      "completions/min_terminated_length": 386.0,
+      "entropy": 0.3764705955982208,
+      "epoch": 4.291005291005291,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.359375,
+      "kl": 0.001442611450329423,
+      "learning_rate": 9.549150281252632e-08,
+      "loss": -0.0122,
+      "num_tokens": 15938535.0,
+      "reward": 0.8571428656578064,
+      "reward_std": 0.11921755969524384,
+      "rewards/itbench_correctness/mean": 0.8571428656578064,
+      "rewards/itbench_correctness/std": 0.21977105736732483,
+      "step": 811,
+      "step_time": 366.22836083732545
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1017.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 713.875,
+      "completions/mean_terminated_length": 713.875,
+      "completions/min_length": 444.0,
+      "completions/min_terminated_length": 444.0,
+      "entropy": 0.36981263756752014,
+      "epoch": 4.296296296296296,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.890625,
+      "kl": 0.0017484528943896294,
+      "learning_rate": 9.452183123003999e-08,
+      "loss": 0.0069,
+      "num_tokens": 15954229.0,
+      "reward": 0.6102941036224365,
+      "reward_std": 0.3698710799217224,
+      "rewards/itbench_correctness/mean": 0.6102941036224365,
+      "rewards/itbench_correctness/std": 0.455075204372406,
+      "step": 812,
+      "step_time": 313.4723123824224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1013.0,
+      "completions/mean_length": 814.5,
+      "completions/mean_terminated_length": 766.1538696289062,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.586863100528717,
+      "epoch": 4.301587301587301,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.09375,
+      "kl": 0.0016130534932017326,
+      "learning_rate": 9.355659389184394e-08,
+      "loss": -0.0401,
+      "num_tokens": 15987341.0,
+      "reward": 0.22499999403953552,
+      "reward_std": 0.28192007541656494,
+      "rewards/itbench_correctness/mean": 0.22499999403953552,
+      "rewards/itbench_correctness/std": 0.3872983455657959,
+      "step": 813,
+      "step_time": 213.50641488097608
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 424.0,
+      "completions/mean_length": 672.9375,
+      "completions/mean_terminated_length": 321.875,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "entropy": 0.3982539176940918,
+      "epoch": 4.306878306878307,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.0019140745280310512,
+      "learning_rate": 9.259580135361927e-08,
+      "loss": 0.0014,
+      "num_tokens": 16004804.0,
+      "reward": 0.53125,
+      "reward_std": 0.23289713263511658,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.3326033651828766,
+      "step": 814,
+      "step_time": 924.7196069033816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 823.25,
+      "completions/mean_terminated_length": 667.1111450195312,
+      "completions/min_length": 427.0,
+      "completions/min_terminated_length": 427.0,
+      "entropy": 0.590343177318573,
+      "epoch": 4.3121693121693125,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.484375,
+      "kl": 0.0014230167726054788,
+      "learning_rate": 9.163946412243895e-08,
+      "loss": -0.0126,
+      "num_tokens": 16059928.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 815,
+      "step_time": 286.4796100119129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 703.25,
+      "completions/mean_terminated_length": 681.86669921875,
+      "completions/min_length": 380.0,
+      "completions/min_terminated_length": 380.0,
+      "entropy": 0.27444010972976685,
+      "epoch": 4.317460317460317,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.06396484375,
+      "kl": 0.0018459860002622008,
+      "learning_rate": 9.068759265665382e-08,
+      "loss": 0.0001,
+      "num_tokens": 16078292.0,
+      "reward": 0.25,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 816,
+      "step_time": 321.2267580414191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 792.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 578.3125,
+      "completions/mean_terminated_length": 578.3125,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.4253755509853363,
+      "epoch": 4.322751322751323,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0595703125,
+      "kl": 0.001751541974954307,
+      "learning_rate": 8.974019736577775e-08,
+      "loss": 0.0,
+      "num_tokens": 16090817.0,
+      "reward": 0.8333333730697632,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.8333333730697632,
+      "rewards/itbench_correctness/std": 0.17213258147239685,
+      "step": 817,
+      "step_time": 207.34377425536513
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1014.0,
+      "completions/mean_length": 811.9375,
+      "completions/mean_terminated_length": 684.7000122070312,
+      "completions/min_length": 476.0,
+      "completions/min_terminated_length": 476.0,
+      "entropy": 0.330074667930603,
+      "epoch": 4.328042328042328,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6640625,
+      "kl": 0.00094667385565117,
+      "learning_rate": 8.879728861037383e-08,
+      "loss": 0.0025,
+      "num_tokens": 16109496.0,
+      "reward": 0.8690475821495056,
+      "reward_std": 0.13734711706638336,
+      "rewards/itbench_correctness/mean": 0.8690475821495056,
+      "rewards/itbench_correctness/std": 0.15356682240962982,
+      "step": 818,
+      "step_time": 157.14436247292906
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1023.0,
+      "completions/mean_length": 744.25,
+      "completions/mean_terminated_length": 651.0,
+      "completions/min_length": 477.0,
+      "completions/min_terminated_length": 477.0,
+      "entropy": 0.3815922141075134,
+      "epoch": 4.333333333333333,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.734375,
+      "kl": 0.0016716917743906379,
+      "learning_rate": 8.785887670194136e-08,
+      "loss": -0.0363,
+      "num_tokens": 16129908.0,
+      "reward": 0.7552083730697632,
+      "reward_std": 0.12075783312320709,
+      "rewards/itbench_correctness/mean": 0.7552083730697632,
+      "rewards/itbench_correctness/std": 0.16796371340751648,
+      "step": 819,
+      "step_time": 186.5824106996879
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 825.875,
+      "completions/mean_terminated_length": 671.7777709960938,
+      "completions/min_length": 499.0,
+      "completions/min_terminated_length": 499.0,
+      "entropy": 0.5860450863838196,
+      "epoch": 4.338624338624339,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.125,
+      "kl": 0.0019729058258235455,
+      "learning_rate": 8.692497190280224e-08,
+      "loss": -0.0117,
+      "num_tokens": 16162090.0,
+      "reward": 0.171875,
+      "reward_std": 0.3820367455482483,
+      "rewards/itbench_correctness/mean": 0.171875,
+      "rewards/itbench_correctness/std": 0.37325987219810486,
+      "step": 820,
+      "step_time": 353.5038594137877
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 996.0,
+      "completions/mean_length": 909.6875,
+      "completions/mean_terminated_length": 795.375,
+      "completions/min_length": 625.0,
+      "completions/min_terminated_length": 625.0,
+      "entropy": 0.6024046540260315,
+      "epoch": 4.343915343915344,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.28125,
+      "kl": 0.001822230638936162,
+      "learning_rate": 8.599558442598998e-08,
+      "loss": 0.035,
+      "num_tokens": 16191157.0,
+      "reward": 0.8125,
+      "reward_std": 0.3458075523376465,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.3403429687023163,
+      "step": 821,
+      "step_time": 505.8641969123855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 515.0,
+      "completions/max_terminated_length": 515.0,
+      "completions/mean_length": 400.875,
+      "completions/mean_terminated_length": 400.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.46897411346435547,
+      "epoch": 4.349206349206349,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.515625,
+      "kl": 0.002614083932712674,
+      "learning_rate": 8.507072443513702e-08,
+      "loss": -0.0603,
+      "num_tokens": 16206291.0,
+      "reward": 0.171875,
+      "reward_std": 0.07281029224395752,
+      "rewards/itbench_correctness/mean": 0.171875,
+      "rewards/itbench_correctness/std": 0.20348526537418365,
+      "step": 822,
+      "step_time": 195.5613472936675
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 874.0,
+      "completions/mean_length": 786.8125,
+      "completions/mean_terminated_length": 752.9285888671875,
+      "completions/min_length": 514.0,
+      "completions/min_terminated_length": 514.0,
+      "entropy": 0.4295813739299774,
+      "epoch": 4.354497354497354,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.0021104791667312384,
+      "learning_rate": 8.415040204436425e-08,
+      "loss": 0.0155,
+      "num_tokens": 16229032.0,
+      "reward": 0.8999999761581421,
+      "reward_std": 0.09258200973272324,
+      "rewards/itbench_correctness/mean": 0.8999999761581421,
+      "rewards/itbench_correctness/std": 0.1632993221282959,
+      "step": 823,
+      "step_time": 379.2500517424196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 730.0,
+      "completions/mean_length": 892.75,
+      "completions/mean_terminated_length": 499.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5667880177497864,
+      "epoch": 4.35978835978836,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.109375,
+      "kl": 0.0018997631268575788,
+      "learning_rate": 8.32346273181696e-08,
+      "loss": -0.0691,
+      "num_tokens": 16254500.0,
+      "reward": 0.125,
+      "reward_std": 0.1674824357032776,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.17320507764816284,
+      "step": 824,
+      "step_time": 547.5418346459046
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 597.0,
+      "completions/mean_length": 736.0625,
+      "completions/mean_terminated_length": 512.1111450195312,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.41572555899620056,
+      "epoch": 4.365079365079365,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.040283203125,
+      "kl": 0.0017400278011336923,
+      "learning_rate": 8.232341027131883e-08,
+      "loss": 0.0001,
+      "num_tokens": 16277821.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 825,
+      "step_time": 1151.8339996775612
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 460.5,
+      "completions/mean_terminated_length": 460.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.41910967230796814,
+      "epoch": 4.37037037037037,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.234375,
+      "kl": 0.0028239088132977486,
+      "learning_rate": 8.141676086873573e-08,
+      "loss": -0.0546,
+      "num_tokens": 16288837.0,
+      "reward": 0.375,
+      "reward_std": 0.19918900728225708,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.20412415266036987,
+      "step": 826,
+      "step_time": 428.2450467739254
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 684.0,
+      "completions/mean_length": 784.5,
+      "completions/mean_terminated_length": 598.2222290039062,
+      "completions/min_length": 479.0,
+      "completions/min_terminated_length": 479.0,
+      "entropy": 0.4869343638420105,
+      "epoch": 4.375661375661376,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.515625,
+      "kl": 0.001279774820432067,
+      "learning_rate": 8.051468902539271e-08,
+      "loss": 0.0081,
+      "num_tokens": 16312141.0,
+      "reward": 0.75,
+      "reward_std": 0.2182178944349289,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.394405335187912,
+      "step": 827,
+      "step_time": 134.5602146498859
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 912.3125,
+      "completions/mean_terminated_length": 825.4444580078125,
+      "completions/min_length": 675.0,
+      "completions/min_terminated_length": 675.0,
+      "entropy": 0.5195587873458862,
+      "epoch": 4.380952380952381,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.21875,
+      "kl": 0.002217257162556052,
+      "learning_rate": 7.961720460620319e-08,
+      "loss": 0.0265,
+      "num_tokens": 16336338.0,
+      "reward": 0.5885416269302368,
+      "reward_std": 0.38563019037246704,
+      "rewards/itbench_correctness/mean": 0.5885416269302368,
+      "rewards/itbench_correctness/std": 0.4767450988292694,
+      "step": 828,
+      "step_time": 96.03808457683772
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 571.0,
+      "completions/max_terminated_length": 571.0,
+      "completions/mean_length": 416.9375,
+      "completions/mean_terminated_length": 416.9375,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 0.3237895369529724,
+      "epoch": 4.386243386243386,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.109375,
+      "kl": 0.001336383749730885,
+      "learning_rate": 7.872431742591267e-08,
+      "loss": -0.009,
+      "num_tokens": 16345745.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 829,
+      "step_time": 79.61548331100494
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 966.0,
+      "completions/mean_length": 697.0625,
+      "completions/mean_terminated_length": 588.0833740234375,
+      "completions/min_length": 341.0,
+      "completions/min_terminated_length": 341.0,
+      "entropy": 0.39594727754592896,
+      "epoch": 4.391534391534392,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2890625,
+      "kl": 0.00119964184705168,
+      "learning_rate": 7.783603724899257e-08,
+      "loss": 0.0261,
+      "num_tokens": 16370170.0,
+      "reward": 0.25,
+      "reward_std": 0.26726123690605164,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.44721361994743347,
+      "step": 830,
+      "step_time": 94.27793037891388
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 753.0625,
+      "completions/mean_terminated_length": 735.0000610351562,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 0.33463358879089355,
+      "epoch": 4.396825396825397,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.25,
+      "kl": 0.0012139711761847138,
+      "learning_rate": 7.695237378953224e-08,
+      "loss": -0.0273,
+      "num_tokens": 16387731.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 831,
+      "step_time": 357.8091874551028
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 993.0,
+      "completions/mean_length": 880.25,
+      "completions/mean_terminated_length": 794.0,
+      "completions/min_length": 596.0,
+      "completions/min_terminated_length": 596.0,
+      "entropy": 0.48168134689331055,
+      "epoch": 4.402116402116402,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0625,
+      "kl": 0.0016746832989156246,
+      "learning_rate": 7.607333671113408e-08,
+      "loss": 0.0418,
+      "num_tokens": 16408031.0,
+      "reward": 0.375,
+      "reward_std": 0.20927216112613678,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.4249182939529419,
+      "step": 832,
+      "step_time": 128.6728245029226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 810.9375,
+      "completions/mean_terminated_length": 780.5000610351562,
+      "completions/min_length": 484.0,
+      "completions/min_terminated_length": 484.0,
+      "entropy": 0.3551445007324219,
+      "epoch": 4.407407407407407,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0556640625,
+      "kl": 0.0015545395435765386,
+      "learning_rate": 7.519893562680663e-08,
+      "loss": 0.0,
+      "num_tokens": 16426366.0,
+      "reward": 0.0833333358168602,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0833333358168602,
+      "rewards/itbench_correctness/std": 0.08606629818677902,
+      "step": 833,
+      "step_time": 679.0917965397239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 555.0,
+      "completions/max_terminated_length": 555.0,
+      "completions/mean_length": 399.3125,
+      "completions/mean_terminated_length": 399.3125,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "entropy": 0.3430896997451782,
+      "epoch": 4.412698412698413,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0546875,
+      "kl": 0.0014030004385858774,
+      "learning_rate": 7.432918009885996e-08,
+      "loss": 0.0148,
+      "num_tokens": 16436691.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 834,
+      "step_time": 1008.2919538905844
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 792.0,
+      "completions/mean_length": 666.6875,
+      "completions/mean_terminated_length": 642.86669921875,
+      "completions/min_length": 567.0,
+      "completions/min_terminated_length": 567.0,
+      "entropy": 0.4679853618144989,
+      "epoch": 4.417989417989418,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8203125,
+      "kl": 0.001401036512106657,
+      "learning_rate": 7.346407963880136e-08,
+      "loss": 0.0403,
+      "num_tokens": 16456902.0,
+      "reward": 0.5125000476837158,
+      "reward_std": 0.1552647352218628,
+      "rewards/itbench_correctness/mean": 0.5125000476837158,
+      "rewards/itbench_correctness/std": 0.38100746273994446,
+      "step": 835,
+      "step_time": 458.4377150340006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 970.0,
+      "completions/mean_length": 636.4375,
+      "completions/mean_terminated_length": 507.25,
+      "completions/min_length": 261.0,
+      "completions/min_terminated_length": 261.0,
+      "entropy": 0.5530786514282227,
+      "epoch": 4.423280423280423,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6328125,
+      "kl": 0.0014363399241119623,
+      "learning_rate": 7.260364370723043e-08,
+      "loss": -0.003,
+      "num_tokens": 16482653.0,
+      "reward": 0.3046875,
+      "reward_std": 0.19887377321720123,
+      "rewards/itbench_correctness/mean": 0.3046875,
+      "rewards/itbench_correctness/std": 0.3060798943042755,
+      "step": 836,
+      "step_time": 170.9447146616876
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 696.0,
+      "completions/mean_length": 629.3125,
+      "completions/mean_terminated_length": 572.9285888671875,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.45764225721359253,
+      "epoch": 4.428571428571429,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7265625,
+      "kl": 0.0012081885943189263,
+      "learning_rate": 7.17478817137373e-08,
+      "loss": 0.0681,
+      "num_tokens": 16500194.0,
+      "reward": 0.84375,
+      "reward_std": 0.2893187999725342,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.286865234375,
+      "step": 837,
+      "step_time": 777.2624794654548
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 690.0,
+      "completions/max_terminated_length": 690.0,
+      "completions/mean_length": 558.375,
+      "completions/mean_terminated_length": 558.375,
+      "completions/min_length": 450.0,
+      "completions/min_terminated_length": 450.0,
+      "entropy": 0.40116408467292786,
+      "epoch": 4.4338624338624335,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0,
+      "kl": 0.002113268943503499,
+      "learning_rate": 7.089680301679751e-08,
+      "loss": -0.0104,
+      "num_tokens": 16512536.0,
+      "reward": 0.90625,
+      "reward_std": 0.2346404492855072,
+      "rewards/itbench_correctness/mean": 0.90625,
+      "rewards/itbench_correctness/std": 0.2561737895011902,
+      "step": 838,
+      "step_time": 86.38017075136304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1004.0,
+      "completions/mean_length": 955.75,
+      "completions/mean_terminated_length": 660.0,
+      "completions/min_length": 480.0,
+      "completions/min_terminated_length": 480.0,
+      "entropy": 0.31388962268829346,
+      "epoch": 4.439153439153439,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.037109375,
+      "kl": 0.001120888045988977,
+      "learning_rate": 7.005041692367153e-08,
+      "loss": 0.0,
+      "num_tokens": 16542196.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 839,
+      "step_time": 386.5617507044226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 622.1875,
+      "completions/mean_terminated_length": 622.1875,
+      "completions/min_length": 481.0,
+      "completions/min_terminated_length": 481.0,
+      "entropy": 0.40020090341567993,
+      "epoch": 4.444444444444445,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1875,
+      "kl": 0.0012403958244249225,
+      "learning_rate": 6.92087326903022e-08,
+      "loss": 0.0021,
+      "num_tokens": 16556511.0,
+      "reward": 0.4296875,
+      "reward_std": 0.17499202489852905,
+      "rewards/itbench_correctness/mean": 0.4296875,
+      "rewards/itbench_correctness/std": 0.5040848851203918,
+      "step": 840,
+      "step_time": 135.079236516729
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 881.0,
+      "completions/mean_length": 765.9375,
+      "completions/mean_terminated_length": 611.1000366210938,
+      "completions/min_length": 459.0,
+      "completions/min_terminated_length": 459.0,
+      "entropy": 0.5144022703170776,
+      "epoch": 4.449735449735449,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.765625,
+      "kl": 0.0017959108809009194,
+      "learning_rate": 6.837175952121304e-08,
+      "loss": 0.0412,
+      "num_tokens": 16577070.0,
+      "reward": 0.2395833432674408,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.2395833432674408,
+      "rewards/itbench_correctness/std": 0.19214914739131927,
+      "step": 841,
+      "step_time": 980.2466245274991
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 989.0,
+      "completions/mean_length": 794.0,
+      "completions/mean_terminated_length": 656.0,
+      "completions/min_length": 534.0,
+      "completions/min_terminated_length": 534.0,
+      "entropy": 0.6423173546791077,
+      "epoch": 4.455026455026455,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9453125,
+      "kl": 0.001680102082900703,
+      "learning_rate": 6.753950656940905e-08,
+      "loss": 0.0577,
+      "num_tokens": 16599126.0,
+      "reward": 0.25,
+      "reward_std": 0.1462520956993103,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.24152295291423798,
+      "step": 842,
+      "step_time": 89.48983163572848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 943.0,
+      "completions/mean_length": 873.75,
+      "completions/mean_terminated_length": 756.888916015625,
+      "completions/min_length": 690.0,
+      "completions/min_terminated_length": 690.0,
+      "entropy": 0.4738197326660156,
+      "epoch": 4.4603174603174605,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.0010829598177224398,
+      "learning_rate": 6.671198293627479e-08,
+      "loss": 0.0027,
+      "num_tokens": 16620106.0,
+      "reward": 0.4750000238418579,
+      "reward_std": 0.19992218911647797,
+      "rewards/itbench_correctness/mean": 0.4750000238418579,
+      "rewards/itbench_correctness/std": 0.4358898997306824,
+      "step": 843,
+      "step_time": 68.92372180242091
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 900.0,
+      "completions/max_terminated_length": 900.0,
+      "completions/mean_length": 608.5,
+      "completions/mean_terminated_length": 608.5,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4272801876068115,
+      "epoch": 4.465608465608465,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.55078125,
+      "kl": 0.0015739505179226398,
+      "learning_rate": 6.588919767147638e-08,
+      "loss": -0.0799,
+      "num_tokens": 16633634.0,
+      "reward": 0.8125,
+      "reward_std": 0.1298656165599823,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.2626432776451111,
+      "step": 844,
+      "step_time": 799.5955674275756
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 987.0,
+      "completions/mean_length": 709.375,
+      "completions/mean_terminated_length": 636.7692260742188,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "entropy": 0.38625550270080566,
+      "epoch": 4.470899470899471,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.0017984964651986957,
+      "learning_rate": 6.507115977286143e-08,
+      "loss": 0.0032,
+      "num_tokens": 16650712.0,
+      "reward": 0.421875,
+      "reward_std": 0.09300297498703003,
+      "rewards/itbench_correctness/mean": 0.421875,
+      "rewards/itbench_correctness/std": 0.4538607597351074,
+      "step": 845,
+      "step_time": 264.149626750499
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 687.0,
+      "completions/max_terminated_length": 687.0,
+      "completions/mean_length": 482.4375,
+      "completions/mean_terminated_length": 482.4375,
+      "completions/min_length": 312.0,
+      "completions/min_terminated_length": 312.0,
+      "entropy": 0.4415079653263092,
+      "epoch": 4.476190476190476,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.390625,
+      "kl": 0.001339221140369773,
+      "learning_rate": 6.42578781863613e-08,
+      "loss": 0.0102,
+      "num_tokens": 16661415.0,
+      "reward": 0.484375,
+      "reward_std": 0.16849708557128906,
+      "rewards/itbench_correctness/mean": 0.484375,
+      "rewards/itbench_correctness/std": 0.17001838982105255,
+      "step": 846,
+      "step_time": 347.23770444560796
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 730.5625,
+      "completions/mean_terminated_length": 554.5,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.533835232257843,
+      "epoch": 4.481481481481482,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5390625,
+      "kl": 0.0018466972978785634,
+      "learning_rate": 6.34493618058935e-08,
+      "loss": -0.0754,
+      "num_tokens": 16700320.0,
+      "reward": 0.03125,
+      "reward_std": 0.043129101395606995,
+      "rewards/itbench_correctness/mean": 0.03125,
+      "rewards/itbench_correctness/std": 0.06718548387289047,
+      "step": 847,
+      "step_time": 213.22028856538236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 576.0,
+      "completions/max_terminated_length": 576.0,
+      "completions/mean_length": 455.5,
+      "completions/mean_terminated_length": 455.5,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.40175631642341614,
+      "epoch": 4.4867724867724865,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4375,
+      "kl": 0.0014747100649401546,
+      "learning_rate": 6.26456194732633e-08,
+      "loss": -0.0161,
+      "num_tokens": 16710784.0,
+      "reward": 0.4895833432674408,
+      "reward_std": 0.21374498307704926,
+      "rewards/itbench_correctness/mean": 0.4895833432674408,
+      "rewards/itbench_correctness/std": 0.20983901619911194,
+      "step": 848,
+      "step_time": 55.42429989017546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 619.0,
+      "completions/max_terminated_length": 619.0,
+      "completions/mean_length": 442.25,
+      "completions/mean_terminated_length": 442.25,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "entropy": 0.35048049688339233,
+      "epoch": 4.492063492063492,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.015625,
+      "kl": 0.003505163826048374,
+      "learning_rate": 6.184665997806831e-08,
+      "loss": 0.0185,
+      "num_tokens": 16721036.0,
+      "reward": 0.4943181872367859,
+      "reward_std": 0.016070598736405373,
+      "rewards/itbench_correctness/mean": 0.4943181872367859,
+      "rewards/itbench_correctness/std": 0.5110015273094177,
+      "step": 849,
+      "step_time": 1021.3690116815269
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 918.0,
+      "completions/mean_length": 910.4375,
+      "completions/mean_terminated_length": 660.6000366210938,
+      "completions/min_length": 518.0,
+      "completions/min_terminated_length": 518.0,
+      "entropy": 0.4964646100997925,
+      "epoch": 4.497354497354498,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 8.875,
+      "kl": 0.0017396226758137345,
+      "learning_rate": 6.105249205760127e-08,
+      "loss": 0.0387,
+      "num_tokens": 16749483.0,
+      "reward": 0.4375,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.48733973503112793,
+      "step": 850,
+      "step_time": 576.2763332147151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 930.0,
+      "completions/max_terminated_length": 930.0,
+      "completions/mean_length": 593.75,
+      "completions/mean_terminated_length": 593.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3183158040046692,
+      "epoch": 4.502645502645502,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0289306640625,
+      "kl": 0.001586488215252757,
+      "learning_rate": 6.026312439675551e-08,
+      "loss": 0.0,
+      "num_tokens": 16763647.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 851,
+      "step_time": 732.8523011729121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 860.0,
+      "completions/mean_length": 584.375,
+      "completions/mean_terminated_length": 555.0667114257812,
+      "completions/min_length": 401.0,
+      "completions/min_terminated_length": 401.0,
+      "entropy": 0.5373262166976929,
+      "epoch": 4.507936507936508,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1953125,
+      "kl": 0.0018168711103498936,
+      "learning_rate": 5.9478565627929244e-08,
+      "loss": 0.0319,
+      "num_tokens": 16794397.0,
+      "reward": 0.0729166716337204,
+      "reward_std": 0.0294627845287323,
+      "rewards/itbench_correctness/mean": 0.0729166716337204,
+      "rewards/itbench_correctness/std": 0.08539126068353653,
+      "step": 852,
+      "step_time": 84.26783776376396
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 935.625,
+      "completions/mean_terminated_length": 741.2000122070312,
+      "completions/min_length": 626.0,
+      "completions/min_terminated_length": 626.0,
+      "entropy": 0.5600534677505493,
+      "epoch": 4.5132275132275135,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.034912109375,
+      "kl": 0.0015273833414539695,
+      "learning_rate": 5.869882433093154e-08,
+      "loss": 0.0001,
+      "num_tokens": 16832975.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 853,
+      "step_time": 153.46809213608503
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 668.0,
+      "completions/mean_length": 617.25,
+      "completions/mean_terminated_length": 481.66668701171875,
+      "completions/min_length": 400.0,
+      "completions/min_terminated_length": 400.0,
+      "entropy": 0.37748077511787415,
+      "epoch": 4.518518518518518,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 2.90625,
+      "kl": 0.0018203725339844823,
+      "learning_rate": 5.7923909032888295e-08,
+      "loss": -0.0292,
+      "num_tokens": 16855027.0,
+      "reward": 0.2395833432674408,
+      "reward_std": 0.13684004545211792,
+      "rewards/itbench_correctness/mean": 0.2395833432674408,
+      "rewards/itbench_correctness/std": 0.31012991070747375,
+      "step": 854,
+      "step_time": 189.47853012941778
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 566.0,
+      "completions/max_terminated_length": 566.0,
+      "completions/mean_length": 370.75,
+      "completions/mean_terminated_length": 370.75,
+      "completions/min_length": 297.0,
+      "completions/min_terminated_length": 297.0,
+      "entropy": 0.34794336557388306,
+      "epoch": 4.523809523809524,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.546875,
+      "kl": 0.002147156512364745,
+      "learning_rate": 5.7153828208148846e-08,
+      "loss": -0.0062,
+      "num_tokens": 16863943.0,
+      "reward": 0.53125,
+      "reward_std": 0.405046284198761,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.4905354380607605,
+      "step": 855,
+      "step_time": 1156.329422229901
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 950.0,
+      "completions/mean_length": 743.25,
+      "completions/mean_terminated_length": 615.6363525390625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.44399595260620117,
+      "epoch": 4.529100529100529,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.0015257069608196616,
+      "learning_rate": 5.638859027819409e-08,
+      "loss": -0.0599,
+      "num_tokens": 16885899.0,
+      "reward": 0.3812499940395355,
+      "reward_std": 0.3087776303291321,
+      "rewards/itbench_correctness/mean": 0.3812499940395355,
+      "rewards/itbench_correctness/std": 0.46219584345817566,
+      "step": 856,
+      "step_time": 461.6809099484235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 819.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 481.375,
+      "completions/mean_terminated_length": 481.375,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "entropy": 0.4964944124221802,
+      "epoch": 4.534391534391535,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.203125,
+      "kl": 0.0020852303132414818,
+      "learning_rate": 5.562820361154313e-08,
+      "loss": -0.0059,
+      "num_tokens": 16896417.0,
+      "reward": 0.359375,
+      "reward_std": 0.031000997871160507,
+      "rewards/itbench_correctness/mean": 0.359375,
+      "rewards/itbench_correctness/std": 0.3735698163509369,
+      "step": 857,
+      "step_time": 93.96854640357196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 901.0,
+      "completions/mean_length": 994.9375,
+      "completions/mean_terminated_length": 869.0,
+      "completions/min_length": 831.0,
+      "completions/min_terminated_length": 831.0,
+      "entropy": 0.4140963554382324,
+      "epoch": 4.5396825396825395,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4296875,
+      "kl": 0.001920070848427713,
+      "learning_rate": 5.48726765236629e-08,
+      "loss": 0.0154,
+      "num_tokens": 16921448.0,
+      "reward": 0.2678571343421936,
+      "reward_std": 0.04959750175476074,
+      "rewards/itbench_correctness/mean": 0.2678571343421936,
+      "rewards/itbench_correctness/std": 0.12975645065307617,
+      "step": 858,
+      "step_time": 6701.7141972742975
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 825.0,
+      "completions/mean_length": 783.1875,
+      "completions/mean_terminated_length": 595.888916015625,
+      "completions/min_length": 378.0,
+      "completions/min_terminated_length": 378.0,
+      "entropy": 0.43923071026802063,
+      "epoch": 4.544973544973545,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.140625,
+      "kl": 0.001265035243704915,
+      "learning_rate": 5.412201727687643e-08,
+      "loss": -0.0124,
+      "num_tokens": 16938763.0,
+      "reward": 0.8556547164916992,
+      "reward_std": 0.053405825048685074,
+      "rewards/itbench_correctness/mean": 0.8556547164916992,
+      "rewards/itbench_correctness/std": 0.07298243790864944,
+      "step": 859,
+      "step_time": 1027.743779040873
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 619.0,
+      "completions/mean_length": 712.5625,
+      "completions/mean_terminated_length": 401.125,
+      "completions/min_length": 338.0,
+      "completions/min_terminated_length": 338.0,
+      "entropy": 0.5192527174949646,
+      "epoch": 4.550264550264551,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.84375,
+      "kl": 0.0014884049305692315,
+      "learning_rate": 5.337623408027292e-08,
+      "loss": -0.0094,
+      "num_tokens": 16957572.0,
+      "reward": 0.6171875,
+      "reward_std": 0.37874263525009155,
+      "rewards/itbench_correctness/mean": 0.6171875,
+      "rewards/itbench_correctness/std": 0.3991364538669586,
+      "step": 860,
+      "step_time": 262.18378533329815
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 984.5625,
+      "completions/mean_terminated_length": 708.5,
+      "completions/min_length": 527.0,
+      "completions/min_terminated_length": 527.0,
+      "entropy": 0.5484669804573059,
+      "epoch": 4.555555555555555,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.25,
+      "kl": 0.0018317234935238957,
+      "learning_rate": 5.263533508961826e-08,
+      "loss": 0.0379,
+      "num_tokens": 16981197.0,
+      "reward": 0.265625,
+      "reward_std": 0.39435434341430664,
+      "rewards/itbench_correctness/mean": 0.265625,
+      "rewards/itbench_correctness/std": 0.4096280336380005,
+      "step": 861,
+      "step_time": 77.7364522125572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 764.0,
+      "completions/max_terminated_length": 764.0,
+      "completions/mean_length": 457.0625,
+      "completions/mean_terminated_length": 457.0625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.345685750246048,
+      "epoch": 4.560846560846561,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.2109375,
+      "kl": 0.002134555485099554,
+      "learning_rate": 5.1899328407264855e-08,
+      "loss": -0.087,
+      "num_tokens": 16993086.0,
+      "reward": 0.3812499940395355,
+      "reward_std": 0.318040132522583,
+      "rewards/itbench_correctness/mean": 0.3812499940395355,
+      "rewards/itbench_correctness/std": 0.4069705307483673,
+      "step": 862,
+      "step_time": 827.9984636185691
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 547.8125,
+      "completions/mean_terminated_length": 547.8125,
+      "completions/min_length": 458.0,
+      "completions/min_terminated_length": 458.0,
+      "entropy": 0.46731317043304443,
+      "epoch": 4.5661375661375665,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.234375,
+      "kl": 0.0013622003607451916,
+      "learning_rate": 5.116822208206395e-08,
+      "loss": -0.0166,
+      "num_tokens": 17005059.0,
+      "reward": 0.5588235259056091,
+      "reward_std": 0.16637806594371796,
+      "rewards/itbench_correctness/mean": 0.5588235259056091,
+      "rewards/itbench_correctness/std": 0.5092002749443054,
+      "step": 863,
+      "step_time": 190.58081929571927
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 903.0,
+      "completions/mean_length": 952.5,
+      "completions/mean_terminated_length": 452.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.2813648283481598,
+      "epoch": 4.571428571428571,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.421875,
+      "kl": 0.001091470243409276,
+      "learning_rate": 5.044202410927706e-08,
+      "loss": -0.0038,
+      "num_tokens": 17038027.0,
+      "reward": 0.048076923936605453,
+      "reward_std": 0.13598206639289856,
+      "rewards/itbench_correctness/mean": 0.048076923936605453,
+      "rewards/itbench_correctness/std": 0.192307710647583,
+      "step": 864,
+      "step_time": 116.58771913684905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 703.0,
+      "completions/max_terminated_length": 703.0,
+      "completions/mean_length": 526.375,
+      "completions/mean_terminated_length": 526.375,
+      "completions/min_length": 404.0,
+      "completions/min_terminated_length": 404.0,
+      "entropy": 0.391355961561203,
+      "epoch": 4.576719576719577,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.03125,
+      "kl": 0.001298400922678411,
+      "learning_rate": 4.972074243048896e-08,
+      "loss": -0.0148,
+      "num_tokens": 17049241.0,
+      "reward": 0.71875,
+      "reward_std": 0.18196186423301697,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.3823356628417969,
+      "step": 865,
+      "step_time": 239.0914654675871
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 688.0,
+      "completions/max_terminated_length": 688.0,
+      "completions/mean_length": 552.75,
+      "completions/mean_terminated_length": 552.75,
+      "completions/min_length": 465.0,
+      "completions/min_terminated_length": 465.0,
+      "entropy": 0.510176420211792,
+      "epoch": 4.582010582010582,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3125,
+      "kl": 0.0015060057630762458,
+      "learning_rate": 4.9004384933520547e-08,
+      "loss": -0.0102,
+      "num_tokens": 17061309.0,
+      "reward": 0.8125,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 866,
+      "step_time": 120.828508451581
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 807.0,
+      "completions/mean_length": 870.375,
+      "completions/mean_terminated_length": 750.888916015625,
+      "completions/min_length": 648.0,
+      "completions/min_terminated_length": 648.0,
+      "entropy": 0.25850924849510193,
+      "epoch": 4.587301587301587,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.75,
+      "kl": 0.001840650918893516,
+      "learning_rate": 4.829295945234257e-08,
+      "loss": 0.01,
+      "num_tokens": 17083635.0,
+      "reward": 0.6875,
+      "reward_std": 0.12400396168231964,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.28463754057884216,
+      "step": 867,
+      "step_time": 78.40537928510457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 831.0,
+      "completions/max_terminated_length": 831.0,
+      "completions/mean_length": 641.875,
+      "completions/mean_terminated_length": 641.875,
+      "completions/min_length": 407.0,
+      "completions/min_terminated_length": 407.0,
+      "entropy": 0.5577409863471985,
+      "epoch": 4.592592592592593,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5,
+      "kl": 0.0019760173745453358,
+      "learning_rate": 4.758647376699032e-08,
+      "loss": -0.0254,
+      "num_tokens": 17104393.0,
+      "reward": 0.625,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 868,
+      "step_time": 90.65404016617686
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 803.0,
+      "completions/max_terminated_length": 803.0,
+      "completions/mean_length": 561.0,
+      "completions/mean_terminated_length": 561.0,
+      "completions/min_length": 357.0,
+      "completions/min_terminated_length": 357.0,
+      "entropy": 0.35650622844696045,
+      "epoch": 4.597883597883598,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0018372276099398732,
+      "learning_rate": 4.6884935603477724e-08,
+      "loss": -0.0127,
+      "num_tokens": 17119761.0,
+      "reward": 0.3125,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.3095695972442627,
+      "step": 869,
+      "step_time": 567.832233437337
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1000.0,
+      "completions/mean_length": 735.75,
+      "completions/mean_terminated_length": 562.7999877929688,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4512402415275574,
+      "epoch": 4.603174603174603,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3125,
+      "kl": 0.0015587429516017437,
+      "learning_rate": 4.6188352633713956e-08,
+      "loss": -0.0225,
+      "num_tokens": 17156341.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 870,
+      "step_time": 147.64252108428627
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 814.0,
+      "completions/mean_length": 688.875,
+      "completions/mean_terminated_length": 666.5333862304688,
+      "completions/min_length": 500.0,
+      "completions/min_terminated_length": 500.0,
+      "entropy": 0.4325893521308899,
+      "epoch": 4.608465608465608,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.171875,
+      "kl": 0.0013199535897001624,
+      "learning_rate": 4.549673247541874e-08,
+      "loss": -0.0139,
+      "num_tokens": 17174155.0,
+      "reward": 0.40625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.4552929699420929,
+      "step": 871,
+      "step_time": 94.71705105807632
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 609.0625,
+      "completions/mean_terminated_length": 420.4545593261719,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.36613649129867554,
+      "epoch": 4.613756613756614,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.71484375,
+      "kl": 0.001597036374732852,
+      "learning_rate": 4.48100826920394e-08,
+      "loss": -0.1975,
+      "num_tokens": 17194796.0,
+      "reward": 0.0625,
+      "reward_std": 0.03857583925127983,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.0833333358168602,
+      "step": 872,
+      "step_time": 804.2430580342188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 1.0,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 0.0,
+      "completions/mean_length": 1024.0,
+      "completions/mean_terminated_length": 0.0,
+      "completions/min_length": 1024.0,
+      "completions/min_terminated_length": 0.0,
+      "entropy": 0.296875,
+      "epoch": 4.619047619047619,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.001162711065262556,
+      "learning_rate": 4.412841079266777e-08,
+      "loss": 0.0,
+      "num_tokens": 17221364.0,
+      "reward": 0.2708333432674408,
+      "reward_std": 0.19795581698417664,
+      "rewards/itbench_correctness/mean": 0.2708333432674408,
+      "rewards/itbench_correctness/std": 0.3890872597694397,
+      "step": 873,
+      "step_time": 150.55902750603855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 846.0,
+      "completions/max_terminated_length": 846.0,
+      "completions/mean_length": 616.125,
+      "completions/mean_terminated_length": 616.125,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.5778048038482666,
+      "epoch": 4.624338624338624,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.002010586205869913,
+      "learning_rate": 4.3451724231958645e-08,
+      "loss": 0.0149,
+      "num_tokens": 17239318.0,
+      "reward": 0.625,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 874,
+      "step_time": 94.01774641126394
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 537.5,
+      "completions/mean_terminated_length": 537.5,
+      "completions/min_length": 2.0,
+      "completions/min_terminated_length": 2.0,
+      "entropy": 0.26232558488845825,
+      "epoch": 4.62962962962963,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.109375,
+      "kl": 0.0017294755671173334,
+      "learning_rate": 4.2780030410047796e-08,
+      "loss": -0.0713,
+      "num_tokens": 17252630.0,
+      "reward": 0.375,
+      "reward_std": 0.08908706903457642,
+      "rewards/itbench_correctness/mean": 0.375,
+      "rewards/itbench_correctness/std": 0.17743021249771118,
+      "step": 875,
+      "step_time": 71.65096860099584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 792.8125,
+      "completions/mean_terminated_length": 715.75,
+      "completions/min_length": 616.0,
+      "completions/min_terminated_length": 616.0,
+      "entropy": 0.47930628061294556,
+      "epoch": 4.634920634920634,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.0011707143858075142,
+      "learning_rate": 4.2113336672471245e-08,
+      "loss": 0.0018,
+      "num_tokens": 17273627.0,
+      "reward": 0.84375,
+      "reward_std": 0.2088201940059662,
+      "rewards/itbench_correctness/mean": 0.84375,
+      "rewards/itbench_correctness/std": 0.24757154285907745,
+      "step": 876,
+      "step_time": 72.07463994249701
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 585.375,
+      "completions/mean_terminated_length": 585.375,
+      "completions/min_length": 422.0,
+      "completions/min_terminated_length": 422.0,
+      "entropy": 0.5500747561454773,
+      "epoch": 4.64021164021164,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9609375,
+      "kl": 0.0019222473492845893,
+      "learning_rate": 4.145165031008507e-08,
+      "loss": 0.0357,
+      "num_tokens": 17291033.0,
+      "reward": 0.800000011920929,
+      "reward_std": 0.2777460217475891,
+      "rewards/itbench_correctness/mean": 0.800000011920929,
+      "rewards/itbench_correctness/std": 0.3265986442565918,
+      "step": 877,
+      "step_time": 86.38187370076776
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 962.0,
+      "completions/mean_length": 519.375,
+      "completions/mean_terminated_length": 402.923095703125,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "entropy": 0.379302054643631,
+      "epoch": 4.645502645502646,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.546875,
+      "kl": 0.00152775296010077,
+      "learning_rate": 4.0794978558985e-08,
+      "loss": 0.0501,
+      "num_tokens": 17305767.0,
+      "reward": 0.2395833432674408,
+      "reward_std": 0.10386862605810165,
+      "rewards/itbench_correctness/mean": 0.2395833432674408,
+      "rewards/itbench_correctness/std": 0.2852468192577362,
+      "step": 878,
+      "step_time": 181.77928131632507
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1008.0,
+      "completions/mean_length": 672.125,
+      "completions/mean_terminated_length": 554.8333740234375,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.5118095874786377,
+      "epoch": 4.650793650793651,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.65625,
+      "kl": 0.0015323893167078495,
+      "learning_rate": 4.0143328600428294e-08,
+      "loss": 0.0114,
+      "num_tokens": 17320513.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 879,
+      "step_time": 106.95045015309006
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 721.0625,
+      "completions/mean_terminated_length": 539.2999877929688,
+      "completions/min_length": 363.0,
+      "completions/min_terminated_length": 363.0,
+      "entropy": 0.3689000606536865,
+      "epoch": 4.656084656084656,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9765625,
+      "kl": 0.0019987330306321383,
+      "learning_rate": 3.949670756075446e-08,
+      "loss": 0.0123,
+      "num_tokens": 17337194.0,
+      "reward": 0.5625,
+      "reward_std": 0.3098883032798767,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.4466957151889801,
+      "step": 880,
+      "step_time": 1010.3906643372029
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 400.0,
+      "completions/mean_length": 663.1875,
+      "completions/mean_terminated_length": 302.375,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5669588446617126,
+      "epoch": 4.661375661375661,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.40625,
+      "kl": 0.0028189942240715027,
+      "learning_rate": 3.8855122511307626e-08,
+      "loss": 0.0001,
+      "num_tokens": 17354189.0,
+      "reward": 0.125,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 881,
+      "step_time": 113.85438270866871
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 675.0,
+      "completions/max_terminated_length": 675.0,
+      "completions/mean_length": 442.6875,
+      "completions/mean_terminated_length": 442.6875,
+      "completions/min_length": 366.0,
+      "completions/min_terminated_length": 366.0,
+      "entropy": 0.4585627615451813,
+      "epoch": 4.666666666666667,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.609375,
+      "kl": 0.0014935840154066682,
+      "learning_rate": 3.821858046835913e-08,
+      "loss": 0.0196,
+      "num_tokens": 17363480.0,
+      "reward": 0.5653409361839294,
+      "reward_std": 0.1779802441596985,
+      "rewards/itbench_correctness/mean": 0.5653409361839294,
+      "rewards/itbench_correctness/std": 0.3175182640552521,
+      "step": 882,
+      "step_time": 61.071511584334075
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 825.4375,
+      "completions/mean_terminated_length": 779.6154174804688,
+      "completions/min_length": 561.0,
+      "completions/min_terminated_length": 561.0,
+      "entropy": 0.32225334644317627,
+      "epoch": 4.671957671957672,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4453125,
+      "kl": 0.0013399338349699974,
+      "learning_rate": 3.75870883930306e-08,
+      "loss": 0.0614,
+      "num_tokens": 17384615.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 883,
+      "step_time": 223.48245067708194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 709.0,
+      "completions/mean_length": 802.8125,
+      "completions/mean_terminated_length": 581.625,
+      "completions/min_length": 471.0,
+      "completions/min_terminated_length": 471.0,
+      "entropy": 0.5904242992401123,
+      "epoch": 4.677248677248677,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8203125,
+      "kl": 0.0015930193476378918,
+      "learning_rate": 3.6960653191218324e-08,
+      "loss": -0.0112,
+      "num_tokens": 17405748.0,
+      "reward": 0.8020833134651184,
+      "reward_std": 0.25074294209480286,
+      "rewards/itbench_correctness/mean": 0.8020833134651184,
+      "rewards/itbench_correctness/std": 0.32185086607933044,
+      "step": 884,
+      "step_time": 80.26837155316025
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 697.0,
+      "completions/mean_terminated_length": 548.3636474609375,
+      "completions/min_length": 350.0,
+      "completions/min_terminated_length": 350.0,
+      "entropy": 0.42180773615837097,
+      "epoch": 4.682539682539683,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8515625,
+      "kl": 0.0014244532212615013,
+      "learning_rate": 3.63392817135173e-08,
+      "loss": 0.0199,
+      "num_tokens": 17421772.0,
+      "reward": 0.3482142686843872,
+      "reward_std": 0.14384004473686218,
+      "rewards/itbench_correctness/mean": 0.3482142686843872,
+      "rewards/itbench_correctness/std": 0.2632541060447693,
+      "step": 885,
+      "step_time": 135.67724260222167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1010.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 652.875,
+      "completions/mean_terminated_length": 652.875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3706682026386261,
+      "epoch": 4.6878306878306875,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.296875,
+      "kl": 0.001711677061393857,
+      "learning_rate": 3.572298075514652e-08,
+      "loss": -0.118,
+      "num_tokens": 17441162.0,
+      "reward": 0.5,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 886,
+      "step_time": 319.866651549004
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1016.0,
+      "completions/mean_length": 697.0,
+      "completions/mean_terminated_length": 675.2000122070312,
+      "completions/min_length": 387.0,
+      "completions/min_terminated_length": 387.0,
+      "entropy": 0.4418938159942627,
+      "epoch": 4.693121693121693,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.0013350816443562508,
+      "learning_rate": 3.5111757055874326e-08,
+      "loss": 0.0371,
+      "num_tokens": 17455858.0,
+      "reward": 0.9236111640930176,
+      "reward_std": 0.21606040000915527,
+      "rewards/itbench_correctness/mean": 0.9236111640930176,
+      "rewards/itbench_correctness/std": 0.20971761643886566,
+      "step": 887,
+      "step_time": 798.9065483696759
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1009.0,
+      "completions/mean_length": 778.75,
+      "completions/mean_terminated_length": 722.1538696289062,
+      "completions/min_length": 543.0,
+      "completions/min_terminated_length": 543.0,
+      "entropy": 0.4057784974575043,
+      "epoch": 4.698412698412699,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8984375,
+      "kl": 0.0015928231878206134,
+      "learning_rate": 3.450561729994533e-08,
+      "loss": 0.0253,
+      "num_tokens": 17493110.0,
+      "reward": 0.5625,
+      "reward_std": 0.5260357856750488,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 888,
+      "step_time": 112.42098965961486
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 633.0,
+      "completions/mean_terminated_length": 633.0,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.32543444633483887,
+      "epoch": 4.703703703703704,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.28125,
+      "kl": 0.0019525340758264065,
+      "learning_rate": 3.390456811600673e-08,
+      "loss": -0.145,
+      "num_tokens": 17514366.0,
+      "reward": 0.4375,
+      "reward_std": 0.23927490413188934,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.30276504158973694,
+      "step": 889,
+      "step_time": 95.3800596492365
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 921.0,
+      "completions/max_terminated_length": 921.0,
+      "completions/mean_length": 672.375,
+      "completions/mean_terminated_length": 672.375,
+      "completions/min_length": 478.0,
+      "completions/min_terminated_length": 478.0,
+      "entropy": 0.5919315814971924,
+      "epoch": 4.708994708994709,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0888671875,
+      "kl": 0.002438145689666271,
+      "learning_rate": 3.330861607703611e-08,
+      "loss": 0.0001,
+      "num_tokens": 17529404.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 890,
+      "step_time": 194.32226402964443
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 910.875,
+      "completions/mean_terminated_length": 797.75,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.6191848516464233,
+      "epoch": 4.714285714285714,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.453125,
+      "kl": 0.002367243403568864,
+      "learning_rate": 3.271776770026963e-08,
+      "loss": 0.0057,
+      "num_tokens": 17554098.0,
+      "reward": 0.1015625,
+      "reward_std": 0.21758441627025604,
+      "rewards/itbench_correctness/mean": 0.1015625,
+      "rewards/itbench_correctness/std": 0.2550275921821594,
+      "step": 891,
+      "step_time": 93.79412244167179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 529.0,
+      "completions/max_terminated_length": 529.0,
+      "completions/mean_length": 419.75,
+      "completions/mean_terminated_length": 419.75,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "entropy": 0.38832637667655945,
+      "epoch": 4.71957671957672,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0022835221607238054,
+      "learning_rate": 3.213202944713023e-08,
+      "loss": 0.0096,
+      "num_tokens": 17563774.0,
+      "reward": 0.4837239682674408,
+      "reward_std": 0.10496115684509277,
+      "rewards/itbench_correctness/mean": 0.4837239682674408,
+      "rewards/itbench_correctness/std": 0.12295603007078171,
+      "step": 892,
+      "step_time": 44.90997119899839
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 523.0,
+      "completions/max_terminated_length": 523.0,
+      "completions/mean_length": 433.8125,
+      "completions/mean_terminated_length": 433.8125,
+      "completions/min_length": 349.0,
+      "completions/min_terminated_length": 349.0,
+      "entropy": 0.4886903762817383,
+      "epoch": 4.724867724867725,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.0014157530386000872,
+      "learning_rate": 3.155140772315773e-08,
+      "loss": 0.0198,
+      "num_tokens": 17573195.0,
+      "reward": 0.4437499940395355,
+      "reward_std": 0.11511446535587311,
+      "rewards/itbench_correctness/mean": 0.4437499940395355,
+      "rewards/itbench_correctness/std": 0.1263263076543808,
+      "step": 893,
+      "step_time": 62.78798679355532
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 948.0,
+      "completions/mean_length": 990.5,
+      "completions/mean_terminated_length": 890.0,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.43210500478744507,
+      "epoch": 4.73015873015873,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.140625,
+      "kl": 0.0013504911912605166,
+      "learning_rate": 3.097590887793827e-08,
+      "loss": 0.0194,
+      "num_tokens": 17598411.0,
+      "reward": 0.6432291865348816,
+      "reward_std": 0.2726758122444153,
+      "rewards/itbench_correctness/mean": 0.6432291865348816,
+      "rewards/itbench_correctness/std": 0.4446098804473877,
+      "step": 894,
+      "step_time": 102.70882797706872
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1002.0,
+      "completions/mean_length": 671.25,
+      "completions/mean_terminated_length": 553.6666870117188,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.3515828549861908,
+      "epoch": 4.735449735449736,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.0012724708067253232,
+      "learning_rate": 3.040553920503502e-08,
+      "loss": 0.0152,
+      "num_tokens": 17614191.0,
+      "reward": 0.3125,
+      "reward_std": 0.2335786372423172,
+      "rewards/itbench_correctness/mean": 0.3125,
+      "rewards/itbench_correctness/std": 0.23471811413764954,
+      "step": 895,
+      "step_time": 131.4134486299008
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 698.0,
+      "completions/mean_length": 748.6875,
+      "completions/mean_terminated_length": 473.375,
+      "completions/min_length": 374.0,
+      "completions/min_terminated_length": 374.0,
+      "entropy": 0.44878536462783813,
+      "epoch": 4.7407407407407405,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.0016055708983913064,
+      "learning_rate": 2.9840304941919416e-08,
+      "loss": -0.0042,
+      "num_tokens": 17634162.0,
+      "reward": 0.625,
+      "reward_std": 0.31586384773254395,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.40138646960258484,
+      "step": 896,
+      "step_time": 121.16828937549144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1018.0,
+      "completions/mean_length": 698.125,
+      "completions/mean_terminated_length": 651.5714721679688,
+      "completions/min_length": 455.0,
+      "completions/min_terminated_length": 455.0,
+      "entropy": 0.4755595326423645,
+      "epoch": 4.746031746031746,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 6.84375,
+      "kl": 0.0011872922768816352,
+      "learning_rate": 2.9280212269902628e-08,
+      "loss": 0.0145,
+      "num_tokens": 17650676.0,
+      "reward": 0.8125,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.8125,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 897,
+      "step_time": 88.3342649359256
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 435.0,
+      "completions/mean_length": 678.5,
+      "completions/mean_terminated_length": 333.0,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.38319823145866394,
+      "epoch": 4.751322751322752,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.640625,
+      "kl": 0.0016045935917645693,
+      "learning_rate": 2.872526731406849e-08,
+      "loss": -0.0215,
+      "num_tokens": 17670076.0,
+      "reward": 0.5572916865348816,
+      "reward_std": 0.398481547832489,
+      "rewards/itbench_correctness/mean": 0.5572916865348816,
+      "rewards/itbench_correctness/std": 0.38934746384620667,
+      "step": 898,
+      "step_time": 123.173085459508
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 939.0,
+      "completions/mean_length": 656.1875,
+      "completions/mean_terminated_length": 571.3077392578125,
+      "completions/min_length": 334.0,
+      "completions/min_terminated_length": 334.0,
+      "entropy": 0.55167156457901,
+      "epoch": 4.756613756613756,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0,
+      "kl": 0.0021755106281489134,
+      "learning_rate": 2.8175476143206145e-08,
+      "loss": 0.0149,
+      "num_tokens": 17684807.0,
+      "reward": 0.778124988079071,
+      "reward_std": 0.3087267279624939,
+      "rewards/itbench_correctness/mean": 0.778124988079071,
+      "rewards/itbench_correctness/std": 0.3087441027164459,
+      "step": 899,
+      "step_time": 81.62243656814098
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 768.0,
+      "completions/max_terminated_length": 768.0,
+      "completions/mean_length": 475.625,
+      "completions/mean_terminated_length": 475.625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3910643756389618,
+      "epoch": 4.761904761904762,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.4921875,
+      "kl": 0.0022366566117852926,
+      "learning_rate": 2.7630844769743756e-08,
+      "loss": -0.0554,
+      "num_tokens": 17695441.0,
+      "reward": 0.6812499761581421,
+      "reward_std": 0.0752970278263092,
+      "rewards/itbench_correctness/mean": 0.6812499761581421,
+      "rewards/itbench_correctness/std": 0.3449033796787262,
+      "step": 900,
+      "step_time": 96.98080993723124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 706.625,
+      "completions/mean_terminated_length": 661.2857666015625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5575800538063049,
+      "epoch": 4.767195767195767,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.171875,
+      "kl": 0.0022240527905523777,
+      "learning_rate": 2.7091379149682682e-08,
+      "loss": -0.0212,
+      "num_tokens": 17720731.0,
+      "reward": 0.484375,
+      "reward_std": 0.46940183639526367,
+      "rewards/itbench_correctness/mean": 0.484375,
+      "rewards/itbench_correctness/std": 0.4696519374847412,
+      "step": 901,
+      "step_time": 119.61820879764855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 922.0,
+      "completions/mean_length": 772.5625,
+      "completions/mean_terminated_length": 577.0,
+      "completions/min_length": 436.0,
+      "completions/min_terminated_length": 436.0,
+      "entropy": 0.5074023008346558,
+      "epoch": 4.772486772486772,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 5.0625,
+      "kl": 0.0013676214730367064,
+      "learning_rate": 2.655708518253258e-08,
+      "loss": -0.0031,
+      "num_tokens": 17737724.0,
+      "reward": 0.5364583134651184,
+      "reward_std": 0.3415539562702179,
+      "rewards/itbench_correctness/mean": 0.5364583134651184,
+      "rewards/itbench_correctness/std": 0.4584280252456665,
+      "step": 902,
+      "step_time": 71.03902994468808
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 727.0,
+      "completions/max_terminated_length": 727.0,
+      "completions/mean_length": 556.0625,
+      "completions/mean_terminated_length": 556.0625,
+      "completions/min_length": 368.0,
+      "completions/min_terminated_length": 368.0,
+      "entropy": 0.467573344707489,
+      "epoch": 4.777777777777778,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.0027739950455725193,
+      "learning_rate": 2.6027968711246627e-08,
+      "loss": -0.0162,
+      "num_tokens": 17756685.0,
+      "reward": 0.6875,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.4787135720252991,
+      "step": 903,
+      "step_time": 73.05204797629267
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 663.0,
+      "completions/max_terminated_length": 663.0,
+      "completions/mean_length": 432.375,
+      "completions/mean_terminated_length": 432.375,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "entropy": 0.33304423093795776,
+      "epoch": 4.783068783068783,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.040771484375,
+      "kl": 0.0020940680988132954,
+      "learning_rate": 2.550403552215785e-08,
+      "loss": 0.0,
+      "num_tokens": 17766963.0,
+      "reward": 0.75,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 904,
+      "step_time": 75.88487505353987
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 940.0,
+      "completions/max_terminated_length": 940.0,
+      "completions/mean_length": 543.375,
+      "completions/mean_terminated_length": 543.375,
+      "completions/min_length": 326.0,
+      "completions/min_terminated_length": 326.0,
+      "entropy": 0.36438924074172974,
+      "epoch": 4.788359788359788,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0625,
+      "kl": 0.0024348387960344553,
+      "learning_rate": 2.4985291344915673e-08,
+      "loss": 0.0049,
+      "num_tokens": 17780065.0,
+      "reward": 0.22499999403953552,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.22499999403953552,
+      "rewards/itbench_correctness/std": 0.29552215337753296,
+      "step": 905,
+      "step_time": 420.1394767453894
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 911.4375,
+      "completions/mean_terminated_length": 766.7142944335938,
+      "completions/min_length": 623.0,
+      "completions/min_terminated_length": 623.0,
+      "entropy": 0.6890214681625366,
+      "epoch": 4.7936507936507935,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.65625,
+      "kl": 0.00153902149759233,
+      "learning_rate": 2.4471741852423233e-08,
+      "loss": 0.0055,
+      "num_tokens": 17825752.0,
+      "reward": 0.03125,
+      "reward_std": 0.0578637570142746,
+      "rewards/itbench_correctness/mean": 0.03125,
+      "rewards/itbench_correctness/std": 0.08539126068353653,
+      "step": 906,
+      "step_time": 132.87724316772074
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 963.0,
+      "completions/max_terminated_length": 963.0,
+      "completions/mean_length": 648.3125,
+      "completions/mean_terminated_length": 648.3125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4966740608215332,
+      "epoch": 4.798941798941799,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.4765625,
+      "kl": 0.0030572270043194294,
+      "learning_rate": 2.396339266077557e-08,
+      "loss": -0.1071,
+      "num_tokens": 17839621.0,
+      "reward": 0.9375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 907,
+      "step_time": 503.1076301559806
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.75,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 917.625,
+      "completions/mean_terminated_length": 598.5,
+      "completions/min_length": 428.0,
+      "completions/min_terminated_length": 428.0,
+      "entropy": 0.3029560148715973,
+      "epoch": 4.804232804232804,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0234375,
+      "kl": 0.0009918678551912308,
+      "learning_rate": 2.3460249329197823e-08,
+      "loss": -0.0133,
+      "num_tokens": 17868991.0,
+      "reward": 0.3541666865348816,
+      "reward_std": 0.058925561606884,
+      "rewards/itbench_correctness/mean": 0.3541666865348816,
+      "rewards/itbench_correctness/std": 0.37453675270080566,
+      "step": 908,
+      "step_time": 243.62568031344563
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 847.0,
+      "completions/mean_length": 767.5625,
+      "completions/mean_terminated_length": 750.4667358398438,
+      "completions/min_length": 473.0,
+      "completions/min_terminated_length": 473.0,
+      "entropy": 0.2605651021003723,
+      "epoch": 4.809523809523809,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4609375,
+      "kl": 0.001077577588148415,
+      "learning_rate": 2.2962317359985107e-08,
+      "loss": -0.0185,
+      "num_tokens": 17888648.0,
+      "reward": 0.3333333432674408,
+      "reward_std": 0.2766174077987671,
+      "rewards/itbench_correctness/mean": 0.3333333432674408,
+      "rewards/itbench_correctness/std": 0.3162277638912201,
+      "step": 909,
+      "step_time": 128.46637521497905
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 711.375,
+      "completions/mean_terminated_length": 690.5333862304688,
+      "completions/min_length": 395.0,
+      "completions/min_terminated_length": 395.0,
+      "entropy": 0.4948163628578186,
+      "epoch": 4.814814814814815,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.078125,
+      "kl": 0.002276236657053232,
+      "learning_rate": 2.2469602198441573e-08,
+      "loss": 0.0748,
+      "num_tokens": 17906702.0,
+      "reward": 0.5833333134651184,
+      "reward_std": 0.19287919998168945,
+      "rewards/itbench_correctness/mean": 0.5833333134651184,
+      "rewards/itbench_correctness/std": 0.28706690669059753,
+      "step": 910,
+      "step_time": 451.41445366758853
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 586.0,
+      "completions/max_terminated_length": 586.0,
+      "completions/mean_length": 479.5,
+      "completions/mean_terminated_length": 479.5,
+      "completions/min_length": 368.0,
+      "completions/min_terminated_length": 368.0,
+      "entropy": 0.31699687242507935,
+      "epoch": 4.8201058201058204,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4453125,
+      "kl": 0.001171560725197196,
+      "learning_rate": 2.1982109232821176e-08,
+      "loss": -0.0032,
+      "num_tokens": 17917550.0,
+      "reward": 0.9453125,
+      "reward_std": 0.08679073303937912,
+      "rewards/itbench_correctness/mean": 0.9453125,
+      "rewards/itbench_correctness/std": 0.10174263268709183,
+      "step": 911,
+      "step_time": 838.8527243016288
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 852.0,
+      "completions/max_terminated_length": 852.0,
+      "completions/mean_length": 508.5625,
+      "completions/mean_terminated_length": 508.5625,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5309081673622131,
+      "epoch": 4.825396825396825,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4453125,
+      "kl": 0.0025923862121999264,
+      "learning_rate": 2.1499843794269058e-08,
+      "loss": -0.1172,
+      "num_tokens": 17931191.0,
+      "reward": 0.5625,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 912,
+      "step_time": 216.96005523204803
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 969.0,
+      "completions/mean_length": 761.25,
+      "completions/mean_terminated_length": 673.6666870117188,
+      "completions/min_length": 513.0,
+      "completions/min_terminated_length": 513.0,
+      "entropy": 0.47027915716171265,
+      "epoch": 4.830687830687831,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.84375,
+      "kl": 0.0013219509273767471,
+      "learning_rate": 2.1022811156762576e-08,
+      "loss": 0.0432,
+      "num_tokens": 17947115.0,
+      "reward": 0.25,
+      "reward_std": 0.30284827947616577,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.3259601294994354,
+      "step": 913,
+      "step_time": 100.75746689084917
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 558.0,
+      "completions/mean_length": 631.75,
+      "completions/mean_terminated_length": 396.3999938964844,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3593193590641022,
+      "epoch": 4.835978835978836,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4609375,
+      "kl": 0.002075113356113434,
+      "learning_rate": 2.055101653705449e-08,
+      "loss": -0.0584,
+      "num_tokens": 17963415.0,
+      "reward": 0.671875,
+      "reward_std": 0.4432469606399536,
+      "rewards/itbench_correctness/mean": 0.671875,
+      "rewards/itbench_correctness/std": 0.4718646705150604,
+      "step": 914,
+      "step_time": 378.09901642706245
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 865.3125,
+      "completions/mean_terminated_length": 741.888916015625,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.40216684341430664,
+      "epoch": 4.841269841269841,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2265625,
+      "kl": 0.0013008101377636194,
+      "learning_rate": 2.008446509461498e-08,
+      "loss": -0.0024,
+      "num_tokens": 17983796.0,
+      "reward": 0.8459821343421936,
+      "reward_std": 0.10143714398145676,
+      "rewards/itbench_correctness/mean": 0.8459821343421936,
+      "rewards/itbench_correctness/std": 0.21097390353679657,
+      "step": 915,
+      "step_time": 85.97270075790584
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 396.0,
+      "completions/mean_length": 609.9375,
+      "completions/mean_terminated_length": 287.8888854980469,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5607131719589233,
+      "epoch": 4.8465608465608465,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3125,
+      "kl": 0.0020648783538490534,
+      "learning_rate": 1.9623161931575926e-08,
+      "loss": -0.0784,
+      "num_tokens": 18000307.0,
+      "reward": 0.171875,
+      "reward_std": 0.1367267221212387,
+      "rewards/itbench_correctness/mean": 0.171875,
+      "rewards/itbench_correctness/std": 0.2576940953731537,
+      "step": 916,
+      "step_time": 217.30383673589677
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 837.0,
+      "completions/max_terminated_length": 837.0,
+      "completions/mean_length": 639.0625,
+      "completions/mean_terminated_length": 639.0625,
+      "completions/min_length": 443.0,
+      "completions/min_terminated_length": 443.0,
+      "entropy": 0.4193643033504486,
+      "epoch": 4.851851851851852,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.048095703125,
+      "kl": 0.0019990454893559217,
+      "learning_rate": 1.9167112092674796e-08,
+      "loss": 0.0,
+      "num_tokens": 18025980.0,
+      "reward": 0.05000000074505806,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.05000000074505806,
+      "rewards/itbench_correctness/std": 0.05163978040218353,
+      "step": 917,
+      "step_time": 113.52076997049153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 861.0,
+      "completions/max_terminated_length": 861.0,
+      "completions/mean_length": 583.625,
+      "completions/mean_terminated_length": 583.625,
+      "completions/min_length": 306.0,
+      "completions/min_terminated_length": 306.0,
+      "entropy": 0.24159349501132965,
+      "epoch": 4.857142857142857,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.4375,
+      "kl": 0.0012394418008625507,
+      "learning_rate": 1.8716320565199618e-08,
+      "loss": 0.0073,
+      "num_tokens": 18040222.0,
+      "reward": 0.4027777910232544,
+      "reward_std": 0.11368955671787262,
+      "rewards/itbench_correctness/mean": 0.4027777910232544,
+      "rewards/itbench_correctness/std": 0.17033012211322784,
+      "step": 918,
+      "step_time": 70.6566086569801
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1015.0,
+      "completions/mean_length": 733.75,
+      "completions/mean_terminated_length": 559.6000366210938,
+      "completions/min_length": 432.0,
+      "completions/min_terminated_length": 432.0,
+      "entropy": 0.5805792212486267,
+      "epoch": 4.862433862433862,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0245361328125,
+      "kl": 0.0014999855775386095,
+      "learning_rate": 1.82707922789343e-08,
+      "loss": 0.0,
+      "num_tokens": 18068698.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 919,
+      "step_time": 638.5085713258013
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 869.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 725.3125,
+      "completions/mean_terminated_length": 725.3125,
+      "completions/min_length": 536.0,
+      "completions/min_terminated_length": 536.0,
+      "entropy": 0.6038776636123657,
+      "epoch": 4.867724867724868,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.046142578125,
+      "kl": 0.0026757661253213882,
+      "learning_rate": 1.7830532106104746e-08,
+      "loss": 0.0001,
+      "num_tokens": 18099127.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 920,
+      "step_time": 105.40813992917538
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 599.0,
+      "completions/max_terminated_length": 599.0,
+      "completions/mean_length": 492.125,
+      "completions/mean_terminated_length": 492.125,
+      "completions/min_length": 394.0,
+      "completions/min_terminated_length": 394.0,
+      "entropy": 0.4124968349933624,
+      "epoch": 4.8730158730158735,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3515625,
+      "kl": 0.0012788712047040462,
+      "learning_rate": 1.7395544861325718e-08,
+      "loss": -0.0021,
+      "num_tokens": 18109953.0,
+      "reward": 0.20928031206130981,
+      "reward_std": 0.10235221683979034,
+      "rewards/itbench_correctness/mean": 0.20928031206130981,
+      "rewards/itbench_correctness/std": 0.1258237361907959,
+      "step": 921,
+      "step_time": 53.33936434518546
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 567.0,
+      "completions/max_terminated_length": 567.0,
+      "completions/mean_length": 424.125,
+      "completions/mean_terminated_length": 424.125,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "entropy": 0.38903623819351196,
+      "epoch": 4.878306878306878,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5546875,
+      "kl": 0.0015015568351373076,
+      "learning_rate": 1.6965835301547936e-08,
+      "loss": -0.0055,
+      "num_tokens": 18119051.0,
+      "reward": 0.5625,
+      "reward_std": 0.2177756428718567,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 922,
+      "step_time": 53.09215545654297
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 825.0,
+      "completions/max_terminated_length": 825.0,
+      "completions/mean_length": 631.75,
+      "completions/mean_terminated_length": 631.75,
+      "completions/min_length": 482.0,
+      "completions/min_terminated_length": 482.0,
+      "entropy": 0.42105263471603394,
+      "epoch": 4.883597883597884,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4765625,
+      "kl": 0.0011097542010247707,
+      "learning_rate": 1.654140812600646e-08,
+      "loss": 0.0024,
+      "num_tokens": 18132919.0,
+      "reward": 0.8697916865348816,
+      "reward_std": 0.014731383882462978,
+      "rewards/itbench_correctness/mean": 0.8697916865348816,
+      "rewards/itbench_correctness/std": 0.1359764039516449,
+      "step": 923,
+      "step_time": 800.9228875609115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 535.0,
+      "completions/max_terminated_length": 535.0,
+      "completions/mean_length": 391.125,
+      "completions/mean_terminated_length": 391.125,
+      "completions/min_length": 276.0,
+      "completions/min_terminated_length": 276.0,
+      "entropy": 0.3630552887916565,
+      "epoch": 4.888888888888889,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.96875,
+      "kl": 0.0026786988601088524,
+      "learning_rate": 1.612226797616878e-08,
+      "loss": -0.0025,
+      "num_tokens": 18141865.0,
+      "reward": 0.71875,
+      "reward_std": 0.0578637570142746,
+      "rewards/itbench_correctness/mean": 0.71875,
+      "rewards/itbench_correctness/std": 0.23935678601264954,
+      "step": 924,
+      "step_time": 811.8248612135649
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1024.0,
+      "completions/mean_length": 992.0,
+      "completions/mean_terminated_length": 921.6000366210938,
+      "completions/min_length": 743.0,
+      "completions/min_terminated_length": 743.0,
+      "entropy": 0.44556450843811035,
+      "epoch": 4.894179894179894,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 3.796875,
+      "kl": 0.0014187112683430314,
+      "learning_rate": 1.570841943568446e-08,
+      "loss": -0.0001,
+      "num_tokens": 18173001.0,
+      "reward": 0.30000001192092896,
+      "reward_std": 0.32691311836242676,
+      "rewards/itbench_correctness/mean": 0.30000001192092896,
+      "rewards/itbench_correctness/std": 0.37372004985809326,
+      "step": 925,
+      "step_time": 140.93405285663903
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1011.0,
+      "completions/mean_length": 780.3125,
+      "completions/mean_terminated_length": 590.7777709960938,
+      "completions/min_length": 440.0,
+      "completions/min_terminated_length": 440.0,
+      "entropy": 0.5049259066581726,
+      "epoch": 4.8994708994708995,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.859375,
+      "kl": 0.0018744791159406304,
+      "learning_rate": 1.5299867030334813e-08,
+      "loss": 0.0123,
+      "num_tokens": 18191094.0,
+      "reward": 0.6193181872367859,
+      "reward_std": 0.19284729659557343,
+      "rewards/itbench_correctness/mean": 0.6193181872367859,
+      "rewards/itbench_correctness/std": 0.4520004689693451,
+      "step": 926,
+      "step_time": 850.4311718912795
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 744.375,
+      "completions/mean_terminated_length": 576.6000366210938,
+      "completions/min_length": 411.0,
+      "completions/min_terminated_length": 411.0,
+      "entropy": 0.30495381355285645,
+      "epoch": 4.904761904761905,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6328125,
+      "kl": 0.0011643210891634226,
+      "learning_rate": 1.4896615227983466e-08,
+      "loss": 0.0384,
+      "num_tokens": 18211444.0,
+      "reward": 0.1875,
+      "reward_std": 0.2700308561325073,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.32702362537384033,
+      "step": 927,
+      "step_time": 924.7460502795875
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 693.0,
+      "completions/mean_length": 743.0625,
+      "completions/mean_terminated_length": 524.5555419921875,
+      "completions/min_length": 450.0,
+      "completions/min_terminated_length": 450.0,
+      "entropy": 0.2853057384490967,
+      "epoch": 4.91005291005291,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.026123046875,
+      "kl": 0.0013434689026325941,
+      "learning_rate": 1.4498668438527595e-08,
+      "loss": 0.0,
+      "num_tokens": 18229109.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 928,
+      "step_time": 146.66280045732856
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 827.8125,
+      "completions/mean_terminated_length": 675.2222290039062,
+      "completions/min_length": 520.0,
+      "completions/min_terminated_length": 520.0,
+      "entropy": 0.5242733359336853,
+      "epoch": 4.915343915343915,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0,
+      "kl": 0.0013374233385547996,
+      "learning_rate": 1.4106031013849496e-08,
+      "loss": -0.0194,
+      "num_tokens": 18248834.0,
+      "reward": 0.40312498807907104,
+      "reward_std": 0.18545761704444885,
+      "rewards/itbench_correctness/mean": 0.40312498807907104,
+      "rewards/itbench_correctness/std": 0.21328286826610565,
+      "step": 929,
+      "step_time": 373.5206086365506
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 700.0,
+      "completions/max_terminated_length": 700.0,
+      "completions/mean_length": 554.125,
+      "completions/mean_terminated_length": 554.125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5053011775016785,
+      "epoch": 4.920634920634921,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.375,
+      "kl": 0.002205699449405074,
+      "learning_rate": 1.3718707247769134e-08,
+      "loss": -0.0714,
+      "num_tokens": 18262628.0,
+      "reward": 0.4375,
+      "reward_std": 0.22226819396018982,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.32221025228500366,
+      "step": 930,
+      "step_time": 99.51419737841934
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 500.0,
+      "completions/max_terminated_length": 500.0,
+      "completions/mean_length": 370.1875,
+      "completions/mean_terminated_length": 370.1875,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.35117340087890625,
+      "epoch": 4.925925925925926,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.40625,
+      "kl": 0.00277713593095541,
+      "learning_rate": 1.3336701375997127e-08,
+      "loss": -0.0628,
+      "num_tokens": 18275143.0,
+      "reward": 0.6875,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.3095695972442627,
+      "step": 931,
+      "step_time": 64.9049273962155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 660.0,
+      "completions/mean_length": 748.5,
+      "completions/mean_terminated_length": 473.0,
+      "completions/min_length": 9.0,
+      "completions/min_terminated_length": 9.0,
+      "entropy": 0.3607214391231537,
+      "epoch": 4.931216931216931,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0014514221111312509,
+      "learning_rate": 1.2960017576088444e-08,
+      "loss": -0.0523,
+      "num_tokens": 18299639.0,
+      "reward": 0.4375,
+      "reward_std": 0.290380597114563,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.45896419882774353,
+      "step": 932,
+      "step_time": 117.33534361980855
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 668.0,
+      "completions/max_terminated_length": 668.0,
+      "completions/mean_length": 515.25,
+      "completions/mean_terminated_length": 515.25,
+      "completions/min_length": 400.0,
+      "completions/min_terminated_length": 400.0,
+      "entropy": 0.3842794895172119,
+      "epoch": 4.936507936507937,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6875,
+      "kl": 0.0029691443778574467,
+      "learning_rate": 1.2588659967396997e-08,
+      "loss": -0.0036,
+      "num_tokens": 18311107.0,
+      "reward": 0.8636363744735718,
+      "reward_std": 0.09819302707910538,
+      "rewards/itbench_correctness/mean": 0.8636363744735718,
+      "rewards/itbench_correctness/std": 0.1603485643863678,
+      "step": 933,
+      "step_time": 64.65582219231874
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 692.875,
+      "completions/mean_terminated_length": 692.875,
+      "completions/min_length": 532.0,
+      "completions/min_terminated_length": 532.0,
+      "entropy": 0.3896806836128235,
+      "epoch": 4.941798941798941,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.001258630771189928,
+      "learning_rate": 1.2222632611029848e-08,
+      "loss": 0.0216,
+      "num_tokens": 18326641.0,
+      "reward": 0.545036792755127,
+      "reward_std": 0.15080596506595612,
+      "rewards/itbench_correctness/mean": 0.545036792755127,
+      "rewards/itbench_correctness/std": 0.3676183223724365,
+      "step": 934,
+      "step_time": 169.38152172323316
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 956.0,
+      "completions/mean_length": 851.6875,
+      "completions/mean_terminated_length": 717.6666870117188,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5119248628616333,
+      "epoch": 4.947089947089947,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.265625,
+      "kl": 0.0015083615435287356,
+      "learning_rate": 1.1861939509803686e-08,
+      "loss": 0.0377,
+      "num_tokens": 18350692.0,
+      "reward": 0.0390625,
+      "reward_std": 0.05725783854722977,
+      "rewards/itbench_correctness/mean": 0.0390625,
+      "rewards/itbench_correctness/std": 0.08801929652690887,
+      "step": 935,
+      "step_time": 103.84523029625416
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 951.0,
+      "completions/max_terminated_length": 951.0,
+      "completions/mean_length": 659.0625,
+      "completions/mean_terminated_length": 659.0625,
+      "completions/min_length": 403.0,
+      "completions/min_terminated_length": 403.0,
+      "entropy": 0.2715979218482971,
+      "epoch": 4.9523809523809526,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6953125,
+      "kl": 0.000976603594608605,
+      "learning_rate": 1.1506584608200364e-08,
+      "loss": -0.0178,
+      "num_tokens": 18366501.0,
+      "reward": 0.6177083253860474,
+      "reward_std": 0.17081069946289062,
+      "rewards/itbench_correctness/mean": 0.6177083253860474,
+      "rewards/itbench_correctness/std": 0.21085968613624573,
+      "step": 936,
+      "step_time": 92.50707028061152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 954.0,
+      "completions/mean_length": 699.0625,
+      "completions/mean_terminated_length": 504.1000061035156,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4091193675994873,
+      "epoch": 4.957671957671957,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 4.21875,
+      "kl": 0.002065515611320734,
+      "learning_rate": 1.115657179232421e-08,
+      "loss": -0.065,
+      "num_tokens": 18387390.0,
+      "reward": 0.1875,
+      "reward_std": 0.10681165754795074,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.14751020073890686,
+      "step": 937,
+      "step_time": 594.838815539144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 972.0,
+      "completions/mean_length": 817.4375,
+      "completions/mean_terminated_length": 787.9285888671875,
+      "completions/min_length": 646.0,
+      "completions/min_terminated_length": 646.0,
+      "entropy": 0.4061472713947296,
+      "epoch": 4.962962962962963,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8046875,
+      "kl": 0.0015314513584598899,
+      "learning_rate": 1.0811904889859335e-08,
+      "loss": 0.0224,
+      "num_tokens": 18405933.0,
+      "reward": 0.854687511920929,
+      "reward_std": 0.22741259634494781,
+      "rewards/itbench_correctness/mean": 0.854687511920929,
+      "rewards/itbench_correctness/std": 0.24986976385116577,
+      "step": 938,
+      "step_time": 84.86788581125438
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 994.0,
+      "completions/mean_length": 857.3125,
+      "completions/mean_terminated_length": 801.75,
+      "completions/min_length": 680.0,
+      "completions/min_terminated_length": 680.0,
+      "entropy": 0.5902165174484253,
+      "epoch": 4.968253968253968,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5078125,
+      "kl": 0.0016059105983003974,
+      "learning_rate": 1.0472587670027678e-08,
+      "loss": 0.0006,
+      "num_tokens": 18446842.0,
+      "reward": 0.1875,
+      "reward_std": 0.2587745785713196,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 939,
+      "step_time": 161.18546231649816
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 533.8125,
+      "completions/mean_terminated_length": 533.8125,
+      "completions/min_length": 257.0,
+      "completions/min_terminated_length": 257.0,
+      "entropy": 0.5657417178153992,
+      "epoch": 4.973544973544973,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.01708984375,
+      "kl": 0.0015139211900532246,
+      "learning_rate": 1.0138623843548078e-08,
+      "loss": 0.0,
+      "num_tokens": 18468391.0,
+      "reward": 0.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 940,
+      "step_time": 92.19830699265003
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 534.0,
+      "completions/mean_length": 734.0,
+      "completions/mean_terminated_length": 444.0,
+      "completions/min_length": 361.0,
+      "completions/min_terminated_length": 361.0,
+      "entropy": 0.3760218024253845,
+      "epoch": 4.978835978835979,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2109375,
+      "kl": 0.001119913998991251,
+      "learning_rate": 9.810017062595321e-09,
+      "loss": 0.0,
+      "num_tokens": 18486999.0,
+      "reward": 0.734375,
+      "reward_std": 0.19408094882965088,
+      "rewards/itbench_correctness/mean": 0.734375,
+      "rewards/itbench_correctness/std": 0.3815402090549469,
+      "step": 941,
+      "step_time": 845.4121110225096
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 649.0,
+      "completions/mean_length": 763.5625,
+      "completions/mean_terminated_length": 503.125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.5107637047767639,
+      "epoch": 4.984126984126984,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7265625,
+      "kl": 0.002076751785352826,
+      "learning_rate": 9.486770920760667e-09,
+      "loss": -0.0707,
+      "num_tokens": 18506976.0,
+      "reward": 0.53125,
+      "reward_std": 0.41746097803115845,
+      "rewards/itbench_correctness/mean": 0.53125,
+      "rewards/itbench_correctness/std": 0.4989572763442993,
+      "step": 942,
+      "step_time": 94.76647205464542
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 935.0,
+      "completions/mean_length": 720.125,
+      "completions/mean_terminated_length": 582.0,
+      "completions/min_length": 392.0,
+      "completions/min_terminated_length": 392.0,
+      "entropy": 0.5610136985778809,
+      "epoch": 4.98941798941799,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.9375,
+      "kl": 0.0018183528445661068,
+      "learning_rate": 9.168888953011989e-09,
+      "loss": 0.0444,
+      "num_tokens": 18531946.0,
+      "reward": 0.1979166716337204,
+      "reward_std": 0.2609178125858307,
+      "rewards/itbench_correctness/mean": 0.1979166716337204,
+      "rewards/itbench_correctness/std": 0.3232860863208771,
+      "step": 943,
+      "step_time": 101.69731870479882
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 730.0,
+      "completions/max_terminated_length": 730.0,
+      "completions/mean_length": 479.875,
+      "completions/mean_terminated_length": 479.875,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "entropy": 0.4459494650363922,
+      "epoch": 4.994708994708994,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1796875,
+      "kl": 0.0016703108558431268,
+      "learning_rate": 8.856374635655695e-09,
+      "loss": -0.0015,
+      "num_tokens": 18542184.0,
+      "reward": 0.4583333432674408,
+      "reward_std": 0.1178511306643486,
+      "rewards/itbench_correctness/mean": 0.4583333432674408,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 944,
+      "step_time": 465.4684395249933
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 983.0,
+      "completions/mean_length": 718.6875,
+      "completions/mean_terminated_length": 481.22222900390625,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.5816158056259155,
+      "epoch": 5.0,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.84375,
+      "kl": 0.0013618582161143422,
+      "learning_rate": 8.54923138629815e-09,
+      "loss": 0.013,
+      "num_tokens": 18560147.0,
+      "reward": 0.40625,
+      "reward_std": 0.19776971638202667,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.306526243686676,
+      "step": 945,
+      "step_time": 143.76468984037638
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 575.8125,
+      "completions/mean_terminated_length": 545.933349609375,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.4619559347629547,
+      "epoch": 5.005291005291006,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6796875,
+      "kl": 0.0013607589062303305,
+      "learning_rate": 8.247462563808816e-09,
+      "loss": 0.0691,
+      "num_tokens": 18571720.0,
+      "reward": 0.8020833134651184,
+      "reward_std": 0.4064691960811615,
+      "rewards/itbench_correctness/mean": 0.8020833134651184,
+      "rewards/itbench_correctness/std": 0.40008679032325745,
+      "step": 946,
+      "step_time": 82.60051180887967
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 797.0,
+      "completions/mean_length": 862.5625,
+      "completions/mean_terminated_length": 701.125,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.5077893137931824,
+      "epoch": 5.01058201058201,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.421875,
+      "kl": 0.0012988558737561107,
+      "learning_rate": 7.951071468283166e-09,
+      "loss": 0.0,
+      "num_tokens": 18591313.0,
+      "reward": 0.796875,
+      "reward_std": 0.13258251547813416,
+      "rewards/itbench_correctness/mean": 0.796875,
+      "rewards/itbench_correctness/std": 0.27716949582099915,
+      "step": 947,
+      "step_time": 1016.9221306946129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 903.0,
+      "completions/mean_length": 920.5625,
+      "completions/mean_terminated_length": 787.5714721679688,
+      "completions/min_length": 688.0,
+      "completions/min_terminated_length": 688.0,
+      "entropy": 0.4605879485607147,
+      "epoch": 5.015873015873016,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5390625,
+      "kl": 0.001742081018164754,
+      "learning_rate": 7.660061341006718e-09,
+      "loss": 0.0001,
+      "num_tokens": 18623898.0,
+      "reward": 0.25,
+      "reward_std": 0.15430334210395813,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.3333333432674408,
+      "step": 948,
+      "step_time": 110.10245905164629
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 989.0,
+      "completions/mean_length": 639.3125,
+      "completions/mean_terminated_length": 511.0833435058594,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4192003011703491,
+      "epoch": 5.021164021164021,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 0.443359375,
+      "kl": 0.0027288675773888826,
+      "learning_rate": 7.374435364419673e-09,
+      "loss": -0.0643,
+      "num_tokens": 18638975.0,
+      "reward": 0.17500001192092896,
+      "reward_std": 0.0707106739282608,
+      "rewards/itbench_correctness/mean": 0.17500001192092896,
+      "rewards/itbench_correctness/std": 0.20493902266025543,
+      "step": 949,
+      "step_time": 263.15858253091574
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 997.0,
+      "completions/mean_length": 859.5625,
+      "completions/mean_terminated_length": 821.6154174804688,
+      "completions/min_length": 680.0,
+      "completions/min_terminated_length": 680.0,
+      "entropy": 0.3839162290096283,
+      "epoch": 5.026455026455026,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.296875,
+      "kl": 0.000997141352854669,
+      "learning_rate": 7.09419666208183e-09,
+      "loss": -0.0236,
+      "num_tokens": 18657616.0,
+      "reward": 0.6669219732284546,
+      "reward_std": 0.24255049228668213,
+      "rewards/itbench_correctness/mean": 0.6669219732284546,
+      "rewards/itbench_correctness/std": 0.4117698669433594,
+      "step": 950,
+      "step_time": 535.0438889786601
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 988.0,
+      "completions/mean_length": 944.6875,
+      "completions/mean_terminated_length": 883.0,
+      "completions/min_length": 833.0,
+      "completions/min_terminated_length": 833.0,
+      "entropy": 0.39801523089408875,
+      "epoch": 5.031746031746032,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0703125,
+      "kl": 0.001400322187691927,
+      "learning_rate": 6.819348298638839e-09,
+      "loss": 0.0019,
+      "num_tokens": 18678923.0,
+      "reward": 0.328125,
+      "reward_std": 0.0646936446428299,
+      "rewards/itbench_correctness/mean": 0.328125,
+      "rewards/itbench_correctness/std": 0.3502231538295746,
+      "step": 951,
+      "step_time": 90.80604922864586
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 787.0625,
+      "completions/mean_terminated_length": 753.2142944335938,
+      "completions/min_length": 617.0,
+      "completions/min_terminated_length": 617.0,
+      "entropy": 0.36845865845680237,
+      "epoch": 5.037037037037037,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.3515625,
+      "kl": 0.0017281656619161367,
+      "learning_rate": 6.549893279788277e-09,
+      "loss": 0.007,
+      "num_tokens": 18701348.0,
+      "reward": 0.9375,
+      "reward_std": 0.1157275140285492,
+      "rewards/itbench_correctness/mean": 0.9375,
+      "rewards/itbench_correctness/std": 0.17078252136707306,
+      "step": 952,
+      "step_time": 321.3709614155814
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 685.25,
+      "completions/mean_terminated_length": 685.25,
+      "completions/min_length": 584.0,
+      "completions/min_terminated_length": 584.0,
+      "entropy": 0.4961692690849304,
+      "epoch": 5.042328042328043,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.265625,
+      "kl": 0.0015087542124092579,
+      "learning_rate": 6.2858345522471265e-09,
+      "loss": 0.0125,
+      "num_tokens": 18722424.0,
+      "reward": 0.34687501192092896,
+      "reward_std": 0.0646936446428299,
+      "rewards/itbench_correctness/mean": 0.34687501192092896,
+      "rewards/itbench_correctness/std": 0.2698571979999542,
+      "step": 953,
+      "step_time": 409.55402624513954
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 591.0,
+      "completions/max_terminated_length": 591.0,
+      "completions/mean_length": 433.375,
+      "completions/mean_terminated_length": 433.375,
+      "completions/min_length": 345.0,
+      "completions/min_terminated_length": 345.0,
+      "entropy": 0.5030285716056824,
+      "epoch": 5.0476190476190474,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.07666015625,
+      "kl": 0.0032849370036274195,
+      "learning_rate": 6.0271750037193534e-09,
+      "loss": 0.0001,
+      "num_tokens": 18731814.0,
+      "reward": 0.25,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.25,
+      "rewards/itbench_correctness/std": 0.25819888710975647,
+      "step": 954,
+      "step_time": 96.037104123272
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 762.25,
+      "completions/mean_terminated_length": 701.84619140625,
+      "completions/min_length": 548.0,
+      "completions/min_terminated_length": 548.0,
+      "entropy": 0.3961954712867737,
+      "epoch": 5.052910052910053,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.765625,
+      "kl": 0.0015258269850164652,
+      "learning_rate": 5.773917462864264e-09,
+      "loss": 0.0162,
+      "num_tokens": 18749154.0,
+      "reward": 0.5750000476837158,
+      "reward_std": 0.24238379299640656,
+      "rewards/itbench_correctness/mean": 0.5750000476837158,
+      "rewards/itbench_correctness/std": 0.4028027057647705,
+      "step": 955,
+      "step_time": 67.30359940230846
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 587.3125,
+      "completions/mean_terminated_length": 558.2000122070312,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3524529039859772,
+      "epoch": 5.058201058201059,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05322265625,
+      "kl": 0.002124128630384803,
+      "learning_rate": 5.526064699265753e-09,
+      "loss": 0.0,
+      "num_tokens": 18767295.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 956,
+      "step_time": 319.00149345304817
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 791.0,
+      "completions/mean_length": 828.9375,
+      "completions/mean_terminated_length": 633.875,
+      "completions/min_length": 485.0,
+      "completions/min_terminated_length": 485.0,
+      "entropy": 0.35708361864089966,
+      "epoch": 5.063492063492063,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.0012050803052261472,
+      "learning_rate": 5.283619423401997e-09,
+      "loss": -0.0121,
+      "num_tokens": 18791526.0,
+      "reward": 0.1875,
+      "reward_std": 0.4082317352294922,
+      "rewards/itbench_correctness/mean": 0.1875,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 957,
+      "step_time": 935.1128796143457
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.9375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 1020.0,
+      "completions/mean_terminated_length": 960.0,
+      "completions/min_length": 960.0,
+      "completions/min_terminated_length": 960.0,
+      "entropy": 0.31568628549575806,
+      "epoch": 5.068783068783069,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.0625,
+      "kl": 0.00110468955244869,
+      "learning_rate": 5.046584286615696e-09,
+      "loss": 0.0017,
+      "num_tokens": 18820270.0,
+      "reward": 0.3437500298023224,
+      "reward_std": 0.23224487900733948,
+      "rewards/itbench_correctness/mean": 0.3437500298023224,
+      "rewards/itbench_correctness/std": 0.30712980031967163,
+      "step": 958,
+      "step_time": 187.4145915368572
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 916.0,
+      "completions/max_terminated_length": 916.0,
+      "completions/mean_length": 732.625,
+      "completions/mean_terminated_length": 732.625,
+      "completions/min_length": 564.0,
+      "completions/min_terminated_length": 564.0,
+      "entropy": 0.3330489695072174,
+      "epoch": 5.074074074074074,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.859375,
+      "kl": 0.0017922122497111559,
+      "learning_rate": 4.8149618810850444e-09,
+      "loss": 0.0026,
+      "num_tokens": 18846408.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.3205420970916748,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.40138646960258484,
+      "step": 959,
+      "step_time": 115.98179497290403
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 869.0,
+      "completions/mean_length": 747.9375,
+      "completions/mean_terminated_length": 582.2999877929688,
+      "completions/min_length": 453.0,
+      "completions/min_terminated_length": 453.0,
+      "entropy": 0.4786496162414551,
+      "epoch": 5.079365079365079,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0233154296875,
+      "kl": 0.0014122002758085728,
+      "learning_rate": 4.588754739795586e-09,
+      "loss": 0.0,
+      "num_tokens": 18866879.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 960,
+      "step_time": 447.1240088623017
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1022.0,
+      "completions/mean_length": 872.0625,
+      "completions/mean_terminated_length": 780.9000244140625,
+      "completions/min_length": 605.0,
+      "completions/min_terminated_length": 605.0,
+      "entropy": 0.36006593704223633,
+      "epoch": 5.084656084656085,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.0014327183598652482,
+      "learning_rate": 4.367965336512403e-09,
+      "loss": 0.0033,
+      "num_tokens": 18892360.0,
+      "reward": 0.5625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.5625,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 961,
+      "step_time": 565.5308811077848
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 702.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 447.9375,
+      "completions/mean_terminated_length": 447.9375,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "entropy": 0.4866750240325928,
+      "epoch": 5.08994708994709,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.6171875,
+      "kl": 0.0028053594287484884,
+      "learning_rate": 4.152596085753024e-09,
+      "loss": -0.0055,
+      "num_tokens": 18904239.0,
+      "reward": 0.2922794222831726,
+      "reward_std": 0.34673309326171875,
+      "rewards/itbench_correctness/mean": 0.2922794222831726,
+      "rewards/itbench_correctness/std": 0.35596761107444763,
+      "step": 962,
+      "step_time": 62.929508111439645
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 578.0,
+      "completions/mean_length": 579.75,
+      "completions/mean_terminated_length": 431.66668701171875,
+      "completions/min_length": 335.0,
+      "completions/min_terminated_length": 335.0,
+      "entropy": 0.3777490258216858,
+      "epoch": 5.095238095238095,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.03125,
+      "kl": 0.0031987950205802917,
+      "learning_rate": 3.9426493427611175e-09,
+      "loss": 0.0033,
+      "num_tokens": 18924067.0,
+      "reward": 0.1171875,
+      "reward_std": 0.1269381046295166,
+      "rewards/itbench_correctness/mean": 0.1171875,
+      "rewards/itbench_correctness/std": 0.2114865630865097,
+      "step": 963,
+      "step_time": 148.80320667196065
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 722.0,
+      "completions/max_terminated_length": 722.0,
+      "completions/mean_length": 530.5625,
+      "completions/mean_terminated_length": 530.5625,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "entropy": 0.5051242709159851,
+      "epoch": 5.1005291005291005,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.640625,
+      "kl": 0.0027365966234356165,
+      "learning_rate": 3.7381274034805066e-09,
+      "loss": -0.0151,
+      "num_tokens": 18945556.0,
+      "reward": 0.5562499761581421,
+      "reward_std": 0.3939805328845978,
+      "rewards/itbench_correctness/mean": 0.5562499761581421,
+      "rewards/itbench_correctness/std": 0.4657878875732422,
+      "step": 964,
+      "step_time": 113.64905078150332
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 634.0,
+      "completions/mean_length": 575.75,
+      "completions/mean_terminated_length": 545.86669921875,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.35084670782089233,
+      "epoch": 5.105820105820106,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7578125,
+      "kl": 0.0013689551269635558,
+      "learning_rate": 3.53903250453047e-09,
+      "loss": -0.0007,
+      "num_tokens": 18957760.0,
+      "reward": 0.6875,
+      "reward_std": 0.3613206446170807,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.36830443143844604,
+      "step": 965,
+      "step_time": 143.10534042678773
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 707.0,
+      "completions/mean_length": 667.4375,
+      "completions/mean_terminated_length": 453.5,
+      "completions/min_length": 332.0,
+      "completions/min_terminated_length": 332.0,
+      "entropy": 0.4554733633995056,
+      "epoch": 5.111111111111111,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.001402169349603355,
+      "learning_rate": 3.3453668231809283e-09,
+      "loss": 0.0134,
+      "num_tokens": 18976887.0,
+      "reward": 0.7916666865348816,
+      "reward_std": 0.3177001476287842,
+      "rewards/itbench_correctness/mean": 0.7916666865348816,
+      "rewards/itbench_correctness/std": 0.4013864994049072,
+      "step": 966,
+      "step_time": 106.49516909942031
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 808.4375,
+      "completions/mean_terminated_length": 640.7777709960938,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 0.4626207947731018,
+      "epoch": 5.116402116402116,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.03515625,
+      "kl": 0.0013422613264992833,
+      "learning_rate": 3.1571324773286278e-09,
+      "loss": 0.0,
+      "num_tokens": 19000374.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 967,
+      "step_time": 106.7643784377724
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1010.0,
+      "completions/mean_length": 628.375,
+      "completions/mean_terminated_length": 448.54547119140625,
+      "completions/min_length": 275.0,
+      "completions/min_terminated_length": 275.0,
+      "entropy": 0.4742391109466553,
+      "epoch": 5.121693121693122,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.001217088894918561,
+      "learning_rate": 2.9743315254743828e-09,
+      "loss": 0.0213,
+      "num_tokens": 19014292.0,
+      "reward": 0.5,
+      "reward_std": 0.3745020925998688,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.40824830532073975,
+      "step": 968,
+      "step_time": 76.93132317159325
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 570.0,
+      "completions/max_terminated_length": 570.0,
+      "completions/mean_length": 427.0625,
+      "completions/mean_terminated_length": 427.0625,
+      "completions/min_length": 365.0,
+      "completions/min_terminated_length": 365.0,
+      "entropy": 0.4425581693649292,
+      "epoch": 5.1269841269841265,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1015625,
+      "kl": 0.0016664480790495872,
+      "learning_rate": 2.7969659666999267e-09,
+      "loss": -0.0014,
+      "num_tokens": 19023773.0,
+      "reward": 0.6875,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.6875,
+      "rewards/itbench_correctness/std": 0.370809942483902,
+      "step": 969,
+      "step_time": 200.08597892336547
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 686.0,
+      "completions/mean_terminated_length": 483.20001220703125,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.3600583076477051,
+      "epoch": 5.132275132275132,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.578125,
+      "kl": 0.0012360253604128957,
+      "learning_rate": 2.6250377406467627e-09,
+      "loss": -0.0297,
+      "num_tokens": 19050061.0,
+      "reward": 0.515625,
+      "reward_std": 0.2414703369140625,
+      "rewards/itbench_correctness/mean": 0.515625,
+      "rewards/itbench_correctness/std": 0.4784414768218994,
+      "step": 970,
+      "step_time": 540.8324056314304
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 702.0,
+      "completions/mean_length": 750.4375,
+      "completions/mean_terminated_length": 537.6666870117188,
+      "completions/min_length": 414.0,
+      "completions/min_terminated_length": 414.0,
+      "entropy": 0.4610643684864044,
+      "epoch": 5.137566137566138,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.1171875,
+      "kl": 0.0013813948025926948,
+      "learning_rate": 2.458548727494292e-09,
+      "loss": 0.0043,
+      "num_tokens": 19075564.0,
+      "reward": 0.9921875,
+      "reward_std": 0.022097086533904076,
+      "rewards/itbench_correctness/mean": 0.9921875,
+      "rewards/itbench_correctness/std": 0.03125,
+      "step": 971,
+      "step_time": 217.15286646224558
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 657.375,
+      "completions/mean_terminated_length": 657.375,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "entropy": 0.40768206119537354,
+      "epoch": 5.142857142857143,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.640625,
+      "kl": 0.0014334998559206724,
+      "learning_rate": 2.2975007479397733e-09,
+      "loss": 0.0096,
+      "num_tokens": 19090834.0,
+      "reward": 0.75,
+      "reward_std": 0.13363061845302582,
+      "rewards/itbench_correctness/mean": 0.75,
+      "rewards/itbench_correctness/std": 0.3162277936935425,
+      "step": 972,
+      "step_time": 432.4331463770941
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 958.0,
+      "completions/mean_length": 740.25,
+      "completions/mean_terminated_length": 699.7142944335938,
+      "completions/min_length": 519.0,
+      "completions/min_terminated_length": 519.0,
+      "entropy": 0.5214454531669617,
+      "epoch": 5.148148148148148,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7890625,
+      "kl": 0.0014667396899312735,
+      "learning_rate": 2.14189556317812e-09,
+      "loss": -0.0047,
+      "num_tokens": 19107054.0,
+      "reward": 0.8541666865348816,
+      "reward_std": 0.290380597114563,
+      "rewards/itbench_correctness/mean": 0.8541666865348816,
+      "rewards/itbench_correctness/std": 0.3435921370983124,
+      "step": 973,
+      "step_time": 205.37443487346172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 893.0,
+      "completions/max_terminated_length": 893.0,
+      "completions/mean_length": 665.625,
+      "completions/mean_terminated_length": 665.625,
+      "completions/min_length": 555.0,
+      "completions/min_terminated_length": 555.0,
+      "entropy": 0.39962440729141235,
+      "epoch": 5.1534391534391535,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.703125,
+      "kl": 0.001995999598875642,
+      "learning_rate": 1.9917348748826334e-09,
+      "loss": -0.0209,
+      "num_tokens": 19121896.0,
+      "reward": 0.4375,
+      "reward_std": 0.13969546556472778,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.2922613024711609,
+      "step": 974,
+      "step_time": 112.39083941001445
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 915.0,
+      "completions/max_terminated_length": 915.0,
+      "completions/mean_length": 617.75,
+      "completions/mean_terminated_length": 617.75,
+      "completions/min_length": 339.0,
+      "completions/min_terminated_length": 339.0,
+      "entropy": 0.2751922309398651,
+      "epoch": 5.158730158730159,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5859375,
+      "kl": 0.0011612976668402553,
+      "learning_rate": 1.8470203251865768e-09,
+      "loss": -0.0003,
+      "num_tokens": 19137156.0,
+      "reward": 0.578125,
+      "reward_std": 0.3319548964500427,
+      "rewards/itbench_correctness/mean": 0.578125,
+      "rewards/itbench_correctness/std": 0.3842606544494629,
+      "step": 975,
+      "step_time": 101.55415380187333
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 936.0,
+      "completions/mean_length": 901.0,
+      "completions/mean_terminated_length": 778.0,
+      "completions/min_length": 627.0,
+      "completions/min_terminated_length": 627.0,
+      "entropy": 0.4217536151409149,
+      "epoch": 5.164021164021164,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4375,
+      "kl": 0.0018101928289979696,
+      "learning_rate": 1.7077534966650765e-09,
+      "loss": 0.0001,
+      "num_tokens": 19166732.0,
+      "reward": 0.40416669845581055,
+      "reward_std": 0.2077372521162033,
+      "rewards/itbench_correctness/mean": 0.40416669845581055,
+      "rewards/itbench_correctness/std": 0.423368364572525,
+      "step": 976,
+      "step_time": 115.02839307207614
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 370.0,
+      "completions/mean_length": 664.25,
+      "completions/mean_terminated_length": 304.5,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "entropy": 0.5600301027297974,
+      "epoch": 5.169312169312169,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.09375,
+      "kl": 0.002377049997448921,
+      "learning_rate": 1.5739359123178585e-09,
+      "loss": -0.0112,
+      "num_tokens": 19183648.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 977,
+      "step_time": 76.0952754272148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 979.0,
+      "completions/mean_length": 735.0,
+      "completions/mean_terminated_length": 638.6666870117188,
+      "completions/min_length": 358.0,
+      "completions/min_terminated_length": 358.0,
+      "entropy": 0.38639456033706665,
+      "epoch": 5.174603174603175,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7421875,
+      "kl": 0.0020921749528497458,
+      "learning_rate": 1.4455690355525963e-09,
+      "loss": 0.0233,
+      "num_tokens": 19205040.0,
+      "reward": 0.5208333730697632,
+      "reward_std": 0.347861647605896,
+      "rewards/itbench_correctness/mean": 0.5208333730697632,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 978,
+      "step_time": 102.6859831251204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1017.0,
+      "completions/mean_length": 640.625,
+      "completions/mean_terminated_length": 615.0667114257812,
+      "completions/min_length": 316.0,
+      "completions/min_terminated_length": 316.0,
+      "entropy": 0.3012682795524597,
+      "epoch": 5.1798941798941796,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.65625,
+      "kl": 0.0016695463564246893,
+      "learning_rate": 1.3226542701689214e-09,
+      "loss": 0.0201,
+      "num_tokens": 19219874.0,
+      "reward": 0.40625,
+      "reward_std": 0.24511480331420898,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.23935678601264954,
+      "step": 979,
+      "step_time": 449.7205182630569
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 811.0,
+      "completions/mean_length": 882.25,
+      "completions/mean_terminated_length": 740.5,
+      "completions/min_length": 671.0,
+      "completions/min_terminated_length": 671.0,
+      "entropy": 0.5644658803939819,
+      "epoch": 5.185185185185185,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.5234375,
+      "kl": 0.0016784444451332092,
+      "learning_rate": 1.2051929603428823e-09,
+      "loss": 0.0001,
+      "num_tokens": 19257654.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 980,
+      "step_time": 202.28774461336434
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 859.0,
+      "completions/mean_length": 706.5625,
+      "completions/mean_terminated_length": 633.3077392578125,
+      "completions/min_length": 472.0,
+      "completions/min_terminated_length": 472.0,
+      "entropy": 0.5095090866088867,
+      "epoch": 5.190476190476191,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.53125,
+      "kl": 0.0016119469655677676,
+      "learning_rate": 1.0931863906127325e-09,
+      "loss": -0.0168,
+      "num_tokens": 19297487.0,
+      "reward": 0.625,
+      "reward_std": 0.2314550280570984,
+      "rewards/itbench_correctness/mean": 0.625,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 981,
+      "step_time": 162.3620089488104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1021.0,
+      "completions/mean_length": 797.4375,
+      "completions/mean_terminated_length": 721.9166870117188,
+      "completions/min_length": 560.0,
+      "completions/min_terminated_length": 560.0,
+      "entropy": 0.4715103209018707,
+      "epoch": 5.195767195767195,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.96875,
+      "kl": 0.0011810938594862819,
+      "learning_rate": 9.866357858642205e-10,
+      "loss": 0.0159,
+      "num_tokens": 19314862.0,
+      "reward": 0.8062499761581421,
+      "reward_std": 0.2764522433280945,
+      "rewards/itbench_correctness/mean": 0.8062499761581421,
+      "rewards/itbench_correctness/std": 0.40078049898147583,
+      "step": 982,
+      "step_time": 87.88412514608353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 870.625,
+      "completions/mean_terminated_length": 751.3333129882812,
+      "completions/min_length": 569.0,
+      "completions/min_terminated_length": 569.0,
+      "entropy": 0.45254844427108765,
+      "epoch": 5.201058201058201,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.05224609375,
+      "kl": 0.001551373046822846,
+      "learning_rate": 8.855423113177662e-10,
+      "loss": 0.0001,
+      "num_tokens": 19335648.0,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 1.0,
+      "rewards/itbench_correctness/std": 0.0,
+      "step": 983,
+      "step_time": 624.4646268095821
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.6875,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 1001.0,
+      "completions/mean_length": 983.9375,
+      "completions/mean_terminated_length": 895.7999877929688,
+      "completions/min_length": 815.0,
+      "completions/min_terminated_length": 815.0,
+      "entropy": 0.5325541496276855,
+      "epoch": 5.2063492063492065,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.4921875,
+      "kl": 0.0016566679114475846,
+      "learning_rate": 7.899070725153611e-10,
+      "loss": -0.0178,
+      "num_tokens": 19373543.0,
+      "reward": 0.4375,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.4375,
+      "rewards/itbench_correctness/std": 0.5123475790023804,
+      "step": 984,
+      "step_time": 265.6073463913053
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 539.0,
+      "completions/max_terminated_length": 539.0,
+      "completions/mean_length": 461.125,
+      "completions/mean_terminated_length": 461.125,
+      "completions/min_length": 391.0,
+      "completions/min_terminated_length": 391.0,
+      "entropy": 0.4120357930660248,
+      "epoch": 5.211640211640212,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0234375,
+      "kl": 0.002073045587167144,
+      "learning_rate": 6.997311153086882e-10,
+      "loss": 0.0054,
+      "num_tokens": 19384705.0,
+      "reward": 0.28125,
+      "reward_std": 0.0294627845287323,
+      "rewards/itbench_correctness/mean": 0.28125,
+      "rewards/itbench_correctness/std": 0.145535409450531,
+      "step": 985,
+      "step_time": 46.38171513937414
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 976.0,
+      "completions/max_terminated_length": 976.0,
+      "completions/mean_length": 612.9375,
+      "completions/mean_terminated_length": 612.9375,
+      "completions/min_length": 8.0,
+      "completions/min_terminated_length": 8.0,
+      "entropy": 0.5286020040512085,
+      "epoch": 5.216931216931217,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0022879249881953,
+      "learning_rate": 6.150154258476314e-10,
+      "loss": -0.0764,
+      "num_tokens": 19399360.0,
+      "reward": 0.7708333730697632,
+      "reward_std": 0.25392836332321167,
+      "rewards/itbench_correctness/mean": 0.7708333730697632,
+      "rewards/itbench_correctness/std": 0.26440009474754333,
+      "step": 986,
+      "step_time": 232.74916400574148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 887.0,
+      "completions/max_terminated_length": 887.0,
+      "completions/mean_length": 668.0,
+      "completions/mean_terminated_length": 668.0,
+      "completions/min_length": 429.0,
+      "completions/min_terminated_length": 429.0,
+      "entropy": 0.5179640650749207,
+      "epoch": 5.222222222222222,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.28125,
+      "kl": 0.0015953207621350884,
+      "learning_rate": 5.35760930569229e-10,
+      "loss": 0.0063,
+      "num_tokens": 19417072.0,
+      "reward": 0.0625,
+      "reward_std": 0.1767766922712326,
+      "rewards/itbench_correctness/mean": 0.0625,
+      "rewards/itbench_correctness/std": 0.25,
+      "step": 987,
+      "step_time": 90.40588045120239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 796.0,
+      "completions/max_terminated_length": 796.0,
+      "completions/mean_length": 559.8125,
+      "completions/mean_terminated_length": 559.8125,
+      "completions/min_length": 460.0,
+      "completions/min_terminated_length": 460.0,
+      "entropy": 0.5394663214683533,
+      "epoch": 5.227513227513228,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.140625,
+      "kl": 0.0017291223630309105,
+      "learning_rate": 4.619684961881254e-10,
+      "loss": -0.0123,
+      "num_tokens": 19447357.0,
+      "reward": 0.40625,
+      "reward_std": 0.1293872892856598,
+      "rewards/itbench_correctness/mean": 0.40625,
+      "rewards/itbench_correctness/std": 0.4552929699420929,
+      "step": 988,
+      "step_time": 91.7427905248478
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 926.0,
+      "completions/mean_length": 719.4375,
+      "completions/mean_terminated_length": 617.9166870117188,
+      "completions/min_length": 402.0,
+      "completions/min_terminated_length": 402.0,
+      "entropy": 0.3725132346153259,
+      "epoch": 5.232804232804233,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.671875,
+      "kl": 0.0019212639890611172,
+      "learning_rate": 3.9363892968641287e-10,
+      "loss": 0.0219,
+      "num_tokens": 19463452.0,
+      "reward": 0.21875,
+      "reward_std": 0.1978391408920288,
+      "rewards/itbench_correctness/mean": 0.21875,
+      "rewards/itbench_correctness/std": 0.22219711542129517,
+      "step": 989,
+      "step_time": 508.81171389855444
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 868.0,
+      "completions/mean_length": 719.0,
+      "completions/mean_terminated_length": 675.4285888671875,
+      "completions/min_length": 483.0,
+      "completions/min_terminated_length": 483.0,
+      "entropy": 0.2767733037471771,
+      "epoch": 5.238095238095238,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.984375,
+      "kl": 0.0014671377139165998,
+      "learning_rate": 3.3077297830541585e-10,
+      "loss": 0.0246,
+      "num_tokens": 19480260.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.37473249435424805,
+      "rewards/itbench_correctness/mean": 0.5416666865348816,
+      "rewards/itbench_correctness/std": 0.5,
+      "step": 990,
+      "step_time": 126.7575543159619
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 709.0,
+      "completions/max_terminated_length": 709.0,
+      "completions/mean_length": 516.25,
+      "completions/mean_terminated_length": 516.25,
+      "completions/min_length": 318.0,
+      "completions/min_terminated_length": 318.0,
+      "entropy": 0.46295398473739624,
+      "epoch": 5.243386243386244,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.0078125,
+      "kl": 0.0014694234123453498,
+      "learning_rate": 2.733713295369755e-10,
+      "loss": -0.0213,
+      "num_tokens": 19491424.0,
+      "reward": 0.875,
+      "reward_std": 0.18898223340511322,
+      "rewards/itbench_correctness/mean": 0.875,
+      "rewards/itbench_correctness/std": 0.28867512941360474,
+      "step": 991,
+      "step_time": 357.908637705259
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 913.0,
+      "completions/max_terminated_length": 913.0,
+      "completions/mean_length": 658.0625,
+      "completions/mean_terminated_length": 658.0625,
+      "completions/min_length": 454.0,
+      "completions/min_terminated_length": 454.0,
+      "entropy": 0.6351979970932007,
+      "epoch": 5.248677248677248,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.71875,
+      "kl": 0.0013824838679283857,
+      "learning_rate": 2.2143461111645556e-10,
+      "loss": 0.0117,
+      "num_tokens": 19507433.0,
+      "reward": 0.6458333730697632,
+      "reward_std": 0.2946278154850006,
+      "rewards/itbench_correctness/mean": 0.6458333730697632,
+      "rewards/itbench_correctness/std": 0.40311288833618164,
+      "step": 992,
+      "step_time": 522.8660918865353
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 895.0,
+      "completions/mean_length": 678.25,
+      "completions/mean_terminated_length": 655.2000122070312,
+      "completions/min_length": 462.0,
+      "completions/min_terminated_length": 462.0,
+      "entropy": 0.4423147737979889,
+      "epoch": 5.253968253968254,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.024169921875,
+      "kl": 0.0016678622923791409,
+      "learning_rate": 1.7496339101535918e-10,
+      "loss": 0.0,
+      "num_tokens": 19545629.0,
+      "reward": 0.5,
+      "reward_std": 0.0,
+      "rewards/itbench_correctness/mean": 0.5,
+      "rewards/itbench_correctness/std": 0.5163977742195129,
+      "step": 993,
+      "step_time": 334.44221889507025
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 955.0,
+      "completions/mean_length": 698.9375,
+      "completions/mean_terminated_length": 677.2667236328125,
+      "completions/min_length": 451.0,
+      "completions/min_terminated_length": 451.0,
+      "entropy": 0.39774659276008606,
+      "epoch": 5.2592592592592595,
+      "frac_reward_zero_std": 0.5,
+      "grad_norm": 1.2265625,
+      "kl": 0.0019350027432665229,
+      "learning_rate": 1.3395817743561132e-10,
+      "loss": 0.0034,
+      "num_tokens": 19562180.0,
+      "reward": 0.1041666716337204,
+      "reward_std": 0.03857583925127983,
+      "rewards/itbench_correctness/mean": 0.1041666716337204,
+      "rewards/itbench_correctness/std": 0.11979921907186508,
+      "step": 994,
+      "step_time": 805.2065543290228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0625,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 999.0,
+      "completions/mean_length": 563.8125,
+      "completions/mean_terminated_length": 533.1333618164062,
+      "completions/min_length": 1.0,
+      "completions/min_terminated_length": 1.0,
+      "entropy": 0.4930717349052429,
+      "epoch": 5.264550264550264,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.78125,
+      "kl": 0.0037524921353906393,
+      "learning_rate": 9.841941880361914e-11,
+      "loss": -0.0262,
+      "num_tokens": 19574545.0,
+      "reward": 0.5104166269302368,
+      "reward_std": 0.32622629404067993,
+      "rewards/itbench_correctness/mean": 0.5104166269302368,
+      "rewards/itbench_correctness/std": 0.3812578022480011,
+      "step": 995,
+      "step_time": 363.2403373187408
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 771.625,
+      "completions/mean_terminated_length": 575.3333129882812,
+      "completions/min_length": 466.0,
+      "completions/min_terminated_length": 466.0,
+      "entropy": 0.5106106996536255,
+      "epoch": 5.26984126984127,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8671875,
+      "kl": 0.0013155718334019184,
+      "learning_rate": 6.834750376549791e-11,
+      "loss": -0.011,
+      "num_tokens": 19592107.0,
+      "reward": 0.125,
+      "reward_std": 0.3535533845424652,
+      "rewards/itbench_correctness/mean": 0.125,
+      "rewards/itbench_correctness/std": 0.3415650427341461,
+      "step": 996,
+      "step_time": 159.3905362924561
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.8125,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 991.0,
+      "completions/mean_length": 999.8125,
+      "completions/mean_terminated_length": 895.0,
+      "completions/min_length": 774.0,
+      "completions/min_terminated_length": 774.0,
+      "entropy": 0.6281177997589111,
+      "epoch": 5.275132275132275,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.203125,
+      "kl": 0.001502976636402309,
+      "learning_rate": 4.3742761183018783e-11,
+      "loss": 0.0133,
+      "num_tokens": 19617248.0,
+      "reward": 0.21250000596046448,
+      "reward_std": 0.1787744164466858,
+      "rewards/itbench_correctness/mean": 0.21250000596046448,
+      "rewards/itbench_correctness/std": 0.20124614238739014,
+      "step": 997,
+      "step_time": 138.13474278803915
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 651.625,
+      "completions/mean_terminated_length": 651.625,
+      "completions/min_length": 475.0,
+      "completions/min_terminated_length": 475.0,
+      "entropy": 0.40821024775505066,
+      "epoch": 5.28042328042328,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5,
+      "kl": 0.0013937059557065368,
+      "learning_rate": 2.4605460129556442e-11,
+      "loss": -0.0343,
+      "num_tokens": 19632762.0,
+      "reward": 0.8848214149475098,
+      "reward_std": 0.04293148219585419,
+      "rewards/itbench_correctness/mean": 0.8848214149475098,
+      "rewards/itbench_correctness/std": 0.10898028314113617,
+      "step": 998,
+      "step_time": 133.00229213759303
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 436.0,
+      "completions/mean_length": 707.75,
+      "completions/mean_terminated_length": 391.5,
+      "completions/min_length": 339.0,
+      "completions/min_terminated_length": 339.0,
+      "entropy": 0.6329918503761292,
+      "epoch": 5.285714285714286,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.828125,
+      "kl": 0.0015219503547996283,
+      "learning_rate": 1.0935809887702152e-11,
+      "loss": 0.0067,
+      "num_tokens": 19651310.0,
+      "reward": 0.5859375,
+      "reward_std": 0.2041938304901123,
+      "rewards/itbench_correctness/mean": 0.5859375,
+      "rewards/itbench_correctness/std": 0.4557931423187256,
+      "step": 999,
+      "step_time": 188.53237317036837
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 1024.0,
+      "completions/max_terminated_length": 995.0,
+      "completions/mean_length": 665.75,
+      "completions/mean_terminated_length": 546.3333740234375,
+      "completions/min_length": 320.0,
+      "completions/min_terminated_length": 320.0,
+      "entropy": 0.42959070205688477,
+      "epoch": 5.291005291005291,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 2.046875,
+      "kl": 0.001996886683627963,
+      "learning_rate": 2.7339599464326622e-12,
+      "loss": 0.021,
+      "num_tokens": 19667050.0,
+      "reward": 0.1160714328289032,
+      "reward_std": 0.23404696583747864,
+      "rewards/itbench_correctness/mean": 0.1160714328289032,
+      "rewards/itbench_correctness/std": 0.25404882431030273,
+      "step": 1000,
+      "step_time": 844.8066724454984
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 19667050,
+  "num_train_epochs": 6,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}