{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.291005291005291, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 723.75, "completions/mean_terminated_length": 490.22222900390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5195164084434509, "epoch": 0.005291005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0010675216326490045, "learning_rate": 0.0, "loss": -0.1441, "num_tokens": 18452.0, "reward": 0.828125, "reward_std": 0.3463020324707031, "rewards/itbench_correctness/mean": 0.828125, "rewards/itbench_correctness/std": 0.33811673521995544, "step": 1, "step_time": 91.14044637419283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 604.75, "completions/mean_terminated_length": 544.857177734375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3654402792453766, "epoch": 0.010582010582010581, "frac_reward_zero_std": 0.5, "grad_norm": 0.8046875, "kl": 0.0009289107983931899, "learning_rate": 2e-08, "loss": -0.0658, "num_tokens": 33008.0, "reward": 0.3645833432674408, "reward_std": 0.1873345822095871, "rewards/itbench_correctness/mean": 0.3645833432674408, "rewards/itbench_correctness/std": 0.4552929401397705, "step": 2, "step_time": 828.1970858396962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 957.5625, "completions/mean_terminated_length": 905.888916015625, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "entropy": 0.5472227931022644, "epoch": 0.015873015873015872, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.001058843918144703, "learning_rate": 4e-08, "loss": 0.0343, "num_tokens": 55673.0, "reward": 0.34687501192092896, "reward_std": 0.3456803262233734, "rewards/itbench_correctness/mean": 0.34687501192092896, "rewards/itbench_correctness/std": 0.4120957851409912, "step": 3, "step_time": 151.529059112072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 532.5625, "completions/mean_terminated_length": 532.5625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5482924580574036, "epoch": 0.021164021164021163, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.0013292429503053427, "learning_rate": 6e-08, "loss": -0.1305, "num_tokens": 68794.0, "reward": 0.7916666865348816, "reward_std": 0.32439103722572327, "rewards/itbench_correctness/mean": 0.7916666865348816, "rewards/itbench_correctness/std": 0.34960296750068665, "step": 4, "step_time": 417.0535086672753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 711.4375, "completions/mean_terminated_length": 468.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.47509443759918213, "epoch": 0.026455026455026454, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0010377272265031934, "learning_rate": 8e-08, "loss": -0.0456, "num_tokens": 83449.0, "reward": 0.3515625, "reward_std": 0.2974616289138794, "rewards/itbench_correctness/mean": 0.3515625, "rewards/itbench_correctness/std": 0.32021722197532654, "step": 5, "step_time": 128.02622807957232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 951.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5846477150917053, "epoch": 0.031746031746031744, "frac_reward_zero_std": 0.5, "grad_norm": 1.5625, "kl": 0.000945708598010242, "learning_rate": 1e-07, "loss": 0.0034, "num_tokens": 122025.0, "reward": 0.25, "reward_std": 0.2182178944349289, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.394405335187912, "step": 6, "step_time": 145.1888073068112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 544.4375, "completions/mean_terminated_length": 544.4375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.27000343799591064, "epoch": 0.037037037037037035, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.001091663958504796, "learning_rate": 1.2e-07, "loss": -0.0527, "num_tokens": 135296.0, "reward": 0.3736979365348816, "reward_std": 0.31324487924575806, "rewards/itbench_correctness/mean": 0.3736979365348816, "rewards/itbench_correctness/std": 0.3162706792354584, "step": 7, "step_time": 83.35824911855161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 683.25, "completions/mean_terminated_length": 660.5333862304688, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "entropy": 0.47713136672973633, "epoch": 0.042328042328042326, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0009320093668065965, "learning_rate": 1.4e-07, "loss": 0.0447, "num_tokens": 150748.0, "reward": 0.9322916269302368, "reward_std": 0.062747523188591, "rewards/itbench_correctness/mean": 0.9322916269302368, "rewards/itbench_correctness/std": 0.11063265055418015, "step": 8, "step_time": 179.78012859076262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 518.6875, "completions/mean_terminated_length": 446.5000305175781, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.42993131279945374, "epoch": 0.047619047619047616, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.001125591923482716, "learning_rate": 1.6e-07, "loss": -0.0315, "num_tokens": 165327.0, "reward": 0.578125, "reward_std": 0.24882009625434875, "rewards/itbench_correctness/mean": 0.578125, "rewards/itbench_correctness/std": 0.2660909593105316, "step": 9, "step_time": 145.98578487429768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 700.0625, "completions/mean_terminated_length": 552.8181762695312, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.4028211832046509, "epoch": 0.05291005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0010170488385483623, "learning_rate": 1.8e-07, "loss": -0.0257, "num_tokens": 181176.0, "reward": 0.5083333253860474, "reward_std": 0.3309464454650879, "rewards/itbench_correctness/mean": 0.5083333253860474, "rewards/itbench_correctness/std": 0.3380225598812103, "step": 10, "step_time": 135.02681362256408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 466.4375, "completions/mean_terminated_length": 466.4375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.37089642882347107, "epoch": 0.0582010582010582, "frac_reward_zero_std": 0.5, "grad_norm": 1.15625, "kl": 0.0011742996284738183, "learning_rate": 2e-07, "loss": -0.0049, "num_tokens": 192455.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4989572763442993, "step": 11, "step_time": 994.5879717040807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 501.25, "completions/mean_terminated_length": 327.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.49077308177948, "epoch": 0.06349206349206349, "frac_reward_zero_std": 0.5, "grad_norm": 0.890625, "kl": 0.0011536800302565098, "learning_rate": 2.1999999999999998e-07, "loss": -0.0581, "num_tokens": 210611.0, "reward": 0.09375, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.09375, "rewards/itbench_correctness/std": 0.20155644416809082, "step": 12, "step_time": 106.51383402384818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 420.9375, "completions/mean_terminated_length": 420.9375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5107646584510803, "epoch": 0.06878306878306878, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.0012960818130522966, "learning_rate": 2.4e-07, "loss": -0.1036, "num_tokens": 220666.0, "reward": 0.5572916865348816, "reward_std": 0.2719196677207947, "rewards/itbench_correctness/mean": 0.5572916865348816, "rewards/itbench_correctness/std": 0.2750736474990845, "step": 13, "step_time": 78.42556338571012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 553.4375, "completions/mean_terminated_length": 486.21429443359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6179559826850891, "epoch": 0.07407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0013870123075321317, "learning_rate": 2.6e-07, "loss": -0.1253, "num_tokens": 237537.0, "reward": 0.5, "reward_std": 0.3650856614112854, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 14, "step_time": 266.15765621792525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 504.9375, "completions/mean_terminated_length": 504.9375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.5228369832038879, "epoch": 0.07936507936507936, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.0013084918027743697, "learning_rate": 2.8e-07, "loss": 0.0175, "num_tokens": 253968.0, "reward": 0.9035714864730835, "reward_std": 0.06060914695262909, "rewards/itbench_correctness/mean": 0.9035714864730835, "rewards/itbench_correctness/std": 0.10054273903369904, "step": 15, "step_time": 132.4059884781018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 420.5, "completions/mean_terminated_length": 420.5, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.4851367473602295, "epoch": 0.08465608465608465, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.001197831821627915, "learning_rate": 3e-07, "loss": 0.0638, "num_tokens": 263192.0, "reward": 0.4375, "reward_std": 0.38298875093460083, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 16, "step_time": 94.08578859362751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 573.8125, "completions/mean_terminated_length": 573.8125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.3694586753845215, "epoch": 0.08994708994708994, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.0009872535010799766, "learning_rate": 3.2e-07, "loss": -0.0021, "num_tokens": 276349.0, "reward": 0.7132352590560913, "reward_std": 0.24745365977287292, "rewards/itbench_correctness/mean": 0.7132352590560913, "rewards/itbench_correctness/std": 0.44946467876434326, "step": 17, "step_time": 803.3225803021342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 918.625, "completions/mean_terminated_length": 783.1428833007812, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "entropy": 0.3722955584526062, "epoch": 0.09523809523809523, "frac_reward_zero_std": 0.5, "grad_norm": 1.7421875, "kl": 0.0011494335485622287, "learning_rate": 3.4000000000000003e-07, "loss": 0.0019, "num_tokens": 305519.0, "reward": 0.7291666865348816, "reward_std": 0.23464766144752502, "rewards/itbench_correctness/mean": 0.7291666865348816, "rewards/itbench_correctness/std": 0.4254627227783203, "step": 18, "step_time": 293.30187319312245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 638.5, "completions/mean_terminated_length": 638.5, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.4792482256889343, "epoch": 0.10052910052910052, "frac_reward_zero_std": 0.5, "grad_norm": 1.390625, "kl": 0.001083478331565857, "learning_rate": 3.6e-07, "loss": -0.0201, "num_tokens": 319703.0, "reward": 0.71875, "reward_std": 0.09797047078609467, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.16520188748836517, "step": 19, "step_time": 138.92694834899157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 680.625, "completions/mean_terminated_length": 524.5454711914062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.39669421315193176, "epoch": 0.10582010582010581, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0011998838745057583, "learning_rate": 3.7999999999999996e-07, "loss": -0.1584, "num_tokens": 342409.0, "reward": 0.1770833432674408, "reward_std": 0.3077988028526306, "rewards/itbench_correctness/mean": 0.1770833432674408, "rewards/itbench_correctness/std": 0.3413955569267273, "step": 20, "step_time": 374.13402384892106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 655.375, "completions/mean_terminated_length": 655.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4669082462787628, "epoch": 0.1111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0013768852222710848, "learning_rate": 4e-07, "loss": -0.115, "num_tokens": 365695.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 21, "step_time": 114.49692635703832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 880.375, "completions/mean_terminated_length": 847.2307739257812, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "entropy": 0.511145830154419, "epoch": 0.1164021164021164, "frac_reward_zero_std": 0.5, "grad_norm": 1.4609375, "kl": 0.0011111602652817965, "learning_rate": 4.1999999999999995e-07, "loss": 0.0192, "num_tokens": 389429.0, "reward": 0.59375, "reward_std": 0.03788072243332863, "rewards/itbench_correctness/mean": 0.59375, "rewards/itbench_correctness/std": 0.4227531850337982, "step": 22, "step_time": 103.79572249855846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 653.375, "completions/mean_terminated_length": 653.375, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "entropy": 0.5203749537467957, "epoch": 0.12169312169312169, "frac_reward_zero_std": 1.0, "grad_norm": 0.021240234375, "kl": 0.0011424734257161617, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "num_tokens": 405051.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 23, "step_time": 158.34662247169763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 565.0, "completions/mean_terminated_length": 565.0, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.5274336338043213, "epoch": 0.12698412698412698, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0010382338659837842, "learning_rate": 4.6e-07, "loss": -0.0043, "num_tokens": 417107.0, "reward": 0.578125, "reward_std": 0.1099528968334198, "rewards/itbench_correctness/mean": 0.578125, "rewards/itbench_correctness/std": 0.19116783142089844, "step": 24, "step_time": 93.74338541273028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 806.9375, "completions/mean_terminated_length": 734.5833740234375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.4733947813510895, "epoch": 0.13227513227513227, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0012686237460002303, "learning_rate": 4.8e-07, "loss": -0.027, "num_tokens": 443242.0, "reward": 0.5, "reward_std": 0.39511844515800476, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.4654746949672699, "step": 25, "step_time": 117.60556835308671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 717.125, "completions/mean_terminated_length": 673.2857666015625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.46017083525657654, "epoch": 0.13756613756613756, "frac_reward_zero_std": 0.5, "grad_norm": 1.234375, "kl": 0.0011198390275239944, "learning_rate": 5e-07, "loss": -0.0578, "num_tokens": 460940.0, "reward": 0.15625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.15625, "rewards/itbench_correctness/std": 0.23935678601264954, "step": 26, "step_time": 435.78113711997867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 709.125, "completions/mean_terminated_length": 664.1428833007812, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.38639166951179504, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.001125096925534308, "learning_rate": 5.2e-07, "loss": 0.0106, "num_tokens": 477014.0, "reward": 0.3166666626930237, "reward_std": 0.175833061337471, "rewards/itbench_correctness/mean": 0.3166666626930237, "rewards/itbench_correctness/std": 0.2388242930173874, "step": 27, "step_time": 136.88797108456492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 603.9375, "completions/mean_terminated_length": 543.9285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.42057332396507263, "epoch": 0.14814814814814814, "frac_reward_zero_std": 0.5, "grad_norm": 1.0390625, "kl": 0.0010505876271054149, "learning_rate": 5.4e-07, "loss": -0.012, "num_tokens": 491261.0, "reward": 0.46875, "reward_std": 0.04312910512089729, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4876958429813385, "step": 28, "step_time": 450.98891491629183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 822.4375, "completions/mean_terminated_length": 755.25, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 0.4498822093009949, "epoch": 0.15343915343915343, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0008466003928333521, "learning_rate": 5.6e-07, "loss": -0.0265, "num_tokens": 510580.0, "reward": 0.831250011920929, "reward_std": 0.2088201940059662, "rewards/itbench_correctness/mean": 0.831250011920929, "rewards/itbench_correctness/std": 0.24958299100399017, "step": 29, "step_time": 82.54267377220094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 790.4375, "completions/mean_terminated_length": 650.2999877929688, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 0.33146199584007263, "epoch": 0.15873015873015872, "frac_reward_zero_std": 0.5, "grad_norm": 1.3125, "kl": 0.000913652591407299, "learning_rate": 5.8e-07, "loss": -0.0192, "num_tokens": 529675.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4989572763442993, "step": 30, "step_time": 153.0279028210789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 852.0625, "completions/mean_terminated_length": 773.9091186523438, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "entropy": 0.624367356300354, "epoch": 0.164021164021164, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0011790527496486902, "learning_rate": 6e-07, "loss": 0.0451, "num_tokens": 554588.0, "reward": 0.17812499403953552, "reward_std": 0.21488739550113678, "rewards/itbench_correctness/mean": 0.17812499403953552, "rewards/itbench_correctness/std": 0.21210749447345734, "step": 31, "step_time": 496.71210376080126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 464.375, "completions/mean_terminated_length": 464.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4242261052131653, "epoch": 0.1693121693121693, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.001077913912013173, "learning_rate": 6.2e-07, "loss": -0.0869, "num_tokens": 565106.0, "reward": 0.53125, "reward_std": 0.12696418166160583, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.43977582454681396, "step": 32, "step_time": 62.5571150816977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 790.375, "completions/mean_terminated_length": 490.0000305175781, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5136802196502686, "epoch": 0.1746031746031746, "frac_reward_zero_std": 0.5, "grad_norm": 1.34375, "kl": 0.0010994027834385633, "learning_rate": 6.4e-07, "loss": -0.0752, "num_tokens": 590992.0, "reward": 0.2723214328289032, "reward_std": 0.22582654654979706, "rewards/itbench_correctness/mean": 0.2723214328289032, "rewards/itbench_correctness/std": 0.417490690946579, "step": 33, "step_time": 873.944114420563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 382.3125, "completions/mean_terminated_length": 382.3125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41327446699142456, "epoch": 0.17989417989417988, "frac_reward_zero_std": 0.5, "grad_norm": 0.38671875, "kl": 0.0016043909126892686, "learning_rate": 6.6e-07, "loss": -0.05, "num_tokens": 599933.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 34, "step_time": 811.917650568299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 961.25, "completions/mean_terminated_length": 856.6666870117188, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "entropy": 0.6283485293388367, "epoch": 0.18518518518518517, "frac_reward_zero_std": 0.5, "grad_norm": 1.515625, "kl": 0.0012013108935207129, "learning_rate": 6.800000000000001e-07, "loss": 0.0029, "num_tokens": 628801.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 35, "step_time": 104.12166160158813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 765.4375, "completions/mean_terminated_length": 506.875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "entropy": 0.29917532205581665, "epoch": 0.19047619047619047, "frac_reward_zero_std": 0.5, "grad_norm": 1.390625, "kl": 0.0010058790212497115, "learning_rate": 7e-07, "loss": 0.0029, "num_tokens": 648056.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 36, "step_time": 884.992473276332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 540.0625, "completions/mean_terminated_length": 540.0625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.4203217327594757, "epoch": 0.19576719576719576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0106201171875, "kl": 0.001015704357996583, "learning_rate": 7.2e-07, "loss": 0.0, "num_tokens": 660377.0, "reward": 0.5833333134651184, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5833333134651184, "rewards/itbench_correctness/std": 0.4303314983844757, "step": 37, "step_time": 85.28049738146365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 456.1875, "completions/mean_terminated_length": 456.1875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5830935835838318, "epoch": 0.20105820105820105, "frac_reward_zero_std": 1.0, "grad_norm": 0.017822265625, "kl": 0.0013373795663937926, "learning_rate": 7.4e-07, "loss": 0.0, "num_tokens": 688692.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 38, "step_time": 211.86649047024548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 711.375, "completions/mean_terminated_length": 639.2307739257812, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.2656826674938202, "epoch": 0.20634920634920634, "frac_reward_zero_std": 0.5, "grad_norm": 1.265625, "kl": 0.0009032340021803975, "learning_rate": 7.599999999999999e-07, "loss": 0.0308, "num_tokens": 707554.0, "reward": 0.484375, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.484375, "rewards/itbench_correctness/std": 0.503891110420227, "step": 39, "step_time": 823.7539153788239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 455.6875, "completions/mean_terminated_length": 455.6875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.4432862401008606, "epoch": 0.21164021164021163, "frac_reward_zero_std": 0.5, "grad_norm": 0.9609375, "kl": 0.0011500748805701733, "learning_rate": 7.799999999999999e-07, "loss": -0.0166, "num_tokens": 717741.0, "reward": 0.2395833432674408, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.2395833432674408, "rewards/itbench_correctness/std": 0.19214914739131927, "step": 40, "step_time": 798.0437586428598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 644.25, "completions/mean_terminated_length": 416.3999938964844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6643383502960205, "epoch": 0.21693121693121692, "frac_reward_zero_std": 0.5, "grad_norm": 1.3984375, "kl": 0.0012484320905059576, "learning_rate": 8e-07, "loss": 0.0101, "num_tokens": 743129.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.015625, "rewards/itbench_correctness/std": 0.0625, "step": 41, "step_time": 98.21953046228737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 725.875, "completions/mean_terminated_length": 547.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5951437950134277, "epoch": 0.2222222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0012599489418789744, "learning_rate": 8.199999999999999e-07, "loss": -0.1275, "num_tokens": 779247.0, "reward": 0.4375, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 42, "step_time": 374.9474004274234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 411.625, "completions/mean_terminated_length": 411.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.449438214302063, "epoch": 0.2275132275132275, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0011913544731214643, "learning_rate": 8.399999999999999e-07, "loss": -0.063, "num_tokens": 794473.0, "reward": 0.2911931872367859, "reward_std": 0.16020165383815765, "rewards/itbench_correctness/mean": 0.2911931872367859, "rewards/itbench_correctness/std": 0.1646159142255783, "step": 43, "step_time": 82.59138822741807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 500.0625, "completions/mean_terminated_length": 500.0625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5279340147972107, "epoch": 0.2328042328042328, "frac_reward_zero_std": 0.5, "grad_norm": 0.91796875, "kl": 0.0011128420010209084, "learning_rate": 8.599999999999999e-07, "loss": -0.0494, "num_tokens": 805986.0, "reward": 0.65625, "reward_std": 0.09643959254026413, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.3786855936050415, "step": 44, "step_time": 486.3739328915253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 970.5625, "completions/mean_terminated_length": 596.5, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "entropy": 0.5398930907249451, "epoch": 0.23809523809523808, "frac_reward_zero_std": 0.5, "grad_norm": 1.5625, "kl": 0.0010154710616916418, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 836355.0, "reward": 0.109375, "reward_std": 0.14074896275997162, "rewards/itbench_correctness/mean": 0.109375, "rewards/itbench_correctness/std": 0.22302372753620148, "step": 45, "step_time": 134.77385379187763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 522.5, "completions/mean_terminated_length": 522.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.45741626620292664, "epoch": 0.24338624338624337, "frac_reward_zero_std": 1.0, "grad_norm": 0.02490234375, "kl": 0.0011397113557904959, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 849099.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 46, "step_time": 88.64014313649386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 596.6875, "completions/mean_terminated_length": 596.6875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.3787577152252197, "epoch": 0.24867724867724866, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0011281240731477737, "learning_rate": 9.2e-07, "loss": -0.0062, "num_tokens": 862838.0, "reward": 0.7421875, "reward_std": 0.3093565106391907, "rewards/itbench_correctness/mean": 0.7421875, "rewards/itbench_correctness/std": 0.3337562382221222, "step": 47, "step_time": 70.17125954851508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 423.0625, "completions/mean_terminated_length": 423.0625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.45146992802619934, "epoch": 0.25396825396825395, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.0013216013321653008, "learning_rate": 9.399999999999999e-07, "loss": 0.0073, "num_tokens": 872559.0, "reward": 0.11328125, "reward_std": 0.08985587954521179, "rewards/itbench_correctness/mean": 0.11328125, "rewards/itbench_correctness/std": 0.16958704590797424, "step": 48, "step_time": 87.19072807300836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 815.375, "completions/mean_terminated_length": 653.1111450195312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.39981603622436523, "epoch": 0.25925925925925924, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0008777660550549626, "learning_rate": 9.6e-07, "loss": 0.0344, "num_tokens": 891605.0, "reward": 0.32207342982292175, "reward_std": 0.2425267994403839, "rewards/itbench_correctness/mean": 0.32207342982292175, "rewards/itbench_correctness/std": 0.32837510108947754, "step": 49, "step_time": 145.8219982078299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 576.25, "completions/mean_terminated_length": 546.4000244140625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.5032538175582886, "epoch": 0.26455026455026454, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0010894184233620763, "learning_rate": 9.8e-07, "loss": 0.002, "num_tokens": 903569.0, "reward": 0.3849431872367859, "reward_std": 0.13158553838729858, "rewards/itbench_correctness/mean": 0.3849431872367859, "rewards/itbench_correctness/std": 0.20182853937149048, "step": 50, "step_time": 374.8412516852841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 721.0625, "completions/mean_terminated_length": 539.2999877929688, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.3356158435344696, "epoch": 0.2698412698412698, "frac_reward_zero_std": 0.5, "grad_norm": 1.0703125, "kl": 0.0009243504609912634, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 926762.0, "reward": 0.4732142686843872, "reward_std": 0.07576144486665726, "rewards/itbench_correctness/mean": 0.4732142686843872, "rewards/itbench_correctness/std": 0.4995746612548828, "step": 51, "step_time": 99.96388372033834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 644.8125, "completions/mean_terminated_length": 590.6428833007812, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.43733644485473633, "epoch": 0.2751322751322751, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.0011704964563250542, "learning_rate": 9.999972660400534e-07, "loss": -0.0123, "num_tokens": 941111.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 52, "step_time": 114.90833497233689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 588.4375, "completions/mean_terminated_length": 588.4375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.42825278639793396, "epoch": 0.2804232804232804, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0011681180913001299, "learning_rate": 9.999890641901124e-07, "loss": -0.0885, "num_tokens": 953494.0, "reward": 0.8244047164916992, "reward_std": 0.27204394340515137, "rewards/itbench_correctness/mean": 0.8244047164916992, "rewards/itbench_correctness/std": 0.2754608690738678, "step": 53, "step_time": 127.88445741310716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 788.75, "completions/mean_terminated_length": 647.6000366210938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3549920618534088, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 1.3359375, "kl": 0.0009949615923687816, "learning_rate": 9.999753945398703e-07, "loss": -0.1075, "num_tokens": 981738.0, "reward": 0.5416666865348816, "reward_std": 0.044543541967868805, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.4772607088088989, "step": 54, "step_time": 270.70763381849974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 641.1875, "completions/mean_terminated_length": 615.6666870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4897163510322571, "epoch": 0.291005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0010646218433976173, "learning_rate": 9.99956257238817e-07, "loss": -0.0731, "num_tokens": 996773.0, "reward": 0.6666666865348816, "reward_std": 0.35634830594062805, "rewards/itbench_correctness/mean": 0.6666666865348816, "rewards/itbench_correctness/std": 0.42163705825805664, "step": 55, "step_time": 172.69540655519813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3881106972694397, "epoch": 0.2962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.001151208532974124, "learning_rate": 9.999316524962345e-07, "loss": -0.0416, "num_tokens": 1010651.0, "reward": 0.359375, "reward_std": 0.1751839816570282, "rewards/itbench_correctness/mean": 0.359375, "rewards/itbench_correctness/std": 0.1875, "step": 56, "step_time": 88.2503134328872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 699.6875, "completions/mean_terminated_length": 552.2727661132812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.523090660572052, "epoch": 0.30158730158730157, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.001052780426107347, "learning_rate": 9.999015805811963e-07, "loss": -0.1995, "num_tokens": 1031726.0, "reward": 0.2864583432674408, "reward_std": 0.1927037239074707, "rewards/itbench_correctness/mean": 0.2864583432674408, "rewards/itbench_correctness/std": 0.2652195990085602, "step": 57, "step_time": 354.65077784564346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 657.0625, "completions/mean_terminated_length": 490.2727355957031, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.47179681062698364, "epoch": 0.30687830687830686, "frac_reward_zero_std": 0.5, "grad_norm": 1.03125, "kl": 0.0011720317415893078, "learning_rate": 9.998660418225644e-07, "loss": 0.0026, "num_tokens": 1048359.0, "reward": 0.6420454978942871, "reward_std": 0.07464002817869186, "rewards/itbench_correctness/mean": 0.6420454978942871, "rewards/itbench_correctness/std": 0.38350099325180054, "step": 58, "step_time": 612.9088207762688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 757.1875, "completions/mean_terminated_length": 597.1000366210938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5388361811637878, "epoch": 0.31216931216931215, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.001013660104945302, "learning_rate": 9.998250366089846e-07, "loss": -0.0625, "num_tokens": 1067810.0, "reward": 0.375, "reward_std": 0.33407655358314514, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.3979112207889557, "step": 59, "step_time": 368.7298939973116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 921.625, "completions/mean_terminated_length": 696.4000244140625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.49694833159446716, "epoch": 0.31746031746031744, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.0011713014682754874, "learning_rate": 9.997785653888834e-07, "loss": 0.0514, "num_tokens": 1093876.0, "reward": 0.140625, "reward_std": 0.26977968215942383, "rewards/itbench_correctness/mean": 0.140625, "rewards/itbench_correctness/std": 0.2733854353427887, "step": 60, "step_time": 751.7327463729307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 780.5625, "completions/mean_terminated_length": 634.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4586436152458191, "epoch": 0.32275132275132273, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0011096000671386719, "learning_rate": 9.99726628670463e-07, "loss": -0.0469, "num_tokens": 1113397.0, "reward": 0.5833333134651184, "reward_std": 0.3903999924659729, "rewards/itbench_correctness/mean": 0.5833333134651184, "rewards/itbench_correctness/std": 0.3884918689727783, "step": 61, "step_time": 380.8789173979312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 782.6875, "completions/mean_terminated_length": 702.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5136149525642395, "epoch": 0.328042328042328, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0011364103993400931, "learning_rate": 9.996692270216946e-07, "loss": -0.0176, "num_tokens": 1133432.0, "reward": 0.581250011920929, "reward_std": 0.37959763407707214, "rewards/itbench_correctness/mean": 0.581250011920929, "rewards/itbench_correctness/std": 0.4445503354072571, "step": 62, "step_time": 86.81900852825493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 403.1875, "completions/mean_terminated_length": 403.1875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3745155930519104, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.0013223676942288876, "learning_rate": 9.996063610703135e-07, "loss": 0.0032, "num_tokens": 1148299.0, "reward": 0.4635416865348816, "reward_std": 0.06842001527547836, "rewards/itbench_correctness/mean": 0.4635416865348816, "rewards/itbench_correctness/std": 0.48778483271598816, "step": 63, "step_time": 649.0491365483031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 997.625, "completions/mean_terminated_length": 883.3333740234375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.3588522672653198, "epoch": 0.3386243386243386, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.0009991804836317897, "learning_rate": 9.995380315038117e-07, "loss": 0.0, "num_tokens": 1174997.0, "reward": 0.2083333432674408, "reward_std": 0.11785111576318741, "rewards/itbench_correctness/mean": 0.2083333432674408, "rewards/itbench_correctness/std": 0.2687419056892395, "step": 64, "step_time": 110.76476481370628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 454.0625, "completions/mean_terminated_length": 454.0625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5461803078651428, "epoch": 0.3439153439153439, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.0015210240380838513, "learning_rate": 9.994642390694308e-07, "loss": -0.0216, "num_tokens": 1187070.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 65, "step_time": 79.26396809145808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 719.8125, "completions/mean_terminated_length": 581.5454711914062, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "entropy": 0.5056872367858887, "epoch": 0.3492063492063492, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0013450310798361897, "learning_rate": 9.993849845741523e-07, "loss": -0.0131, "num_tokens": 1207347.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 66, "step_time": 337.5469845244661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 396.625, "completions/mean_terminated_length": 396.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.35297825932502747, "epoch": 0.3544973544973545, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0014365667011588812, "learning_rate": 9.993002688846912e-07, "loss": 0.0055, "num_tokens": 1216421.0, "reward": 0.28125, "reward_std": 0.3061639666557312, "rewards/itbench_correctness/mean": 0.28125, "rewards/itbench_correctness/std": 0.3400367796421051, "step": 67, "step_time": 1142.9677757564932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 682.5, "completions/mean_terminated_length": 603.6923217773438, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4981684982776642, "epoch": 0.35978835978835977, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0014837104827165604, "learning_rate": 9.992100929274846e-07, "loss": -0.0849, "num_tokens": 1231053.0, "reward": 0.5208333134651184, "reward_std": 0.4382143020629883, "rewards/itbench_correctness/mean": 0.5208333134651184, "rewards/itbench_correctness/std": 0.4326561689376831, "step": 68, "step_time": 479.8328125309199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 817.875, "completions/mean_terminated_length": 694.2000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5012990832328796, "epoch": 0.36507936507936506, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.001043464639224112, "learning_rate": 9.991144576886822e-07, "loss": 0.0323, "num_tokens": 1249699.0, "reward": 0.4642857015132904, "reward_std": 0.22637419402599335, "rewards/itbench_correctness/mean": 0.4642857015132904, "rewards/itbench_correctness/std": 0.4928053915500641, "step": 69, "step_time": 82.86210318095982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 693.5625, "completions/mean_terminated_length": 646.357177734375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.2220419943332672, "epoch": 0.37037037037037035, "frac_reward_zero_std": 0.5, "grad_norm": 0.388671875, "kl": 0.0010405785869807005, "learning_rate": 9.990133642141357e-07, "loss": -0.0727, "num_tokens": 1271964.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 70, "step_time": 202.81221913732588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 769.125, "completions/mean_terminated_length": 710.3077392578125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4264586269855499, "epoch": 0.37566137566137564, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0012975148856639862, "learning_rate": 9.989068136093872e-07, "loss": -0.0651, "num_tokens": 1293950.0, "reward": 0.6597222089767456, "reward_std": 0.4048736095428467, "rewards/itbench_correctness/mean": 0.6597222089767456, "rewards/itbench_correctness/std": 0.4526442587375641, "step": 71, "step_time": 454.2652143603191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 711.125, "completions/mean_terminated_length": 690.2667236328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.28124451637268066, "epoch": 0.38095238095238093, "frac_reward_zero_std": 0.5, "grad_norm": 0.51953125, "kl": 0.0010576838394626975, "learning_rate": 9.98794807039657e-07, "loss": -0.085, "num_tokens": 1311536.0, "reward": 0.8125, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.25, "step": 72, "step_time": 104.54512037336826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 516.25, "completions/mean_terminated_length": 516.25, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.46295398473739624, "epoch": 0.3862433862433862, "frac_reward_zero_std": 0.5, "grad_norm": 1.1640625, "kl": 0.0009957690490409732, "learning_rate": 9.98677345729831e-07, "loss": -0.0093, "num_tokens": 1322844.0, "reward": 0.6812499761581421, "reward_std": 0.062321171164512634, "rewards/itbench_correctness/mean": 0.6812499761581421, "rewards/itbench_correctness/std": 0.3400367796421051, "step": 73, "step_time": 635.4008999932557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 674.0, "completions/mean_terminated_length": 674.0, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.48961424827575684, "epoch": 0.3915343915343915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0283203125, "kl": 0.0012715591583400965, "learning_rate": 9.985544309644473e-07, "loss": 0.0, "num_tokens": 1342212.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 74, "step_time": 107.24268661439419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 889.125, "completions/mean_terminated_length": 844.1666870117188, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "entropy": 0.5128637552261353, "epoch": 0.3968253968253968, "frac_reward_zero_std": 0.5, "grad_norm": 1.640625, "kl": 0.0014187946217134595, "learning_rate": 9.98426064087682e-07, "loss": 0.0213, "num_tokens": 1371870.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 75, "step_time": 283.4586225701496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 880.9375, "completions/mean_terminated_length": 566.2000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6129833459854126, "epoch": 0.4021164021164021, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.0010527537669986486, "learning_rate": 9.982922465033348e-07, "loss": 0.0007, "num_tokens": 1399837.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 76, "step_time": 86.58735218271613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 715.6875, "completions/mean_terminated_length": 715.6875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.3325473666191101, "epoch": 0.4074074074074074, "frac_reward_zero_std": 0.5, "grad_norm": 1.2265625, "kl": 0.0010047402465716004, "learning_rate": 9.981529796748134e-07, "loss": 0.0174, "num_tokens": 1423416.0, "reward": 0.25833335518836975, "reward_std": 0.07715167105197906, "rewards/itbench_correctness/mean": 0.25833335518836975, "rewards/itbench_correctness/std": 0.19455552101135254, "step": 77, "step_time": 97.38318173773587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 376.1875, "completions/mean_terminated_length": 376.1875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4253198206424713, "epoch": 0.4126984126984127, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0014126732712611556, "learning_rate": 9.980082651251174e-07, "loss": -0.0444, "num_tokens": 1432859.0, "reward": 0.5520833730697632, "reward_std": 0.29967689514160156, "rewards/itbench_correctness/mean": 0.5520833730697632, "rewards/itbench_correctness/std": 0.32185083627700806, "step": 78, "step_time": 62.12770148552954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 779.1875, "completions/mean_terminated_length": 744.2142944335938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4029838740825653, "epoch": 0.41798941798941797, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0012558053713291883, "learning_rate": 9.978581044368217e-07, "loss": -0.0309, "num_tokens": 1461190.0, "reward": 0.1354166716337204, "reward_std": 0.07634378224611282, "rewards/itbench_correctness/mean": 0.1354166716337204, "rewards/itbench_correctness/std": 0.17447009682655334, "step": 79, "step_time": 79.0433895830065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 858.5625, "completions/mean_terminated_length": 729.888916015625, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "entropy": 0.36106863617897034, "epoch": 0.42328042328042326, "frac_reward_zero_std": 0.5, "grad_norm": 1.6171875, "kl": 0.0013509814161807299, "learning_rate": 9.977024992520601e-07, "loss": 0.0063, "num_tokens": 1483655.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 80, "step_time": 7292.784606534056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 627.6875, "completions/mean_terminated_length": 627.6875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5671612024307251, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0016267532482743263, "learning_rate": 9.975414512725056e-07, "loss": -0.0913, "num_tokens": 1504522.0, "reward": 0.5, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 81, "step_time": 89.92690824903548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 600.375, "completions/mean_terminated_length": 600.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.264834463596344, "epoch": 0.43386243386243384, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0011798108462244272, "learning_rate": 9.973749622593532e-07, "loss": -0.0018, "num_tokens": 1519384.0, "reward": 0.5625, "reward_std": 0.1462520956993103, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.19364917278289795, "step": 82, "step_time": 92.88886137399822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 491.9375, "completions/mean_terminated_length": 491.9375, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.35370346903800964, "epoch": 0.43915343915343913, "frac_reward_zero_std": 0.5, "grad_norm": 1.1484375, "kl": 0.001843614736571908, "learning_rate": 9.972030340333e-07, "loss": 0.0148, "num_tokens": 1531063.0, "reward": 0.3020833134651184, "reward_std": 0.1386406421661377, "rewards/itbench_correctness/mean": 0.3020833134651184, "rewards/itbench_correctness/std": 0.36498987674713135, "step": 83, "step_time": 1134.5993446996436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 475.75, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.41828691959381104, "epoch": 0.4444444444444444, "frac_reward_zero_std": 0.5, "grad_norm": 1.046875, "kl": 0.001323950826190412, "learning_rate": 9.970256684745255e-07, "loss": -0.0128, "num_tokens": 1542371.0, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 84, "step_time": 89.19195851124823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 987.0, "completions/mean_terminated_length": 876.0, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "entropy": 0.3343465030193329, "epoch": 0.4497354497354497, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0010419428581371903, "learning_rate": 9.968428675226713e-07, "loss": 0.0338, "num_tokens": 1576531.0, "reward": 0.6875, "reward_std": 0.32618680596351624, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.42108768224716187, "step": 85, "step_time": 85.11601546406746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 709.0625, "completions/mean_terminated_length": 520.1000366210938, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.33565446734428406, "epoch": 0.455026455026455, "frac_reward_zero_std": 0.5, "grad_norm": 0.98828125, "kl": 0.0012508188374340534, "learning_rate": 9.966546331768192e-07, "loss": -0.0029, "num_tokens": 1595508.0, "reward": 0.5104166865348816, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.5104166865348816, "rewards/itbench_correctness/std": 0.2543601393699646, "step": 86, "step_time": 110.2943638684228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 485.75, "completions/mean_terminated_length": 485.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.5352547764778137, "epoch": 0.4603174603174603, "frac_reward_zero_std": 0.5, "grad_norm": 1.4296875, "kl": 0.0011995767708867788, "learning_rate": 9.964609674954695e-07, "loss": 0.0036, "num_tokens": 1608696.0, "reward": 0.3125, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 87, "step_time": 85.32795084360987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 802.3125, "completions/mean_terminated_length": 751.1538696289062, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.40632545948028564, "epoch": 0.4656084656084656, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0009902386227622628, "learning_rate": 9.962618725965194e-07, "loss": -0.0316, "num_tokens": 1627885.0, "reward": 0.4479166865348816, "reward_std": 0.3577525019645691, "rewards/itbench_correctness/mean": 0.4479166865348816, "rewards/itbench_correctness/std": 0.420399934053421, "step": 88, "step_time": 81.01259941980243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 542.0625, "completions/mean_terminated_length": 542.0625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4243053197860718, "epoch": 0.4708994708994709, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0011555891251191497, "learning_rate": 9.960573506572389e-07, "loss": -0.0988, "num_tokens": 1640238.0, "reward": 0.53515625, "reward_std": 0.2504205107688904, "rewards/itbench_correctness/mean": 0.53515625, "rewards/itbench_correctness/std": 0.43777894973754883, "step": 89, "step_time": 97.55466525349766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 807.875, "completions/mean_terminated_length": 709.6364135742188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.48522359132766724, "epoch": 0.47619047619047616, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.001208159956149757, "learning_rate": 9.958474039142469e-07, "loss": -0.1015, "num_tokens": 1668412.0, "reward": 0.10625000298023224, "reward_std": 0.1334051787853241, "rewards/itbench_correctness/mean": 0.10625000298023224, "rewards/itbench_correctness/std": 0.16111589968204498, "step": 90, "step_time": 459.5639867214486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 543.0625, "completions/mean_terminated_length": 543.0625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5929335951805115, "epoch": 0.48148148148148145, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0014933362836018205, "learning_rate": 9.956320346634875e-07, "loss": -0.0536, "num_tokens": 1681853.0, "reward": 0.8125, "reward_std": 0.32946425676345825, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.3256048858165741, "step": 91, "step_time": 78.2018728973344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 682.625, "completions/mean_terminated_length": 527.45458984375, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "entropy": 0.38381248712539673, "epoch": 0.48677248677248675, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.001028747414238751, "learning_rate": 9.954112452602043e-07, "loss": 0.0, "num_tokens": 1707895.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 92, "step_time": 160.40463780704886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 984.0625, "completions/mean_terminated_length": 704.5, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.30079391598701477, "epoch": 0.49206349206349204, "frac_reward_zero_std": 0.5, "grad_norm": 1.890625, "kl": 0.0009068697690963745, "learning_rate": 9.95185038118915e-07, "loss": -0.0136, "num_tokens": 1733104.0, "reward": 0.53125, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.4989572763442993, "step": 93, "step_time": 135.90597889758646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 646.875, "completions/mean_terminated_length": 475.4545593261719, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5750724673271179, "epoch": 0.4973544973544973, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.001511996379122138, "learning_rate": 9.949534157133844e-07, "loss": -0.1351, "num_tokens": 1762622.0, "reward": 0.4765625, "reward_std": 0.32506585121154785, "rewards/itbench_correctness/mean": 0.4765625, "rewards/itbench_correctness/std": 0.3958607614040375, "step": 94, "step_time": 178.96230245847255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 802.5625, "completions/mean_terminated_length": 751.4615478515625, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "entropy": 0.5806401371955872, "epoch": 0.5026455026455027, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0013606835855171084, "learning_rate": 9.947163805765979e-07, "loss": 0.0764, "num_tokens": 1795879.0, "reward": 0.48124998807907104, "reward_std": 0.1944543570280075, "rewards/itbench_correctness/mean": 0.48124998807907104, "rewards/itbench_correctness/std": 0.47359442710876465, "step": 95, "step_time": 182.67914429306984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 667.5, "completions/mean_terminated_length": 667.5, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 0.32958802580833435, "epoch": 0.5079365079365079, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0008195637492462993, "learning_rate": 9.944739353007341e-07, "loss": 0.0178, "num_tokens": 1811303.0, "reward": 0.8374999761581421, "reward_std": 0.09672200679779053, "rewards/itbench_correctness/mean": 0.8374999761581421, "rewards/itbench_correctness/std": 0.1031898632645607, "step": 96, "step_time": 74.22002993617207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 519.4375, "completions/mean_terminated_length": 519.4375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3407532274723053, "epoch": 0.5132275132275133, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.0011502447305247188, "learning_rate": 9.942260825371357e-07, "loss": -0.1158, "num_tokens": 1824454.0, "reward": 0.5687500238418579, "reward_std": 0.23231291770935059, "rewards/itbench_correctness/mean": 0.5687500238418579, "rewards/itbench_correctness/std": 0.2676284909248352, "step": 97, "step_time": 72.25101596303284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 519.75, "completions/mean_terminated_length": 519.75, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.49639248847961426, "epoch": 0.5185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0014472255716100335, "learning_rate": 9.939728249962806e-07, "loss": -0.0098, "num_tokens": 1844642.0, "reward": 0.8500000238418579, "reward_std": 0.2121320366859436, "rewards/itbench_correctness/mean": 0.8500000238418579, "rewards/itbench_correctness/std": 0.24765567481517792, "step": 98, "step_time": 68.29791031684726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 692.9375, "completions/mean_terminated_length": 542.45458984375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5685938596725464, "epoch": 0.5238095238095238, "frac_reward_zero_std": 0.5, "grad_norm": 0.6171875, "kl": 0.0014234319096431136, "learning_rate": 9.937141654477528e-07, "loss": -0.1176, "num_tokens": 1866377.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 99, "step_time": 99.10520203411579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 337.6875, "completions/mean_terminated_length": 337.6875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3746066987514496, "epoch": 0.5291005291005291, "frac_reward_zero_std": 0.5, "grad_norm": 0.97265625, "kl": 0.0013704805169254541, "learning_rate": 9.934501067202117e-07, "loss": -0.0118, "num_tokens": 1874500.0, "reward": 0.3125, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.3095695972442627, "step": 100, "step_time": 831.8933219816536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 611.1875, "completions/mean_terminated_length": 473.5833435058594, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.5955619215965271, "epoch": 0.5343915343915344, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0012612693244591355, "learning_rate": 9.931806517013612e-07, "loss": 0.0328, "num_tokens": 1899799.0, "reward": 0.125, "reward_std": 0.2925041913986206, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.28867512941360474, "step": 101, "step_time": 185.49466035328805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 743.0625, "completions/mean_terminated_length": 724.3333740234375, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "entropy": 0.4737151861190796, "epoch": 0.5396825396825397, "frac_reward_zero_std": 0.5, "grad_norm": 1.609375, "kl": 0.0012106020003557205, "learning_rate": 9.929058033379181e-07, "loss": 0.0185, "num_tokens": 1915728.0, "reward": 0.8194444179534912, "reward_std": 0.20520132780075073, "rewards/itbench_correctness/mean": 0.8194444179534912, "rewards/itbench_correctness/std": 0.3367112874984741, "step": 102, "step_time": 418.81876328215003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 761.3125, "completions/mean_terminated_length": 557.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5017650723457336, "epoch": 0.544973544973545, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0011779210763052106, "learning_rate": 9.926255646355803e-07, "loss": -0.1277, "num_tokens": 1953421.0, "reward": 0.2708333432674408, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.2708333432674408, "rewards/itbench_correctness/std": 0.4254627227783203, "step": 103, "step_time": 131.8819383457303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 631.1875, "completions/mean_terminated_length": 605.0000610351562, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.41825923323631287, "epoch": 0.5502645502645502, "frac_reward_zero_std": 0.5, "grad_norm": 1.3046875, "kl": 0.001125396229326725, "learning_rate": 9.923399386589932e-07, "loss": 0.0027, "num_tokens": 1967568.0, "reward": 0.967524528503418, "reward_std": 0.0356326624751091, "rewards/itbench_correctness/mean": 0.967524528503418, "rewards/itbench_correctness/std": 0.059118952602148056, "step": 104, "step_time": 237.89590667374432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 437.25, "completions/mean_terminated_length": 398.13336181640625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41623786091804504, "epoch": 0.5555555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.001291301567107439, "learning_rate": 9.92048928531717e-07, "loss": -0.0479, "num_tokens": 1981084.0, "reward": 0.46875, "reward_std": 0.1883128434419632, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.2525334656238556, "step": 105, "step_time": 178.8811132274568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 725.9375, "completions/mean_terminated_length": 427.875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.5179509520530701, "epoch": 0.5608465608465608, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0009696544148027897, "learning_rate": 9.917525374361911e-07, "loss": 0.0018, "num_tokens": 1999387.0, "reward": 0.546875, "reward_std": 0.22097086906433105, "rewards/itbench_correctness/mean": 0.546875, "rewards/itbench_correctness/std": 0.5018196105957031, "step": 106, "step_time": 493.8660353682935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 911.375, "completions/mean_terminated_length": 843.7999877929688, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "entropy": 0.34014537930488586, "epoch": 0.5661375661375662, "frac_reward_zero_std": 0.5, "grad_norm": 1.4453125, "kl": 0.001088326214812696, "learning_rate": 9.914507686137017e-07, "loss": 0.0167, "num_tokens": 2022945.0, "reward": 0.35624998807907104, "reward_std": 0.11475905776023865, "rewards/itbench_correctness/mean": 0.35624998807907104, "rewards/itbench_correctness/std": 0.3999479115009308, "step": 107, "step_time": 235.87840359471738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 542.4375, "completions/mean_terminated_length": 542.4375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.4701002538204193, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0014570436906069517, "learning_rate": 9.911436253643443e-07, "loss": 0.0162, "num_tokens": 2036592.0, "reward": 0.8567708134651184, "reward_std": 0.19427995383739471, "rewards/itbench_correctness/mean": 0.8567708134651184, "rewards/itbench_correctness/std": 0.24054758250713348, "step": 108, "step_time": 129.46329625695944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 1023.4375, "completions/mean_terminated_length": 1015.0, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "entropy": 0.5901679396629333, "epoch": 0.5767195767195767, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0010806693462654948, "learning_rate": 9.90831111046988e-07, "loss": 0.0009, "num_tokens": 2060871.0, "reward": 0.15625, "reward_std": 0.3198433816432953, "rewards/itbench_correctness/mean": 0.15625, "rewards/itbench_correctness/std": 0.3520771861076355, "step": 109, "step_time": 73.70483169332147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 689.5, "completions/mean_terminated_length": 689.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.39158812165260315, "epoch": 0.582010582010582, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.001037560636177659, "learning_rate": 9.905132290792392e-07, "loss": -0.0033, "num_tokens": 2076943.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 110, "step_time": 73.8764311010018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 513.0, "completions/mean_terminated_length": 206.40000915527344, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5146198868751526, "epoch": 0.5873015873015873, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.0013582897372543812, "learning_rate": 9.901899829374047e-07, "loss": -0.1464, "num_tokens": 2089871.0, "reward": 0.3740079402923584, "reward_std": 0.34763163328170776, "rewards/itbench_correctness/mean": 0.3740079402923584, "rewards/itbench_correctness/std": 0.3568885028362274, "step": 111, "step_time": 695.7899582823738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 466.9375, "completions/mean_terminated_length": 466.9375, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.3662160336971283, "epoch": 0.5925925925925926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01226806640625, "kl": 0.0011427823919802904, "learning_rate": 9.89861376156452e-07, "loss": 0.0, "num_tokens": 2100646.0, "reward": 0.4166666865348816, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.4166666865348816, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 112, "step_time": 65.8763862894848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 998.875, "completions/mean_terminated_length": 823.0, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "entropy": 0.3023401200771332, "epoch": 0.5978835978835979, "frac_reward_zero_std": 0.5, "grad_norm": 1.3046875, "kl": 0.0008460183744318783, "learning_rate": 9.895274123299722e-07, "loss": 0.0013, "num_tokens": 2126916.0, "reward": 0.28125, "reward_std": 0.2086307406425476, "rewards/itbench_correctness/mean": 0.28125, "rewards/itbench_correctness/std": 0.4069705307483673, "step": 113, "step_time": 870.3144110767171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 618.875, "completions/mean_terminated_length": 525.3846435546875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.47182387113571167, "epoch": 0.6031746031746031, "frac_reward_zero_std": 0.5, "grad_norm": 1.4609375, "kl": 0.0011947272578254342, "learning_rate": 9.891880951101407e-07, "loss": -0.0027, "num_tokens": 2140634.0, "reward": 0.15416666865348816, "reward_std": 0.21283237636089325, "rewards/itbench_correctness/mean": 0.15416666865348816, "rewards/itbench_correctness/std": 0.3315228819847107, "step": 114, "step_time": 111.45921329036355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 831.6875, "completions/mean_terminated_length": 639.375, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "entropy": 0.3174269199371338, "epoch": 0.6084656084656085, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0009422831353731453, "learning_rate": 9.888434282076757e-07, "loss": 0.0093, "num_tokens": 2159877.0, "reward": 0.10546875, "reward_std": 0.07999982684850693, "rewards/itbench_correctness/mean": 0.10546875, "rewards/itbench_correctness/std": 0.1543108969926834, "step": 115, "step_time": 162.2415656549856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 833.0, "completions/mean_terminated_length": 718.4000244140625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.5162065029144287, "epoch": 0.6137566137566137, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.0014166326727718115, "learning_rate": 9.884934153917996e-07, "loss": 0.0456, "num_tokens": 2190885.0, "reward": 0.21875, "reward_std": 0.1735912710428238, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.3275540769100189, "step": 116, "step_time": 763.6827120250091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 753.6875, "completions/mean_terminated_length": 591.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41927191615104675, "epoch": 0.6190476190476191, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0011654142290353775, "learning_rate": 9.881380604901963e-07, "loss": -0.1407, "num_tokens": 2212584.0, "reward": 0.2708333134651184, "reward_std": 0.3443610668182373, "rewards/itbench_correctness/mean": 0.2708333134651184, "rewards/itbench_correctness/std": 0.33471935987472534, "step": 117, "step_time": 234.95893322955817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 748.8125, "completions/mean_terminated_length": 623.727294921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4727485179901123, "epoch": 0.6243386243386243, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0017693624831736088, "learning_rate": 9.8777736738897e-07, "loss": -0.09, "num_tokens": 2236157.0, "reward": 0.2291666716337204, "reward_std": 0.3471825420856476, "rewards/itbench_correctness/mean": 0.2291666716337204, "rewards/itbench_correctness/std": 0.35420751571655273, "step": 118, "step_time": 141.18642224557698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 832.25, "completions/mean_terminated_length": 683.1111450195312, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.4133373498916626, "epoch": 0.6296296296296297, "frac_reward_zero_std": 0.5, "grad_norm": 1.5, "kl": 0.0010757006239145994, "learning_rate": 9.87411340032603e-07, "loss": 0.0049, "num_tokens": 2259913.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4989572763442993, "step": 119, "step_time": 577.6952238306403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 707.6875, "completions/mean_terminated_length": 461.6666564941406, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.5736995339393616, "epoch": 0.6349206349206349, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0011166096664965153, "learning_rate": 9.870399824239114e-07, "loss": -0.0078, "num_tokens": 2278228.0, "reward": 0.3671875, "reward_std": 0.2785572409629822, "rewards/itbench_correctness/mean": 0.3671875, "rewards/itbench_correctness/std": 0.2793920040130615, "step": 120, "step_time": 203.33785133063793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 780.75, "completions/mean_terminated_length": 670.1818237304688, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.40473902225494385, "epoch": 0.6402116402116402, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0012949309311807156, "learning_rate": 9.866632986240029e-07, "loss": 0.0027, "num_tokens": 2296336.0, "reward": 0.4776785671710968, "reward_std": 0.2322283834218979, "rewards/itbench_correctness/mean": 0.4776785671710968, "rewards/itbench_correctness/std": 0.4821428656578064, "step": 121, "step_time": 101.13796862587333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 618.75, "completions/mean_terminated_length": 618.75, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.5333333611488342, "epoch": 0.6455026455026455, "frac_reward_zero_std": 0.5, "grad_norm": 1.1328125, "kl": 0.0013620926765725017, "learning_rate": 9.862812927522308e-07, "loss": 0.0167, "num_tokens": 2314388.0, "reward": 0.6145833134651184, "reward_std": 0.043129097670316696, "rewards/itbench_correctness/mean": 0.6145833134651184, "rewards/itbench_correctness/std": 0.40239447355270386, "step": 122, "step_time": 715.118090393953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 589.375, "completions/mean_terminated_length": 444.5, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.38006362318992615, "epoch": 0.6507936507936508, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0010308035416528583, "learning_rate": 9.858939689861506e-07, "loss": 0.0628, "num_tokens": 2330282.0, "reward": 0.5416666865348816, "reward_std": 0.17097428441047668, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.197202667593956, "step": 123, "step_time": 104.44047453720123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 700.125, "completions/mean_terminated_length": 376.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.43992143869400024, "epoch": 0.656084656084656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0233154296875, "kl": 0.0014201418962329626, "learning_rate": 9.855013315614725e-07, "loss": 0.0, "num_tokens": 2353412.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 124, "step_time": 91.80700621567667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 658.0, "completions/mean_terminated_length": 658.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.3844984769821167, "epoch": 0.6613756613756614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.0017445363337174058, "learning_rate": 9.851033847720164e-07, "loss": 0.0, "num_tokens": 2368164.0, "reward": 0.25, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 125, "step_time": 84.32240361534059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 700.6875, "completions/mean_terminated_length": 449.22222900390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.6279546618461609, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.001191388233564794, "learning_rate": 9.847001329696652e-07, "loss": -0.0546, "num_tokens": 2386335.0, "reward": 0.41874998807907104, "reward_std": 0.2509503960609436, "rewards/itbench_correctness/mean": 0.41874998807907104, "rewards/itbench_correctness/std": 0.3046172559261322, "step": 126, "step_time": 192.25429659802467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 673.125, "completions/mean_terminated_length": 400.22222900390625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.6596100330352783, "epoch": 0.671957671957672, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0013828243827447295, "learning_rate": 9.842915805643156e-07, "loss": -0.0019, "num_tokens": 2410073.0, "reward": 0.453125, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.453125, "rewards/itbench_correctness/std": 0.5018196105957031, "step": 127, "step_time": 370.60414741840214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 675.125, "completions/mean_terminated_length": 465.8000183105469, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6161822080612183, "epoch": 0.6772486772486772, "frac_reward_zero_std": 0.5, "grad_norm": 1.0390625, "kl": 0.0011551063507795334, "learning_rate": 9.838777320238312e-07, "loss": -0.0151, "num_tokens": 2430699.0, "reward": 0.34375, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.3966001570224762, "step": 128, "step_time": 101.63996140938252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 655.9375, "completions/mean_terminated_length": 655.9375, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.27136731147766113, "epoch": 0.6825396825396826, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.0009132423438131809, "learning_rate": 9.834585918739934e-07, "loss": 0.0035, "num_tokens": 2448146.0, "reward": 0.34375, "reward_std": 0.0578637570142746, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.36371922492980957, "step": 129, "step_time": 926.4854553686455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 507.75, "completions/mean_terminated_length": 507.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.486459881067276, "epoch": 0.6878306878306878, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0013310650829225779, "learning_rate": 9.83034164698452e-07, "loss": -0.08, "num_tokens": 2459726.0, "reward": 0.8690476417541504, "reward_std": 0.28752756118774414, "rewards/itbench_correctness/mean": 0.8690476417541504, "rewards/itbench_correctness/std": 0.2865068316459656, "step": 130, "step_time": 497.3244105326012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 458.625, "completions/mean_terminated_length": 458.625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.40119925141334534, "epoch": 0.6931216931216931, "frac_reward_zero_std": 0.5, "grad_norm": 1.328125, "kl": 0.0013875841395929456, "learning_rate": 9.826044551386742e-07, "loss": 0.0024, "num_tokens": 2469992.0, "reward": 0.4791666865348816, "reward_std": 0.19795583188533783, "rewards/itbench_correctness/mean": 0.4791666865348816, "rewards/itbench_correctness/std": 0.27131369709968567, "step": 131, "step_time": 64.11436599586159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 632.5625, "completions/mean_terminated_length": 606.4666748046875, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "entropy": 0.3730856776237488, "epoch": 0.6984126984126984, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234375, "kl": 0.0012992192059755325, "learning_rate": 9.821694678938952e-07, "loss": -0.0026, "num_tokens": 2484161.0, "reward": 0.9255682229995728, "reward_std": 0.17330622673034668, "rewards/itbench_correctness/mean": 0.9255682229995728, "rewards/itbench_correctness/std": 0.24894750118255615, "step": 132, "step_time": 782.2131289467216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 779.25, "completions/mean_terminated_length": 762.933349609375, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "entropy": 0.6185434460639954, "epoch": 0.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.0011128331534564495, "learning_rate": 9.817292077210656e-07, "loss": 0.0277, "num_tokens": 2503445.0, "reward": 0.59375, "reward_std": 0.3061639666557312, "rewards/itbench_correctness/mean": 0.59375, "rewards/itbench_correctness/std": 0.41708314418792725, "step": 133, "step_time": 234.19261386059225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 657.4375, "completions/mean_terminated_length": 535.25, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.31942200660705566, "epoch": 0.708994708994709, "frac_reward_zero_std": 0.5, "grad_norm": 1.109375, "kl": 0.0009678892092779279, "learning_rate": 9.812836794348002e-07, "loss": 0.0316, "num_tokens": 2520980.0, "reward": 0.78125, "reward_std": 0.1085391715168953, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.27024510502815247, "step": 134, "step_time": 130.00603658426553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 696.0, "completions/mean_terminated_length": 696.0, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.36063218116760254, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 1.1015625, "kl": 0.0012922929599881172, "learning_rate": 9.808328879073251e-07, "loss": -0.024, "num_tokens": 2537100.0, "reward": 0.6875, "reward_std": 0.0862581878900528, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.3435921370983124, "step": 135, "step_time": 191.46370885893703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.3898509740829468, "epoch": 0.7195767195767195, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0024722313974052668, "learning_rate": 9.803768380684242e-07, "loss": -0.0114, "num_tokens": 2544442.0, "reward": 0.21875, "reward_std": 0.3061639666557312, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 136, "step_time": 65.17159292474389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1024.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 960.5625, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.4580649435520172, "epoch": 0.7248677248677249, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0012847312027588487, "learning_rate": 9.79915534905385e-07, "loss": -0.0218, "num_tokens": 2571915.0, "reward": 0.3541666865348816, "reward_std": 0.349293053150177, "rewards/itbench_correctness/mean": 0.3541666865348816, "rewards/itbench_correctness/std": 0.4121982753276825, "step": 137, "step_time": 95.23527884297073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 655.5, "completions/mean_terminated_length": 630.933349609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5095347166061401, "epoch": 0.7301587301587301, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.0019625271670520306, "learning_rate": 9.794489834629454e-07, "loss": 0.0004, "num_tokens": 2596083.0, "reward": 0.296875, "reward_std": 0.24944134056568146, "rewards/itbench_correctness/mean": 0.296875, "rewards/itbench_correctness/std": 0.4584280252456665, "step": 138, "step_time": 73.28423386160284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 568.3125, "completions/mean_terminated_length": 568.3125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.47509074211120605, "epoch": 0.7354497354497355, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.001536766067147255, "learning_rate": 9.789771888432373e-07, "loss": 0.0225, "num_tokens": 2617728.0, "reward": 0.5104166865348816, "reward_std": 0.43504026532173157, "rewards/itbench_correctness/mean": 0.5104166865348816, "rewards/itbench_correctness/std": 0.43127182126045227, "step": 139, "step_time": 116.2228917106986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 554.75, "completions/mean_terminated_length": 487.71429443359375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.3929698169231415, "epoch": 0.7407407407407407, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.001249143504537642, "learning_rate": 9.78500156205731e-07, "loss": -0.0021, "num_tokens": 2630956.0, "reward": 0.19062501192092896, "reward_std": 0.0265165064483881, "rewards/itbench_correctness/mean": 0.19062501192092896, "rewards/itbench_correctness/std": 0.2001822143793106, "step": 140, "step_time": 416.3123774584383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 462.125, "completions/mean_terminated_length": 462.125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.36353799700737, "epoch": 0.746031746031746, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0016349649522453547, "learning_rate": 9.780178907671788e-07, "loss": 0.0084, "num_tokens": 2641358.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 141, "step_time": 87.9296273579821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 552.0, "completions/mean_terminated_length": 552.0, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.5036231875419617, "epoch": 0.7513227513227513, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0012071933597326279, "learning_rate": 9.775303978015585e-07, "loss": -0.0368, "num_tokens": 2652918.0, "reward": 0.65625, "reward_std": 0.4532671868801117, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.4732423722743988, "step": 142, "step_time": 125.74961478449404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 490.3999938964844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.483707457780838, "epoch": 0.7566137566137566, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0013088560663163662, "learning_rate": 9.77037682640015e-07, "loss": -0.1116, "num_tokens": 2668606.0, "reward": 0.5104166269302368, "reward_std": 0.39774924516677856, "rewards/itbench_correctness/mean": 0.5104166269302368, "rewards/itbench_correctness/std": 0.4732423722743988, "step": 143, "step_time": 81.490906807594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 869.1875, "completions/mean_terminated_length": 817.5833740234375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.36585891246795654, "epoch": 0.7619047619047619, "frac_reward_zero_std": 0.5, "grad_norm": 1.375, "kl": 0.001102715963497758, "learning_rate": 9.76539750670802e-07, "loss": 0.0248, "num_tokens": 2688489.0, "reward": 0.29411765933036804, "reward_std": 0.1618601679801941, "rewards/itbench_correctness/mean": 0.29411765933036804, "rewards/itbench_correctness/std": 0.3757345974445343, "step": 144, "step_time": 625.0967052578926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 753.125, "completions/mean_terminated_length": 714.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6479668021202087, "epoch": 0.7671957671957672, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0020443897228688, "learning_rate": 9.760366073392244e-07, "loss": -0.1502, "num_tokens": 2719323.0, "reward": 0.6875, "reward_std": 0.44403791427612305, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 145, "step_time": 126.37558931391686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 751.5, "completions/mean_terminated_length": 751.5, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.39387890696525574, "epoch": 0.7724867724867724, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.0012558232992887497, "learning_rate": 9.755282581475767e-07, "loss": -0.0169, "num_tokens": 2736931.0, "reward": 0.84375, "reward_std": 0.32239729166030884, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 146, "step_time": 86.20990402065217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 764.1875, "completions/mean_terminated_length": 677.5833740234375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.5051116347312927, "epoch": 0.7777777777777778, "frac_reward_zero_std": 0.5, "grad_norm": 1.1015625, "kl": 0.0011690112296491861, "learning_rate": 9.750147086550842e-07, "loss": 0.0162, "num_tokens": 2773926.0, "reward": 0.4734848737716675, "reward_std": 0.05882110819220543, "rewards/itbench_correctness/mean": 0.4734848737716675, "rewards/itbench_correctness/std": 0.4955727159976959, "step": 147, "step_time": 137.13953017815948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 518.0, "completions/mean_terminated_length": 518.0, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.3146718144416809, "epoch": 0.783068783068783, "frac_reward_zero_std": 1.0, "grad_norm": 0.028564453125, "kl": 0.001314603490754962, "learning_rate": 9.744959644778421e-07, "loss": 0.0, "num_tokens": 2787054.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 148, "step_time": 1022.448972039856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 704.8125, "completions/mean_terminated_length": 513.2999877929688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4767225384712219, "epoch": 0.7883597883597884, "frac_reward_zero_std": 0.5, "grad_norm": 0.3984375, "kl": 0.0012462595477700233, "learning_rate": 9.739720312887533e-07, "loss": -0.0812, "num_tokens": 2813323.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 149, "step_time": 102.6073711141944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 666.75, "completions/mean_terminated_length": 584.3077392578125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4049493670463562, "epoch": 0.7936507936507936, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0011266146320849657, "learning_rate": 9.734429148174674e-07, "loss": -0.0562, "num_tokens": 2830087.0, "reward": 0.453125, "reward_std": 0.15026018023490906, "rewards/itbench_correctness/mean": 0.453125, "rewards/itbench_correctness/std": 0.413710355758667, "step": 150, "step_time": 72.91534078493714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 764.9375, "completions/mean_terminated_length": 609.5, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.3294386863708496, "epoch": 0.798941798941799, "frac_reward_zero_std": 0.5, "grad_norm": 1.2734375, "kl": 0.0012626381358131766, "learning_rate": 9.729086208503173e-07, "loss": -0.0019, "num_tokens": 2847998.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 151, "step_time": 135.860564914532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 789.375, "completions/mean_terminated_length": 755.857177734375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 0.6638163328170776, "epoch": 0.8042328042328042, "frac_reward_zero_std": 0.5, "grad_norm": 1.4140625, "kl": 0.0012647550320252776, "learning_rate": 9.723691552302562e-07, "loss": 0.006, "num_tokens": 2892140.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 152, "step_time": 128.31029498856515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 438.8125, "completions/mean_terminated_length": 438.8125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.35778379440307617, "epoch": 0.8095238095238095, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0011610172223299742, "learning_rate": 9.718245238567938e-07, "loss": -0.0117, "num_tokens": 2901465.0, "reward": 0.5062500238418579, "reward_std": 0.1627907156944275, "rewards/itbench_correctness/mean": 0.5062500238418579, "rewards/itbench_correctness/std": 0.17308476567268372, "step": 153, "step_time": 53.513846694491804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 529.3125, "completions/mean_terminated_length": 529.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.44775062799453735, "epoch": 0.8148148148148148, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0016965508693829179, "learning_rate": 9.712747326859315e-07, "loss": 0.0038, "num_tokens": 2931910.0, "reward": 0.40625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.4552929699420929, "step": 154, "step_time": 79.1174840349704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 713.0, "completions/mean_terminated_length": 609.3333740234375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "entropy": 0.41234222054481506, "epoch": 0.8201058201058201, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.001036427216604352, "learning_rate": 9.707197877300973e-07, "loss": -0.0351, "num_tokens": 2949046.0, "reward": 0.47181373834609985, "reward_std": 0.2768261134624481, "rewards/itbench_correctness/mean": 0.47181373834609985, "rewards/itbench_correctness/std": 0.45311903953552246, "step": 155, "step_time": 1143.2126589166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 572.9375, "completions/mean_terminated_length": 572.9375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.4380931556224823, "epoch": 0.8253968253968254, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.0014040054520592093, "learning_rate": 9.701596950580807e-07, "loss": 0.0116, "num_tokens": 2961597.0, "reward": 0.953125, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.953125, "rewards/itbench_correctness/std": 0.1875, "step": 156, "step_time": 101.3859726889059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 650.625, "completions/mean_terminated_length": 650.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.40268972516059875, "epoch": 0.8306878306878307, "frac_reward_zero_std": 0.5, "grad_norm": 1.375, "kl": 0.0015219611814245582, "learning_rate": 9.695944607949648e-07, "loss": -0.0258, "num_tokens": 2981207.0, "reward": 0.78125, "reward_std": 0.2086307406425476, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.36371922492980957, "step": 157, "step_time": 316.9267311077565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 514.25, "completions/mean_terminated_length": 514.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5833738446235657, "epoch": 0.8359788359788359, "frac_reward_zero_std": 0.5, "grad_norm": 0.80078125, "kl": 0.0016437954036518931, "learning_rate": 9.690240911220617e-07, "loss": -0.0919, "num_tokens": 2994235.0, "reward": 0.84375, "reward_std": 0.15866193175315857, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.27024510502815247, "step": 158, "step_time": 80.80979425925761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 591.0, "completions/mean_terminated_length": 591.0, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.46023687720298767, "epoch": 0.8412698412698413, "frac_reward_zero_std": 0.5, "grad_norm": 1.34375, "kl": 0.001493943389505148, "learning_rate": 9.684485922768421e-07, "loss": -0.0018, "num_tokens": 3009803.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 159, "step_time": 92.85486916080117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 839.375, "completions/mean_terminated_length": 654.75, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "entropy": 0.3979151248931885, "epoch": 0.8465608465608465, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0009861888829618692, "learning_rate": 9.678679705528698e-07, "loss": 0.0391, "num_tokens": 3033361.0, "reward": 0.4520833492279053, "reward_std": 0.2401251643896103, "rewards/itbench_correctness/mean": 0.4520833492279053, "rewards/itbench_correctness/std": 0.3798574209213257, "step": 160, "step_time": 113.24223164469004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 866.9375, "completions/mean_terminated_length": 772.7000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4936918616294861, "epoch": 0.8518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0016467805253341794, "learning_rate": 9.672822322997304e-07, "loss": -0.0031, "num_tokens": 3052032.0, "reward": 0.535937488079071, "reward_std": 0.39822056889533997, "rewards/itbench_correctness/mean": 0.535937488079071, "rewards/itbench_correctness/std": 0.4591630697250366, "step": 161, "step_time": 73.70399552583694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 524.125, "completions/mean_terminated_length": 524.125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "entropy": 0.503696620464325, "epoch": 0.8571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 1.1015625, "kl": 0.0013778236461803317, "learning_rate": 9.666913839229637e-07, "loss": -0.0048, "num_tokens": 3063106.0, "reward": 0.5, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.3651483952999115, "step": 162, "step_time": 143.48580626491457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 587.75, "completions/mean_terminated_length": 587.75, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.3862186372280121, "epoch": 0.8624338624338624, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0010811445536091924, "learning_rate": 9.660954318839932e-07, "loss": 0.0044, "num_tokens": 3076070.0, "reward": 0.643750011920929, "reward_std": 0.20177768170833588, "rewards/itbench_correctness/mean": 0.643750011920929, "rewards/itbench_correctness/std": 0.36142081022262573, "step": 163, "step_time": 79.29910835064948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 771.5625, "completions/mean_terminated_length": 620.1000366210938, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.6091535091400146, "epoch": 0.8677248677248677, "frac_reward_zero_std": 0.5, "grad_norm": 1.53125, "kl": 0.0012378692626953125, "learning_rate": 9.654943827000546e-07, "loss": 0.0099, "num_tokens": 3094839.0, "reward": 0.609375, "reward_std": 0.1043153703212738, "rewards/itbench_correctness/mean": 0.609375, "rewards/itbench_correctness/std": 0.4278702139854431, "step": 164, "step_time": 97.59095096122473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 990.5625, "completions/mean_terminated_length": 934.8333740234375, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "entropy": 0.5027446746826172, "epoch": 0.873015873015873, "frac_reward_zero_std": 0.5, "grad_norm": 1.390625, "kl": 0.0011952045606449246, "learning_rate": 9.648882429441256e-07, "loss": 0.0129, "num_tokens": 3138016.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 165, "step_time": 121.71588209550828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 771.75, "completions/mean_terminated_length": 771.75, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "entropy": 0.39909297227859497, "epoch": 0.8783068783068783, "frac_reward_zero_std": 0.5, "grad_norm": 1.46875, "kl": 0.0013839628081768751, "learning_rate": 9.642770192448535e-07, "loss": 0.0037, "num_tokens": 3161268.0, "reward": 0.47187501192092896, "reward_std": 0.08010874688625336, "rewards/itbench_correctness/mean": 0.47187501192092896, "rewards/itbench_correctness/std": 0.3993614614009857, "step": 166, "step_time": 104.31897877063602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 805.625, "completions/mean_terminated_length": 755.2307739257812, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.5784329175949097, "epoch": 0.8835978835978836, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.00185630121268332, "learning_rate": 9.636607182864826e-07, "loss": -0.0227, "num_tokens": 3196606.0, "reward": 0.25, "reward_std": 0.4355512857437134, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 167, "step_time": 113.68263853341341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 573.625, "completions/mean_terminated_length": 573.625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.4166485071182251, "epoch": 0.8888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0012832505162805319, "learning_rate": 9.630393468087817e-07, "loss": -0.0249, "num_tokens": 3209872.0, "reward": 0.2291666716337204, "reward_std": 0.14026343822479248, "rewards/itbench_correctness/mean": 0.2291666716337204, "rewards/itbench_correctness/std": 0.1787301003932953, "step": 168, "step_time": 417.7054488658905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 928.125, "completions/mean_terminated_length": 804.857177734375, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 0.5990572571754456, "epoch": 0.8941798941798942, "frac_reward_zero_std": 1.0, "grad_norm": 0.03857421875, "kl": 0.0013898048782721162, "learning_rate": 9.624129116069694e-07, "loss": 0.0001, "num_tokens": 3258930.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 169, "step_time": 225.11859526112676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 959.5, "completions/mean_terminated_length": 876.5714721679688, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "entropy": 0.4815007746219635, "epoch": 0.8994708994708994, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.001060598180629313, "learning_rate": 9.61781419531641e-07, "loss": 0.0041, "num_tokens": 3282762.0, "reward": 0.625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.4564354717731476, "step": 170, "step_time": 735.476375034079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 735.125, "completions/mean_terminated_length": 446.25, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.5223601460456848, "epoch": 0.9047619047619048, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0013405587524175644, "learning_rate": 9.611448774886923e-07, "loss": 0.0105, "num_tokens": 3301500.0, "reward": 0.6875, "reward_std": 0.22201895713806152, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.33850160241127014, "step": 171, "step_time": 763.4565976867452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 626.875, "completions/mean_terminated_length": 626.875, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "entropy": 0.4658025801181793, "epoch": 0.91005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 35.0, "kl": 0.0011788216652348638, "learning_rate": 9.605032924392455e-07, "loss": -0.0153, "num_tokens": 3315410.0, "reward": 0.7395833134651184, "reward_std": 0.16796313226222992, "rewards/itbench_correctness/mean": 0.7395833134651184, "rewards/itbench_correctness/std": 0.19924628734588623, "step": 172, "step_time": 103.72399638220668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 573.3125, "completions/mean_terminated_length": 573.3125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.3575711250305176, "epoch": 0.9153439153439153, "frac_reward_zero_std": 0.5, "grad_norm": 1.2734375, "kl": 0.0011289599351584911, "learning_rate": 9.598566713995717e-07, "loss": 0.0046, "num_tokens": 3328047.0, "reward": 0.3333333432674408, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.3333333432674408, "rewards/itbench_correctness/std": 0.4036867320537567, "step": 173, "step_time": 597.9741206569597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 885.25, "completions/mean_terminated_length": 802.0, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "entropy": 0.4992939829826355, "epoch": 0.9206349206349206, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0012533684493973851, "learning_rate": 9.59205021441015e-07, "loss": 0.0048, "num_tokens": 3350707.0, "reward": 0.03750000149011612, "reward_std": 0.1060660183429718, "rewards/itbench_correctness/mean": 0.03750000149011612, "rewards/itbench_correctness/std": 0.15000000596046448, "step": 174, "step_time": 158.50677568931133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 715.8125, "completions/mean_terminated_length": 476.1111145019531, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.5113070607185364, "epoch": 0.9259259259259259, "frac_reward_zero_std": 0.5, "grad_norm": 1.59375, "kl": 0.0011880681850016117, "learning_rate": 9.585483496899149e-07, "loss": -0.0041, "num_tokens": 3367576.0, "reward": 0.6875, "reward_std": 0.20044593513011932, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.42328083515167236, "step": 175, "step_time": 881.939713913016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 832.5625, "completions/mean_terminated_length": 411.3999938964844, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.5116732716560364, "epoch": 0.9312169312169312, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0010343582835048437, "learning_rate": 9.578866633275286e-07, "loss": 0.0285, "num_tokens": 3392569.0, "reward": 0.5052083730697632, "reward_std": 0.1857735514640808, "rewards/itbench_correctness/mean": 0.5052083730697632, "rewards/itbench_correctness/std": 0.2930029034614563, "step": 176, "step_time": 269.2045645285398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 568.8125, "completions/mean_terminated_length": 568.8125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "entropy": 0.5203824043273926, "epoch": 0.9365079365079365, "frac_reward_zero_std": 1.0, "grad_norm": 0.017578125, "kl": 0.0011270169634371996, "learning_rate": 9.572199695899521e-07, "loss": 0.0, "num_tokens": 3405782.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 177, "step_time": 226.17386937886477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 958.875, "completions/mean_terminated_length": 850.3333740234375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "entropy": 0.39004039764404297, "epoch": 0.9417989417989417, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0013279912527650595, "learning_rate": 9.565482757680414e-07, "loss": -0.0199, "num_tokens": 3432116.0, "reward": 0.625, "reward_std": 0.28324785828590393, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.3626037836074829, "step": 178, "step_time": 150.1005060262978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 558.1875, "completions/mean_terminated_length": 491.64288330078125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.4765423834323883, "epoch": 0.9470899470899471, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0016712526557967067, "learning_rate": 9.558715892073323e-07, "loss": 0.0807, "num_tokens": 3467055.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 179, "step_time": 91.84085294324905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 847.5625, "completions/mean_terminated_length": 822.357177734375, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "entropy": 0.3374382555484772, "epoch": 0.9523809523809523, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.002606587251648307, "learning_rate": 9.551899173079606e-07, "loss": 0.0001, "num_tokens": 3486896.0, "reward": 0.4375, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.4518480598926544, "step": 180, "step_time": 250.4422083152458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 560.0, "completions/mean_terminated_length": 529.0667114257812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5392857193946838, "epoch": 0.9576719576719577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0206298828125, "kl": 0.0013467188691720366, "learning_rate": 9.545032675245813e-07, "loss": 0.0, "num_tokens": 3501360.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 181, "step_time": 231.51458043325692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 722.3125, "completions/mean_terminated_length": 702.2000122070312, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.41533270478248596, "epoch": 0.9629629629629629, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0013854112476110458, "learning_rate": 9.538116473662861e-07, "loss": -0.0126, "num_tokens": 3528605.0, "reward": 0.765625, "reward_std": 0.4136722683906555, "rewards/itbench_correctness/mean": 0.765625, "rewards/itbench_correctness/std": 0.40278977155685425, "step": 182, "step_time": 96.76111165247858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 632.6875, "completions/mean_terminated_length": 576.7857666015625, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "entropy": 0.46152326464653015, "epoch": 0.9682539682539683, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.001549158594571054, "learning_rate": 9.531150643965222e-07, "loss": 0.005, "num_tokens": 3549936.0, "reward": 0.3125, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 183, "step_time": 141.31063493527472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 763.9375, "completions/mean_terminated_length": 607.9000244140625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.3272519111633301, "epoch": 0.9735449735449735, "frac_reward_zero_std": 0.5, "grad_norm": 1.1171875, "kl": 0.0013428620295599103, "learning_rate": 9.524135262330098e-07, "loss": -0.0182, "num_tokens": 3567807.0, "reward": 0.3645833432674408, "reward_std": 0.01928791031241417, "rewards/itbench_correctness/mean": 0.3645833432674408, "rewards/itbench_correctness/std": 0.3774610757827759, "step": 184, "step_time": 143.63786490540951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 427.9375, "completions/mean_terminated_length": 427.9375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.4136117994785309, "epoch": 0.9788359788359788, "frac_reward_zero_std": 0.5, "grad_norm": 1.359375, "kl": 0.0012134211137890816, "learning_rate": 9.517070405476574e-07, "loss": -0.0009, "num_tokens": 3577486.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 185, "step_time": 170.13555748201907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 573.375, "completions/mean_terminated_length": 543.3333740234375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.2842816710472107, "epoch": 0.9841269841269841, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0015454581007361412, "learning_rate": 9.509956150664795e-07, "loss": -0.0815, "num_tokens": 3591212.0, "reward": 0.3984375, "reward_std": 0.28348496556282043, "rewards/itbench_correctness/mean": 0.3984375, "rewards/itbench_correctness/std": 0.2954002320766449, "step": 186, "step_time": 82.30455144122243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 573.4375, "completions/mean_terminated_length": 573.4375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.5754768252372742, "epoch": 0.9894179894179894, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.002303932560607791, "learning_rate": 9.502792575695111e-07, "loss": 0.0049, "num_tokens": 3614019.0, "reward": 0.40625, "reward_std": 0.1735912710428238, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.48196646571159363, "step": 187, "step_time": 89.72001887392253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 801.25, "completions/mean_terminated_length": 667.6000366210938, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "entropy": 0.44430577754974365, "epoch": 0.9947089947089947, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0012758101802319288, "learning_rate": 9.495579758907229e-07, "loss": 0.0478, "num_tokens": 3631471.0, "reward": 0.4765625, "reward_std": 0.23403453826904297, "rewards/itbench_correctness/mean": 0.4765625, "rewards/itbench_correctness/std": 0.4835174083709717, "step": 188, "step_time": 79.22767079528421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 716.1875, "completions/mean_terminated_length": 613.5833740234375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.6143642663955688, "epoch": 1.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.5234375, "kl": 0.001404265291057527, "learning_rate": 9.488317779179361e-07, "loss": 0.0008, "num_tokens": 3658762.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 189, "step_time": 153.7122633298859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 575.1875, "completions/mean_terminated_length": 575.1875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.46245789527893066, "epoch": 1.0052910052910053, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.001671693054959178, "learning_rate": 9.481006715927351e-07, "loss": -0.0487, "num_tokens": 3673277.0, "reward": 0.6145833730697632, "reward_std": 0.2882373631000519, "rewards/itbench_correctness/mean": 0.6145833730697632, "rewards/itbench_correctness/std": 0.43341347575187683, "step": 190, "step_time": 71.11040670704097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 503.75, "completions/mean_terminated_length": 503.75, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.35334986448287964, "epoch": 1.0105820105820107, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0010900463676080108, "learning_rate": 9.473646649103817e-07, "loss": 0.012, "num_tokens": 3684537.0, "reward": 0.875, "reward_std": 0.16866441071033478, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.18257419764995575, "step": 191, "step_time": 796.484293489717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 665.6875, "completions/mean_terminated_length": 502.8182067871094, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.43263542652130127, "epoch": 1.0158730158730158, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0013347232015803456, "learning_rate": 9.466237659197269e-07, "loss": -0.1131, "num_tokens": 3704212.0, "reward": 0.5625, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.4699290990829468, "step": 192, "step_time": 630.9985243473202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3936009705066681, "epoch": 1.0211640211640212, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0016619176603853703, "learning_rate": 9.458779827231236e-07, "loss": -0.0404, "num_tokens": 3713654.0, "reward": 0.65625, "reward_std": 0.1735912710428238, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.3400367796421051, "step": 193, "step_time": 692.6463372064754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 944.5625, "completions/mean_terminated_length": 600.3333740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.44041553139686584, "epoch": 1.0264550264550265, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0013270394410938025, "learning_rate": 9.451273234763371e-07, "loss": 0.0, "num_tokens": 3736343.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 194, "step_time": 4224.783679332584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 861.5, "completions/mean_terminated_length": 374.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5571677088737488, "epoch": 1.0317460317460316, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0012215389870107174, "learning_rate": 9.443717963884568e-07, "loss": -0.0105, "num_tokens": 3761855.0, "reward": 0.375, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 195, "step_time": 750.6342479139566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 776.3125, "completions/mean_terminated_length": 740.9285888671875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.3168826997280121, "epoch": 1.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0012352862395346165, "learning_rate": 9.436114097218058e-07, "loss": 0.0208, "num_tokens": 3779892.0, "reward": 0.578125, "reward_std": 0.25282490253448486, "rewards/itbench_correctness/mean": 0.578125, "rewards/itbench_correctness/std": 0.32556042075157166, "step": 196, "step_time": 167.4240329694003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 712.4375, "completions/mean_terminated_length": 525.5, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.30879902839660645, "epoch": 1.0423280423280423, "frac_reward_zero_std": 0.5, "grad_norm": 1.421875, "kl": 0.001269629574380815, "learning_rate": 9.42846171791851e-07, "loss": -0.0334, "num_tokens": 3798771.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 197, "step_time": 1535.9739540033042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 782.875, "completions/mean_terminated_length": 541.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "entropy": 0.38831230998039246, "epoch": 1.0476190476190477, "frac_reward_zero_std": 0.5, "grad_norm": 1.421875, "kl": 0.001182721694931388, "learning_rate": 9.420760909671118e-07, "loss": 0.0, "num_tokens": 3818961.0, "reward": 0.3333333432674408, "reward_std": 0.17817413806915283, "rewards/itbench_correctness/mean": 0.3333333432674408, "rewards/itbench_correctness/std": 0.42163702845573425, "step": 198, "step_time": 118.89587634429336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 746.625, "completions/mean_terminated_length": 746.625, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "entropy": 0.409844309091568, "epoch": 1.052910052910053, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.001056072534993291, "learning_rate": 9.413011756690684e-07, "loss": 0.0058, "num_tokens": 3839107.0, "reward": 0.46875, "reward_std": 0.1944543570280075, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.43006783723831177, "step": 199, "step_time": 86.62848719768226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 775.25, "completions/mean_terminated_length": 581.7777709960938, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.611415684223175, "epoch": 1.0582010582010581, "frac_reward_zero_std": 0.5, "grad_norm": 1.515625, "kl": 0.0012188013643026352, "learning_rate": 9.405214343720706e-07, "loss": 0.0098, "num_tokens": 3858671.0, "reward": 0.1302083432674408, "reward_std": 0.09300297498703003, "rewards/itbench_correctness/mean": 0.1302083432674408, "rewards/itbench_correctness/std": 0.13252796232700348, "step": 200, "step_time": 82.59393281675875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 843.9375, "completions/mean_terminated_length": 703.888916015625, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "entropy": 0.46211951971054077, "epoch": 1.0634920634920635, "frac_reward_zero_std": 0.5, "grad_norm": 1.3359375, "kl": 0.000934995652642101, "learning_rate": 9.397368756032444e-07, "loss": -0.001, "num_tokens": 3879470.0, "reward": 0.71875, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.34308648109436035, "step": 201, "step_time": 210.63156687188894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 485.8125, "completions/mean_terminated_length": 485.8125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.2428920567035675, "epoch": 1.0687830687830688, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.002602006308734417, "learning_rate": 9.389475079423988e-07, "loss": -0.0939, "num_tokens": 3892331.0, "reward": 0.3125, "reward_std": 0.25763458013534546, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.25730079412460327, "step": 202, "step_time": 79.75068347156048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 439.5, "completions/mean_terminated_length": 439.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.457337886095047, "epoch": 1.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.0015101981116458774, "learning_rate": 9.381533400219317e-07, "loss": -0.0691, "num_tokens": 3902267.0, "reward": 0.5852272510528564, "reward_std": 0.252642422914505, "rewards/itbench_correctness/mean": 0.5852272510528564, "rewards/itbench_correctness/std": 0.3983004689216614, "step": 203, "step_time": 112.7746303929016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 522.6875, "completions/mean_terminated_length": 522.6875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.47829726338386536, "epoch": 1.0793650793650793, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.0021134628914296627, "learning_rate": 9.373543805267367e-07, "loss": -0.0106, "num_tokens": 3916214.0, "reward": 0.4910714328289032, "reward_std": 0.02525380812585354, "rewards/itbench_correctness/mean": 0.4910714328289032, "rewards/itbench_correctness/std": 0.5083487033843994, "step": 204, "step_time": 118.03211208153516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 762.875, "completions/mean_terminated_length": 606.2000122070312, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "entropy": 0.3617892861366272, "epoch": 1.0846560846560847, "frac_reward_zero_std": 0.5, "grad_norm": 1.53125, "kl": 0.0010750198271125555, "learning_rate": 9.365506381941065e-07, "loss": -0.0237, "num_tokens": 3933452.0, "reward": 0.4270833134651184, "reward_std": 0.053405821323394775, "rewards/itbench_correctness/mean": 0.4270833134651184, "rewards/itbench_correctness/std": 0.4470841884613037, "step": 205, "step_time": 255.03229981381446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 652.625, "completions/mean_terminated_length": 363.77777099609375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.4872629642486572, "epoch": 1.08994708994709, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0011171969817951322, "learning_rate": 9.357421218136386e-07, "loss": -0.0152, "num_tokens": 3953614.0, "reward": 0.5843750238418579, "reward_std": 0.21464183926582336, "rewards/itbench_correctness/mean": 0.5843750238418579, "rewards/itbench_correctness/std": 0.29686442017555237, "step": 206, "step_time": 127.40266931243241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 552.6875, "completions/mean_terminated_length": 552.6875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4143390357494354, "epoch": 1.0952380952380953, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.001612935564480722, "learning_rate": 9.349288402271387e-07, "loss": -0.021, "num_tokens": 3966409.0, "reward": 0.71875, "reward_std": 0.35564959049224854, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.44604745507240295, "step": 207, "step_time": 76.78453262429684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 522.875, "completions/mean_terminated_length": 522.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.2907004654407501, "epoch": 1.1005291005291005, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.001368207624182105, "learning_rate": 9.341108023285237e-07, "loss": -0.1923, "num_tokens": 3980703.0, "reward": 0.5208333134651184, "reward_std": 0.25392836332321167, "rewards/itbench_correctness/mean": 0.5208333134651184, "rewards/itbench_correctness/std": 0.45082229375839233, "step": 208, "step_time": 87.08768197055906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 712.4375, "completions/mean_terminated_length": 712.4375, "completions/min_length": 595.0, "completions/min_terminated_length": 595.0, "entropy": 0.4154750406742096, "epoch": 1.1058201058201058, "frac_reward_zero_std": 0.5, "grad_norm": 1.15625, "kl": 0.0015189462574198842, "learning_rate": 9.332880170637252e-07, "loss": 0.0093, "num_tokens": 3996494.0, "reward": 0.8671875, "reward_std": 0.07790146768093109, "rewards/itbench_correctness/mean": 0.8671875, "rewards/itbench_correctness/std": 0.17361806333065033, "step": 209, "step_time": 73.19756223168224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 527.75, "completions/mean_terminated_length": 527.75, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.43202275037765503, "epoch": 1.1111111111111112, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0011430936865508556, "learning_rate": 9.32460493430591e-07, "loss": -0.0004, "num_tokens": 4008082.0, "reward": 0.9166666865348816, "reward_std": 0.235702246427536, "rewards/itbench_correctness/mean": 0.9166666865348816, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 210, "step_time": 7547.0997234797105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 756.1875, "completions/mean_terminated_length": 547.888916015625, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.4284651577472687, "epoch": 1.1164021164021163, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0010993402684107423, "learning_rate": 9.316282404787869e-07, "loss": -0.0121, "num_tokens": 4028837.0, "reward": 0.7395833730697632, "reward_std": 0.28634417057037354, "rewards/itbench_correctness/mean": 0.7395833730697632, "rewards/itbench_correctness/std": 0.35988038778305054, "step": 211, "step_time": 127.47603439353406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 988.5, "completions/mean_terminated_length": 740.0, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "entropy": 0.3925139009952545, "epoch": 1.1216931216931216, "frac_reward_zero_std": 0.5, "grad_norm": 1.6640625, "kl": 0.0009769725147634745, "learning_rate": 9.307912673096979e-07, "loss": 0.0022, "num_tokens": 4061109.0, "reward": 0.375, "reward_std": 0.1725163757801056, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.45338237285614014, "step": 212, "step_time": 153.4806991070509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 641.0625, "completions/mean_terminated_length": 586.357177734375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "entropy": 0.4180559515953064, "epoch": 1.126984126984127, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.00120700488332659, "learning_rate": 9.299495830763284e-07, "loss": -0.0587, "num_tokens": 4076166.0, "reward": 0.3768939673900604, "reward_std": 0.29744255542755127, "rewards/itbench_correctness/mean": 0.3768939673900604, "rewards/itbench_correctness/std": 0.3607577383518219, "step": 213, "step_time": 132.46722139418125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 566.4375, "completions/mean_terminated_length": 535.933349609375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.4042811393737793, "epoch": 1.1322751322751323, "frac_reward_zero_std": 0.5, "grad_norm": 1.171875, "kl": 0.00115107255987823, "learning_rate": 9.291031969832025e-07, "loss": 0.0001, "num_tokens": 4089029.0, "reward": 0.38786762952804565, "reward_std": 0.16254664957523346, "rewards/itbench_correctness/mean": 0.38786762952804565, "rewards/itbench_correctness/std": 0.458029180765152, "step": 214, "step_time": 364.9181332997978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.4731896221637726, "epoch": 1.1375661375661377, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0016687542665749788, "learning_rate": 9.282521182862629e-07, "loss": 0.0181, "num_tokens": 4103865.0, "reward": 0.587890625, "reward_std": 0.31902575492858887, "rewards/itbench_correctness/mean": 0.587890625, "rewards/itbench_correctness/std": 0.37728795409202576, "step": 215, "step_time": 78.70893874578178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 621.3125, "completions/mean_terminated_length": 621.3125, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "entropy": 0.4731918275356293, "epoch": 1.1428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0012563822092488408, "learning_rate": 9.273963562927694e-07, "loss": 0.0316, "num_tokens": 4116998.0, "reward": 0.875, "reward_std": 0.2630348801612854, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.2687419056892395, "step": 216, "step_time": 189.60282021015882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 981.875, "completions/mean_terminated_length": 927.71435546875, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "entropy": 0.4297899305820465, "epoch": 1.1481481481481481, "frac_reward_zero_std": 0.5, "grad_norm": 1.5078125, "kl": 0.0011373235611245036, "learning_rate": 9.265359203611987e-07, "loss": 0.0, "num_tokens": 4144004.0, "reward": 0.02500000037252903, "reward_std": 0.04629100486636162, "rewards/itbench_correctness/mean": 0.02500000037252903, "rewards/itbench_correctness/std": 0.06831301003694534, "step": 217, "step_time": 195.92587360646576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 956.875, "completions/mean_terminated_length": 845.0, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "entropy": 0.5434356331825256, "epoch": 1.1534391534391535, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0011996476678177714, "learning_rate": 9.2567081990114e-07, "loss": 0.0111, "num_tokens": 4186818.0, "reward": 0.16249999403953552, "reward_std": 0.25583362579345703, "rewards/itbench_correctness/mean": 0.16249999403953552, "rewards/itbench_correctness/std": 0.2673948407173157, "step": 218, "step_time": 182.73709686659276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 886.0625, "completions/mean_terminated_length": 708.7142944335938, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6590957045555115, "epoch": 1.1587301587301586, "frac_reward_zero_std": 0.5, "grad_norm": 1.578125, "kl": 0.0019986084662377834, "learning_rate": 9.248010643731934e-07, "loss": 0.0001, "num_tokens": 4218627.0, "reward": 0.171875, "reward_std": 0.16952534019947052, "rewards/itbench_correctness/mean": 0.171875, "rewards/itbench_correctness/std": 0.29181545972824097, "step": 219, "step_time": 215.92658524494618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 586.3125, "completions/mean_terminated_length": 586.3125, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.48438331484794617, "epoch": 1.164021164021164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0286865234375, "kl": 0.0012342262780293822, "learning_rate": 9.239266632888658e-07, "loss": 0.0, "num_tokens": 4232136.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 220, "step_time": 87.92167458124459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 588.4375, "completions/mean_terminated_length": 588.4375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 0.3789697289466858, "epoch": 1.1693121693121693, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0008935832884162664, "learning_rate": 9.230476262104676e-07, "loss": 0.0133, "num_tokens": 4245863.0, "reward": 0.6875, "reward_std": 0.09531004726886749, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.15000000596046448, "step": 221, "step_time": 73.37130374461412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 957.1875, "completions/mean_terminated_length": 489.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.35938623547554016, "epoch": 1.1746031746031746, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0009425441385246813, "learning_rate": 9.221639627510075e-07, "loss": -0.1453, "num_tokens": 4274026.0, "reward": 0.2772817611694336, "reward_std": 0.13795886933803558, "rewards/itbench_correctness/mean": 0.2772817611694336, "rewards/itbench_correctness/std": 0.22852860391139984, "step": 222, "step_time": 114.55098836030811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 747.1875, "completions/mean_terminated_length": 683.3077392578125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.36135509610176086, "epoch": 1.17989417989418, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.00118353555444628, "learning_rate": 9.212756825740872e-07, "loss": -0.0024, "num_tokens": 4290805.0, "reward": 0.4583333432674408, "reward_std": 0.2527993321418762, "rewards/itbench_correctness/mean": 0.4583333432674408, "rewards/itbench_correctness/std": 0.30804041028022766, "step": 223, "step_time": 133.77505498286337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 501.3125, "completions/mean_terminated_length": 466.4667053222656, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4587956666946411, "epoch": 1.1851851851851851, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0022395735140889883, "learning_rate": 9.203827953937968e-07, "loss": -0.0998, "num_tokens": 4302938.0, "reward": 0.35208332538604736, "reward_std": 0.3391679525375366, "rewards/itbench_correctness/mean": 0.35208332538604736, "rewards/itbench_correctness/std": 0.3392188847064972, "step": 224, "step_time": 86.85009481851012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 717.4375, "completions/mean_terminated_length": 578.0909423828125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.36240091919898987, "epoch": 1.1904761904761905, "frac_reward_zero_std": 0.5, "grad_norm": 15.875, "kl": 0.0009973702253773808, "learning_rate": 9.194853109746072e-07, "loss": -0.0269, "num_tokens": 4321705.0, "reward": 0.4114583432674408, "reward_std": 0.17598573863506317, "rewards/itbench_correctness/mean": 0.4114583432674408, "rewards/itbench_correctness/std": 0.3488987386226654, "step": 225, "step_time": 695.8362277401611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.3760812282562256, "epoch": 1.1957671957671958, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.0032024469692260027, "learning_rate": 9.185832391312642e-07, "loss": -0.0008, "num_tokens": 4329399.0, "reward": 0.3125, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.3095695972442627, "step": 226, "step_time": 71.38310491386801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 811.25, "completions/mean_terminated_length": 537.7142944335938, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "entropy": 0.4955315887928009, "epoch": 1.201058201058201, "frac_reward_zero_std": 0.5, "grad_norm": 1.21875, "kl": 0.0015454755630344152, "learning_rate": 9.176765897286811e-07, "loss": 0.0, "num_tokens": 4367643.0, "reward": 0.171875, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.171875, "rewards/itbench_correctness/std": 0.25361964106559753, "step": 227, "step_time": 732.0901973983273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 692.625, "completions/mean_terminated_length": 434.8888854980469, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "entropy": 0.4908861219882965, "epoch": 1.2063492063492063, "frac_reward_zero_std": 1.0, "grad_norm": 0.01904296875, "kl": 0.0013219286920502782, "learning_rate": 9.167653726818304e-07, "loss": 0.0, "num_tokens": 4388877.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 228, "step_time": 883.394539824687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 532.375, "completions/mean_terminated_length": 368.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.33810752630233765, "epoch": 1.2116402116402116, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0016971830045804381, "learning_rate": 9.158495979556358e-07, "loss": -0.1874, "num_tokens": 4406811.0, "reward": 0.4947916567325592, "reward_std": 0.33243152499198914, "rewards/itbench_correctness/mean": 0.4947916567325592, "rewards/itbench_correctness/std": 0.3914227783679962, "step": 229, "step_time": 297.6574033163488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 516.5625, "completions/mean_terminated_length": 516.5625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5071990489959717, "epoch": 1.216931216931217, "frac_reward_zero_std": 0.5, "grad_norm": 1.1640625, "kl": 0.0015578863676637411, "learning_rate": 9.14929275564863e-07, "loss": -0.0018, "num_tokens": 4418316.0, "reward": 0.359375, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.359375, "rewards/itbench_correctness/std": 0.3760402202606201, "step": 230, "step_time": 98.72435673046857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 711.5625, "completions/mean_terminated_length": 468.5555725097656, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.34571805596351624, "epoch": 1.2222222222222223, "frac_reward_zero_std": 0.5, "grad_norm": 1.3671875, "kl": 0.0011883811093866825, "learning_rate": 9.1400441557401e-07, "loss": 0.0284, "num_tokens": 4450277.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 231, "step_time": 109.3037657784298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 506.375, "completions/mean_terminated_length": 506.375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.5213527679443359, "epoch": 1.2275132275132274, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.003350482787936926, "learning_rate": 9.130750280971977e-07, "loss": -0.0084, "num_tokens": 4470851.0, "reward": 0.5249999761581421, "reward_std": 0.34211215376853943, "rewards/itbench_correctness/mean": 0.5249999761581421, "rewards/itbench_correctness/std": 0.3803507089614868, "step": 232, "step_time": 117.60546538699418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 696.25, "completions/mean_terminated_length": 696.25, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "entropy": 0.5285457968711853, "epoch": 1.2328042328042328, "frac_reward_zero_std": 1.0, "grad_norm": 0.020263671875, "kl": 0.0014563931617885828, "learning_rate": 9.121411232980587e-07, "loss": 0.0, "num_tokens": 4490551.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 233, "step_time": 93.69822262041271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 886.25, "completions/mean_terminated_length": 854.4615478515625, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "entropy": 0.2617771625518799, "epoch": 1.2380952380952381, "frac_reward_zero_std": 0.5, "grad_norm": 1.234375, "kl": 0.00089176808251068, "learning_rate": 9.112027113896261e-07, "loss": 0.0017, "num_tokens": 4513339.0, "reward": 0.375, "reward_std": 0.15669579803943634, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.4425306022167206, "step": 234, "step_time": 236.43063350580633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 487.125, "completions/mean_terminated_length": 487.125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.4331537187099457, "epoch": 1.2433862433862433, "frac_reward_zero_std": 0.5, "grad_norm": 1.03125, "kl": 0.0012799181276932359, "learning_rate": 9.102598026342222e-07, "loss": -0.0038, "num_tokens": 4523829.0, "reward": 0.3333333432674408, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.3333333432674408, "rewards/itbench_correctness/std": 0.4036867320537567, "step": 235, "step_time": 690.8860946493223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 448.0, "completions/mean_terminated_length": 448.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5066964030265808, "epoch": 1.2486772486772486, "frac_reward_zero_std": 1.0, "grad_norm": 0.034423828125, "kl": 0.0018882593140006065, "learning_rate": 9.093124073433462e-07, "loss": 0.0, "num_tokens": 4552069.0, "reward": 0.3125, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.3227486312389374, "step": 236, "step_time": 151.43605288118124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 564.9375, "completions/mean_terminated_length": 564.9375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.4885496199131012, "epoch": 1.253968253968254, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.001722036860883236, "learning_rate": 9.083605358775611e-07, "loss": -0.0206, "num_tokens": 4567172.0, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 237, "step_time": 79.38015065714717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 528.6875, "completions/mean_terminated_length": 528.6875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.4085589349269867, "epoch": 1.2592592592592593, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.0010910117998719215, "learning_rate": 9.074041986463808e-07, "loss": 0.0075, "num_tokens": 4578575.0, "reward": 0.9047619104385376, "reward_std": 0.19606643915176392, "rewards/itbench_correctness/mean": 0.9047619104385376, "rewards/itbench_correctness/std": 0.2161296308040619, "step": 238, "step_time": 126.61944894865155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 696.9375, "completions/mean_terminated_length": 500.70001220703125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "entropy": 0.5710698366165161, "epoch": 1.2645502645502646, "frac_reward_zero_std": 0.5, "grad_norm": 1.453125, "kl": 0.0016381683526560664, "learning_rate": 9.064434061081561e-07, "loss": 0.017, "num_tokens": 4602870.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 239, "step_time": 184.73187920358032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 644.0625, "completions/mean_terminated_length": 644.0625, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.4098981022834778, "epoch": 1.2698412698412698, "frac_reward_zero_std": 1.0, "grad_norm": 0.021240234375, "kl": 0.001283331774175167, "learning_rate": 9.0547816876996e-07, "loss": 0.0, "num_tokens": 4623511.0, "reward": 0.550000011920929, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.550000011920929, "rewards/itbench_correctness/std": 0.4647580087184906, "step": 240, "step_time": 119.32252531778067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 804.875, "completions/mean_terminated_length": 705.2727661132812, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "entropy": 0.4646684229373932, "epoch": 1.2751322751322751, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.0014155855169519782, "learning_rate": 9.045084971874737e-07, "loss": 0.0092, "num_tokens": 4644965.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 241, "step_time": 253.224197126925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 761.9375, "completions/mean_terminated_length": 604.7000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5197276473045349, "epoch": 1.2804232804232805, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0014071539044380188, "learning_rate": 9.0353440196487e-07, "loss": -0.189, "num_tokens": 4670188.0, "reward": 0.609375, "reward_std": 0.3135034143924713, "rewards/itbench_correctness/mean": 0.609375, "rewards/itbench_correctness/std": 0.41047483682632446, "step": 242, "step_time": 248.56613456085324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 806.9375, "completions/mean_terminated_length": 756.84619140625, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "entropy": 0.4957013428211212, "epoch": 1.2857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0014731662813574076, "learning_rate": 9.025558937546987e-07, "loss": 0.0259, "num_tokens": 4690107.0, "reward": 0.6979166269302368, "reward_std": 0.18552666902542114, "rewards/itbench_correctness/mean": 0.6979166269302368, "rewards/itbench_correctness/std": 0.18225695192813873, "step": 243, "step_time": 141.42184507194906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 674.5625, "completions/mean_terminated_length": 674.5625, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.5336792469024658, "epoch": 1.291005291005291, "frac_reward_zero_std": 1.0, "grad_norm": 0.035400390625, "kl": 0.0016851610271260142, "learning_rate": 9.015729832577681e-07, "loss": 0.0, "num_tokens": 4710412.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 244, "step_time": 105.10308491624892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 527.75, "completions/mean_terminated_length": 527.75, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.5949786901473999, "epoch": 1.2962962962962963, "frac_reward_zero_std": 0.5, "grad_norm": 1.21875, "kl": 0.001679896144196391, "learning_rate": 9.005856812230304e-07, "loss": -0.0189, "num_tokens": 4723320.0, "reward": 0.4322916865348816, "reward_std": 0.031000997871160507, "rewards/itbench_correctness/mean": 0.4322916865348816, "rewards/itbench_correctness/std": 0.4484735131263733, "step": 245, "step_time": 98.61767490487546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 661.1875, "completions/mean_terminated_length": 577.4615478515625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.3705454170703888, "epoch": 1.3015873015873016, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0013237885432317853, "learning_rate": 8.995939984474623e-07, "loss": 0.0171, "num_tokens": 4739299.0, "reward": 0.6763392686843872, "reward_std": 0.17046323418617249, "rewards/itbench_correctness/mean": 0.6763392686843872, "rewards/itbench_correctness/std": 0.29665619134902954, "step": 246, "step_time": 81.29718050733209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 945.125, "completions/mean_terminated_length": 909.2727661132812, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "entropy": 0.31318607926368713, "epoch": 1.306878306878307, "frac_reward_zero_std": 0.5, "grad_norm": 1.515625, "kl": 0.001092436257749796, "learning_rate": 8.98597945775948e-07, "loss": 0.0178, "num_tokens": 4762901.0, "reward": 0.41874998807907104, "reward_std": 0.17100021243095398, "rewards/itbench_correctness/mean": 0.41874998807907104, "rewards/itbench_correctness/std": 0.4915536642074585, "step": 247, "step_time": 379.86342859547585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 466.1875, "completions/mean_terminated_length": 466.1875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.49336370825767517, "epoch": 1.312169312169312, "frac_reward_zero_std": 0.5, "grad_norm": 1.2578125, "kl": 0.0015457995468750596, "learning_rate": 8.975975341011595e-07, "loss": -0.0118, "num_tokens": 4772808.0, "reward": 0.5645833611488342, "reward_std": 0.01928791031241417, "rewards/itbench_correctness/mean": 0.5645833611488342, "rewards/itbench_correctness/std": 0.17201152443885803, "step": 248, "step_time": 83.39602283388376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 502.6875, "completions/mean_terminated_length": 502.6875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.3978614807128906, "epoch": 1.3174603174603174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0164794921875, "kl": 0.001136748120188713, "learning_rate": 8.965927743634389e-07, "loss": 0.0, "num_tokens": 4783827.0, "reward": 0.5833333134651184, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5833333134651184, "rewards/itbench_correctness/std": 0.4303314983844757, "step": 249, "step_time": 808.8291652789339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 488.375, "completions/mean_terminated_length": 488.375, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "entropy": 0.3910928964614868, "epoch": 1.3227513227513228, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.0019690291956067085, "learning_rate": 8.955836775506775e-07, "loss": 0.0, "num_tokens": 4795977.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 250, "step_time": 1037.5402492322028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 415.9375, "completions/mean_terminated_length": 415.9375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.40631103515625, "epoch": 1.328042328042328, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.0014389187563210726, "learning_rate": 8.945702546981968e-07, "loss": -0.0199, "num_tokens": 4805024.0, "reward": 0.546875, "reward_std": 0.16521647572517395, "rewards/itbench_correctness/mean": 0.546875, "rewards/itbench_correctness/std": 0.24714809656143188, "step": 251, "step_time": 89.62413766887039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 614.9375, "completions/mean_terminated_length": 614.9375, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "entropy": 0.5171257257461548, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.046875, "kl": 0.0015879496932029724, "learning_rate": 8.935525168886262e-07, "loss": 0.0103, "num_tokens": 4827879.0, "reward": 0.359375, "reward_std": 0.05866191163659096, "rewards/itbench_correctness/mean": 0.359375, "rewards/itbench_correctness/std": 0.3797157406806946, "step": 252, "step_time": 79.69811306335032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 657.75, "completions/mean_terminated_length": 657.75, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.3618395924568176, "epoch": 1.3386243386243386, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.001461725914850831, "learning_rate": 8.925304752517839e-07, "loss": -0.0421, "num_tokens": 4842899.0, "reward": 0.40937501192092896, "reward_std": 0.30845823884010315, "rewards/itbench_correctness/mean": 0.40937501192092896, "rewards/itbench_correctness/std": 0.3658055067062378, "step": 253, "step_time": 131.8951225792989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 725.875, "completions/mean_terminated_length": 626.5, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.5731014013290405, "epoch": 1.343915343915344, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.0015030583599582314, "learning_rate": 8.91504140964553e-07, "loss": 0.0, "num_tokens": 4870481.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 254, "step_time": 419.8384141791612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 756.4375, "completions/mean_terminated_length": 488.875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.4759150743484497, "epoch": 1.3492063492063493, "frac_reward_zero_std": 0.5, "grad_norm": 0.953125, "kl": 0.0011384043609723449, "learning_rate": 8.904735252507609e-07, "loss": -0.0058, "num_tokens": 4889368.0, "reward": 0.2916666865348816, "reward_std": 0.1178511381149292, "rewards/itbench_correctness/mean": 0.2916666865348816, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 255, "step_time": 75.48513688519597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 621.25, "completions/mean_terminated_length": 563.7142944335938, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.36378270387649536, "epoch": 1.3544973544973544, "frac_reward_zero_std": 0.5, "grad_norm": 1.171875, "kl": 0.002011555014178157, "learning_rate": 8.894386393810562e-07, "loss": 0.014, "num_tokens": 4904140.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 256, "step_time": 448.9436140609905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 1007.375, "completions/mean_terminated_length": 935.3333740234375, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "entropy": 0.4546469748020172, "epoch": 1.3597883597883598, "frac_reward_zero_std": 0.5, "grad_norm": 1.6484375, "kl": 0.0013795711565762758, "learning_rate": 8.883994946727847e-07, "loss": 0.0001, "num_tokens": 4929690.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 257, "step_time": 249.96061486005783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 705.5, "completions/mean_terminated_length": 632.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5301204919815063, "epoch": 1.3650793650793651, "frac_reward_zero_std": 0.5, "grad_norm": 1.4609375, "kl": 0.0015691749285906553, "learning_rate": 8.873561024898667e-07, "loss": -0.0308, "num_tokens": 4954970.0, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 258, "step_time": 128.34479956980795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 485.875, "completions/mean_terminated_length": 485.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3889889419078827, "epoch": 1.3703703703703702, "frac_reward_zero_std": 0.5, "grad_norm": 1.0859375, "kl": 0.002027226844802499, "learning_rate": 8.863084742426718e-07, "loss": -0.0592, "num_tokens": 4980176.0, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 259, "step_time": 110.6389656001702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 611.4375, "completions/mean_terminated_length": 611.4375, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "entropy": 0.4971890151500702, "epoch": 1.3756613756613756, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.0010566740529611707, "learning_rate": 8.852566213878946e-07, "loss": -0.0071, "num_tokens": 4992951.0, "reward": 0.84375, "reward_std": 0.22903135418891907, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.3520771861076355, "step": 260, "step_time": 502.41869831830263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.4797135889530182, "epoch": 1.380952380952381, "frac_reward_zero_std": 1.0, "grad_norm": 0.0223388671875, "kl": 0.0018066860502585769, "learning_rate": 8.842005554284295e-07, "loss": 0.0, "num_tokens": 5002711.0, "reward": 0.2083333283662796, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.2083333283662796, "rewards/itbench_correctness/std": 0.21516574919223785, "step": 261, "step_time": 94.77580868080258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 758.8125, "completions/mean_terminated_length": 599.7000122070312, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.4796968996524811, "epoch": 1.3862433862433863, "frac_reward_zero_std": 0.5, "grad_norm": 1.234375, "kl": 0.0017543428111821413, "learning_rate": 8.831402879132445e-07, "loss": -0.0144, "num_tokens": 5039356.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 262, "step_time": 486.13853998761624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 641.75, "completions/mean_terminated_length": 514.3333740234375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.49863654375076294, "epoch": 1.3915343915343916, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0019419684540480375, "learning_rate": 8.820758304372555e-07, "loss": 0.0274, "num_tokens": 5056128.0, "reward": 0.5, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 263, "step_time": 1132.2881942698732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 859.75, "completions/mean_terminated_length": 785.0909423828125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.47223028540611267, "epoch": 1.3968253968253967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0234375, "kl": 0.0012077168794348836, "learning_rate": 8.810071946411988e-07, "loss": 0.0, "num_tokens": 5077620.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 264, "step_time": 98.81888623256236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 616.875, "completions/mean_terminated_length": 589.7333374023438, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.40688955783843994, "epoch": 1.402116402116402, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0010585308773443103, "learning_rate": 8.799343922115043e-07, "loss": -0.0001, "num_tokens": 5090890.0, "reward": 0.6937500238418579, "reward_std": 0.13999362289905548, "rewards/itbench_correctness/mean": 0.6937500238418579, "rewards/itbench_correctness/std": 0.3696281909942627, "step": 265, "step_time": 779.0254694251344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 599.3125, "completions/mean_terminated_length": 599.3125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 0.5172593593597412, "epoch": 1.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0015477510169148445, "learning_rate": 8.788574348801674e-07, "loss": -0.0177, "num_tokens": 5103495.0, "reward": 0.46875, "reward_std": 0.1552036553621292, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.15478479862213135, "step": 266, "step_time": 430.513926978223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 705.0625, "completions/mean_terminated_length": 513.7000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3616700768470764, "epoch": 1.4126984126984126, "frac_reward_zero_std": 0.5, "grad_norm": 2.140625, "kl": 0.0013228055322542787, "learning_rate": 8.777763344246208e-07, "loss": 0.0035, "num_tokens": 5126072.0, "reward": 0.34375, "reward_std": 0.1735912710428238, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.42695629596710205, "step": 267, "step_time": 154.13704107049853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 493.375, "completions/mean_terminated_length": 493.375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.3830757439136505, "epoch": 1.417989417989418, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0012491599190980196, "learning_rate": 8.766911026676063e-07, "loss": 0.0118, "num_tokens": 5137798.0, "reward": 0.796875, "reward_std": 0.26196980476379395, "rewards/itbench_correctness/mean": 0.796875, "rewards/itbench_correctness/std": 0.27716949582099915, "step": 268, "step_time": 78.56690625380725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 639.6875, "completions/mean_terminated_length": 639.6875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.41582804918289185, "epoch": 1.4232804232804233, "frac_reward_zero_std": 0.5, "grad_norm": 1.0703125, "kl": 0.0016272872453555465, "learning_rate": 8.756017514770442e-07, "loss": 0.0292, "num_tokens": 5150993.0, "reward": 0.78125, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.286865234375, "step": 269, "step_time": 87.11506285239011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 907.8125, "completions/mean_terminated_length": 559.25, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.4648537039756775, "epoch": 1.4285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 2.046875, "kl": 0.0014328653924167156, "learning_rate": 8.745082927659046e-07, "loss": 0.0103, "num_tokens": 5182310.0, "reward": 0.3958333432674408, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.3958333432674408, "rewards/itbench_correctness/std": 0.47482940554618835, "step": 270, "step_time": 148.08130174782127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 821.0, "completions/mean_terminated_length": 663.1111450195312, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 0.38246041536331177, "epoch": 1.433862433862434, "frac_reward_zero_std": 0.5, "grad_norm": 1.5703125, "kl": 0.0012811021879315376, "learning_rate": 8.734107384920769e-07, "loss": 0.045, "num_tokens": 5206270.0, "reward": 0.4124999940395355, "reward_std": 0.172688826918602, "rewards/itbench_correctness/mean": 0.4124999940395355, "rewards/itbench_correctness/std": 0.4869976043701172, "step": 271, "step_time": 145.7680284064263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 584.0, "completions/mean_terminated_length": 482.4615478515625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3476027250289917, "epoch": 1.439153439153439, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.002004146808758378, "learning_rate": 8.723091006582388e-07, "loss": 0.0194, "num_tokens": 5224206.0, "reward": 0.4375, "reward_std": 0.38298875093460083, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 272, "step_time": 309.5534623619169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 656.25, "completions/mean_terminated_length": 370.22222900390625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.7039999961853027, "epoch": 1.4444444444444444, "frac_reward_zero_std": 0.5, "grad_norm": 1.421875, "kl": 0.0013335734838619828, "learning_rate": 8.712033913117249e-07, "loss": 0.074, "num_tokens": 5243858.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.015625, "rewards/itbench_correctness/std": 0.0625, "step": 273, "step_time": 103.5548415929079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 646.75, "completions/mean_terminated_length": 646.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "entropy": 0.5102435350418091, "epoch": 1.4497354497354498, "frac_reward_zero_std": 0.5, "grad_norm": 1.2265625, "kl": 0.0015617223689332604, "learning_rate": 8.700936225443958e-07, "loss": 0.0166, "num_tokens": 5259974.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 274, "step_time": 187.742013909854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 469.75, "completions/mean_terminated_length": 469.75, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.42575839161872864, "epoch": 1.455026455026455, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0016291391802951694, "learning_rate": 8.689798064925048e-07, "loss": 0.012, "num_tokens": 5270194.0, "reward": 0.4375, "reward_std": 0.0862581878900528, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.4669642150402069, "step": 275, "step_time": 89.66884011216462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 821.375, "completions/mean_terminated_length": 807.86669921875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4723786413669586, "epoch": 1.4603174603174602, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.001486663124524057, "learning_rate": 8.678619553365658e-07, "loss": -0.0841, "num_tokens": 5292856.0, "reward": 0.8402777910232544, "reward_std": 0.21910008788108826, "rewards/itbench_correctness/mean": 0.8402777910232544, "rewards/itbench_correctness/std": 0.3417908549308777, "step": 276, "step_time": 421.1291719619185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 588.8125, "completions/mean_terminated_length": 588.8125, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.43817004561424255, "epoch": 1.4656084656084656, "frac_reward_zero_std": 0.5, "grad_norm": 1.25, "kl": 0.001539916731417179, "learning_rate": 8.667400813012199e-07, "loss": -0.002, "num_tokens": 5312117.0, "reward": 0.43541669845581055, "reward_std": 0.1833198070526123, "rewards/itbench_correctness/mean": 0.43541669845581055, "rewards/itbench_correctness/std": 0.4274764358997345, "step": 277, "step_time": 134.6105333585292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 812.3125, "completions/mean_terminated_length": 741.75, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.4210202395915985, "epoch": 1.470899470899471, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0012785056605935097, "learning_rate": 8.656141966551018e-07, "loss": -0.0175, "num_tokens": 5336450.0, "reward": 0.125, "reward_std": 0.2925041913986206, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.28867512941360474, "step": 278, "step_time": 588.067256687209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 479.125, "completions/mean_terminated_length": 479.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.4654317796230316, "epoch": 1.4761904761904763, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.001857173629105091, "learning_rate": 8.644843137107057e-07, "loss": 0.0, "num_tokens": 5347140.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 279, "step_time": 93.42977315280586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 902.1875, "completions/mean_terminated_length": 807.4444580078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4633183181285858, "epoch": 1.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0013472916325554252, "learning_rate": 8.633504448242504e-07, "loss": -0.0543, "num_tokens": 5367903.0, "reward": 0.3077380955219269, "reward_std": 0.26785334944725037, "rewards/itbench_correctness/mean": 0.3077380955219269, "rewards/itbench_correctness/std": 0.3630719482898712, "step": 280, "step_time": 90.29017782397568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 777.25, "completions/mean_terminated_length": 665.0909423828125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.3731103241443634, "epoch": 1.4867724867724867, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.0013979077339172363, "learning_rate": 8.622126023955445e-07, "loss": 0.0109, "num_tokens": 5393947.0, "reward": 0.31041666865348816, "reward_std": 0.18059369921684265, "rewards/itbench_correctness/mean": 0.31041666865348816, "rewards/itbench_correctness/std": 0.32879552245140076, "step": 281, "step_time": 111.45759059861302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 780.375, "completions/mean_terminated_length": 536.75, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.6048374176025391, "epoch": 1.492063492063492, "frac_reward_zero_std": 0.5, "grad_norm": 1.53125, "kl": 0.0014561447314918041, "learning_rate": 8.610707988678503e-07, "loss": 0.0, "num_tokens": 5413105.0, "reward": 0.609375, "reward_std": 0.12387890368700027, "rewards/itbench_correctness/mean": 0.609375, "rewards/itbench_correctness/std": 0.4375000298023224, "step": 282, "step_time": 87.52544206380844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 567.25, "completions/mean_terminated_length": 567.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.39841338992118835, "epoch": 1.4973544973544972, "frac_reward_zero_std": 0.5, "grad_norm": 1.1328125, "kl": 0.0016809444641694427, "learning_rate": 8.599250467277483e-07, "loss": -0.1189, "num_tokens": 5431333.0, "reward": 0.3125, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 283, "step_time": 285.17205636110157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 790.9375, "completions/mean_terminated_length": 651.1000366210938, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.3767680823802948, "epoch": 1.5026455026455028, "frac_reward_zero_std": 0.5, "grad_norm": 1.59375, "kl": 0.001456994330510497, "learning_rate": 8.587753585050004e-07, "loss": -0.0216, "num_tokens": 5450876.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.03125, "rewards/itbench_correctness/std": 0.125, "step": 284, "step_time": 7258.076673376374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 572.8125, "completions/mean_terminated_length": 572.8125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.36661210656166077, "epoch": 1.507936507936508, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.001243554288521409, "learning_rate": 8.576217467724127e-07, "loss": -0.0147, "num_tokens": 5463649.0, "reward": 0.875, "reward_std": 0.2177756428718567, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.22360680997371674, "step": 285, "step_time": 65.26539001893252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 674.0, "completions/mean_terminated_length": 464.0, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.4094955623149872, "epoch": 1.5132275132275133, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.0017368828412145376, "learning_rate": 8.564642241456986e-07, "loss": 0.0128, "num_tokens": 5481657.0, "reward": 0.1640625, "reward_std": 0.06629125773906708, "rewards/itbench_correctness/mean": 0.1640625, "rewards/itbench_correctness/std": 0.19213032722473145, "step": 286, "step_time": 1010.0560459299013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 677.5625, "completions/mean_terminated_length": 469.70001220703125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.30107924342155457, "epoch": 1.5185185185185186, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0014942490961402655, "learning_rate": 8.553028032833396e-07, "loss": 0.0399, "num_tokens": 5500474.0, "reward": 0.34375, "reward_std": 0.12938730418682098, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.3966001570224762, "step": 287, "step_time": 967.4249309562147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 861.8125, "completions/mean_terminated_length": 851.0000610351562, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "entropy": 0.3689897656440735, "epoch": 1.5238095238095237, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.001182331470772624, "learning_rate": 8.541374968864485e-07, "loss": 0.0161, "num_tokens": 5520511.0, "reward": 0.6545138955116272, "reward_std": 0.2626494765281677, "rewards/itbench_correctness/mean": 0.6545138955116272, "rewards/itbench_correctness/std": 0.284541517496109, "step": 288, "step_time": 416.6262904284522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 753.5625, "completions/mean_terminated_length": 591.2999877929688, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.6183959245681763, "epoch": 1.529100529100529, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0017281738109886646, "learning_rate": 8.529683176986295e-07, "loss": 0.0359, "num_tokens": 5542376.0, "reward": 0.5255681872367859, "reward_std": 0.07393435388803482, "rewards/itbench_correctness/mean": 0.5255681872367859, "rewards/itbench_correctness/std": 0.48524191975593567, "step": 289, "step_time": 100.21258049272001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 957.125, "completions/mean_terminated_length": 756.5, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "entropy": 0.5600104331970215, "epoch": 1.5343915343915344, "frac_reward_zero_std": 1.0, "grad_norm": 0.224609375, "kl": 0.002044468652456999, "learning_rate": 8.517952785058384e-07, "loss": 0.0001, "num_tokens": 5578490.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 290, "step_time": 204.50664361845702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 846.8125, "completions/mean_terminated_length": 805.923095703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.49125397205352783, "epoch": 1.5396825396825395, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.001689074793830514, "learning_rate": 8.506183921362442e-07, "loss": -0.1079, "num_tokens": 5598255.0, "reward": 0.7552083730697632, "reward_std": 0.3175256550312042, "rewards/itbench_correctness/mean": 0.7552083730697632, "rewards/itbench_correctness/std": 0.3325946629047394, "step": 291, "step_time": 141.62990444898605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 697.3125, "completions/mean_terminated_length": 443.22222900390625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.5621582865715027, "epoch": 1.544973544973545, "frac_reward_zero_std": 0.5, "grad_norm": 1.140625, "kl": 0.0017327865352854133, "learning_rate": 8.494376714600877e-07, "loss": -0.0076, "num_tokens": 5618796.0, "reward": 0.21250000596046448, "reward_std": 0.02314549870789051, "rewards/itbench_correctness/mean": 0.21250000596046448, "rewards/itbench_correctness/std": 0.22173558175563812, "step": 292, "step_time": 116.08490148931742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 701.75, "completions/mean_terminated_length": 680.2667236328125, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "entropy": 0.36052724719047546, "epoch": 1.5502645502645502, "frac_reward_zero_std": 0.5, "grad_norm": 1.3203125, "kl": 0.0012737837387248874, "learning_rate": 8.48253129389541e-07, "loss": 0.0105, "num_tokens": 5634952.0, "reward": 0.10546875, "reward_std": 0.06008155643939972, "rewards/itbench_correctness/mean": 0.10546875, "rewards/itbench_correctness/std": 0.1363947093486786, "step": 293, "step_time": 391.972909046337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 493.25, "completions/mean_terminated_length": 493.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4176381230354309, "epoch": 1.5555555555555556, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0017929230816662312, "learning_rate": 8.470647788785664e-07, "loss": -0.0618, "num_tokens": 5646612.0, "reward": 0.671875, "reward_std": 0.2810920476913452, "rewards/itbench_correctness/mean": 0.671875, "rewards/itbench_correctness/std": 0.3502231538295746, "step": 294, "step_time": 417.62054439727217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 440.1875, "completions/mean_terminated_length": 440.1875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.3566662073135376, "epoch": 1.560846560846561, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0015034499811008573, "learning_rate": 8.458726329227747e-07, "loss": 0.0048, "num_tokens": 5656511.0, "reward": 0.7447916865348816, "reward_std": 0.11666134744882584, "rewards/itbench_correctness/mean": 0.7447916865348816, "rewards/itbench_correctness/std": 0.17864912748336792, "step": 295, "step_time": 839.189504972659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 655.875, "completions/mean_terminated_length": 655.875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.5641318559646606, "epoch": 1.566137566137566, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.002510908292606473, "learning_rate": 8.446767045592829e-07, "loss": 0.0185, "num_tokens": 5680541.0, "reward": 0.265625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.265625, "rewards/itbench_correctness/std": 0.28090256452560425, "step": 296, "step_time": 371.1978498548269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 673.5, "completions/mean_terminated_length": 650.1333618164062, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.5404602885246277, "epoch": 1.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.001535046030767262, "learning_rate": 8.434770068665722e-07, "loss": 0.0315, "num_tokens": 5695821.0, "reward": 0.5397727489471436, "reward_std": 0.29765012860298157, "rewards/itbench_correctness/mean": 0.5397727489471436, "rewards/itbench_correctness/std": 0.45329374074935913, "step": 297, "step_time": 163.53786495421082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 764.0625, "completions/mean_terminated_length": 677.4166870117188, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.48425358533859253, "epoch": 1.5767195767195767, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.001875088200904429, "learning_rate": 8.422735529643443e-07, "loss": -0.0003, "num_tokens": 5711494.0, "reward": 0.390625, "reward_std": 0.0794283002614975, "rewards/itbench_correctness/mean": 0.390625, "rewards/itbench_correctness/std": 0.3492303192615509, "step": 298, "step_time": 102.2513699810952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 742.1875, "completions/mean_terminated_length": 523.0, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.36648422479629517, "epoch": 1.5820105820105819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0281982421875, "kl": 0.0010558166541159153, "learning_rate": 8.410663560133783e-07, "loss": 0.0, "num_tokens": 5738233.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 299, "step_time": 170.24059600010514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 844.5625, "completions/mean_terminated_length": 705.0, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.5233479142189026, "epoch": 1.5873015873015874, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.001366324140690267, "learning_rate": 8.398554292153865e-07, "loss": 0.0093, "num_tokens": 5760042.0, "reward": 0.4791666865348816, "reward_std": 0.16512766480445862, "rewards/itbench_correctness/mean": 0.4791666865348816, "rewards/itbench_correctness/std": 0.3019995093345642, "step": 300, "step_time": 285.300213762559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 753.6875, "completions/mean_terminated_length": 543.4444580078125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.536031186580658, "epoch": 1.5925925925925926, "frac_reward_zero_std": 0.5, "grad_norm": 1.5625, "kl": 0.0015036487020552158, "learning_rate": 8.386407858128706e-07, "loss": 0.0005, "num_tokens": 5783901.0, "reward": 0.28437501192092896, "reward_std": 0.07841908931732178, "rewards/itbench_correctness/mean": 0.28437501192092896, "rewards/itbench_correctness/std": 0.14226588606834412, "step": 301, "step_time": 148.27129491977394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 530.0625, "completions/mean_terminated_length": 530.0625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.42825138568878174, "epoch": 1.597883597883598, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0016496152384206653, "learning_rate": 8.374224390889759e-07, "loss": -0.0793, "num_tokens": 5802318.0, "reward": 0.296875, "reward_std": 0.13962560892105103, "rewards/itbench_correctness/mean": 0.296875, "rewards/itbench_correctness/std": 0.1434326171875, "step": 302, "step_time": 129.92639573384076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 1009.8125, "completions/mean_terminated_length": 948.3333740234375, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "entropy": 0.43770501017570496, "epoch": 1.6031746031746033, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0010520732030272484, "learning_rate": 8.362004023673472e-07, "loss": 0.0025, "num_tokens": 5825731.0, "reward": 0.5, "reward_std": 0.1356339007616043, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.47214052081108093, "step": 303, "step_time": 78.02016614936292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 747.8125, "completions/mean_terminated_length": 582.1000366210938, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.4011700749397278, "epoch": 1.6084656084656084, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.0011779264314100146, "learning_rate": 8.349746890119824e-07, "loss": -0.0198, "num_tokens": 5864376.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 304, "step_time": 941.6569038927555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 496.3125, "completions/mean_terminated_length": 496.3125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.40297192335128784, "epoch": 1.6137566137566137, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.00194979936350137, "learning_rate": 8.337453124270862e-07, "loss": -0.0047, "num_tokens": 5884285.0, "reward": 0.5625, "reward_std": 0.3471825420856476, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 305, "step_time": 107.52555268164724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 965.625, "completions/mean_terminated_length": 837.2000122070312, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.41838186979293823, "epoch": 1.619047619047619, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0014495647046715021, "learning_rate": 8.325122860569241e-07, "loss": -0.0125, "num_tokens": 5910599.0, "reward": 0.5354166626930237, "reward_std": 0.05260828882455826, "rewards/itbench_correctness/mean": 0.5354166626930237, "rewards/itbench_correctness/std": 0.4056031107902527, "step": 306, "step_time": 113.85000483132899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 630.75, "completions/mean_terminated_length": 574.5714721679688, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.3202536702156067, "epoch": 1.6243386243386242, "frac_reward_zero_std": 0.5, "grad_norm": 1.078125, "kl": 0.001761075691320002, "learning_rate": 8.312756233856748e-07, "loss": -0.0057, "num_tokens": 5925579.0, "reward": 0.1875, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.25, "step": 307, "step_time": 252.66727325879037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 831.3125, "completions/mean_terminated_length": 681.4444580078125, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 0.4017743170261383, "epoch": 1.6296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0010056800674647093, "learning_rate": 8.300353379372833e-07, "loss": 0.0837, "num_tokens": 5954544.0, "reward": 0.5625, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 308, "step_time": 91.5687418980524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 438.6875, "completions/mean_terminated_length": 399.66668701171875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.36244478821754456, "epoch": 1.6349206349206349, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.0014679917367175221, "learning_rate": 8.287914432753123e-07, "loss": -0.1423, "num_tokens": 5971995.0, "reward": 0.78125, "reward_std": 0.2706093192100525, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.29007503390312195, "step": 309, "step_time": 103.0185587760061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 951.5625, "completions/mean_terminated_length": 830.8333740234375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "entropy": 0.5422660112380981, "epoch": 1.6402116402116402, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.00137800769880414, "learning_rate": 8.275439530027947e-07, "loss": -0.0189, "num_tokens": 5994244.0, "reward": 0.4236111044883728, "reward_std": 0.3206467628479004, "rewards/itbench_correctness/mean": 0.4236111044883728, "rewards/itbench_correctness/std": 0.4552505910396576, "step": 310, "step_time": 408.57210523914546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 718.8125, "completions/mean_terminated_length": 648.3846435546875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.4062255322933197, "epoch": 1.6455026455026456, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0012174441944807768, "learning_rate": 8.262928807620843e-07, "loss": 0.0105, "num_tokens": 6010465.0, "reward": 0.25, "reward_std": 0.3745020925998688, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.40824830532073975, "step": 311, "step_time": 475.81261223275214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 547.4375, "completions/mean_terminated_length": 547.4375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.3781253695487976, "epoch": 1.6507936507936507, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0014938174281269312, "learning_rate": 8.250382402347064e-07, "loss": 0.0148, "num_tokens": 6023008.0, "reward": 0.7837010025978088, "reward_std": 0.2962387502193451, "rewards/itbench_correctness/mean": 0.7837010025978088, "rewards/itbench_correctness/std": 0.3930458724498749, "step": 312, "step_time": 166.25692852959037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 492.6875, "completions/mean_terminated_length": 492.6875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.3694024980068207, "epoch": 1.656084656084656, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0012153934221714735, "learning_rate": 8.237800451412094e-07, "loss": -0.0061, "num_tokens": 6034795.0, "reward": 0.671875, "reward_std": 0.25043365359306335, "rewards/itbench_correctness/mean": 0.671875, "rewards/itbench_correctness/std": 0.3353670835494995, "step": 313, "step_time": 146.2452635196969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 496.75, "completions/mean_terminated_length": 496.75, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.5234020948410034, "epoch": 1.6613756613756614, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0013322219019755721, "learning_rate": 8.225183092410127e-07, "loss": -0.0344, "num_tokens": 6045567.0, "reward": 0.5625, "reward_std": 0.3471825420856476, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 314, "step_time": 998.2186722587794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 660.6875, "completions/mean_terminated_length": 636.4666748046875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "entropy": 0.44499102234840393, "epoch": 1.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0257568359375, "kl": 0.001278597628697753, "learning_rate": 8.212530463322582e-07, "loss": 0.0, "num_tokens": 6061834.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 315, "step_time": 1148.7406483720988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 454.25, "completions/mean_terminated_length": 454.25, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.41607043147087097, "epoch": 1.671957671957672, "frac_reward_zero_std": 0.5, "grad_norm": 1.078125, "kl": 0.0012671623844653368, "learning_rate": 8.199842702516582e-07, "loss": 0.0057, "num_tokens": 6072118.0, "reward": 0.953125, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.953125, "rewards/itbench_correctness/std": 0.1875, "step": 316, "step_time": 864.3541559455916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 566.625, "completions/mean_terminated_length": 566.625, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.4729759395122528, "epoch": 1.6772486772486772, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.001966112991794944, "learning_rate": 8.187119948743449e-07, "loss": 0.0121, "num_tokens": 6086384.0, "reward": 0.6979166269302368, "reward_std": 0.3103903532028198, "rewards/itbench_correctness/mean": 0.6979166269302368, "rewards/itbench_correctness/std": 0.42259669303894043, "step": 317, "step_time": 80.52064239047468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 786.5, "completions/mean_terminated_length": 549.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4755244851112366, "epoch": 1.6825396825396826, "frac_reward_zero_std": 0.5, "grad_norm": 0.494140625, "kl": 0.0016143879620358348, "learning_rate": 8.174362341137176e-07, "loss": -0.0945, "num_tokens": 6105360.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 318, "step_time": 496.4877818999812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 753.0, "completions/mean_terminated_length": 753.0, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "entropy": 0.2895086407661438, "epoch": 1.687830687830688, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0012620992492884398, "learning_rate": 8.16157001921292e-07, "loss": -0.0084, "num_tokens": 6123976.0, "reward": 0.8125, "reward_std": 0.1462520956993103, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.19364917278289795, "step": 319, "step_time": 96.52025901339948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 1001.3125, "completions/mean_terminated_length": 903.0, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "entropy": 0.39947569370269775, "epoch": 1.693121693121693, "frac_reward_zero_std": 0.5, "grad_norm": 1.59375, "kl": 0.0011935190996155143, "learning_rate": 8.148743122865463e-07, "loss": -0.0069, "num_tokens": 6166669.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 320, "step_time": 7083.372859461233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 675.4375, "completions/mean_terminated_length": 652.2000122070312, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.37753307819366455, "epoch": 1.6984126984126984, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0014044505078345537, "learning_rate": 8.135881792367685e-07, "loss": -0.0313, "num_tokens": 6182420.0, "reward": 0.5208333730697632, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5208333730697632, "rewards/itbench_correctness/std": 0.48638883233070374, "step": 321, "step_time": 248.25969803985208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 685.5, "completions/mean_terminated_length": 572.6666870117188, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.6068562865257263, "epoch": 1.7037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0015854442026466131, "learning_rate": 8.122986168369039e-07, "loss": -0.0155, "num_tokens": 6206140.0, "reward": 0.234375, "reward_std": 0.3006556034088135, "rewards/itbench_correctness/mean": 0.234375, "rewards/itbench_correctness/std": 0.3158157467842102, "step": 322, "step_time": 145.6991236684844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 787.6875, "completions/mean_terminated_length": 551.375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.4164087772369385, "epoch": 1.7089947089947088, "frac_reward_zero_std": 0.5, "grad_norm": 1.59375, "kl": 0.0013617142103612423, "learning_rate": 8.110056391894003e-07, "loss": 0.0, "num_tokens": 6227303.0, "reward": 0.5520833134651184, "reward_std": 0.06200198456645012, "rewards/itbench_correctness/mean": 0.5520833134651184, "rewards/itbench_correctness/std": 0.4702983796596527, "step": 323, "step_time": 140.916482466273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 699.0, "completions/mean_terminated_length": 699.0, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "entropy": 0.3690987229347229, "epoch": 1.7142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0010593783808872104, "learning_rate": 8.097092604340541e-07, "loss": 0.0083, "num_tokens": 6244119.0, "reward": 0.84375, "reward_std": 0.14777101576328278, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.16720746457576752, "step": 324, "step_time": 67.42365125380456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 972.25, "completions/mean_terminated_length": 748.0, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "entropy": 0.5759835243225098, "epoch": 1.7195767195767195, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.001520626712590456, "learning_rate": 8.084094947478554e-07, "loss": 0.0001, "num_tokens": 6288875.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 325, "step_time": 162.16153999976814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 747.1875, "completions/mean_terminated_length": 621.3636474609375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.40150564908981323, "epoch": 1.7248677248677249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517578125, "kl": 0.0012618422042578459, "learning_rate": 8.071063563448339e-07, "loss": 0.0, "num_tokens": 6307974.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 326, "step_time": 658.432337153703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 683.375, "completions/mean_terminated_length": 683.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4097311198711395, "epoch": 1.7301587301587302, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0016633245395496488, "learning_rate": 8.057998594759022e-07, "loss": -0.0924, "num_tokens": 6323180.0, "reward": 0.45098039507865906, "reward_std": 0.24446815252304077, "rewards/itbench_correctness/mean": 0.45098039507865906, "rewards/itbench_correctness/std": 0.37708234786987305, "step": 327, "step_time": 183.20953813474625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 992.0625, "completions/mean_terminated_length": 938.8333740234375, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "entropy": 0.4838404953479767, "epoch": 1.7354497354497354, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0014802846126258373, "learning_rate": 8.044900184287006e-07, "loss": -0.0402, "num_tokens": 6345661.0, "reward": 0.7080707550048828, "reward_std": 0.3341723680496216, "rewards/itbench_correctness/mean": 0.7080707550048828, "rewards/itbench_correctness/std": 0.33529266715049744, "step": 328, "step_time": 849.2058257460594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 852.4375, "completions/mean_terminated_length": 827.9285888671875, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.4762812554836273, "epoch": 1.7407407407407407, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0015186754753813148, "learning_rate": 8.031768475274412e-07, "loss": 0.0195, "num_tokens": 6375004.0, "reward": 0.4166666865348816, "reward_std": 0.3177001476287842, "rewards/itbench_correctness/mean": 0.4166666865348816, "rewards/itbench_correctness/std": 0.42163705825805664, "step": 329, "step_time": 127.72529877442867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 461.375, "completions/mean_terminated_length": 423.86669921875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.353291779756546, "epoch": 1.746031746031746, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0018113987753167748, "learning_rate": 8.018603611327504e-07, "loss": -0.0122, "num_tokens": 6384650.0, "reward": 0.53125, "reward_std": 0.0578637570142746, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.08539126068353653, "step": 330, "step_time": 139.3600283851847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 869.3125, "completions/mean_terminated_length": 611.5, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.39111366868019104, "epoch": 1.7513227513227512, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0013434779830276966, "learning_rate": 8.005405736415125e-07, "loss": -0.0235, "num_tokens": 6410383.0, "reward": 0.6458333730697632, "reward_std": 0.37862008810043335, "rewards/itbench_correctness/mean": 0.6458333730697632, "rewards/itbench_correctness/std": 0.36704525351524353, "step": 331, "step_time": 760.4793853284791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 822.4375, "completions/mean_terminated_length": 730.8181762695312, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "entropy": 0.2966790795326233, "epoch": 1.7566137566137567, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0013310646172612906, "learning_rate": 7.992174994867123e-07, "loss": 0.0379, "num_tokens": 6431670.0, "reward": 0.4851190745830536, "reward_std": 0.19627538323402405, "rewards/itbench_correctness/mean": 0.4851190745830536, "rewards/itbench_correctness/std": 0.2754608690738678, "step": 332, "step_time": 100.20586761180311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 540.0, "completions/mean_terminated_length": 540.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.5, "epoch": 1.7619047619047619, "frac_reward_zero_std": 0.5, "grad_norm": 1.640625, "kl": 0.0020073342602699995, "learning_rate": 7.978911531372764e-07, "loss": -0.007, "num_tokens": 6452334.0, "reward": 0.25, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 333, "step_time": 130.50346516724676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 894.25, "completions/mean_terminated_length": 727.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5636007785797119, "epoch": 1.7671957671957672, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0019210126483812928, "learning_rate": 7.965615490979163e-07, "loss": -0.0575, "num_tokens": 6485010.0, "reward": 0.3125, "reward_std": 0.3924052119255066, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 334, "step_time": 287.8023096676916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 860.6875, "completions/mean_terminated_length": 697.375, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "entropy": 0.5019243359565735, "epoch": 1.7724867724867726, "frac_reward_zero_std": 0.5, "grad_norm": 1.3203125, "kl": 0.0012930968077853322, "learning_rate": 7.952287019089685e-07, "loss": 0.001, "num_tokens": 6506909.0, "reward": 0.956250011920929, "reward_std": 0.086344413459301, "rewards/itbench_correctness/mean": 0.956250011920929, "rewards/itbench_correctness/std": 0.1263262927532196, "step": 335, "step_time": 81.09772168658674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 642.625, "completions/mean_terminated_length": 642.625, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.38280490040779114, "epoch": 1.7777777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.025146484375, "kl": 0.0013457894092425704, "learning_rate": 7.938926261462365e-07, "loss": 0.0, "num_tokens": 6521703.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 336, "step_time": 95.01154231280088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 756.6875, "completions/mean_terminated_length": 548.7777709960938, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.306599497795105, "epoch": 1.783068783068783, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0012715982738882303, "learning_rate": 7.925533364208308e-07, "loss": -0.0117, "num_tokens": 6541690.0, "reward": 0.25, "reward_std": 0.4355512857437134, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 337, "step_time": 142.65012488793582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 928.625, "completions/mean_terminated_length": 769.6666870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3445954918861389, "epoch": 1.7883597883597884, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0012255455367267132, "learning_rate": 7.912108473790091e-07, "loss": -0.0833, "num_tokens": 6563700.0, "reward": 0.36250001192092896, "reward_std": 0.22638463973999023, "rewards/itbench_correctness/mean": 0.36250001192092896, "rewards/itbench_correctness/std": 0.4856267273426056, "step": 338, "step_time": 146.54249787330627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 537.0625, "completions/mean_terminated_length": 537.0625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.333294540643692, "epoch": 1.7936507936507935, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0017877722857519984, "learning_rate": 7.898651737020166e-07, "loss": -0.092, "num_tokens": 6577333.0, "reward": 0.875568151473999, "reward_std": 0.17176605761051178, "rewards/itbench_correctness/mean": 0.875568151473999, "rewards/itbench_correctness/std": 0.24365714192390442, "step": 339, "step_time": 63.54152914788574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 494.625, "completions/mean_terminated_length": 494.625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.414455384016037, "epoch": 1.798941798941799, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0016418452141806483, "learning_rate": 7.88516330105925e-07, "loss": 0.0023, "num_tokens": 6588839.0, "reward": 0.46875, "reward_std": 0.23779192566871643, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.23935678601264954, "step": 340, "step_time": 421.9616943122819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 897.6875, "completions/mean_terminated_length": 771.375, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "entropy": 0.4611849784851074, "epoch": 1.8042328042328042, "frac_reward_zero_std": 0.5, "grad_norm": 1.3984375, "kl": 0.0015902061713859439, "learning_rate": 7.871643313414718e-07, "loss": 0.0001, "num_tokens": 6620194.0, "reward": 0.4583333432674408, "reward_std": 0.07715166360139847, "rewards/itbench_correctness/mean": 0.4583333432674408, "rewards/itbench_correctness/std": 0.4849589467048645, "step": 341, "step_time": 85.41169494390488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 788.5625, "completions/mean_terminated_length": 710.0833740234375, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "entropy": 0.5148609280586243, "epoch": 1.8095238095238095, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0013109652791172266, "learning_rate": 7.858091921938987e-07, "loss": 0.0209, "num_tokens": 6637355.0, "reward": 0.49609375, "reward_std": 0.31232208013534546, "rewards/itbench_correctness/mean": 0.49609375, "rewards/itbench_correctness/std": 0.42540958523750305, "step": 342, "step_time": 104.21269215922803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 768.9375, "completions/mean_terminated_length": 513.875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "entropy": 0.3641388416290283, "epoch": 1.8148148148148149, "frac_reward_zero_std": 0.5, "grad_norm": 1.1640625, "kl": 0.0012282090028747916, "learning_rate": 7.844509274827906e-07, "loss": 0.0, "num_tokens": 6656498.0, "reward": 0.84375, "reward_std": 0.08258593082427979, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.19690898060798645, "step": 343, "step_time": 250.37473237421364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 728.8125, "completions/mean_terminated_length": 551.7000122070312, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.4253494441509247, "epoch": 1.82010582010582, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.001717406790703535, "learning_rate": 7.830895520619128e-07, "loss": 0.0331, "num_tokens": 6673527.0, "reward": 0.3839285671710968, "reward_std": 0.24145764112472534, "rewards/itbench_correctness/mean": 0.3839285671710968, "rewards/itbench_correctness/std": 0.41063666343688965, "step": 344, "step_time": 111.02232545148581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 734.6875, "completions/mean_terminated_length": 509.6666564941406, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.43555933237075806, "epoch": 1.8253968253968254, "frac_reward_zero_std": 1.0, "grad_norm": 0.04296875, "kl": 0.001631794380955398, "learning_rate": 7.817250808190483e-07, "loss": 0.0, "num_tokens": 6696034.0, "reward": 0.4285714328289032, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.4285714328289032, "rewards/itbench_correctness/std": 0.4426266849040985, "step": 345, "step_time": 991.8486757231876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 619.3125, "completions/mean_terminated_length": 619.3125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.4714905619621277, "epoch": 1.8306878306878307, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.001351330429315567, "learning_rate": 7.803575286758363e-07, "loss": 0.0057, "num_tokens": 6714223.0, "reward": 0.875, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.22360680997371674, "step": 346, "step_time": 114.43643134180456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 641.6875, "completions/mean_terminated_length": 641.6875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.3397292196750641, "epoch": 1.8359788359788358, "frac_reward_zero_std": 0.5, "grad_norm": 1.15625, "kl": 0.000967301893979311, "learning_rate": 7.789869105876082e-07, "loss": 0.0277, "num_tokens": 6727946.0, "reward": 0.9375, "reward_std": 0.03857583552598953, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.08333335071802139, "step": 347, "step_time": 799.3260576492175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 940.0625, "completions/mean_terminated_length": 688.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.4978392422199249, "epoch": 1.8412698412698414, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.00129983713850379, "learning_rate": 7.776132415432232e-07, "loss": -0.0185, "num_tokens": 6748563.0, "reward": 0.2109375, "reward_std": 0.37981581687927246, "rewards/itbench_correctness/mean": 0.2109375, "rewards/itbench_correctness/std": 0.3783702850341797, "step": 348, "step_time": 79.58014123514295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 987.375, "completions/mean_terminated_length": 877.5, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "entropy": 0.5266489386558533, "epoch": 1.8465608465608465, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.001141308806836605, "learning_rate": 7.762365365649067e-07, "loss": 0.0426, "num_tokens": 6769345.0, "reward": 0.28125, "reward_std": 0.32512497901916504, "rewards/itbench_correctness/mean": 0.28125, "rewards/itbench_correctness/std": 0.4366062581539154, "step": 349, "step_time": 74.42612945474684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 600.4375, "completions/mean_terminated_length": 572.2000122070312, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.2631414532661438, "epoch": 1.8518518518518519, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0013918590266257524, "learning_rate": 7.74856810708083e-07, "loss": -0.0166, "num_tokens": 6783368.0, "reward": 0.4817708432674408, "reward_std": 0.2546592652797699, "rewards/itbench_correctness/mean": 0.4817708432674408, "rewards/itbench_correctness/std": 0.2613040506839752, "step": 350, "step_time": 84.01541598606855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 432.25, "completions/mean_terminated_length": 432.25, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.37478309869766235, "epoch": 1.8571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0021659955382347107, "learning_rate": 7.734740790612136e-07, "loss": -0.0002, "num_tokens": 6793332.0, "reward": 0.2847222089767456, "reward_std": 0.1159602552652359, "rewards/itbench_correctness/mean": 0.2847222089767456, "rewards/itbench_correctness/std": 0.19016453623771667, "step": 351, "step_time": 52.17074024025351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 744.5625, "completions/mean_terminated_length": 527.2222290039062, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.44052714109420776, "epoch": 1.8624338624338623, "frac_reward_zero_std": 0.5, "grad_norm": 1.625, "kl": 0.0016061851056292653, "learning_rate": 7.720883567456298e-07, "loss": 0.0084, "num_tokens": 6812589.0, "reward": 0.78125, "reward_std": 0.23543904721736908, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.39308255910873413, "step": 352, "step_time": 82.14024385716766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 459.25, "completions/mean_terminated_length": 459.25, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.5182362794876099, "epoch": 1.8677248677248677, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0017120328266173601, "learning_rate": 7.706996589153689e-07, "loss": 0.0197, "num_tokens": 6822457.0, "reward": 0.53125, "reward_std": 0.38816186785697937, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.4069705307483673, "step": 353, "step_time": 141.23254205007106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 991.625, "completions/mean_terminated_length": 920.4000244140625, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "entropy": 0.3872431516647339, "epoch": 1.873015873015873, "frac_reward_zero_std": 0.5, "grad_norm": 1.2734375, "kl": 0.0011720954207703471, "learning_rate": 7.693080007570083e-07, "loss": -0.0084, "num_tokens": 6850571.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.03125, "rewards/itbench_correctness/std": 0.125, "step": 354, "step_time": 552.564743893221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 522.0, "completions/mean_terminated_length": 522.0, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.517241358757019, "epoch": 1.8783068783068781, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0022999588400125504, "learning_rate": 7.679133974894982e-07, "loss": -0.0017, "num_tokens": 6861683.0, "reward": 0.40625, "reward_std": 0.2882373631000519, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.42149975895881653, "step": 355, "step_time": 615.0445311861113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 993.8125, "completions/mean_terminated_length": 927.4000244140625, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "entropy": 0.4246273934841156, "epoch": 1.8835978835978837, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.001227955799549818, "learning_rate": 7.665158643639969e-07, "loss": 0.022, "num_tokens": 6892520.0, "reward": 0.1294642835855484, "reward_std": 0.2322283834218979, "rewards/itbench_correctness/mean": 0.1294642835855484, "rewards/itbench_correctness/std": 0.2531687021255493, "step": 356, "step_time": 94.74817245267332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 760.8125, "completions/mean_terminated_length": 760.8125, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "entropy": 0.316766619682312, "epoch": 1.8888888888888888, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0009881765581667423, "learning_rate": 7.651154166637024e-07, "loss": 0.0684, "num_tokens": 6910381.0, "reward": 0.7250000238418579, "reward_std": 0.28192007541656494, "rewards/itbench_correctness/mean": 0.7250000238418579, "rewards/itbench_correctness/std": 0.3696845769882202, "step": 357, "step_time": 73.87886378820986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 723.5, "completions/mean_terminated_length": 680.5714721679688, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.43676573038101196, "epoch": 1.8941798941798942, "frac_reward_zero_std": 0.5, "grad_norm": 1.25, "kl": 0.00232238182798028, "learning_rate": 7.637120697036865e-07, "loss": -0.0125, "num_tokens": 6932685.0, "reward": 0.9479166865348816, "reward_std": 0.019287927076220512, "rewards/itbench_correctness/mean": 0.9479166865348816, "rewards/itbench_correctness/std": 0.05989960953593254, "step": 358, "step_time": 316.1507151676342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 980.3125, "completions/mean_terminated_length": 849.25, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "entropy": 0.5059611201286316, "epoch": 1.8994708994708995, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.0012296068016439676, "learning_rate": 7.623058388307268e-07, "loss": 0.0303, "num_tokens": 6960266.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.015625, "rewards/itbench_correctness/std": 0.0625, "step": 359, "step_time": 104.70608714781702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 538.4375, "completions/mean_terminated_length": 538.4375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 0.3045850396156311, "epoch": 1.9047619047619047, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.001858119503594935, "learning_rate": 7.608967394231386e-07, "loss": 0.0023, "num_tokens": 6972577.0, "reward": 0.8125, "reward_std": 0.2982703447341919, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.3354102075099945, "step": 360, "step_time": 63.012714352458715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1024.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 982.0625, "completions/mean_terminated_length": 353.0, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.42563483119010925, "epoch": 1.91005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.0017107086023315787, "learning_rate": 7.594847868906076e-07, "loss": -0.0493, "num_tokens": 6998698.0, "reward": 0.6770833730697632, "reward_std": 0.2745841145515442, "rewards/itbench_correctness/mean": 0.6770833730697632, "rewards/itbench_correctness/std": 0.3303687572479248, "step": 361, "step_time": 104.44509523361921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 527.0625, "completions/mean_terminated_length": 527.0625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.46863511204719543, "epoch": 1.9153439153439153, "frac_reward_zero_std": 0.5, "grad_norm": 1.390625, "kl": 0.0021068877540528774, "learning_rate": 7.5806999667402e-07, "loss": 0.0109, "num_tokens": 7018539.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 362, "step_time": 93.05886326078326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 664.125, "completions/mean_terminated_length": 664.125, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "entropy": 0.41859591007232666, "epoch": 1.9206349206349205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.001816941425204277, "learning_rate": 7.566523842452956e-07, "loss": 0.0, "num_tokens": 7034125.0, "reward": 0.25, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 363, "step_time": 467.18558633420616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 868.125, "completions/mean_terminated_length": 712.25, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "entropy": 0.49992799758911133, "epoch": 1.925925925925926, "frac_reward_zero_std": 0.5, "grad_norm": 1.6953125, "kl": 0.0011530888732522726, "learning_rate": 7.552319651072163e-07, "loss": 0.0, "num_tokens": 7056823.0, "reward": 0.5546875, "reward_std": 0.09704047441482544, "rewards/itbench_correctness/mean": 0.5546875, "rewards/itbench_correctness/std": 0.3563961982727051, "step": 364, "step_time": 246.65094076655805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 850.6875, "completions/mean_terminated_length": 677.375, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "entropy": 0.46785688400268555, "epoch": 1.9312169312169312, "frac_reward_zero_std": 0.5, "grad_norm": 1.4921875, "kl": 0.001800206839106977, "learning_rate": 7.538087547932584e-07, "loss": 0.0001, "num_tokens": 7084386.0, "reward": 0.6041666865348816, "reward_std": 0.07386711239814758, "rewards/itbench_correctness/mean": 0.6041666865348816, "rewards/itbench_correctness/std": 0.42108768224716187, "step": 365, "step_time": 140.03905525244772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 627.0625, "completions/mean_terminated_length": 600.6000366210938, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.5358317494392395, "epoch": 1.9365079365079365, "frac_reward_zero_std": 0.5, "grad_norm": 1.328125, "kl": 0.0013238092651590705, "learning_rate": 7.523827688674219e-07, "loss": 0.0181, "num_tokens": 7100155.0, "reward": 0.65625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.3966001570224762, "step": 366, "step_time": 214.51375654805452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 813.6875, "completions/mean_terminated_length": 650.1111450195312, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.7029725909233093, "epoch": 1.9417989417989419, "frac_reward_zero_std": 1.0, "grad_norm": 0.01507568359375, "kl": 0.0012179253390058875, "learning_rate": 7.509540229240601e-07, "loss": 0.0, "num_tokens": 7147734.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 367, "step_time": 142.35515129286796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 393.875, "completions/mean_terminated_length": 393.875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.35036495327949524, "epoch": 1.947089947089947, "frac_reward_zero_std": 1.0, "grad_norm": 0.01324462890625, "kl": 0.0013395985588431358, "learning_rate": 7.495225325877103e-07, "loss": 0.0, "num_tokens": 7157244.0, "reward": 0.0833333358168602, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0833333358168602, "rewards/itbench_correctness/std": 0.08606629818677902, "step": 368, "step_time": 50.80347699671984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 622.8125, "completions/mean_terminated_length": 530.2307739257812, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.34520822763442993, "epoch": 1.9523809523809523, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.002235216787084937, "learning_rate": 7.480883135129211e-07, "loss": -0.0501, "num_tokens": 7173433.0, "reward": 0.5, "reward_std": 0.4629100561141968, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 369, "step_time": 573.6945302598178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 703.75, "completions/mean_terminated_length": 558.1818237304688, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.47744226455688477, "epoch": 1.9576719576719577, "frac_reward_zero_std": 0.5, "grad_norm": 1.5078125, "kl": 0.0014034686610102654, "learning_rate": 7.466513813840824e-07, "loss": 0.0179, "num_tokens": 7199237.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 370, "step_time": 871.0688090631738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 799.0, "completions/mean_terminated_length": 574.0, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.5306633114814758, "epoch": 1.9629629629629628, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0018356168875470757, "learning_rate": 7.452117519152541e-07, "loss": 0.0105, "num_tokens": 7218325.0, "reward": 0.21250000596046448, "reward_std": 0.21977336704730988, "rewards/itbench_correctness/mean": 0.21250000596046448, "rewards/itbench_correctness/std": 0.239095538854599, "step": 371, "step_time": 154.47280544694513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 673.0625, "completions/mean_terminated_length": 649.6666870117188, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.3461788594722748, "epoch": 1.9682539682539684, "frac_reward_zero_std": 0.5, "grad_norm": 1.671875, "kl": 0.0013453871943056583, "learning_rate": 7.437694408499932e-07, "loss": -0.0357, "num_tokens": 7233958.0, "reward": 0.8125, "reward_std": 0.13908715546131134, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.2713136672973633, "step": 372, "step_time": 89.71556733455509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 590.9375, "completions/mean_terminated_length": 590.9375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.3553675413131714, "epoch": 1.9735449735449735, "frac_reward_zero_std": 0.5, "grad_norm": 0.98046875, "kl": 0.0020165969617664814, "learning_rate": 7.423244639611826e-07, "loss": -0.0436, "num_tokens": 7253629.0, "reward": 0.125, "reward_std": 0.10564428567886353, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.19364917278289795, "step": 373, "step_time": 113.8127332655713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 715.875, "completions/mean_terminated_length": 407.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5308189392089844, "epoch": 1.9788359788359788, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0015742821851745248, "learning_rate": 7.408768370508576e-07, "loss": -0.0489, "num_tokens": 7276483.0, "reward": 0.4375, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 374, "step_time": 63.292789563536644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 612.8125, "completions/mean_terminated_length": 612.8125, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 0.40632331371307373, "epoch": 1.9841269841269842, "frac_reward_zero_std": 0.5, "grad_norm": 0.89453125, "kl": 0.0012454271782189608, "learning_rate": 7.394265759500347e-07, "loss": -0.015, "num_tokens": 7296752.0, "reward": 0.40937501192092896, "reward_std": 0.16952534019947052, "rewards/itbench_correctness/mean": 0.40937501192092896, "rewards/itbench_correctness/std": 0.3946385681629181, "step": 375, "step_time": 100.02406275831163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 618.0625, "completions/mean_terminated_length": 302.3333435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4271412789821625, "epoch": 1.9894179894179893, "frac_reward_zero_std": 0.5, "grad_norm": 1.390625, "kl": 0.0022359276190400124, "learning_rate": 7.379736965185368e-07, "loss": -0.0336, "num_tokens": 7315561.0, "reward": 0.02083333395421505, "reward_std": 0.03857583925127983, "rewards/itbench_correctness/mean": 0.02083333395421505, "rewards/itbench_correctness/std": 0.05692750960588455, "step": 376, "step_time": 119.51777216419578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 737.875, "completions/mean_terminated_length": 451.75, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "entropy": 0.4445197284221649, "epoch": 1.9947089947089947, "frac_reward_zero_std": 0.5, "grad_norm": 1.234375, "kl": 0.0014217497082427144, "learning_rate": 7.365182146448204e-07, "loss": 0.006, "num_tokens": 7339351.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4989572763442993, "step": 377, "step_time": 101.3526473660022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 957.8125, "completions/mean_terminated_length": 872.7142944335938, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.5074061751365662, "epoch": 2.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.0012597291497513652, "learning_rate": 7.350601462458024e-07, "loss": 0.0172, "num_tokens": 7379628.0, "reward": 0.40625, "reward_std": 0.08210401982069016, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.43430978059768677, "step": 378, "step_time": 126.26444634236395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 574.6875, "completions/mean_terminated_length": 574.6875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.4402392506599426, "epoch": 2.005291005291005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0262451171875, "kl": 0.0012956882128491998, "learning_rate": 7.335995072666847e-07, "loss": 0.0, "num_tokens": 7393239.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 379, "step_time": 1022.2069364916533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 945.5625, "completions/mean_terminated_length": 844.7142944335938, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "entropy": 0.5710886120796204, "epoch": 2.0105820105820107, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0013581872917711735, "learning_rate": 7.321363136807818e-07, "loss": 0.0597, "num_tokens": 7419288.0, "reward": 0.49687498807907104, "reward_std": 0.14920906722545624, "rewards/itbench_correctness/mean": 0.49687498807907104, "rewards/itbench_correctness/std": 0.4410097301006317, "step": 380, "step_time": 104.49046333320439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 816.75, "completions/mean_terminated_length": 747.6666870117188, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "entropy": 0.38934803009033203, "epoch": 2.015873015873016, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.001262744888663292, "learning_rate": 7.306705814893439e-07, "loss": -0.0128, "num_tokens": 7437700.0, "reward": 0.4375, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.4281744360923767, "step": 381, "step_time": 86.5950373802334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 804.875, "completions/mean_terminated_length": 790.2667236328125, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "entropy": 0.47212299704551697, "epoch": 2.0211640211640214, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0021258334163576365, "learning_rate": 7.292023267213835e-07, "loss": -0.0265, "num_tokens": 7457698.0, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 382, "step_time": 89.82852033432573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 692.875, "completions/mean_terminated_length": 670.800048828125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "entropy": 0.26556017994880676, "epoch": 2.0264550264550265, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.0009645685204304755, "learning_rate": 7.277315654334996e-07, "loss": -0.0005, "num_tokens": 7474384.0, "reward": 0.796875, "reward_std": 0.11100947856903076, "rewards/itbench_correctness/mean": 0.796875, "rewards/itbench_correctness/std": 0.1359764039516449, "step": 383, "step_time": 838.6957097211853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 654.375, "completions/mean_terminated_length": 601.5714721679688, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.39732569456100464, "epoch": 2.0317460317460316, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0012180046178400517, "learning_rate": 7.262583137097018e-07, "loss": 0.0288, "num_tokens": 7490990.0, "reward": 0.6614583134651184, "reward_std": 0.30096644163131714, "rewards/itbench_correctness/mean": 0.6614583134651184, "rewards/itbench_correctness/std": 0.2926076054573059, "step": 384, "step_time": 613.9163551460952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 685.25, "completions/mean_terminated_length": 662.6666870117188, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "entropy": 0.41152864694595337, "epoch": 2.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0014390156138688326, "learning_rate": 7.247825876612352e-07, "loss": -0.0331, "num_tokens": 7505778.0, "reward": 0.3324652910232544, "reward_std": 0.2620442807674408, "rewards/itbench_correctness/mean": 0.3324652910232544, "rewards/itbench_correctness/std": 0.35357046127319336, "step": 385, "step_time": 686.7393183000386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 706.1875, "completions/mean_terminated_length": 561.727294921875, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.4871227443218231, "epoch": 2.0423280423280423, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0018794104689732194, "learning_rate": 7.233044034264033e-07, "loss": 0.0135, "num_tokens": 7523709.0, "reward": 0.9829545617103577, "reward_std": 0.023524951189756393, "rewards/itbench_correctness/mean": 0.9829545617103577, "rewards/itbench_correctness/std": 0.03664661943912506, "step": 386, "step_time": 79.07936265133321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 968.625, "completions/mean_terminated_length": 913.25, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "entropy": 0.35927215218544006, "epoch": 2.0476190476190474, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.001336527056992054, "learning_rate": 7.21823777170392e-07, "loss": -0.0067, "num_tokens": 7549655.0, "reward": 0.5, "reward_std": 0.3729091286659241, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.38490018248558044, "step": 387, "step_time": 140.9922649441287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 678.0625, "completions/mean_terminated_length": 470.5, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.45128583908081055, "epoch": 2.052910052910053, "frac_reward_zero_std": 0.5, "grad_norm": 1.328125, "kl": 0.0015049315989017487, "learning_rate": 7.203407250850928e-07, "loss": -0.0119, "num_tokens": 7567664.0, "reward": 0.859375, "reward_std": 0.19408094882965088, "rewards/itbench_correctness/mean": 0.859375, "rewards/itbench_correctness/std": 0.30233466625213623, "step": 388, "step_time": 802.1390054896474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 659.5625, "completions/mean_terminated_length": 635.2667236328125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.36842605471611023, "epoch": 2.058201058201058, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.000988945015706122, "learning_rate": 7.188552633889259e-07, "loss": 0.0073, "num_tokens": 7583641.0, "reward": 0.40625, "reward_std": 0.01767767034471035, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.420267790555954, "step": 389, "step_time": 102.60351053066552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 627.5, "completions/mean_terminated_length": 536.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.4270916283130646, "epoch": 2.0634920634920633, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0013897939352318645, "learning_rate": 7.173674083266623e-07, "loss": -0.0168, "num_tokens": 7597833.0, "reward": 0.4437499940395355, "reward_std": 0.08210402727127075, "rewards/itbench_correctness/mean": 0.4437499940395355, "rewards/itbench_correctness/std": 0.4718315303325653, "step": 390, "step_time": 941.484922320582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 509.375, "completions/mean_terminated_length": 509.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.4750920236110687, "epoch": 2.068783068783069, "frac_reward_zero_std": 0.5, "grad_norm": 0.98828125, "kl": 0.001085254829376936, "learning_rate": 7.158771761692464e-07, "loss": -0.0065, "num_tokens": 7608767.0, "reward": 0.6937500238418579, "reward_std": 0.1399936079978943, "rewards/itbench_correctness/mean": 0.6937500238418579, "rewards/itbench_correctness/std": 0.3696281909942627, "step": 391, "step_time": 484.2868151040748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 783.5, "completions/mean_terminated_length": 543.0, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.5437141060829163, "epoch": 2.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0014096169034019113, "learning_rate": 7.143845832136187e-07, "loss": 0.0198, "num_tokens": 7629503.0, "reward": 0.390625, "reward_std": 0.35405686497688293, "rewards/itbench_correctness/mean": 0.390625, "rewards/itbench_correctness/std": 0.40019527077674866, "step": 392, "step_time": 247.14675129018724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.4428223967552185, "epoch": 2.0793650793650795, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.002443905221298337, "learning_rate": 7.128896457825363e-07, "loss": -0.0199, "num_tokens": 7638431.0, "reward": 0.42500001192092896, "reward_std": 0.1060660183429718, "rewards/itbench_correctness/mean": 0.42500001192092896, "rewards/itbench_correctness/std": 0.12382783740758896, "step": 393, "step_time": 81.88051935099065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 575.75, "completions/mean_terminated_length": 575.75, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.33174121379852295, "epoch": 2.0846560846560847, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.002629755064845085, "learning_rate": 7.113923802243956e-07, "loss": -0.0546, "num_tokens": 7652163.0, "reward": 0.3479166626930237, "reward_std": 0.15140824019908905, "rewards/itbench_correctness/mean": 0.3479166626930237, "rewards/itbench_correctness/std": 0.14930394291877747, "step": 394, "step_time": 82.69239473901689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 812.25, "completions/mean_terminated_length": 741.6666870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5195444822311401, "epoch": 2.0899470899470898, "frac_reward_zero_std": 0.5, "grad_norm": 1.3203125, "kl": 0.0016354058170691133, "learning_rate": 7.098928029130528e-07, "loss": -0.0397, "num_tokens": 7684783.0, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 395, "step_time": 280.40570612065494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 763.375, "completions/mean_terminated_length": 726.1428833007812, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.5344686508178711, "epoch": 2.0952380952380953, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0014837183989584446, "learning_rate": 7.083909302476452e-07, "loss": 0.0739, "num_tokens": 7704229.0, "reward": 0.2749999761581421, "reward_std": 0.17728103697299957, "rewards/itbench_correctness/mean": 0.2749999761581421, "rewards/itbench_correctness/std": 0.3732738196849823, "step": 396, "step_time": 85.86852881591767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 552.5625, "completions/mean_terminated_length": 552.5625, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.46148625016212463, "epoch": 2.1005291005291005, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.0017779659247025847, "learning_rate": 7.068867786524115e-07, "loss": 0.0021, "num_tokens": 7725262.0, "reward": 0.6354166269302368, "reward_std": 0.36623916029930115, "rewards/itbench_correctness/mean": 0.6354166269302368, "rewards/itbench_correctness/std": 0.41373974084854126, "step": 397, "step_time": 108.55369200650603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 638.5, "completions/mean_terminated_length": 463.2727355957031, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.46045419573783875, "epoch": 2.105820105820106, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0019145008409395814, "learning_rate": 7.053803645765127e-07, "loss": -0.0524, "num_tokens": 7743718.0, "reward": 0.3229166567325592, "reward_std": 0.18293291330337524, "rewards/itbench_correctness/mean": 0.3229166567325592, "rewards/itbench_correctness/std": 0.25783106684684753, "step": 398, "step_time": 124.76364956516773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 718.875, "completions/mean_terminated_length": 481.5555725097656, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.3672404885292053, "epoch": 2.111111111111111, "frac_reward_zero_std": 0.5, "grad_norm": 1.3125, "kl": 0.001165916328318417, "learning_rate": 7.038717044938518e-07, "loss": 0.0019, "num_tokens": 7766796.0, "reward": 0.4196428656578064, "reward_std": 0.11090338230133057, "rewards/itbench_correctness/mean": 0.4196428656578064, "rewards/itbench_correctness/std": 0.45912888646125793, "step": 399, "step_time": 92.30510796047747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 742.4375, "completions/mean_terminated_length": 702.2142944335938, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.4337065517902374, "epoch": 2.1164021164021163, "frac_reward_zero_std": 0.5, "grad_norm": 1.25, "kl": 0.0014574953820556402, "learning_rate": 7.023608149028936e-07, "loss": -0.0175, "num_tokens": 7783843.0, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 400, "step_time": 163.5410966835916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 410.75, "completions/mean_terminated_length": 410.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.41387704014778137, "epoch": 2.121693121693122, "frac_reward_zero_std": 0.5, "grad_norm": 0.91796875, "kl": 0.001556264702230692, "learning_rate": 7.008477123264847e-07, "loss": -0.0122, "num_tokens": 7792695.0, "reward": 0.625, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.4281744360923767, "step": 401, "step_time": 68.41044199559838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 720.375, "completions/mean_terminated_length": 538.2000122070312, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 0.3942390978336334, "epoch": 2.126984126984127, "frac_reward_zero_std": 0.5, "grad_norm": 1.5234375, "kl": 0.0016359214205294847, "learning_rate": 6.993324133116725e-07, "loss": 0.026, "num_tokens": 7820917.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 402, "step_time": 80.19952200446278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 813.4375, "completions/mean_terminated_length": 649.6666870117188, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "entropy": 0.4278140664100647, "epoch": 2.132275132275132, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0014102754648774862, "learning_rate": 6.978149344295241e-07, "loss": 0.0034, "num_tokens": 7839988.0, "reward": 0.8035714626312256, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.8035714626312256, "rewards/itbench_correctness/std": 0.1907735913991928, "step": 403, "step_time": 899.5258836848661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 411.5625, "completions/mean_terminated_length": 411.5625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.4130599796772003, "epoch": 2.1375661375661377, "frac_reward_zero_std": 0.5, "grad_norm": 0.7421875, "kl": 0.0012595870066434145, "learning_rate": 6.962952922749457e-07, "loss": -0.051, "num_tokens": 7853909.0, "reward": 0.39008620381355286, "reward_std": 0.2060610055923462, "rewards/itbench_correctness/mean": 0.39008620381355286, "rewards/itbench_correctness/std": 0.4915003180503845, "step": 404, "step_time": 101.09288472961634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 735.25, "completions/mean_terminated_length": 604.0, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.5277116894721985, "epoch": 2.142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 1.609375, "kl": 0.0016202793922275305, "learning_rate": 6.947735034665001e-07, "loss": 0.0106, "num_tokens": 7879449.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 405, "step_time": 199.16461731493473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 484.25, "completions/mean_terminated_length": 484.25, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.35105833411216736, "epoch": 2.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0018317453796043992, "learning_rate": 6.932495846462261e-07, "loss": 0.0256, "num_tokens": 7889989.0, "reward": 0.5866477489471436, "reward_std": 0.1253555417060852, "rewards/itbench_correctness/mean": 0.5866477489471436, "rewards/itbench_correctness/std": 0.4179156720638275, "step": 406, "step_time": 137.3452343745157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 487.5, "completions/mean_terminated_length": 487.5, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.38564103841781616, "epoch": 2.1534391534391535, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0013662968995049596, "learning_rate": 6.917235524794558e-07, "loss": 0.009, "num_tokens": 7901029.0, "reward": 0.890625, "reward_std": 0.22707363963127136, "rewards/itbench_correctness/mean": 0.890625, "rewards/itbench_correctness/std": 0.22302372753620148, "step": 407, "step_time": 68.24169243406504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 953.25, "completions/mean_terminated_length": 862.2857666015625, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.36926305294036865, "epoch": 2.1587301587301586, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0011503396090120077, "learning_rate": 6.901954236546324e-07, "loss": -0.0107, "num_tokens": 7924657.0, "reward": 0.5640318393707275, "reward_std": 0.34312185645103455, "rewards/itbench_correctness/mean": 0.5640318393707275, "rewards/itbench_correctness/std": 0.37077954411506653, "step": 408, "step_time": 846.7672138344496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 768.3125, "completions/mean_terminated_length": 569.4444580078125, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.41389408707618713, "epoch": 2.164021164021164, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.0032037936616688967, "learning_rate": 6.886652148831279e-07, "loss": 0.0154, "num_tokens": 7947958.0, "reward": 0.1041666716337204, "reward_std": 0.19795581698417664, "rewards/itbench_correctness/mean": 0.1041666716337204, "rewards/itbench_correctness/std": 0.291070818901062, "step": 409, "step_time": 180.43269913457334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 513.1875, "completions/mean_terminated_length": 513.1875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.378029465675354, "epoch": 2.1693121693121693, "frac_reward_zero_std": 0.5, "grad_norm": 0.458984375, "kl": 0.0019971805159002542, "learning_rate": 6.871329428990601e-07, "loss": -0.0628, "num_tokens": 7959761.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 410, "step_time": 105.00821590330452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 519.75, "completions/mean_terminated_length": 519.75, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.4290524423122406, "epoch": 2.1746031746031744, "frac_reward_zero_std": 1.0, "grad_norm": 0.037841796875, "kl": 0.0021021170541644096, "learning_rate": 6.855986244591103e-07, "loss": 0.0, "num_tokens": 7976613.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 411, "step_time": 872.8376494199038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 883.0625, "completions/mean_terminated_length": 773.4444580078125, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "entropy": 0.5888597965240479, "epoch": 2.17989417989418, "frac_reward_zero_std": 0.5, "grad_norm": 1.7109375, "kl": 0.002118014730513096, "learning_rate": 6.840622763423391e-07, "loss": 0.0001, "num_tokens": 8008030.0, "reward": 0.25, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 412, "step_time": 87.85907210037112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 798.0625, "completions/mean_terminated_length": 572.125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 0.3759104013442993, "epoch": 2.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0015699933283030987, "learning_rate": 6.825239153500029e-07, "loss": 0.0035, "num_tokens": 8027279.0, "reward": 0.4518229365348816, "reward_std": 0.22612185776233673, "rewards/itbench_correctness/mean": 0.4518229365348816, "rewards/itbench_correctness/std": 0.41147592663764954, "step": 413, "step_time": 98.12107760738581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 754.125, "completions/mean_terminated_length": 484.25, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.5648930668830872, "epoch": 2.1904761904761907, "frac_reward_zero_std": 0.5, "grad_norm": 1.5234375, "kl": 0.0014875817578285933, "learning_rate": 6.809835583053715e-07, "loss": 0.0, "num_tokens": 8047225.0, "reward": 0.6000000238418579, "reward_std": 0.13887301087379456, "rewards/itbench_correctness/mean": 0.6000000238418579, "rewards/itbench_correctness/std": 0.4546060562133789, "step": 414, "step_time": 132.7436649715528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 362.0625, "completions/mean_terminated_length": 362.0625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.4474365711212158, "epoch": 2.195767195767196, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.0019536821637302637, "learning_rate": 6.794412220535425e-07, "loss": 0.0011, "num_tokens": 8055514.0, "reward": 0.21875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.2561737895011902, "step": 415, "step_time": 80.36343740858138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 892.25, "completions/mean_terminated_length": 760.5, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "entropy": 0.542448878288269, "epoch": 2.201058201058201, "frac_reward_zero_std": 0.5, "grad_norm": 1.4921875, "kl": 0.001819648896344006, "learning_rate": 6.778969234612583e-07, "loss": 0.0001, "num_tokens": 8082102.0, "reward": 0.640625, "reward_std": 0.0867956355214119, "rewards/itbench_correctness/mean": 0.640625, "rewards/itbench_correctness/std": 0.3896446228027344, "step": 416, "step_time": 243.75771763175726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 1010.1875, "completions/mean_terminated_length": 913.5, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "entropy": 0.6018684506416321, "epoch": 2.2063492063492065, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0012090131640434265, "learning_rate": 6.763506794167206e-07, "loss": 0.0067, "num_tokens": 8108297.0, "reward": 0.46875, "reward_std": 0.2346404492855072, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4905354380607605, "step": 417, "step_time": 88.06844450253993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 861.75, "completions/mean_terminated_length": 764.4000244140625, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "entropy": 0.5616478323936462, "epoch": 2.2116402116402116, "frac_reward_zero_std": 0.5, "grad_norm": 1.375, "kl": 0.0016445523360744119, "learning_rate": 6.748025068294067e-07, "loss": -0.0042, "num_tokens": 8138485.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 418, "step_time": 170.92238603066653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 637.4375, "completions/mean_terminated_length": 611.6666870117188, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.5302480459213257, "epoch": 2.2169312169312168, "frac_reward_zero_std": 0.5, "grad_norm": 1.3359375, "kl": 0.0013207794399932027, "learning_rate": 6.732524226298841e-07, "loss": -0.0056, "num_tokens": 8171140.0, "reward": 0.21875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.2561737895011902, "step": 419, "step_time": 438.8370144786313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 541.875, "completions/mean_terminated_length": 541.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.44844290614128113, "epoch": 2.2222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.001750895637087524, "learning_rate": 6.717004437696249e-07, "loss": -0.0712, "num_tokens": 8183074.0, "reward": 0.796875, "reward_std": 0.1886717677116394, "rewards/itbench_correctness/mean": 0.796875, "rewards/itbench_correctness/std": 0.25634312629699707, "step": 420, "step_time": 80.88844703137875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 709.4375, "completions/mean_terminated_length": 520.7000122070312, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.28755176067352295, "epoch": 2.2275132275132274, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.001124854083172977, "learning_rate": 6.701465872208216e-07, "loss": 0.0004, "num_tokens": 8200969.0, "reward": 0.59375, "reward_std": 0.36201947927474976, "rewards/itbench_correctness/mean": 0.59375, "rewards/itbench_correctness/std": 0.3598804175853729, "step": 421, "step_time": 135.98304109089077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 456.5625, "completions/mean_terminated_length": 456.5625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.37234771251678467, "epoch": 2.2328042328042326, "frac_reward_zero_std": 0.5, "grad_norm": 0.7578125, "kl": 0.001798869576305151, "learning_rate": 6.685908699762001e-07, "loss": -0.0517, "num_tokens": 8211418.0, "reward": 0.3270833492279053, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.3270833492279053, "rewards/itbench_correctness/std": 0.24227891862392426, "step": 422, "step_time": 66.6123378733173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 658.6875, "completions/mean_terminated_length": 658.6875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.30059778690338135, "epoch": 2.238095238095238, "frac_reward_zero_std": 0.5, "grad_norm": 1.34375, "kl": 0.00184349634218961, "learning_rate": 6.670333090488356e-07, "loss": -0.0076, "num_tokens": 8227349.0, "reward": 0.65625, "reward_std": 0.0578637570142746, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.08539126068353653, "step": 423, "step_time": 148.7072524903342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 991.8125, "completions/mean_terminated_length": 766.5, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "entropy": 0.25408029556274414, "epoch": 2.2433862433862433, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.001193932956084609, "learning_rate": 6.654739214719641e-07, "loss": -0.0169, "num_tokens": 8252114.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 424, "step_time": 569.5789128560573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 811.375, "completions/mean_terminated_length": 598.75, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "entropy": 0.4436912536621094, "epoch": 2.248677248677249, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0011822007363662124, "learning_rate": 6.639127242987987e-07, "loss": -0.0078, "num_tokens": 8271904.0, "reward": 0.65625, "reward_std": 0.3243582546710968, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.3966001570224762, "step": 425, "step_time": 137.0775876250118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 793.625, "completions/mean_terminated_length": 688.9091186523438, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "entropy": 0.44857457280158997, "epoch": 2.253968253968254, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0012536750873550773, "learning_rate": 6.623497346023417e-07, "loss": 0.0162, "num_tokens": 8290466.0, "reward": 0.2544642686843872, "reward_std": 0.20485526323318481, "rewards/itbench_correctness/mean": 0.2544642686843872, "rewards/itbench_correctness/std": 0.2508065104484558, "step": 426, "step_time": 381.0265443623066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 668.5625, "completions/mean_terminated_length": 668.5625, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.4487239420413971, "epoch": 2.259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0014325481606647372, "learning_rate": 6.607849694751977e-07, "loss": -0.0007, "num_tokens": 8311259.0, "reward": 0.800000011920929, "reward_std": 0.09974324703216553, "rewards/itbench_correctness/mean": 0.800000011920929, "rewards/itbench_correctness/std": 0.10954451560974121, "step": 427, "step_time": 78.88445997610688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 581.875, "completions/mean_terminated_length": 581.875, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.36090224981307983, "epoch": 2.2645502645502646, "frac_reward_zero_std": 0.0, "grad_norm": 13.25, "kl": 0.0013826474314555526, "learning_rate": 6.592184460293877e-07, "loss": 0.0109, "num_tokens": 8325825.0, "reward": 0.4765625, "reward_std": 0.3056884706020355, "rewards/itbench_correctness/mean": 0.4765625, "rewards/itbench_correctness/std": 0.4062500298023224, "step": 428, "step_time": 880.099565721117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 642.125, "completions/mean_terminated_length": 587.5714721679688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3862176239490509, "epoch": 2.2698412698412698, "frac_reward_zero_std": 0.5, "grad_norm": 1.21875, "kl": 0.0022476972080767155, "learning_rate": 6.576501813961608e-07, "loss": -0.0962, "num_tokens": 8351099.0, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 429, "step_time": 397.64244225714356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 775.5, "completions/mean_terminated_length": 718.1538696289062, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.5390070676803589, "epoch": 2.2751322751322753, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0013746700715273619, "learning_rate": 6.560801927258079e-07, "loss": 0.0052, "num_tokens": 8370627.0, "reward": 0.6875, "reward_std": 0.44403791427612305, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 430, "step_time": 87.13009965512902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 880.875, "completions/mean_terminated_length": 737.75, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "entropy": 0.5494536757469177, "epoch": 2.2804232804232805, "frac_reward_zero_std": 0.5, "grad_norm": 1.546875, "kl": 0.0012544175842776895, "learning_rate": 6.545084971874736e-07, "loss": 0.0, "num_tokens": 8397801.0, "reward": 0.11249999701976776, "reward_std": 0.09449111670255661, "rewards/itbench_correctness/mean": 0.11249999701976776, "rewards/itbench_correctness/std": 0.12974333763122559, "step": 431, "step_time": 98.64099729061127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 733.5625, "completions/mean_terminated_length": 559.2999877929688, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "entropy": 0.42532163858413696, "epoch": 2.2857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0014970600605010986, "learning_rate": 6.529351119689687e-07, "loss": 0.0263, "num_tokens": 8413058.0, "reward": 0.4921875, "reward_std": 0.10436524450778961, "rewards/itbench_correctness/mean": 0.4921875, "rewards/itbench_correctness/std": 0.47096699476242065, "step": 432, "step_time": 579.7829250898212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 803.1875, "completions/mean_terminated_length": 729.5833740234375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.3336705267429352, "epoch": 2.291005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0011378336930647492, "learning_rate": 6.513600542765816e-07, "loss": -0.0063, "num_tokens": 8433925.0, "reward": 0.78125, "reward_std": 0.13837619125843048, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.19924625754356384, "step": 433, "step_time": 182.92570608016104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 910.25, "completions/mean_terminated_length": 796.5, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.3559461832046509, "epoch": 2.2962962962962963, "frac_reward_zero_std": 0.5, "grad_norm": 1.4140625, "kl": 0.0012446820037439466, "learning_rate": 6.497833413348909e-07, "loss": -0.0338, "num_tokens": 8462945.0, "reward": 0.5625, "reward_std": 0.03857583925127983, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.454911470413208, "step": 434, "step_time": 330.48511962778866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 839.0625, "completions/mean_terminated_length": 654.125, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "entropy": 0.464804470539093, "epoch": 2.3015873015873014, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0014192892704159021, "learning_rate": 6.482049903865768e-07, "loss": 0.008, "num_tokens": 8487602.0, "reward": 0.8541666865348816, "reward_std": 0.3027648329734802, "rewards/itbench_correctness/mean": 0.8541666865348816, "rewards/itbench_correctness/std": 0.2973649799823761, "step": 435, "step_time": 126.97447157558054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 963.5, "completions/mean_terminated_length": 916.4444580078125, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "entropy": 0.39647120237350464, "epoch": 2.306878306878307, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0012867730110883713, "learning_rate": 6.466250186922324e-07, "loss": 0.0023, "num_tokens": 8509778.0, "reward": 0.697578489780426, "reward_std": 0.291044145822525, "rewards/itbench_correctness/mean": 0.697578489780426, "rewards/itbench_correctness/std": 0.3142254650592804, "step": 436, "step_time": 449.84700517356396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 460.5625, "completions/mean_terminated_length": 460.5625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.4451078772544861, "epoch": 2.312169312169312, "frac_reward_zero_std": 1.0, "grad_norm": 0.02685546875, "kl": 0.0015331042231991887, "learning_rate": 6.450434435301751e-07, "loss": 0.0, "num_tokens": 8519963.0, "reward": 0.3333333432674408, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.3333333432674408, "rewards/itbench_correctness/std": 0.17213259637355804, "step": 437, "step_time": 796.3606786699966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 611.5, "completions/mean_terminated_length": 424.0, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.4153720438480377, "epoch": 2.317460317460317, "frac_reward_zero_std": 0.5, "grad_norm": 0.9453125, "kl": 0.0015486880438402295, "learning_rate": 6.43460282196257e-07, "loss": 0.008, "num_tokens": 8541179.0, "reward": 0.2395833432674408, "reward_std": 0.0883883535861969, "rewards/itbench_correctness/mean": 0.2395833432674408, "rewards/itbench_correctness/std": 0.27533650398254395, "step": 438, "step_time": 145.90149160753936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 577.0625, "completions/mean_terminated_length": 577.0625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.5198743343353271, "epoch": 2.322751322751323, "frac_reward_zero_std": 0.5, "grad_norm": 1.1171875, "kl": 0.0017036347417160869, "learning_rate": 6.418755520036774e-07, "loss": 0.0056, "num_tokens": 8558452.0, "reward": 0.765625, "reward_std": 0.09300297498703003, "rewards/itbench_correctness/mean": 0.765625, "rewards/itbench_correctness/std": 0.2733854353427887, "step": 439, "step_time": 154.49192036502063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.4306418299674988, "epoch": 2.328042328042328, "frac_reward_zero_std": 0.5, "grad_norm": 0.94140625, "kl": 0.0017033821204677224, "learning_rate": 6.402892702827916e-07, "loss": -0.0083, "num_tokens": 8566496.0, "reward": 0.1953125, "reward_std": 0.08956430107355118, "rewards/itbench_correctness/mean": 0.1953125, "rewards/itbench_correctness/std": 0.2359323352575302, "step": 440, "step_time": 84.76937860064209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 722.875, "completions/mean_terminated_length": 622.5, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "entropy": 0.40670931339263916, "epoch": 2.3333333333333335, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0013241568813100457, "learning_rate": 6.387014543809223e-07, "loss": 0.0764, "num_tokens": 8586822.0, "reward": 0.4375, "reward_std": 0.3339453935623169, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.35420751571655273, "step": 441, "step_time": 146.5532330982387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.48046875, "epoch": 2.3386243386243386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0238037109375, "kl": 0.0011454581981524825, "learning_rate": 6.371121216621697e-07, "loss": 0.0, "num_tokens": 8615486.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 442, "step_time": 117.81919787544757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 531.1875, "completions/mean_terminated_length": 531.1875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.3595717251300812, "epoch": 2.3439153439153437, "frac_reward_zero_std": 0.5, "grad_norm": 1.0625, "kl": 0.001713753561489284, "learning_rate": 6.355212895072222e-07, "loss": -0.0025, "num_tokens": 8627873.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 443, "step_time": 1035.1622464098036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 728.5, "completions/mean_terminated_length": 594.1818237304688, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.3816060423851013, "epoch": 2.3492063492063493, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0015201023779809475, "learning_rate": 6.339289753131648e-07, "loss": 0.0442, "num_tokens": 8645001.0, "reward": 0.109375, "reward_std": 0.30935919284820557, "rewards/itbench_correctness/mean": 0.109375, "rewards/itbench_correctness/std": 0.30233466625213623, "step": 444, "step_time": 933.9612167160958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 916.4375, "completions/mean_terminated_length": 808.875, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.49976131319999695, "epoch": 2.3544973544973544, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0014402419328689575, "learning_rate": 6.323351964932908e-07, "loss": 0.0092, "num_tokens": 8666528.0, "reward": 0.44999998807907104, "reward_std": 0.09669842571020126, "rewards/itbench_correctness/mean": 0.44999998807907104, "rewards/itbench_correctness/std": 0.41733282804489136, "step": 445, "step_time": 141.34655232075602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 649.5, "completions/mean_terminated_length": 358.22222900390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4064665138721466, "epoch": 2.35978835978836, "frac_reward_zero_std": 0.0, "grad_norm": 6.28125, "kl": 0.0020702362526208162, "learning_rate": 6.307399704769098e-07, "loss": -0.1482, "num_tokens": 8692528.0, "reward": 0.2447916716337204, "reward_std": 0.1262161284685135, "rewards/itbench_correctness/mean": 0.2447916716337204, "rewards/itbench_correctness/std": 0.16020458936691284, "step": 446, "step_time": 118.47124487534165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 942.625, "completions/mean_terminated_length": 861.25, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "entropy": 0.50921630859375, "epoch": 2.365079365079365, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0014145068125799298, "learning_rate": 6.291433147091583e-07, "loss": 0.0104, "num_tokens": 8714466.0, "reward": 0.4854166805744171, "reward_std": 0.3144327402114868, "rewards/itbench_correctness/mean": 0.4854166805744171, "rewards/itbench_correctness/std": 0.393459290266037, "step": 447, "step_time": 369.4135863818228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 899.375, "completions/mean_terminated_length": 691.6666870117188, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5092425346374512, "epoch": 2.3703703703703702, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.00447537936270237, "learning_rate": 6.275452466508075e-07, "loss": -0.0622, "num_tokens": 8750192.0, "reward": 0.09375, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.09375, "rewards/itbench_correctness/std": 0.20155644416809082, "step": 448, "step_time": 469.61162946000695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 550.5, "completions/mean_terminated_length": 550.5, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "entropy": 0.4396003484725952, "epoch": 2.375661375661376, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0023843871895223856, "learning_rate": 6.259457837780741e-07, "loss": 0.0034, "num_tokens": 8762320.0, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 449, "step_time": 616.3559736898169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 805.8125, "completions/mean_terminated_length": 587.625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.6304196119308472, "epoch": 2.380952380952381, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0017924520652741194, "learning_rate": 6.243449435824276e-07, "loss": -0.0157, "num_tokens": 8783789.0, "reward": 0.5416666865348816, "reward_std": 0.2629520893096924, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.43885374069213867, "step": 450, "step_time": 88.87441652361304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 751.1875, "completions/mean_terminated_length": 539.0, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.4606040418148041, "epoch": 2.386243386243386, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0016598474467173219, "learning_rate": 6.227427435703995e-07, "loss": -0.0021, "num_tokens": 8807136.0, "reward": 0.5546875, "reward_std": 0.2294243574142456, "rewards/itbench_correctness/mean": 0.5546875, "rewards/itbench_correctness/std": 0.3080184757709503, "step": 451, "step_time": 986.2314578304067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 817.625, "completions/mean_terminated_length": 770.0, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "entropy": 0.5087907314300537, "epoch": 2.3915343915343916, "frac_reward_zero_std": 0.5, "grad_norm": 1.359375, "kl": 0.001573887187987566, "learning_rate": 6.211392012633931e-07, "loss": -0.0123, "num_tokens": 8831826.0, "reward": 0.59375, "reward_std": 0.18600594997406006, "rewards/itbench_correctness/mean": 0.59375, "rewards/itbench_correctness/std": 0.4905354380607605, "step": 452, "step_time": 369.9421289321035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 602.0, "completions/mean_terminated_length": 602.0, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.49501660466194153, "epoch": 2.3968253968253967, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0018204387743026018, "learning_rate": 6.1953433419749e-07, "loss": 0.0146, "num_tokens": 8844602.0, "reward": 0.4765625, "reward_std": 0.10795740783214569, "rewards/itbench_correctness/mean": 0.4765625, "rewards/itbench_correctness/std": 0.2781464755535126, "step": 453, "step_time": 103.37166160158813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 449.25, "completions/mean_terminated_length": 449.25, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.5052865743637085, "epoch": 2.402116402116402, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0016118157655000687, "learning_rate": 6.17928159923259e-07, "loss": 0.0095, "num_tokens": 8854590.0, "reward": 0.71875, "reward_std": 0.35564959049224854, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.44604745507240295, "step": 454, "step_time": 980.6566639961675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 527.625, "completions/mean_terminated_length": 527.625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.48140251636505127, "epoch": 2.4074074074074074, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.0013566534034907818, "learning_rate": 6.163206960055652e-07, "loss": -0.0056, "num_tokens": 8868488.0, "reward": 0.21875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.2561737895011902, "step": 455, "step_time": 94.81741558108479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 838.3125, "completions/mean_terminated_length": 652.625, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "entropy": 0.37217625975608826, "epoch": 2.4126984126984126, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.0014726277440786362, "learning_rate": 6.147119600233758e-07, "loss": -0.0124, "num_tokens": 8892141.0, "reward": 0.5520833134651184, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.5520833134651184, "rewards/itbench_correctness/std": 0.4781087338924408, "step": 456, "step_time": 774.9578263629228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 805.3125, "completions/mean_terminated_length": 732.4166870117188, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.4619324803352356, "epoch": 2.417989417989418, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.0020939442329108715, "learning_rate": 6.131019695695702e-07, "loss": 0.0001, "num_tokens": 8917394.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 457, "step_time": 160.94475755654275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 415.625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.4980451166629791, "epoch": 2.4232804232804233, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.002369649475440383, "learning_rate": 6.114907422507459e-07, "loss": 0.0046, "num_tokens": 8926548.0, "reward": 0.625, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.3535533845424652, "step": 458, "step_time": 130.5550601184368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 761.1875, "completions/mean_terminated_length": 641.727294921875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "entropy": 0.3757287263870239, "epoch": 2.4285714285714284, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.0018944862531498075, "learning_rate": 6.098782956870265e-07, "loss": 0.0028, "num_tokens": 8944471.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 459, "step_time": 351.48624353297055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 730.5, "completions/mean_terminated_length": 502.22222900390625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "entropy": 0.5448322892189026, "epoch": 2.433862433862434, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0037839075084775686, "learning_rate": 6.082646475118699e-07, "loss": 0.0032, "num_tokens": 8962455.0, "reward": 0.10880681872367859, "reward_std": 0.1294855922460556, "rewards/itbench_correctness/mean": 0.10880681872367859, "rewards/itbench_correctness/std": 0.12925373017787933, "step": 460, "step_time": 137.66368599049747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 897.5, "completions/mean_terminated_length": 799.1111450195312, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "entropy": 0.41671308875083923, "epoch": 2.439153439153439, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.0014645819319412112, "learning_rate": 6.066498153718734e-07, "loss": 0.0001, "num_tokens": 8993351.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 461, "step_time": 331.1069351742044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 638.3125, "completions/mean_terminated_length": 638.3125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.333692342042923, "epoch": 2.4444444444444446, "frac_reward_zero_std": 0.5, "grad_norm": 0.9453125, "kl": 0.0018866208847612143, "learning_rate": 6.05033816926583e-07, "loss": -0.0577, "num_tokens": 9008684.0, "reward": 0.0520833358168602, "reward_std": 0.043129097670316696, "rewards/itbench_correctness/mean": 0.0520833358168602, "rewards/itbench_correctness/std": 0.07978560030460358, "step": 462, "step_time": 83.101976220496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 986.3125, "completions/mean_terminated_length": 873.25, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "entropy": 0.42785629630088806, "epoch": 2.4497354497354498, "frac_reward_zero_std": 0.5, "grad_norm": 1.3828125, "kl": 0.0011906343279406428, "learning_rate": 6.034166698482983e-07, "loss": 0.0241, "num_tokens": 9031329.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 463, "step_time": 6071.444487111643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 587.5, "completions/mean_terminated_length": 442.0, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.5753191709518433, "epoch": 2.455026455026455, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0014363499358296394, "learning_rate": 6.017983918218811e-07, "loss": -0.0264, "num_tokens": 9064257.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 464, "step_time": 156.23662452865392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 489.0, "completions/mean_terminated_length": 453.3333435058594, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.5644171833992004, "epoch": 2.4603174603174605, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.0021656756289303303, "learning_rate": 6.001790005445606e-07, "loss": 0.0027, "num_tokens": 9093129.0, "reward": 0.359375, "reward_std": 0.19408094882965088, "rewards/itbench_correctness/mean": 0.359375, "rewards/itbench_correctness/std": 0.4561501145362854, "step": 465, "step_time": 114.18916879687458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 454.1875, "completions/mean_terminated_length": 454.1875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.45355716347694397, "epoch": 2.4656084656084656, "frac_reward_zero_std": 0.5, "grad_norm": 0.7421875, "kl": 0.0023789138067513704, "learning_rate": 5.985585137257401e-07, "loss": -0.0632, "num_tokens": 9103764.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 466, "step_time": 111.49137642700225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 459.8125, "completions/mean_terminated_length": 459.8125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "entropy": 0.3784151077270508, "epoch": 2.4708994708994707, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0017901118844747543, "learning_rate": 5.969369490868042e-07, "loss": 0.0151, "num_tokens": 9114825.0, "reward": 0.6927083730697632, "reward_std": 0.1530819982290268, "rewards/itbench_correctness/mean": 0.6927083730697632, "rewards/itbench_correctness/std": 0.26652559638023376, "step": 467, "step_time": 45.78145207092166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 631.375, "completions/mean_terminated_length": 631.375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.46565037965774536, "epoch": 2.4761904761904763, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.0021383543498814106, "learning_rate": 5.953143243609234e-07, "loss": 0.0001, "num_tokens": 9128071.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 468, "step_time": 75.04701119381934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 431.6875, "completions/mean_terminated_length": 431.6875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5073114037513733, "epoch": 2.4814814814814814, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0021981364116072655, "learning_rate": 5.936906572928624e-07, "loss": -0.0696, "num_tokens": 9143002.0, "reward": 0.871874988079071, "reward_std": 0.20840224623680115, "rewards/itbench_correctness/mean": 0.871874988079071, "rewards/itbench_correctness/std": 0.2529616057872772, "step": 469, "step_time": 78.34151318110526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 646.0625, "completions/mean_terminated_length": 419.3000183105469, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.5819870233535767, "epoch": 2.4867724867724865, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.002163654426112771, "learning_rate": 5.920659656387836e-07, "loss": 0.072, "num_tokens": 9169283.0, "reward": 0.28125, "reward_std": 0.2651650309562683, "rewards/itbench_correctness/mean": 0.28125, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 470, "step_time": 197.0865554632619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 707.25, "completions/mean_terminated_length": 563.2727661132812, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.41003888845443726, "epoch": 2.492063492063492, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.001396682346239686, "learning_rate": 5.90440267166055e-07, "loss": -0.0062, "num_tokens": 9187327.0, "reward": 0.875, "reward_std": 0.2619796097278595, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.273861289024353, "step": 471, "step_time": 86.19406038243324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 556.1875, "completions/mean_terminated_length": 556.1875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "entropy": 0.3613889217376709, "epoch": 2.497354497354497, "frac_reward_zero_std": 0.5, "grad_norm": 1.0625, "kl": 0.001245600637048483, "learning_rate": 5.888135796530544e-07, "loss": 0.0086, "num_tokens": 9200090.0, "reward": 0.5729166865348816, "reward_std": 0.0294627845287323, "rewards/itbench_correctness/mean": 0.5729166865348816, "rewards/itbench_correctness/std": 0.4429227113723755, "step": 472, "step_time": 72.5158723751083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 477.25, "completions/mean_terminated_length": 477.25, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.45049765706062317, "epoch": 2.502645502645503, "frac_reward_zero_std": 0.5, "grad_norm": 1.15625, "kl": 0.001709087984636426, "learning_rate": 5.871859208889758e-07, "loss": -0.0137, "num_tokens": 9210406.0, "reward": 0.640625, "reward_std": 0.11451567709445953, "rewards/itbench_correctness/mean": 0.640625, "rewards/itbench_correctness/std": 0.40278977155685425, "step": 473, "step_time": 132.74163577985018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 835.9375, "completions/mean_terminated_length": 773.25, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.521570086479187, "epoch": 2.507936507936508, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0017986115999519825, "learning_rate": 5.855573086736349e-07, "loss": -0.1102, "num_tokens": 9243021.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 474, "step_time": 366.73758555483073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 830.5, "completions/mean_terminated_length": 742.5454711914062, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.2576760947704315, "epoch": 2.5132275132275135, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0009246127447113395, "learning_rate": 5.839277608172738e-07, "loss": 0.0089, "num_tokens": 9265781.0, "reward": 0.42500001192092896, "reward_std": 0.026726115494966507, "rewards/itbench_correctness/mean": 0.42500001192092896, "rewards/itbench_correctness/std": 0.44045430421829224, "step": 475, "step_time": 242.08502481784672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 789.1875, "completions/mean_terminated_length": 648.2999877929688, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.3066444993019104, "epoch": 2.5185185185185186, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0016043871873989701, "learning_rate": 5.82297295140367e-07, "loss": 0.0033, "num_tokens": 9285040.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.3333333432674408, "step": 476, "step_time": 477.0931530073285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 922.125, "completions/mean_terminated_length": 820.25, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.37738919258117676, "epoch": 2.5238095238095237, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0015391347697004676, "learning_rate": 5.806659294734255e-07, "loss": -0.0201, "num_tokens": 9310586.0, "reward": 0.6581439971923828, "reward_std": 0.30061405897140503, "rewards/itbench_correctness/mean": 0.6581439971923828, "rewards/itbench_correctness/std": 0.3035711646080017, "step": 477, "step_time": 111.43491127341986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 737.75, "completions/mean_terminated_length": 718.6666870117188, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "entropy": 0.21823111176490784, "epoch": 2.5291005291005293, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0010663650464266539, "learning_rate": 5.790336816568032e-07, "loss": -0.0021, "num_tokens": 9327982.0, "reward": 0.4713541865348816, "reward_std": 0.4539119601249695, "rewards/itbench_correctness/mean": 0.4713541865348816, "rewards/itbench_correctness/std": 0.4463635981082916, "step": 478, "step_time": 79.33199557475746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 493.125, "completions/mean_terminated_length": 493.125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.559695839881897, "epoch": 2.5343915343915344, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.0020555509254336357, "learning_rate": 5.774005695405007e-07, "loss": 0.0, "num_tokens": 9340472.0, "reward": 0.6666666269302368, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.6666666269302368, "rewards/itbench_correctness/std": 0.17213258147239685, "step": 479, "step_time": 60.24873013421893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 519.625, "completions/mean_terminated_length": 519.625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "entropy": 0.49651190638542175, "epoch": 2.5396825396825395, "frac_reward_zero_std": 1.0, "grad_norm": 0.03466796875, "kl": 0.0015541493194177747, "learning_rate": 5.757666109839702e-07, "loss": 0.0, "num_tokens": 9370882.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 480, "step_time": 1150.014605092816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 575.5, "completions/mean_terminated_length": 575.5, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.4587315320968628, "epoch": 2.544973544973545, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.0013555011246353388, "learning_rate": 5.741318238559209e-07, "loss": -0.0073, "num_tokens": 9383698.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 481, "step_time": 187.38160399720073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 647.4375, "completions/mean_terminated_length": 647.4375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.3753257989883423, "epoch": 2.5502645502645502, "frac_reward_zero_std": 1.0, "grad_norm": 0.037841796875, "kl": 0.0015313706826418638, "learning_rate": 5.724962260341229e-07, "loss": 0.0, "num_tokens": 9398977.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 482, "step_time": 765.347533389926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 583.375, "completions/mean_terminated_length": 583.375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.5416755676269531, "epoch": 2.5555555555555554, "frac_reward_zero_std": 0.5, "grad_norm": 1.15625, "kl": 0.0014298075111582875, "learning_rate": 5.708598354052121e-07, "loss": 0.0256, "num_tokens": 9415967.0, "reward": 0.8541666865348816, "reward_std": 0.049801189452409744, "rewards/itbench_correctness/mean": 0.8541666865348816, "rewards/itbench_correctness/std": 0.16527193784713745, "step": 483, "step_time": 99.65433174744248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 448.9375, "completions/mean_terminated_length": 448.9375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.4521787464618683, "epoch": 2.560846560846561, "frac_reward_zero_std": 0.5, "grad_norm": 1.15625, "kl": 0.0014567336766049266, "learning_rate": 5.692226698644937e-07, "loss": -0.0088, "num_tokens": 9425990.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 484, "step_time": 159.2589992955327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 837.875, "completions/mean_terminated_length": 775.8333740234375, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "entropy": 0.38430553674697876, "epoch": 2.566137566137566, "frac_reward_zero_std": 1.0, "grad_norm": 0.033935546875, "kl": 0.0012451084330677986, "learning_rate": 5.675847473157485e-07, "loss": 0.0, "num_tokens": 9444988.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 485, "step_time": 265.1822805535048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 574.375, "completions/mean_terminated_length": 574.375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.4108814001083374, "epoch": 2.571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0019665309228003025, "learning_rate": 5.659460856710345e-07, "loss": -0.008, "num_tokens": 9458426.0, "reward": 0.8883928656578064, "reward_std": 0.16245228052139282, "rewards/itbench_correctness/mean": 0.8883928656578064, "rewards/itbench_correctness/std": 0.1985812783241272, "step": 486, "step_time": 129.54102603532374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 775.9375, "completions/mean_terminated_length": 583.0, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.32863470911979675, "epoch": 2.5767195767195767, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0015423308359459043, "learning_rate": 5.643067028504931e-07, "loss": -0.0219, "num_tokens": 9485297.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.4281744360923767, "step": 487, "step_time": 71.69435486476868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 509.4375, "completions/mean_terminated_length": 509.4375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3533308804035187, "epoch": 2.582010582010582, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0020819108467549086, "learning_rate": 5.626666167821521e-07, "loss": 0.0039, "num_tokens": 9497008.0, "reward": 0.78125, "reward_std": 0.3471629321575165, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.4069705307483673, "step": 488, "step_time": 89.01397905871272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 567.0625, "completions/mean_terminated_length": 567.0625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.5466769337654114, "epoch": 2.5873015873015874, "frac_reward_zero_std": 1.0, "grad_norm": 0.035400390625, "kl": 0.0019230879843235016, "learning_rate": 5.6102584540173e-07, "loss": 0.0, "num_tokens": 9519441.0, "reward": 0.25, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 489, "step_time": 71.94888481497765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 768.8125, "completions/mean_terminated_length": 768.8125, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "entropy": 0.2939598262310028, "epoch": 2.5925925925925926, "frac_reward_zero_std": 0.5, "grad_norm": 1.203125, "kl": 0.0013061386998742819, "learning_rate": 5.5938440665244e-07, "loss": 0.0038, "num_tokens": 9537934.0, "reward": 0.84375, "reward_std": 0.0578637570142746, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.17969882488250732, "step": 490, "step_time": 106.79387213569134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 944.9375, "completions/mean_terminated_length": 771.0, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.2941993474960327, "epoch": 2.597883597883598, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.0013492838479578495, "learning_rate": 5.577423184847931e-07, "loss": 0.0, "num_tokens": 9561381.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 491, "step_time": 823.8002629633993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 816.3125, "completions/mean_terminated_length": 786.6428833007812, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "entropy": 0.4140571057796478, "epoch": 2.6031746031746033, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.001417625928297639, "learning_rate": 5.560995988564023e-07, "loss": 0.0223, "num_tokens": 9581962.0, "reward": 0.109375, "reward_std": 0.2414703369140625, "rewards/itbench_correctness/mean": 0.109375, "rewards/itbench_correctness/std": 0.2576940953731537, "step": 492, "step_time": 97.65742574445903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 701.125, "completions/mean_terminated_length": 450.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4649670124053955, "epoch": 2.6084656084656084, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0015433584339916706, "learning_rate": 5.544562657317863e-07, "loss": -0.014, "num_tokens": 9616972.0, "reward": 0.3125, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.35939764976501465, "step": 493, "step_time": 147.00880005117506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 735.75, "completions/mean_terminated_length": 694.5714721679688, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "entropy": 0.4131838381290436, "epoch": 2.613756613756614, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.001086304779164493, "learning_rate": 5.528123370821729e-07, "loss": 0.0469, "num_tokens": 9632520.0, "reward": 0.6505681872367859, "reward_std": 0.23257695138454437, "rewards/itbench_correctness/mean": 0.6505681872367859, "rewards/itbench_correctness/std": 0.45813921093940735, "step": 494, "step_time": 73.31144659873098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 768.3125, "completions/mean_terminated_length": 1.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.640364408493042, "epoch": 2.619047619047619, "frac_reward_zero_std": 0.5, "grad_norm": 1.46875, "kl": 0.0018688439158722758, "learning_rate": 5.511678308853025e-07, "loss": -0.1102, "num_tokens": 9660781.0, "reward": 0.0520833358168602, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.0520833358168602, "rewards/itbench_correctness/std": 0.13220004737377167, "step": 495, "step_time": 85.44449219666421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 750.5, "completions/mean_terminated_length": 659.3333740234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.46902066469192505, "epoch": 2.624338624338624, "frac_reward_zero_std": 0.5, "grad_norm": 1.265625, "kl": 0.0013028380926698446, "learning_rate": 5.495227651252315e-07, "loss": 0.0103, "num_tokens": 9677765.0, "reward": 0.59375, "reward_std": 0.1735912710428238, "rewards/itbench_correctness/mean": 0.59375, "rewards/itbench_correctness/std": 0.48196646571159363, "step": 496, "step_time": 781.4977411162108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 390.75, "completions/mean_terminated_length": 390.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4273832440376282, "epoch": 2.6296296296296298, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0021745527628809214, "learning_rate": 5.478771577921351e-07, "loss": -0.0118, "num_tokens": 9686945.0, "reward": 0.627480149269104, "reward_std": 0.2220388650894165, "rewards/itbench_correctness/mean": 0.627480149269104, "rewards/itbench_correctness/std": 0.3782695233821869, "step": 497, "step_time": 134.51860492676497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 764.5, "completions/mean_terminated_length": 505.0, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "entropy": 0.3531720042228699, "epoch": 2.634920634920635, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.0014397338964045048, "learning_rate": 5.462310268821117e-07, "loss": 0.0163, "num_tokens": 9713425.0, "reward": 0.265625, "reward_std": 0.1724265068769455, "rewards/itbench_correctness/mean": 0.265625, "rewards/itbench_correctness/std": 0.3616048991680145, "step": 498, "step_time": 152.6961117470637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 576.0625, "completions/mean_terminated_length": 576.0625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.28642725944519043, "epoch": 2.64021164021164, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0037036272697150707, "learning_rate": 5.445843903969854e-07, "loss": -0.0221, "num_tokens": 9727450.0, "reward": 0.2291666716337204, "reward_std": 0.2048145830631256, "rewards/itbench_correctness/mean": 0.2291666716337204, "rewards/itbench_correctness/std": 0.22669117152690887, "step": 499, "step_time": 78.76068393606693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 794.4375, "completions/mean_terminated_length": 690.0909423828125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.3776256740093231, "epoch": 2.6455026455026456, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0019214467611163855, "learning_rate": 5.429372663441085e-07, "loss": 0.0057, "num_tokens": 9745281.0, "reward": 0.40625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.20155644416809082, "step": 500, "step_time": 542.0401397850364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 570.9375, "completions/mean_terminated_length": 570.9375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.3800766170024872, "epoch": 2.6507936507936507, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0011877365177497268, "learning_rate": 5.412896727361662e-07, "loss": 0.0347, "num_tokens": 9757936.0, "reward": 0.8323863744735718, "reward_std": 0.1946326196193695, "rewards/itbench_correctness/mean": 0.8323863744735718, "rewards/itbench_correctness/std": 0.28190067410469055, "step": 501, "step_time": 66.27912161499262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 658.25, "completions/mean_terminated_length": 492.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.30991265177726746, "epoch": 2.656084656084656, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.001489053014665842, "learning_rate": 5.396416275909779e-07, "loss": 0.0148, "num_tokens": 9773348.0, "reward": 0.21875, "reward_std": 0.3061639666557312, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 502, "step_time": 1172.296461245045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 537.6875, "completions/mean_terminated_length": 537.6875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.3291874825954437, "epoch": 2.6613756613756614, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0015724250115454197, "learning_rate": 5.379931489313015e-07, "loss": 0.0265, "num_tokens": 9786871.0, "reward": 0.45625001192092896, "reward_std": 0.05625351518392563, "rewards/itbench_correctness/mean": 0.45625001192092896, "rewards/itbench_correctness/std": 0.34699106216430664, "step": 503, "step_time": 66.85712667554617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 372.875, "completions/mean_terminated_length": 372.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4425075352191925, "epoch": 2.6666666666666665, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.0019720701966434717, "learning_rate": 5.363442547846355e-07, "loss": -0.043, "num_tokens": 9800949.0, "reward": 0.3671875, "reward_std": 0.3643017113208771, "rewards/itbench_correctness/mean": 0.3671875, "rewards/itbench_correctness/std": 0.4119788408279419, "step": 504, "step_time": 72.04895468428731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 784.4375, "completions/mean_terminated_length": 704.5833740234375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.47422516345977783, "epoch": 2.671957671957672, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.001217540237121284, "learning_rate": 5.34694963183022e-07, "loss": -0.0013, "num_tokens": 9817452.0, "reward": 0.8229166269302368, "reward_std": 0.2745841145515442, "rewards/itbench_correctness/mean": 0.8229166269302368, "rewards/itbench_correctness/std": 0.3303687572479248, "step": 505, "step_time": 82.16243282984942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 512.4375, "completions/mean_terminated_length": 512.4375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5346993803977966, "epoch": 2.677248677248677, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.0038210356142371893, "learning_rate": 5.330452921628497e-07, "loss": -0.1759, "num_tokens": 9828603.0, "reward": 0.5416666865348816, "reward_std": 0.21967849135398865, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.24152295291423798, "step": 506, "step_time": 130.50164964888245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 873.9375, "completions/mean_terminated_length": 757.2222290039062, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "entropy": 0.446256160736084, "epoch": 2.682539682539683, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.0013827559305354953, "learning_rate": 5.313952597646567e-07, "loss": -0.0479, "num_tokens": 9851170.0, "reward": 0.1875, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 507, "step_time": 549.6535976743326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 426.5, "completions/mean_terminated_length": 426.5, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.46189919114112854, "epoch": 2.687830687830688, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0022915503941476345, "learning_rate": 5.297448840329328e-07, "loss": -0.0217, "num_tokens": 9860442.0, "reward": 0.2698863744735718, "reward_std": 0.15896323323249817, "rewards/itbench_correctness/mean": 0.2698863744735718, "rewards/itbench_correctness/std": 0.2320520281791687, "step": 508, "step_time": 67.97736590728164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 781.4375, "completions/mean_terminated_length": 746.7857666015625, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.2853715121746063, "epoch": 2.693121693121693, "frac_reward_zero_std": 0.5, "grad_norm": 1.203125, "kl": 0.0011138498084619641, "learning_rate": 5.280941830159227e-07, "loss": 0.0167, "num_tokens": 9880081.0, "reward": 0.34375, "reward_std": 0.18600594997406006, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.4366062581539154, "step": 509, "step_time": 240.96920191589743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 581.5625, "completions/mean_terminated_length": 581.5625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "entropy": 0.5089736580848694, "epoch": 2.6984126984126986, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.001449383096769452, "learning_rate": 5.264431747654283e-07, "loss": 0.0141, "num_tokens": 9910954.0, "reward": 0.34375, "reward_std": 0.18600594997406006, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.4366062581539154, "step": 510, "step_time": 138.8211078811437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 600.4375, "completions/mean_terminated_length": 600.4375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "entropy": 0.3697304129600525, "epoch": 2.7037037037037037, "frac_reward_zero_std": 0.5, "grad_norm": 1.1015625, "kl": 0.001372107770293951, "learning_rate": 5.247918773366111e-07, "loss": 0.0158, "num_tokens": 9925225.0, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 511, "step_time": 78.60712255910039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "entropy": 0.36784201860427856, "epoch": 2.708994708994709, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.001885003293864429, "learning_rate": 5.231403087877955e-07, "loss": -0.0034, "num_tokens": 9936491.0, "reward": 0.5052083730697632, "reward_std": 0.29658451676368713, "rewards/itbench_correctness/mean": 0.5052083730697632, "rewards/itbench_correctness/std": 0.4892064332962036, "step": 512, "step_time": 1101.7817776547745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 506.5, "completions/mean_terminated_length": 506.5, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.5034551024436951, "epoch": 2.7142857142857144, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0027517788112163544, "learning_rate": 5.214884871802703e-07, "loss": -0.0104, "num_tokens": 9958027.0, "reward": 0.5333333015441895, "reward_std": 0.24348656833171844, "rewards/itbench_correctness/mean": 0.5333333015441895, "rewards/itbench_correctness/std": 0.3538151979446411, "step": 513, "step_time": 115.8853734144941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 503.375, "completions/mean_terminated_length": 468.66668701171875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3277874290943146, "epoch": 2.7195767195767195, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.005129899829626083, "learning_rate": 5.198364305780921e-07, "loss": -0.1331, "num_tokens": 9970817.0, "reward": 0.4270833432674408, "reward_std": 0.3061639964580536, "rewards/itbench_correctness/mean": 0.4270833432674408, "rewards/itbench_correctness/std": 0.32185083627700806, "step": 514, "step_time": 81.30817873775959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 523.0, "completions/mean_terminated_length": 523.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.41108986735343933, "epoch": 2.7248677248677247, "frac_reward_zero_std": 0.5, "grad_norm": 1.34375, "kl": 0.0014420569641515613, "learning_rate": 5.181841570478872e-07, "loss": 0.0416, "num_tokens": 9982881.0, "reward": 0.9943181872367859, "reward_std": 0.016070598736405373, "rewards/itbench_correctness/mean": 0.9943181872367859, "rewards/itbench_correctness/std": 0.02272726595401764, "step": 515, "step_time": 76.78445727284998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 978.5, "completions/mean_terminated_length": 878.4000244140625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 0.33316299319267273, "epoch": 2.7301587301587302, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0013756535481661558, "learning_rate": 5.165316846586541e-07, "loss": -0.0065, "num_tokens": 10008745.0, "reward": 0.2053571343421936, "reward_std": 0.28105252981185913, "rewards/itbench_correctness/mean": 0.2053571343421936, "rewards/itbench_correctness/std": 0.32667672634124756, "step": 516, "step_time": 816.1864080894738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 624.875, "completions/mean_terminated_length": 624.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5921184420585632, "epoch": 2.7354497354497354, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0018502407474443316, "learning_rate": 5.148790314815662e-07, "loss": -0.0101, "num_tokens": 10025191.0, "reward": 0.5062500238418579, "reward_std": 0.25888073444366455, "rewards/itbench_correctness/mean": 0.5062500238418579, "rewards/itbench_correctness/std": 0.4753507673740387, "step": 517, "step_time": 72.83533152658492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 823.0625, "completions/mean_terminated_length": 622.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5637481808662415, "epoch": 2.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0017002089880406857, "learning_rate": 5.132262155897738e-07, "loss": -0.1006, "num_tokens": 10054440.0, "reward": 0.4895833134651184, "reward_std": 0.3517908453941345, "rewards/itbench_correctness/mean": 0.4895833134651184, "rewards/itbench_correctness/std": 0.5072392821311951, "step": 518, "step_time": 96.34473600052297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 994.25, "completions/mean_terminated_length": 786.0, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "entropy": 0.3198390603065491, "epoch": 2.746031746031746, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.0014571960782632232, "learning_rate": 5.115732550582069e-07, "loss": 0.0001, "num_tokens": 10079516.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 519, "step_time": 8666.9965882916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 863.5, "completions/mean_terminated_length": 703.0, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "entropy": 0.5211349129676819, "epoch": 2.751322751322751, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.0015487850178033113, "learning_rate": 5.099201679633768e-07, "loss": 0.0001, "num_tokens": 10101236.0, "reward": 0.46875, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.185404971241951, "step": 520, "step_time": 312.3351803580299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 511.4375, "completions/mean_terminated_length": 511.4375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4203837215900421, "epoch": 2.7566137566137567, "frac_reward_zero_std": 0.5, "grad_norm": 0.56640625, "kl": 0.0017837980994954705, "learning_rate": 5.082669723831793e-07, "loss": -0.0992, "num_tokens": 10119339.0, "reward": 0.37708336114883423, "reward_std": 0.0176776684820652, "rewards/itbench_correctness/mean": 0.37708336114883423, "rewards/itbench_correctness/std": 0.30005404353141785, "step": 521, "step_time": 109.74898790102452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 974.6875, "completions/mean_terminated_length": 866.2000122070312, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "entropy": 0.3508816957473755, "epoch": 2.761904761904762, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0012084582122042775, "learning_rate": 5.066136863966962e-07, "loss": 0.0043, "num_tokens": 10142334.0, "reward": 0.46875, "reward_std": 0.28270021080970764, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4035433530807495, "step": 522, "step_time": 106.33708533085883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 875.5625, "completions/mean_terminated_length": 760.1111450195312, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "entropy": 0.5824826955795288, "epoch": 2.7671957671957674, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0019700033590197563, "learning_rate": 5.049603280839982e-07, "loss": -0.0028, "num_tokens": 10170263.0, "reward": 0.015625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.015625, "rewards/itbench_correctness/std": 0.0625, "step": 523, "step_time": 73.45079297944903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 543.125, "completions/mean_terminated_length": 543.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.5413118600845337, "epoch": 2.7724867724867726, "frac_reward_zero_std": 0.5, "grad_norm": 1.46875, "kl": 0.00216845516115427, "learning_rate": 5.033069155259471e-07, "loss": -0.0028, "num_tokens": 10185017.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 524, "step_time": 89.7467988235876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 734.1875, "completions/mean_terminated_length": 692.7857666015625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.6020260453224182, "epoch": 2.7777777777777777, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0015728548169136047, "learning_rate": 5.016534668039976e-07, "loss": -0.0002, "num_tokens": 10209820.0, "reward": 0.5416666865348816, "reward_std": 0.235702246427536, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.5, "step": 525, "step_time": 208.850717083551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 1010.375, "completions/mean_terminated_length": 951.3333740234375, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "entropy": 0.522578239440918, "epoch": 2.7830687830687832, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.00148448022082448, "learning_rate": 5e-07, "loss": 0.0001, "num_tokens": 10238170.0, "reward": 0.171875, "reward_std": 0.188242569565773, "rewards/itbench_correctness/mean": 0.171875, "rewards/itbench_correctness/std": 0.3125, "step": 526, "step_time": 942.4442430688068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 841.5, "completions/mean_terminated_length": 659.0, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 0.45632797479629517, "epoch": 2.7883597883597884, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.001308864215388894, "learning_rate": 4.983465331960023e-07, "loss": -0.0154, "num_tokens": 10258074.0, "reward": 0.5531250238418579, "reward_std": 0.24483326077461243, "rewards/itbench_correctness/mean": 0.5531250238418579, "rewards/itbench_correctness/std": 0.421295166015625, "step": 527, "step_time": 177.91924435272813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 325.75, "completions/mean_terminated_length": 325.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.33614733815193176, "epoch": 2.7936507936507935, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.00386231392621994, "learning_rate": 4.96693084474053e-07, "loss": -0.0765, "num_tokens": 10269622.0, "reward": 0.5625, "reward_std": 0.3471825420856476, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 528, "step_time": 59.13615032006055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 629.375, "completions/mean_terminated_length": 450.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.46077457070350647, "epoch": 2.798941798941799, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0026442273519933224, "learning_rate": 4.950396719160018e-07, "loss": -0.0582, "num_tokens": 10287964.0, "reward": 0.3645833432674408, "reward_std": 0.3061639964580536, "rewards/itbench_correctness/mean": 0.3645833432674408, "rewards/itbench_correctness/std": 0.3507597744464874, "step": 529, "step_time": 269.48390776105225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 713.5, "completions/mean_terminated_length": 527.2000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6839523315429688, "epoch": 2.804232804232804, "frac_reward_zero_std": 0.5, "grad_norm": 0.65234375, "kl": 0.0018830286571756005, "learning_rate": 4.933863136033039e-07, "loss": -0.1144, "num_tokens": 10317900.0, "reward": 0.3072916567325592, "reward_std": 0.19150808453559875, "rewards/itbench_correctness/mean": 0.3072916567325592, "rewards/itbench_correctness/std": 0.4113198518753052, "step": 530, "step_time": 151.5169429546222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 717.125, "completions/mean_terminated_length": 410.25, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.5047934651374817, "epoch": 2.8095238095238093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322265625, "kl": 0.0017540534026920795, "learning_rate": 4.917330276168208e-07, "loss": 0.0, "num_tokens": 10342214.0, "reward": 0.699999988079071, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.699999988079071, "rewards/itbench_correctness/std": 0.3098386824131012, "step": 531, "step_time": 203.65224741771817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 629.5625, "completions/mean_terminated_length": 629.5625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.3589794635772705, "epoch": 2.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0014826505212113261, "learning_rate": 4.900798320366232e-07, "loss": -0.017, "num_tokens": 10355927.0, "reward": 0.7332720756530762, "reward_std": 0.21437928080558777, "rewards/itbench_correctness/mean": 0.7332720756530762, "rewards/itbench_correctness/std": 0.37859466671943665, "step": 532, "step_time": 348.23073250520974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.392578125, "epoch": 2.82010582010582, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.0014328202232718468, "learning_rate": 4.88426744941793e-07, "loss": 0.0001, "num_tokens": 10384407.0, "reward": 0.8541666269302368, "reward_std": 0.2482243776321411, "rewards/itbench_correctness/mean": 0.8541666269302368, "rewards/itbench_correctness/std": 0.26440009474754333, "step": 533, "step_time": 118.49520284496248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 498.25, "completions/mean_terminated_length": 498.25, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "entropy": 0.38534870743751526, "epoch": 2.825396825396825, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.002330408664420247, "learning_rate": 4.86773784410226e-07, "loss": 0.0162, "num_tokens": 10399339.0, "reward": 0.46875, "reward_std": 0.2609178125858307, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4312717914581299, "step": 534, "step_time": 567.9226626912132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 921.625, "completions/mean_terminated_length": 790.0000610351562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.2951308786869049, "epoch": 2.8306878306878307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.0014564162120223045, "learning_rate": 4.851209685184338e-07, "loss": 0.0001, "num_tokens": 10424093.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 535, "step_time": 216.71312026213855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 733.0, "completions/mean_terminated_length": 733.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.45839017629623413, "epoch": 2.835978835978836, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234375, "kl": 0.0023340799380093813, "learning_rate": 4.834683153413459e-07, "loss": -0.1065, "num_tokens": 10440501.0, "reward": 0.856249988079071, "reward_std": 0.17614421248435974, "rewards/itbench_correctness/mean": 0.856249988079071, "rewards/itbench_correctness/std": 0.2827690541744232, "step": 536, "step_time": 74.85195223800838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 563.875, "completions/mean_terminated_length": 563.875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "entropy": 0.48237642645835876, "epoch": 2.8412698412698414, "frac_reward_zero_std": 1.0, "grad_norm": 0.01373291015625, "kl": 0.001148638199083507, "learning_rate": 4.818158429521129e-07, "loss": 0.0, "num_tokens": 10452811.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 537, "step_time": 238.9646631795913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 613.0, "completions/mean_terminated_length": 613.0, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "entropy": 0.48939642310142517, "epoch": 2.8465608465608465, "frac_reward_zero_std": 0.5, "grad_norm": 1.25, "kl": 0.0019839375745505095, "learning_rate": 4.801635694219079e-07, "loss": 0.0307, "num_tokens": 10466963.0, "reward": 0.484375, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.484375, "rewards/itbench_correctness/std": 0.503891110420227, "step": 538, "step_time": 778.2056272830814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 868.4375, "completions/mean_terminated_length": 747.4444580078125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.27981287240982056, "epoch": 2.851851851851852, "frac_reward_zero_std": 0.5, "grad_norm": 0.62890625, "kl": 0.0014373651938512921, "learning_rate": 4.785115128197298e-07, "loss": -0.1155, "num_tokens": 10487634.0, "reward": 0.3897058963775635, "reward_std": 0.16693422198295593, "rewards/itbench_correctness/mean": 0.3897058963775635, "rewards/itbench_correctness/std": 0.46261632442474365, "step": 539, "step_time": 163.97635082527995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 695.375, "completions/mean_terminated_length": 546.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5637246370315552, "epoch": 2.857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.734375, "kl": 0.0018984549678862095, "learning_rate": 4.768596912122045e-07, "loss": -0.1365, "num_tokens": 10528456.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 540, "step_time": 148.45136263035238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 971.5, "completions/mean_terminated_length": 856.0, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.4776119291782379, "epoch": 2.8624338624338623, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0011911564506590366, "learning_rate": 4.752081226633888e-07, "loss": 0.0568, "num_tokens": 10571880.0, "reward": 0.5, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 541, "step_time": 150.04778977762908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 988.6875, "completions/mean_terminated_length": 882.75, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "entropy": 0.5259498357772827, "epoch": 2.867724867724868, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.001533015980385244, "learning_rate": 4.7355682523457173e-07, "loss": 0.0001, "num_tokens": 10605523.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 542, "step_time": 117.57136417739093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 550.3125, "completions/mean_terminated_length": 550.3125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "entropy": 0.523339033126831, "epoch": 2.873015873015873, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.002573953941464424, "learning_rate": 4.719058169840772e-07, "loss": 0.0001, "num_tokens": 10628208.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 543, "step_time": 99.59436613786966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 829.5, "completions/mean_terminated_length": 635.0, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "entropy": 0.3833634853363037, "epoch": 2.878306878306878, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0010798868024721742, "learning_rate": 4.702551159670672e-07, "loss": -0.0133, "num_tokens": 10649920.0, "reward": 0.4388020932674408, "reward_std": 0.24691906571388245, "rewards/itbench_correctness/mean": 0.4388020932674408, "rewards/itbench_correctness/std": 0.3513753414154053, "step": 544, "step_time": 112.22929359227419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 786.3125, "completions/mean_terminated_length": 643.7000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5061600804328918, "epoch": 2.8835978835978837, "frac_reward_zero_std": 0.5, "grad_norm": 0.56640625, "kl": 0.001771087758243084, "learning_rate": 4.686047402353433e-07, "loss": -0.1004, "num_tokens": 10672789.0, "reward": 0.3177083432674408, "reward_std": 0.129746213555336, "rewards/itbench_correctness/mean": 0.3177083432674408, "rewards/itbench_correctness/std": 0.37294963002204895, "step": 545, "step_time": 111.85205744486302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 618.625, "completions/mean_terminated_length": 618.625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.2796524465084076, "epoch": 2.888888888888889, "frac_reward_zero_std": 0.5, "grad_norm": 1.203125, "kl": 0.0013817870058119297, "learning_rate": 4.669547078371503e-07, "loss": -0.0018, "num_tokens": 10688151.0, "reward": 0.6770833730697632, "reward_std": 0.08258593082427979, "rewards/itbench_correctness/mean": 0.6770833730697632, "rewards/itbench_correctness/std": 0.3520771563053131, "step": 546, "step_time": 128.49444034136832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 718.5, "completions/mean_terminated_length": 698.1333618164062, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "entropy": 0.5622825622558594, "epoch": 2.894179894179894, "frac_reward_zero_std": 0.5, "grad_norm": 1.3203125, "kl": 0.0019919220358133316, "learning_rate": 4.6530503681697796e-07, "loss": 0.0318, "num_tokens": 10714559.0, "reward": 0.17499999701976776, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.17499999701976776, "rewards/itbench_correctness/std": 0.19832633435726166, "step": 547, "step_time": 282.7451619775966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 453.4375, "completions/mean_terminated_length": 453.4375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.4057891070842743, "epoch": 2.8994708994708995, "frac_reward_zero_std": 0.5, "grad_norm": 1.140625, "kl": 0.0013302209554240108, "learning_rate": 4.6365574521536446e-07, "loss": 0.0067, "num_tokens": 10725190.0, "reward": 0.234375, "reward_std": 0.12387890368700027, "rewards/itbench_correctness/mean": 0.234375, "rewards/itbench_correctness/std": 0.29536348581314087, "step": 548, "step_time": 440.6484692748636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 755.5625, "completions/mean_terminated_length": 594.5, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 0.5479361414909363, "epoch": 2.9047619047619047, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.002523271832615137, "learning_rate": 4.620068510686984e-07, "loss": 0.0625, "num_tokens": 10743759.0, "reward": 0.4166666865348816, "reward_std": 0.3667176067829132, "rewards/itbench_correctness/mean": 0.4166666865348816, "rewards/itbench_correctness/std": 0.38005849719047546, "step": 549, "step_time": 73.56386850681156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 692.0, "completions/mean_terminated_length": 433.77777099609375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.3309248685836792, "epoch": 2.91005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.001961898524314165, "learning_rate": 4.60358372409022e-07, "loss": 0.0117, "num_tokens": 10761847.0, "reward": 0.5104166865348816, "reward_std": 0.2609047293663025, "rewards/itbench_correctness/mean": 0.5104166865348816, "rewards/itbench_correctness/std": 0.27533650398254395, "step": 550, "step_time": 100.34413592051715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 539.6875, "completions/mean_terminated_length": 539.6875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "entropy": 0.4928778111934662, "epoch": 2.9153439153439153, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.002205328783020377, "learning_rate": 4.5871032726383385e-07, "loss": -0.008, "num_tokens": 10785042.0, "reward": 0.3125, "reward_std": 0.44403791427612305, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 551, "step_time": 102.3933826405555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 950.9375, "completions/mean_terminated_length": 894.1111450195312, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "entropy": 0.3764705955982208, "epoch": 2.9206349206349205, "frac_reward_zero_std": 0.5, "grad_norm": 1.5859375, "kl": 0.0018417153041809797, "learning_rate": 4.5706273365589144e-07, "loss": -0.0023, "num_tokens": 10806801.0, "reward": 0.2447916567325592, "reward_std": 0.20343953371047974, "rewards/itbench_correctness/mean": 0.2447916567325592, "rewards/itbench_correctness/std": 0.37573233246803284, "step": 552, "step_time": 254.7854423839599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 713.875, "completions/mean_terminated_length": 713.875, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.4454561471939087, "epoch": 2.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.0015814114594832063, "learning_rate": 4.554156096030148e-07, "loss": 0.0134, "num_tokens": 10823391.0, "reward": 0.859375, "reward_std": 0.17926117777824402, "rewards/itbench_correctness/mean": 0.859375, "rewards/itbench_correctness/std": 0.17405499517917633, "step": 553, "step_time": 135.78249835129827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 1012.5625, "completions/mean_terminated_length": 978.25, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "entropy": 0.30615395307540894, "epoch": 2.931216931216931, "frac_reward_zero_std": 0.5, "grad_norm": 1.6328125, "kl": 0.001216786215081811, "learning_rate": 4.5376897311788825e-07, "loss": -0.002, "num_tokens": 10847512.0, "reward": 0.1041666716337204, "reward_std": 0.12400396913290024, "rewards/itbench_correctness/mean": 0.1041666716337204, "rewards/itbench_correctness/std": 0.2006932497024536, "step": 554, "step_time": 7353.796993748285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 883.5625, "completions/mean_terminated_length": 743.125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.6835962533950806, "epoch": 2.9365079365079367, "frac_reward_zero_std": 0.5, "grad_norm": 1.5625, "kl": 0.001729651470668614, "learning_rate": 4.521228422078649e-07, "loss": 0.0001, "num_tokens": 10879377.0, "reward": 0.1484375, "reward_std": 0.17971175909042358, "rewards/itbench_correctness/mean": 0.1484375, "rewards/itbench_correctness/std": 0.2894634008407593, "step": 555, "step_time": 211.0395448282361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 859.25, "completions/mean_terminated_length": 731.1111450195312, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.4469013810157776, "epoch": 2.941798941798942, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.001122955116443336, "learning_rate": 4.5047723487476864e-07, "loss": 0.0002, "num_tokens": 10909653.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 556, "step_time": 91.0195178175345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 452.875, "completions/mean_terminated_length": 452.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.47474467754364014, "epoch": 2.947089947089947, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0027064280584454536, "learning_rate": 4.488321691146975e-07, "loss": -0.0516, "num_tokens": 10919539.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 557, "step_time": 96.5944811757654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.4874471127986908, "epoch": 2.9523809523809526, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0014341843780130148, "learning_rate": 4.4718766291782723e-07, "loss": 0.0231, "num_tokens": 10928989.0, "reward": 0.1875, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 558, "step_time": 76.11392251215875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 528.9091186523438, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.38032546639442444, "epoch": 2.9576719576719577, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.0016363576287403703, "learning_rate": 4.4554373426821367e-07, "loss": -0.0025, "num_tokens": 10944967.0, "reward": 0.4375, "reward_std": 0.2077372521162033, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.291070818901062, "step": 559, "step_time": 134.31548726093024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 681.75, "completions/mean_terminated_length": 681.75, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "entropy": 0.3021635413169861, "epoch": 2.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0013805434573441744, "learning_rate": 4.439004011435979e-07, "loss": 0.0081, "num_tokens": 10961347.0, "reward": 0.5416666865348816, "reward_std": 0.21535253524780273, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.4238273799419403, "step": 560, "step_time": 93.65936294849962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 731.3125, "completions/mean_terminated_length": 633.75, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.43756943941116333, "epoch": 2.9682539682539684, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0011807921109721065, "learning_rate": 4.4225768151520694e-07, "loss": 0.0115, "num_tokens": 10977016.0, "reward": 0.5282738208770752, "reward_std": 0.09272660315036774, "rewards/itbench_correctness/mean": 0.5282738208770752, "rewards/itbench_correctness/std": 0.390090674161911, "step": 561, "step_time": 83.44914623722434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 700.75, "completions/mean_terminated_length": 700.75, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.54513019323349, "epoch": 2.9735449735449735, "frac_reward_zero_std": 0.5, "grad_norm": 0.98828125, "kl": 0.0026250071823596954, "learning_rate": 4.406155933475599e-07, "loss": 0.0199, "num_tokens": 11001308.0, "reward": 0.9479166269302368, "reward_std": 0.043129097670316696, "rewards/itbench_correctness/mean": 0.9479166269302368, "rewards/itbench_correctness/std": 0.07978560030460358, "step": 562, "step_time": 114.21156205888838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 708.8125, "completions/mean_terminated_length": 636.0769653320312, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.2962701618671417, "epoch": 2.9788359788359786, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.001674781320616603, "learning_rate": 4.389741545982699e-07, "loss": -0.0206, "num_tokens": 11017705.0, "reward": 0.2916666865348816, "reward_std": 0.4096291959285736, "rewards/itbench_correctness/mean": 0.2916666865348816, "rewards/itbench_correctness/std": 0.4013864994049072, "step": 563, "step_time": 173.21209927741438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 483.6875, "completions/mean_terminated_length": 483.6875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.6161002516746521, "epoch": 2.984126984126984, "frac_reward_zero_std": 0.5, "grad_norm": 1.3203125, "kl": 0.0027410590555518866, "learning_rate": 4.3733338321784777e-07, "loss": 0.0192, "num_tokens": 11029732.0, "reward": 0.40625, "reward_std": 0.08258593827486038, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.43448033928871155, "step": 564, "step_time": 86.51283952593803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 822.0625, "completions/mean_terminated_length": 665.0, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 0.3941306173801422, "epoch": 2.9894179894179893, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.0017499164678156376, "learning_rate": 4.3569329714950703e-07, "loss": -0.0189, "num_tokens": 11051933.0, "reward": 0.21875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.20155644416809082, "step": 565, "step_time": 424.9380031451583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 931.5, "completions/mean_terminated_length": 839.0, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "entropy": 0.5625335574150085, "epoch": 2.9947089947089944, "frac_reward_zero_std": 0.5, "grad_norm": 1.4453125, "kl": 0.0013327021151781082, "learning_rate": 4.340539143289655e-07, "loss": 0.0, "num_tokens": 11079021.0, "reward": 0.11249999701976776, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.11249999701976776, "rewards/itbench_correctness/std": 0.24186775088310242, "step": 566, "step_time": 103.31370590813458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 806.8125, "completions/mean_terminated_length": 756.6923217773438, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "entropy": 0.5825393199920654, "epoch": 3.0, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0027419670950621367, "learning_rate": 4.324152526842517e-07, "loss": 0.0188, "num_tokens": 11105082.0, "reward": 0.3854166865348816, "reward_std": 0.46477773785591125, "rewards/itbench_correctness/mean": 0.3854166865348816, "rewards/itbench_correctness/std": 0.4702983796596527, "step": 567, "step_time": 128.40436456073076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 836.625, "completions/mean_terminated_length": 649.25, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "entropy": 0.3920513987541199, "epoch": 3.005291005291005, "frac_reward_zero_std": 0.5, "grad_norm": 1.3359375, "kl": 0.0014088655589148402, "learning_rate": 4.307773301355062e-07, "loss": 0.0, "num_tokens": 11133620.0, "reward": 0.34166669845581055, "reward_std": 0.1725163757801056, "rewards/itbench_correctness/mean": 0.34166669845581055, "rewards/itbench_correctness/std": 0.3432955741882324, "step": 568, "step_time": 109.38008708879352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 684.5625, "completions/mean_terminated_length": 345.125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.5375696420669556, "epoch": 3.0105820105820107, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0021559942979365587, "learning_rate": 4.2914016459478786e-07, "loss": 0.0188, "num_tokens": 11152141.0, "reward": 0.4140625, "reward_std": 0.24306795001029968, "rewards/itbench_correctness/mean": 0.4140625, "rewards/itbench_correctness/std": 0.2446032166481018, "step": 569, "step_time": 244.70517920982093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 738.1875, "completions/mean_terminated_length": 608.2727661132812, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "entropy": 0.5039370059967041, "epoch": 3.015873015873016, "frac_reward_zero_std": 0.5, "grad_norm": 1.25, "kl": 0.0019608521834015846, "learning_rate": 4.275037739658771e-07, "loss": 0.0055, "num_tokens": 11174272.0, "reward": 0.0989583358168602, "reward_std": 0.03100099228322506, "rewards/itbench_correctness/mean": 0.0989583358168602, "rewards/itbench_correctness/std": 0.11063265055418015, "step": 570, "step_time": 694.2117186943069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 720.5625, "completions/mean_terminated_length": 538.5, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.5023853182792664, "epoch": 3.0211640211640214, "frac_reward_zero_std": 1.0, "grad_norm": 0.026611328125, "kl": 0.0014148685149848461, "learning_rate": 4.258681761440789e-07, "loss": 0.0, "num_tokens": 11210273.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 571, "step_time": 249.2116071432829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 720.6875, "completions/mean_terminated_length": 484.77777099609375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.3996184170246124, "epoch": 3.0264550264550265, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0012311309110373259, "learning_rate": 4.2423338901602983e-07, "loss": 0.0147, "num_tokens": 11229236.0, "reward": 0.7708333730697632, "reward_std": 0.19795581698417664, "rewards/itbench_correctness/mean": 0.7708333730697632, "rewards/itbench_correctness/std": 0.35939764976501465, "step": 572, "step_time": 247.85055056307465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 520.125, "completions/mean_terminated_length": 520.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.35183849930763245, "epoch": 3.0317460317460316, "frac_reward_zero_std": 0.5, "grad_norm": 1.03125, "kl": 0.002790507161989808, "learning_rate": 4.225994304594993e-07, "loss": -0.0321, "num_tokens": 11242278.0, "reward": 0.1875, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.25, "step": 573, "step_time": 426.5937115754932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 835.625, "completions/mean_terminated_length": 792.1538696289062, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.48107704520225525, "epoch": 3.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0021311482414603233, "learning_rate": 4.2096631834319687e-07, "loss": -0.0252, "num_tokens": 11264776.0, "reward": 0.265625, "reward_std": 0.2204262614250183, "rewards/itbench_correctness/mean": 0.265625, "rewards/itbench_correctness/std": 0.2183031290769577, "step": 574, "step_time": 419.11753554455936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 638.6875, "completions/mean_terminated_length": 638.6875, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.375770628452301, "epoch": 3.0423280423280423, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0015504560433328152, "learning_rate": 4.193340705265745e-07, "loss": 0.0162, "num_tokens": 11285235.0, "reward": 0.47968751192092896, "reward_std": 0.17499202489852905, "rewards/itbench_correctness/mean": 0.47968751192092896, "rewards/itbench_correctness/std": 0.4592764973640442, "step": 575, "step_time": 152.97607036307454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 607.8125, "completions/mean_terminated_length": 607.8125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.39650386571884155, "epoch": 3.0476190476190474, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0011127422330901027, "learning_rate": 4.1770270485963294e-07, "loss": -0.0156, "num_tokens": 11298640.0, "reward": 0.6644607782363892, "reward_std": 0.04214790090918541, "rewards/itbench_correctness/mean": 0.6644607782363892, "rewards/itbench_correctness/std": 0.2791382670402527, "step": 576, "step_time": 166.16624604724348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 698.3125, "completions/mean_terminated_length": 698.3125, "completions/min_length": 625.0, "completions/min_terminated_length": 625.0, "entropy": 0.3837823271751404, "epoch": 3.052910052910053, "frac_reward_zero_std": 0.5, "grad_norm": 1.140625, "kl": 0.0011828916613012552, "learning_rate": 4.1607223918272614e-07, "loss": -0.002, "num_tokens": 11313829.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.5, "step": 577, "step_time": 81.87011110130697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 517.3125, "completions/mean_terminated_length": 517.3125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.3595505654811859, "epoch": 3.058201058201058, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.001632296247407794, "learning_rate": 4.1444269132636494e-07, "loss": 0.0011, "num_tokens": 11325674.0, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.96875, "rewards/itbench_correctness/std": 0.125, "step": 578, "step_time": 853.527063309215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 753.4375, "completions/mean_terminated_length": 630.45458984375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.5415180325508118, "epoch": 3.0634920634920633, "frac_reward_zero_std": 0.5, "grad_norm": 2.390625, "kl": 0.016372833400964737, "learning_rate": 4.1281407911102424e-07, "loss": 0.0314, "num_tokens": 11359377.0, "reward": 0.0234375, "reward_std": 0.03234682232141495, "rewards/itbench_correctness/mean": 0.0234375, "rewards/itbench_correctness/std": 0.050389111042022705, "step": 579, "step_time": 93.2713974667713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 592.8125, "completions/mean_terminated_length": 531.2142944335938, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.4048497676849365, "epoch": 3.068783068783069, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0021097399294376373, "learning_rate": 4.1118642034694565e-07, "loss": 0.0827, "num_tokens": 11371702.0, "reward": 0.34375, "reward_std": 0.21564549207687378, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.46135368943214417, "step": 580, "step_time": 78.31074696686119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 500.0, "completions/mean_terminated_length": 500.0, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.4180000126361847, "epoch": 3.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0022473859135061502, "learning_rate": 4.095597328339452e-07, "loss": 0.003, "num_tokens": 11395878.0, "reward": 0.75, "reward_std": 0.4355512857437134, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 581, "step_time": 106.9057395812124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 953.8125, "completions/mean_terminated_length": 462.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5346962809562683, "epoch": 3.0793650793650795, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.0019432251574471593, "learning_rate": 4.079340343612164e-07, "loss": -0.0605, "num_tokens": 11434283.0, "reward": 0.1145833358168602, "reward_std": 0.17747542262077332, "rewards/itbench_correctness/mean": 0.1145833358168602, "rewards/itbench_correctness/std": 0.17969882488250732, "step": 582, "step_time": 153.69186680205166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 930.5, "completions/mean_terminated_length": 810.2857666015625, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.42557764053344727, "epoch": 3.0846560846560847, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.001963236602023244, "learning_rate": 4.0630934270713755e-07, "loss": 0.015, "num_tokens": 11456923.0, "reward": 0.20416668057441711, "reward_std": 0.2299290895462036, "rewards/itbench_correctness/mean": 0.20416668057441711, "rewards/itbench_correctness/std": 0.2864534258842468, "step": 583, "step_time": 133.70225734543055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 510.6875, "completions/mean_terminated_length": 510.6875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.5052013397216797, "epoch": 3.0899470899470898, "frac_reward_zero_std": 1.0, "grad_norm": 0.0240478515625, "kl": 0.001320964889600873, "learning_rate": 4.046856756390766e-07, "loss": 0.0, "num_tokens": 11468238.0, "reward": 0.5833333134651184, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5833333134651184, "rewards/itbench_correctness/std": 0.4303314983844757, "step": 584, "step_time": 56.649503622204065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 657.5625, "completions/mean_terminated_length": 657.5625, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.5353103280067444, "epoch": 3.0952380952380953, "frac_reward_zero_std": 0.5, "grad_norm": 1.6328125, "kl": 0.0017962680431082845, "learning_rate": 4.030630509131959e-07, "loss": -0.0069, "num_tokens": 11496463.0, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 585, "step_time": 126.4090378023684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.63671875, "epoch": 3.1005291005291005, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.0013208300806581974, "learning_rate": 4.0144148627425986e-07, "loss": 0.0001, "num_tokens": 11522143.0, "reward": 0.1875, "reward_std": 0.27381423115730286, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.273861289024353, "step": 586, "step_time": 592.935981715098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 712.0625, "completions/mean_terminated_length": 400.125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.252786785364151, "epoch": 3.105820105820106, "frac_reward_zero_std": 0.5, "grad_norm": 0.85546875, "kl": 0.001402621390298009, "learning_rate": 3.998209994554394e-07, "loss": -0.0316, "num_tokens": 11543664.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 587, "step_time": 146.66224777232856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 716.3125, "completions/mean_terminated_length": 695.800048828125, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.5612075924873352, "epoch": 3.111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.022705078125, "kl": 0.0013308442430570722, "learning_rate": 3.9820160817811887e-07, "loss": 0.0, "num_tokens": 11582781.0, "reward": 0.05000000074505806, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.05000000074505806, "rewards/itbench_correctness/std": 0.05163978040218353, "step": 588, "step_time": 122.08641688153148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 505.375, "completions/mean_terminated_length": 505.375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.31263911724090576, "epoch": 3.1164021164021163, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0014801534125581384, "learning_rate": 3.965833301517016e-07, "loss": 0.0609, "num_tokens": 11594227.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 589, "step_time": 936.1484231920913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 464.875, "completions/mean_terminated_length": 464.875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.46248990297317505, "epoch": 3.121693121693122, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0018392398487776518, "learning_rate": 3.9496618307341713e-07, "loss": 0.0127, "num_tokens": 11610305.0, "reward": 0.24289773404598236, "reward_std": 0.14991973340511322, "rewards/itbench_correctness/mean": 0.24289773404598236, "rewards/itbench_correctness/std": 0.15188638865947723, "step": 590, "step_time": 92.19220882095397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 608.3125, "completions/mean_terminated_length": 608.3125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.5589232444763184, "epoch": 3.126984126984127, "frac_reward_zero_std": 0.5, "grad_norm": 1.4609375, "kl": 0.0023250363301485777, "learning_rate": 3.9335018462812664e-07, "loss": 0.0092, "num_tokens": 11631278.0, "reward": 0.3999999761581421, "reward_std": 0.16903084516525269, "rewards/itbench_correctness/mean": 0.3999999761581421, "rewards/itbench_correctness/std": 0.47328636050224304, "step": 591, "step_time": 87.29508406948298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 611.375, "completions/mean_terminated_length": 611.375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.5332242846488953, "epoch": 3.132275132275132, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.0023294584825634956, "learning_rate": 3.9173535248813017e-07, "loss": 0.0001, "num_tokens": 11654876.0, "reward": 0.0833333358168602, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0833333358168602, "rewards/itbench_correctness/std": 0.08606629818677902, "step": 592, "step_time": 78.46097278501838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 482.6875, "completions/mean_terminated_length": 482.6875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4537096917629242, "epoch": 3.1375661375661377, "frac_reward_zero_std": 0.5, "grad_norm": 0.416015625, "kl": 0.0019261433044448495, "learning_rate": 3.901217043129734e-07, "loss": -0.0825, "num_tokens": 11666455.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 593, "step_time": 112.39422312192619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 745.5, "completions/mean_terminated_length": 618.9091186523438, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.4077800214290619, "epoch": 3.142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0012227533152326941, "learning_rate": 3.885092577492542e-07, "loss": 0.0307, "num_tokens": 11693759.0, "reward": 0.5625, "reward_std": 0.49022960662841797, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 594, "step_time": 92.09622034989297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 869.4375, "completions/mean_terminated_length": 847.357177734375, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "entropy": 0.33354899287223816, "epoch": 3.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0010007465025410056, "learning_rate": 3.8689803043042996e-07, "loss": -0.0018, "num_tokens": 11713342.0, "reward": 0.5208333730697632, "reward_std": 0.4459637701511383, "rewards/itbench_correctness/mean": 0.5208333730697632, "rewards/itbench_correctness/std": 0.438325971364975, "step": 595, "step_time": 515.5870687887073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 442.25, "completions/mean_terminated_length": 442.25, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.4250989258289337, "epoch": 3.1534391534391535, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.001579253701493144, "learning_rate": 3.8528803997662423e-07, "loss": -0.0061, "num_tokens": 11723002.0, "reward": 0.48750001192092896, "reward_std": 0.1636853665113449, "rewards/itbench_correctness/mean": 0.48750001192092896, "rewards/itbench_correctness/std": 0.16683325171470642, "step": 596, "step_time": 60.05524417478591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 720.0, "completions/mean_terminated_length": 416.0, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.3097222149372101, "epoch": 3.1587301587301586, "frac_reward_zero_std": 0.5, "grad_norm": 1.1484375, "kl": 0.0024181241169571877, "learning_rate": 3.8367930399443486e-07, "loss": -0.0056, "num_tokens": 11740490.0, "reward": 0.2395833432674408, "reward_std": 0.0883883386850357, "rewards/itbench_correctness/mean": 0.2395833432674408, "rewards/itbench_correctness/std": 0.27533650398254395, "step": 597, "step_time": 7237.127478616312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 860.0, "completions/mean_terminated_length": 805.3333740234375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 0.43023255467414856, "epoch": 3.164021164021164, "frac_reward_zero_std": 0.5, "grad_norm": 1.3671875, "kl": 0.001607713638804853, "learning_rate": 3.8207184007674085e-07, "loss": 0.001, "num_tokens": 11764610.0, "reward": 0.7083333730697632, "reward_std": 0.19416078925132751, "rewards/itbench_correctness/mean": 0.7083333730697632, "rewards/itbench_correctness/std": 0.40138646960258484, "step": 598, "step_time": 144.42800151277333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 423.9375, "completions/mean_terminated_length": 423.9375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.35382574796676636, "epoch": 3.1693121693121693, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.003284444333985448, "learning_rate": 3.8046566580250995e-07, "loss": 0.0187, "num_tokens": 11774265.0, "reward": 0.6006944179534912, "reward_std": 0.2023771107196808, "rewards/itbench_correctness/mean": 0.6006944179534912, "rewards/itbench_correctness/std": 0.25677546858787537, "step": 599, "step_time": 91.41469971835613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 745.9375, "completions/mean_terminated_length": 467.875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.4826141595840454, "epoch": 3.1746031746031744, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0022031215485185385, "learning_rate": 3.788607987366069e-07, "loss": -0.0035, "num_tokens": 11791280.0, "reward": 0.6674107313156128, "reward_std": 0.29377472400665283, "rewards/itbench_correctness/mean": 0.6674107313156128, "rewards/itbench_correctness/std": 0.40818971395492554, "step": 600, "step_time": 727.5520837632939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 637.4375, "completions/mean_terminated_length": 637.4375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.6149622797966003, "epoch": 3.17989417989418, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0012068739160895348, "learning_rate": 3.772572564296004e-07, "loss": 0.0218, "num_tokens": 11804407.0, "reward": 0.8541666865348816, "reward_std": 0.022271769121289253, "rewards/itbench_correctness/mean": 0.8541666865348816, "rewards/itbench_correctness/std": 0.15365907549858093, "step": 601, "step_time": 197.79692050255835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 553.5, "completions/mean_terminated_length": 553.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4065040647983551, "epoch": 3.185185185185185, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.0014250976964831352, "learning_rate": 3.7565505641757266e-07, "loss": -0.106, "num_tokens": 11817495.0, "reward": 0.5593750476837158, "reward_std": 0.18626472353935242, "rewards/itbench_correctness/mean": 0.5593750476837158, "rewards/itbench_correctness/std": 0.26154589653015137, "step": 602, "step_time": 91.9841024801135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 829.5, "completions/mean_terminated_length": 816.5333862304688, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "entropy": 0.44605183601379395, "epoch": 3.1904761904761907, "frac_reward_zero_std": 0.5, "grad_norm": 1.296875, "kl": 0.0014674547128379345, "learning_rate": 3.74054216221926e-07, "loss": -0.0149, "num_tokens": 11837095.0, "reward": 0.1666666716337204, "reward_std": 0.2182178944349289, "rewards/itbench_correctness/mean": 0.1666666716337204, "rewards/itbench_correctness/std": 0.3442651927471161, "step": 603, "step_time": 129.12120711896569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 551.9375, "completions/mean_terminated_length": 551.9375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.24821650981903076, "epoch": 3.195767195767196, "frac_reward_zero_std": 0.5, "grad_norm": 1.46875, "kl": 0.0011176143307238817, "learning_rate": 3.724547533491924e-07, "loss": -0.0278, "num_tokens": 11850606.0, "reward": 0.2187500149011612, "reward_std": 0.0883883535861969, "rewards/itbench_correctness/mean": 0.2187500149011612, "rewards/itbench_correctness/std": 0.2561737895011902, "step": 604, "step_time": 94.76566615886986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 537.6875, "completions/mean_terminated_length": 537.6875, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.4314773976802826, "epoch": 3.201058201058201, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.0012784149730578065, "learning_rate": 3.708566852908418e-07, "loss": 0.0069, "num_tokens": 11862497.0, "reward": 0.21875, "reward_std": 0.2609178125858307, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.3204091787338257, "step": 605, "step_time": 816.5052793165669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 623.4375, "completions/mean_terminated_length": 623.4375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.3528822064399719, "epoch": 3.2063492063492065, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.001327100908383727, "learning_rate": 3.692600295230901e-07, "loss": 0.0131, "num_tokens": 11878112.0, "reward": 0.6583333611488342, "reward_std": 0.11426578462123871, "rewards/itbench_correctness/mean": 0.6583333611488342, "rewards/itbench_correctness/std": 0.22377237677574158, "step": 606, "step_time": 69.29318222776055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 989.8125, "completions/mean_terminated_length": 887.25, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "entropy": 0.6627517938613892, "epoch": 3.2116402116402116, "frac_reward_zero_std": 0.5, "grad_norm": 1.609375, "kl": 0.0013202429981902242, "learning_rate": 3.6766480350670925e-07, "loss": 0.0134, "num_tokens": 11904909.0, "reward": 0.3958333432674408, "reward_std": 0.19795581698417664, "rewards/itbench_correctness/mean": 0.3958333432674408, "rewards/itbench_correctness/std": 0.4901813864707947, "step": 607, "step_time": 82.23766458127648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 1011.5625, "completions/mean_terminated_length": 957.6666870117188, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "entropy": 0.29854804277420044, "epoch": 3.2169312169312168, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.00097948859911412, "learning_rate": 3.660710246868352e-07, "loss": -0.0092, "num_tokens": 11932430.0, "reward": 0.6073908805847168, "reward_std": 0.2576354146003723, "rewards/itbench_correctness/mean": 0.6073908805847168, "rewards/itbench_correctness/std": 0.26400262117385864, "step": 608, "step_time": 116.86947522684932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 644.625, "completions/mean_terminated_length": 619.3333740234375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.5987977385520935, "epoch": 3.2222222222222223, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.001607781508937478, "learning_rate": 3.6447871049277796e-07, "loss": 0.0406, "num_tokens": 11946784.0, "reward": 0.6499999761581421, "reward_std": 0.3926178812980652, "rewards/itbench_correctness/mean": 0.6499999761581421, "rewards/itbench_correctness/std": 0.43204939365386963, "step": 609, "step_time": 80.47608442325145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 678.625, "completions/mean_terminated_length": 678.625, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.6159513592720032, "epoch": 3.2275132275132274, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.001993054524064064, "learning_rate": 3.6288787833783016e-07, "loss": 0.0001, "num_tokens": 11972402.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 610, "step_time": 71.49992215260863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 725.8125, "completions/mean_terminated_length": 683.2142944335938, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.410574346780777, "epoch": 3.2328042328042326, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.001559928059577942, "learning_rate": 3.612985456190778e-07, "loss": 0.0063, "num_tokens": 11988567.0, "reward": 0.3571428656578064, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.3571428656578064, "rewards/itbench_correctness/std": 0.39382997155189514, "step": 611, "step_time": 165.54018260445446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 925.25, "completions/mean_terminated_length": 826.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5879492163658142, "epoch": 3.238095238095238, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0012924325419589877, "learning_rate": 3.597107297172084e-07, "loss": -0.0471, "num_tokens": 12017675.0, "reward": 0.265625, "reward_std": 0.3114553987979889, "rewards/itbench_correctness/mean": 0.265625, "rewards/itbench_correctness/std": 0.4422362744808197, "step": 612, "step_time": 94.39502456784248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 862.375, "completions/mean_terminated_length": 700.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.3385998010635376, "epoch": 3.2433862433862433, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.001366258948110044, "learning_rate": 3.5812444799632247e-07, "loss": -0.0171, "num_tokens": 12043681.0, "reward": 0.5, "reward_std": 0.5175491571426392, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 613, "step_time": 669.0384511752054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 880.625, "completions/mean_terminated_length": 832.8333740234375, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "entropy": 0.5132718086242676, "epoch": 3.248677248677249, "frac_reward_zero_std": 0.5, "grad_norm": 1.6015625, "kl": 0.0020796628668904305, "learning_rate": 3.565397178037429e-07, "loss": 0.0037, "num_tokens": 12073499.0, "reward": 0.25, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 614, "step_time": 113.26465024612844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 495.875, "completions/mean_terminated_length": 495.875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.45777666568756104, "epoch": 3.253968253968254, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0019308277405798435, "learning_rate": 3.5495655646982503e-07, "loss": -0.0023, "num_tokens": 12093601.0, "reward": 0.5227272510528564, "reward_std": 0.32154878973960876, "rewards/itbench_correctness/mean": 0.5227272510528564, "rewards/itbench_correctness/std": 0.41261112689971924, "step": 615, "step_time": 105.98786111921072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 480.8125, "completions/mean_terminated_length": 480.8125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.29949304461479187, "epoch": 3.259259259259259, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.001420930726453662, "learning_rate": 3.533749813077677e-07, "loss": -0.0137, "num_tokens": 12104318.0, "reward": 0.43359375, "reward_std": 0.2698231339454651, "rewards/itbench_correctness/mean": 0.43359375, "rewards/itbench_correctness/std": 0.35901251435279846, "step": 616, "step_time": 130.85422169603407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 700.875, "completions/mean_terminated_length": 507.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5279115438461304, "epoch": 3.2645502645502646, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0030069290660321712, "learning_rate": 3.517950096134232e-07, "loss": -0.0693, "num_tokens": 12120292.0, "reward": 0.5572916269302368, "reward_std": 0.408902645111084, "rewards/itbench_correctness/mean": 0.5572916269302368, "rewards/itbench_correctness/std": 0.4146828055381775, "step": 617, "step_time": 76.69993899855763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 681.9375, "completions/mean_terminated_length": 681.9375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.3578040599822998, "epoch": 3.2698412698412698, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.001377190463244915, "learning_rate": 3.502166586651092e-07, "loss": 0.0161, "num_tokens": 12136035.0, "reward": 0.78125, "reward_std": 0.2896047830581665, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.28321075439453125, "step": 618, "step_time": 78.15438072942197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 502.5625, "completions/mean_terminated_length": 502.5625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.39796045422554016, "epoch": 3.2751322751322753, "frac_reward_zero_std": 0.5, "grad_norm": 1.2734375, "kl": 0.0014053636696189642, "learning_rate": 3.4863994572341843e-07, "loss": -0.0057, "num_tokens": 12146932.0, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 619, "step_time": 814.3009329754859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 677.75, "completions/mean_terminated_length": 408.4444580078125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.4455920457839966, "epoch": 3.2804232804232805, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0013953611487522721, "learning_rate": 3.470648880310313e-07, "loss": 0.0023, "num_tokens": 12164472.0, "reward": 0.39375001192092896, "reward_std": 0.23028594255447388, "rewards/itbench_correctness/mean": 0.39375001192092896, "rewards/itbench_correctness/std": 0.2535580098628998, "step": 620, "step_time": 774.6395965730771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.318359375, "epoch": 3.2857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 1.609375, "kl": 0.0011473577469587326, "learning_rate": 3.454915028125263e-07, "loss": 0.0, "num_tokens": 12192136.0, "reward": 0.2708333432674408, "reward_std": 0.19795581698417664, "rewards/itbench_correctness/mean": 0.2708333432674408, "rewards/itbench_correctness/std": 0.3890872597694397, "step": 621, "step_time": 147.70375349000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 504.4375, "completions/mean_terminated_length": 504.4375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.4182876944541931, "epoch": 3.291005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.002045721746981144, "learning_rate": 3.43919807274192e-07, "loss": -0.0186, "num_tokens": 12203111.0, "reward": 0.47727274894714355, "reward_std": 0.29765012860298157, "rewards/itbench_correctness/mean": 0.47727274894714355, "rewards/itbench_correctness/std": 0.40613409876823425, "step": 622, "step_time": 49.89000040013343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 515.8125, "completions/mean_terminated_length": 515.8125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "entropy": 0.4788561761379242, "epoch": 3.2962962962962963, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.0019279435509815812, "learning_rate": 3.4234981860383927e-07, "loss": 0.014, "num_tokens": 12215068.0, "reward": 0.375, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 623, "step_time": 93.9447353342548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 831.125, "completions/mean_terminated_length": 681.1111450195312, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "entropy": 0.4283351004123688, "epoch": 3.3015873015873014, "frac_reward_zero_std": 0.5, "grad_norm": 1.234375, "kl": 0.0011250257957726717, "learning_rate": 3.407815539706124e-07, "loss": 0.003, "num_tokens": 12235334.0, "reward": 0.5703125, "reward_std": 0.05964459478855133, "rewards/itbench_correctness/mean": 0.5703125, "rewards/itbench_correctness/std": 0.4511992335319519, "step": 624, "step_time": 102.94410282652825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 864.875, "completions/mean_terminated_length": 741.1111450195312, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.3584333062171936, "epoch": 3.306878306878307, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0012643268564715981, "learning_rate": 3.3921503052480236e-07, "loss": 0.0147, "num_tokens": 12256380.0, "reward": 0.5694444179534912, "reward_std": 0.30291885137557983, "rewards/itbench_correctness/mean": 0.5694444179534912, "rewards/itbench_correctness/std": 0.3557291328907013, "step": 625, "step_time": 484.2877219989896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 779.9375, "completions/mean_terminated_length": 779.9375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41028928756713867, "epoch": 3.312169312169312, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0014163429150357842, "learning_rate": 3.3765026539765827e-07, "loss": -0.0162, "num_tokens": 12274915.0, "reward": 0.30000001192092896, "reward_std": 0.2507132589817047, "rewards/itbench_correctness/mean": 0.30000001192092896, "rewards/itbench_correctness/std": 0.46188023686408997, "step": 626, "step_time": 97.47150356322527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 437.3125, "completions/mean_terminated_length": 437.3125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.541946530342102, "epoch": 3.317460317460317, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0028036432340741158, "learning_rate": 3.360872757012011e-07, "loss": 0.0022, "num_tokens": 12284448.0, "reward": 0.3958333432674408, "reward_std": 0.10767625272274017, "rewards/itbench_correctness/mean": 0.3958333432674408, "rewards/itbench_correctness/std": 0.3657817840576172, "step": 627, "step_time": 93.49457087833434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 665.375, "completions/mean_terminated_length": 545.8333740234375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.3877512812614441, "epoch": 3.322751322751323, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0014740098267793655, "learning_rate": 3.345260785280358e-07, "loss": 0.0046, "num_tokens": 12301662.0, "reward": 0.609375, "reward_std": 0.11860001087188721, "rewards/itbench_correctness/mean": 0.609375, "rewards/itbench_correctness/std": 0.19654129445552826, "step": 628, "step_time": 155.64716604631394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 985.625, "completions/mean_terminated_length": 921.6666870117188, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "entropy": 0.3896005153656006, "epoch": 3.328042328042328, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.0012615715386345983, "learning_rate": 3.329666909511645e-07, "loss": 0.0, "num_tokens": 12328376.0, "reward": 0.07500000298023224, "reward_std": 0.09161254018545151, "rewards/itbench_correctness/mean": 0.07500000298023224, "rewards/itbench_correctness/std": 0.14719600975513458, "step": 629, "step_time": 188.39432869665325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 747.5, "completions/mean_terminated_length": 621.8181762695312, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.32107022404670715, "epoch": 3.3333333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.0018245604587718844, "learning_rate": 3.314091300237999e-07, "loss": 0.0001, "num_tokens": 12347128.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 630, "step_time": 1010.9434453165159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 508.3125, "completions/mean_terminated_length": 508.3125, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.38755688071250916, "epoch": 3.3386243386243386, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0018675376195460558, "learning_rate": 3.2985341277917846e-07, "loss": -0.0092, "num_tokens": 12363957.0, "reward": 0.640625, "reward_std": 0.1446593999862671, "rewards/itbench_correctness/mean": 0.640625, "rewards/itbench_correctness/std": 0.29181545972824097, "step": 631, "step_time": 96.36415668576956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 822.125, "completions/mean_terminated_length": 822.125, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.29314276576042175, "epoch": 3.3439153439153437, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.002633322263136506, "learning_rate": 3.282995562303753e-07, "loss": 0.025, "num_tokens": 12384663.0, "reward": 0.6741071343421936, "reward_std": 0.29771745204925537, "rewards/itbench_correctness/mean": 0.6741071343421936, "rewards/itbench_correctness/std": 0.3393692374229431, "step": 632, "step_time": 81.67576451133937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 741.4375, "completions/mean_terminated_length": 521.6666870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.49633312225341797, "epoch": 3.3492063492063493, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0015913519309833646, "learning_rate": 3.2674757737011606e-07, "loss": 0.0059, "num_tokens": 12407326.0, "reward": 0.4937500059604645, "reward_std": 0.01767767034471035, "rewards/itbench_correctness/mean": 0.4937500059604645, "rewards/itbench_correctness/std": 0.510514497756958, "step": 633, "step_time": 96.86188104748726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 776.0625, "completions/mean_terminated_length": 740.6428833007812, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "entropy": 0.6081984639167786, "epoch": 3.3544973544973544, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.0018657725304365158, "learning_rate": 3.2519749317059327e-07, "loss": 0.0253, "num_tokens": 12442815.0, "reward": 0.4375, "reward_std": 0.49022960662841797, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 634, "step_time": 115.92930174898356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 700.625, "completions/mean_terminated_length": 654.4285888671875, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.3297056257724762, "epoch": 3.35978835978836, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.0020810659043490887, "learning_rate": 3.236493205832794e-07, "loss": 0.0001, "num_tokens": 12459393.0, "reward": 0.0833333358168602, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0833333358168602, "rewards/itbench_correctness/std": 0.08606629818677902, "step": 635, "step_time": 450.1781229842454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 707.625, "completions/mean_terminated_length": 563.8181762695312, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.48048049211502075, "epoch": 3.365079365079365, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.001259908196516335, "learning_rate": 3.221030765387417e-07, "loss": -0.0114, "num_tokens": 12495003.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 636, "step_time": 161.46651719231158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 671.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.42324888706207275, "epoch": 3.3703703703703702, "frac_reward_zero_std": 0.5, "grad_norm": 1.03125, "kl": 0.0016913211438804865, "learning_rate": 3.205587779464576e-07, "loss": 0.008, "num_tokens": 12512171.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 637, "step_time": 103.04572070110589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 563.5, "completions/mean_terminated_length": 563.5, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.41703638434410095, "epoch": 3.375661375661376, "frac_reward_zero_std": 0.5, "grad_norm": 1.0859375, "kl": 0.0013338474091142416, "learning_rate": 3.190164416946285e-07, "loss": -0.022, "num_tokens": 12524003.0, "reward": 0.9166666269302368, "reward_std": 0.044543541967868805, "rewards/itbench_correctness/mean": 0.9166666269302368, "rewards/itbench_correctness/std": 0.10540926456451416, "step": 638, "step_time": 89.99323462788016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 590.5625, "completions/mean_terminated_length": 590.5625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.3149539530277252, "epoch": 3.380952380952381, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.0011156360851600766, "learning_rate": 3.174760846499972e-07, "loss": 0.0079, "num_tokens": 12539556.0, "reward": 0.40625, "reward_std": 0.01767767034471035, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.420267790555954, "step": 639, "step_time": 1139.211639557965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 881.3125, "completions/mean_terminated_length": 567.4000244140625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.3472094237804413, "epoch": 3.386243386243386, "frac_reward_zero_std": 0.5, "grad_norm": 1.34375, "kl": 0.0015863279113546014, "learning_rate": 3.15937723657661e-07, "loss": 0.0695, "num_tokens": 12568353.0, "reward": 0.34375, "reward_std": 0.16925080120563507, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.4236907958984375, "step": 640, "step_time": 180.37901693582535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 970.4375, "completions/mean_terminated_length": 881.1666870117188, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "entropy": 0.6265215277671814, "epoch": 3.3915343915343916, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0014786267420277, "learning_rate": 3.1440137554088953e-07, "loss": 0.0438, "num_tokens": 12603128.0, "reward": 0.19999998807907104, "reward_std": 0.3343248665332794, "rewards/itbench_correctness/mean": 0.19999998807907104, "rewards/itbench_correctness/std": 0.3326660096645355, "step": 641, "step_time": 174.49532955139875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 603.75, "completions/mean_terminated_length": 603.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4306418299674988, "epoch": 3.3968253968253967, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.002129571046680212, "learning_rate": 3.1286705710093984e-07, "loss": -0.0944, "num_tokens": 12617060.0, "reward": 0.737500011920929, "reward_std": 0.25792384147644043, "rewards/itbench_correctness/mean": 0.737500011920929, "rewards/itbench_correctness/std": 0.26884526014328003, "step": 642, "step_time": 78.10744374617934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 557.0, "completions/mean_terminated_length": 557.0, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.35727110505104065, "epoch": 3.402116402116402, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.0019330759532749653, "learning_rate": 3.113347851168721e-07, "loss": 0.0, "num_tokens": 12629716.0, "reward": 0.75, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 643, "step_time": 797.8743101553991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 699.75, "completions/mean_terminated_length": 552.3636474609375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.3372633159160614, "epoch": 3.4074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.00145020114723593, "learning_rate": 3.0980457634536774e-07, "loss": 0.0122, "num_tokens": 12654240.0, "reward": 0.46875, "reward_std": 0.3471629321575165, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.3859512209892273, "step": 644, "step_time": 142.23401138465852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 412.625, "completions/mean_terminated_length": 412.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.43380793929100037, "epoch": 3.4126984126984126, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.003082259325310588, "learning_rate": 3.082764475205442e-07, "loss": -0.0126, "num_tokens": 12663858.0, "reward": 0.828125, "reward_std": 0.3143535256385803, "rewards/itbench_correctness/mean": 0.828125, "rewards/itbench_correctness/std": 0.3502231538295746, "step": 645, "step_time": 1102.395908644423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 635.9375, "completions/mean_terminated_length": 635.9375, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.4497297406196594, "epoch": 3.417989417989418, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0015347811859101057, "learning_rate": 3.06750415353774e-07, "loss": 0.0033, "num_tokens": 12680857.0, "reward": 0.5078125, "reward_std": 0.19887377321720123, "rewards/itbench_correctness/mean": 0.5078125, "rewards/itbench_correctness/std": 0.4642843008041382, "step": 646, "step_time": 96.87032896187156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 726.5625, "completions/mean_terminated_length": 657.923095703125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.3275699019432068, "epoch": 3.4232804232804233, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.001753173884935677, "learning_rate": 3.052264965335e-07, "loss": -0.0118, "num_tokens": 12701762.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 647, "step_time": 256.6041612662375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 966.4375, "completions/mean_terminated_length": 870.5, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "entropy": 0.5587531328201294, "epoch": 3.4285714285714284, "frac_reward_zero_std": 0.5, "grad_norm": 1.3984375, "kl": 0.0016219299286603928, "learning_rate": 3.037047077250543e-07, "loss": 0.0127, "num_tokens": 12725865.0, "reward": 0.3958333432674408, "reward_std": 0.0862581878900528, "rewards/itbench_correctness/mean": 0.3958333432674408, "rewards/itbench_correctness/std": 0.4254627227783203, "step": 648, "step_time": 119.4891459485516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 857.0, "completions/mean_terminated_length": 727.1111450195312, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "entropy": 0.3640606701374054, "epoch": 3.433862433862434, "frac_reward_zero_std": 0.5, "grad_norm": 1.5234375, "kl": 0.001420785440132022, "learning_rate": 3.02185065570476e-07, "loss": -0.0097, "num_tokens": 12745521.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 649, "step_time": 151.27136832941324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 518.3125, "completions/mean_terminated_length": 518.3125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.3357048034667969, "epoch": 3.439153439153439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0135498046875, "kl": 0.0010722068836912513, "learning_rate": 3.006675866883275e-07, "loss": 0.0, "num_tokens": 12758590.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 650, "step_time": 951.2108103726059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 753.5, "completions/mean_terminated_length": 735.4666748046875, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.5069674849510193, "epoch": 3.4444444444444446, "frac_reward_zero_std": 0.5, "grad_norm": 1.3984375, "kl": 0.0015233854064717889, "learning_rate": 2.9915228767351535e-07, "loss": 0.0302, "num_tokens": 12775142.0, "reward": 0.6875, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.3095695972442627, "step": 651, "step_time": 164.39702508877963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 945.8125, "completions/mean_terminated_length": 711.25, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.6090002059936523, "epoch": 3.4497354497354498, "frac_reward_zero_std": 0.5, "grad_norm": 1.8359375, "kl": 0.0016115251928567886, "learning_rate": 2.9763918509710647e-07, "loss": 0.0001, "num_tokens": 12806947.0, "reward": 0.5625, "reward_std": 0.09449111670255661, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.4699290990829468, "step": 652, "step_time": 204.87098419014364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 806.375, "completions/mean_terminated_length": 707.45458984375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5605332255363464, "epoch": 3.455026455026455, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.001556203467771411, "learning_rate": 2.961282955061483e-07, "loss": -0.0919, "num_tokens": 12831673.0, "reward": 0.8125, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 653, "step_time": 89.19978978857398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 883.1875, "completions/mean_terminated_length": 819.1818237304688, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "entropy": 0.33967873454093933, "epoch": 3.4603174603174605, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0015498069114983082, "learning_rate": 2.9461963542348733e-07, "loss": 0.0158, "num_tokens": 12860092.0, "reward": 0.6000000238418579, "reward_std": 0.28192007541656494, "rewards/itbench_correctness/mean": 0.6000000238418579, "rewards/itbench_correctness/std": 0.4242640733718872, "step": 654, "step_time": 85.3845539437607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 841.75, "completions/mean_terminated_length": 829.6000366210938, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "entropy": 0.4942084848880768, "epoch": 3.4656084656084656, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.0017516858642920852, "learning_rate": 2.931132213475884e-07, "loss": 0.0093, "num_tokens": 12887824.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 655, "step_time": 330.32038860116154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 744.625, "completions/mean_terminated_length": 704.7142944335938, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "entropy": 0.4163169264793396, "epoch": 3.4708994708994707, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0016559087671339512, "learning_rate": 2.916090697523549e-07, "loss": 0.0156, "num_tokens": 12904234.0, "reward": 0.875, "reward_std": 0.25583362579345703, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.26457512378692627, "step": 656, "step_time": 138.49934119079262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 745.0625, "completions/mean_terminated_length": 618.2727661132812, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4966026246547699, "epoch": 3.4761904761904763, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0017499123932793736, "learning_rate": 2.901071970869472e-07, "loss": -0.0134, "num_tokens": 12932827.0, "reward": 0.515625, "reward_std": 0.32311493158340454, "rewards/itbench_correctness/mean": 0.515625, "rewards/itbench_correctness/std": 0.436970591545105, "step": 657, "step_time": 79.10722716152668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 691.5, "completions/mean_terminated_length": 691.5, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "entropy": 0.37310194969177246, "epoch": 3.4814814814814814, "frac_reward_zero_std": 0.5, "grad_norm": 1.3203125, "kl": 0.0011259306920692325, "learning_rate": 2.8860761977560433e-07, "loss": -0.0094, "num_tokens": 12948403.0, "reward": 0.8828125, "reward_std": 0.07790146768093109, "rewards/itbench_correctness/mean": 0.8828125, "rewards/itbench_correctness/std": 0.16117246448993683, "step": 658, "step_time": 615.1303538642824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 474.0, "completions/mean_terminated_length": 474.0, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.5063291192054749, "epoch": 3.4867724867724865, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0021006267052143812, "learning_rate": 2.8711035421746363e-07, "loss": 0.004, "num_tokens": 12967043.0, "reward": 0.2265625, "reward_std": 0.24306795001029968, "rewards/itbench_correctness/mean": 0.2265625, "rewards/itbench_correctness/std": 0.2784583568572998, "step": 659, "step_time": 88.2505495576188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 750.375, "completions/mean_terminated_length": 537.5555419921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5357321500778198, "epoch": 3.492063492063492, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.0018287766724824905, "learning_rate": 2.856154167863814e-07, "loss": -0.2463, "num_tokens": 12990473.0, "reward": 0.4114583134651184, "reward_std": 0.1860596239566803, "rewards/itbench_correctness/mean": 0.4114583134651184, "rewards/itbench_correctness/std": 0.3502231538295746, "step": 660, "step_time": 113.05205366853625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 691.875, "completions/mean_terminated_length": 615.2307739257812, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.5694670081138611, "epoch": 3.497354497354497, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0017705087084323168, "learning_rate": 2.841228238307536e-07, "loss": -0.0069, "num_tokens": 13014367.0, "reward": 0.4375, "reward_std": 0.3532657027244568, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.38188132643699646, "step": 661, "step_time": 138.94573136605322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 796.875, "completions/mean_terminated_length": 764.4285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.40658822655677795, "epoch": 3.502645502645503, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.001931068836711347, "learning_rate": 2.8263259167333774e-07, "loss": 0.0253, "num_tokens": 13039493.0, "reward": 0.10625000298023224, "reward_std": 0.14168164134025574, "rewards/itbench_correctness/mean": 0.10625000298023224, "rewards/itbench_correctness/std": 0.1722267121076584, "step": 662, "step_time": 135.8809123178944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 669.3125, "completions/mean_terminated_length": 669.3125, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "entropy": 0.4213278591632843, "epoch": 3.507936507936508, "frac_reward_zero_std": 0.5, "grad_norm": 1.625, "kl": 0.001925744814798236, "learning_rate": 2.811447366110741e-07, "loss": -0.0154, "num_tokens": 13055098.0, "reward": 0.4001736044883728, "reward_std": 0.14328064024448395, "rewards/itbench_correctness/mean": 0.4001736044883728, "rewards/itbench_correctness/std": 0.45731422305107117, "step": 663, "step_time": 1172.1156589342281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 654.1875, "completions/mean_terminated_length": 366.5555725097656, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4555268883705139, "epoch": 3.5132275132275135, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.0016494187293574214, "learning_rate": 2.7965927491490704e-07, "loss": -0.1445, "num_tokens": 13078749.0, "reward": 0.890625, "reward_std": 0.2414703369140625, "rewards/itbench_correctness/mean": 0.890625, "rewards/itbench_correctness/std": 0.2576940953731537, "step": 664, "step_time": 790.0815438805148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 565.625, "completions/mean_terminated_length": 565.625, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.43668508529663086, "epoch": 3.5185185185185186, "frac_reward_zero_std": 1.0, "grad_norm": 0.041259765625, "kl": 0.0019728399347513914, "learning_rate": 2.7817622282960813e-07, "loss": 0.0, "num_tokens": 13091575.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 665, "step_time": 94.42669316660613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 603.8125, "completions/mean_terminated_length": 603.8125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.4007866680622101, "epoch": 3.5238095238095237, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.001383933937177062, "learning_rate": 2.7669559657359673e-07, "loss": -0.0116, "num_tokens": 13105124.0, "reward": 0.78125, "reward_std": 0.3471629321575165, "rewards/itbench_correctness/mean": 0.78125, "rewards/itbench_correctness/std": 0.4069705307483673, "step": 666, "step_time": 72.2391463033855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 718.625, "completions/mean_terminated_length": 616.8333740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4369455575942993, "epoch": 3.5291005291005293, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0016333801904693246, "learning_rate": 2.7521741233876493e-07, "loss": -0.1036, "num_tokens": 13134246.0, "reward": 0.42500001192092896, "reward_std": 0.3273707628250122, "rewards/itbench_correctness/mean": 0.42500001192092896, "rewards/itbench_correctness/std": 0.44347113370895386, "step": 667, "step_time": 192.8937590336427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 604.125, "completions/mean_terminated_length": 507.23077392578125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.3542313277721405, "epoch": 3.5343915343915344, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0024427901953458786, "learning_rate": 2.737416862902981e-07, "loss": -0.0391, "num_tokens": 13147840.0, "reward": 0.2569444477558136, "reward_std": 0.12661024928092957, "rewards/itbench_correctness/mean": 0.2569444477558136, "rewards/itbench_correctness/std": 0.23537467420101166, "step": 668, "step_time": 92.6653502555564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 689.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.42670536041259766, "epoch": 3.5396825396825395, "frac_reward_zero_std": 0.5, "grad_norm": 1.6640625, "kl": 0.0015613737050443888, "learning_rate": 2.722684345665003e-07, "loss": 0.0, "num_tokens": 13163432.0, "reward": 0.5, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.3651483952999115, "step": 669, "step_time": 71.54559296742082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 584.25, "completions/mean_terminated_length": 584.25, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.45186135172843933, "epoch": 3.544973544973545, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0028362891171127558, "learning_rate": 2.707976732786166e-07, "loss": -0.0174, "num_tokens": 13176020.0, "reward": 0.3971354365348816, "reward_std": 0.3280077576637268, "rewards/itbench_correctness/mean": 0.3971354365348816, "rewards/itbench_correctness/std": 0.4046509563922882, "step": 670, "step_time": 115.24186983983964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 513.3125, "completions/mean_terminated_length": 513.3125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "entropy": 0.4909290075302124, "epoch": 3.5502645502645502, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0017170688370242715, "learning_rate": 2.6932941851065615e-07, "loss": -0.0309, "num_tokens": 13189441.0, "reward": 0.4270833432674408, "reward_std": 0.36084234714508057, "rewards/itbench_correctness/mean": 0.4270833432674408, "rewards/itbench_correctness/std": 0.40583136677742004, "step": 671, "step_time": 68.62113481201231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 781.125, "completions/mean_terminated_length": 592.2222290039062, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.35333654284477234, "epoch": 3.5555555555555554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0257568359375, "kl": 0.0011492978082969785, "learning_rate": 2.6786368631921834e-07, "loss": 0.0, "num_tokens": 13207635.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 672, "step_time": 7667.425364185125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 563.0625, "completions/mean_terminated_length": 532.3333740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5292485356330872, "epoch": 3.560846560846561, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.0018245880492031574, "learning_rate": 2.664004927333151e-07, "loss": 0.0172, "num_tokens": 13229564.0, "reward": 0.34166666865348816, "reward_std": 0.13930098712444305, "rewards/itbench_correctness/mean": 0.34166666865348816, "rewards/itbench_correctness/std": 0.4009248614311218, "step": 673, "step_time": 94.37730458006263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 1013.6875, "completions/mean_terminated_length": 969.0, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "entropy": 0.3610580265522003, "epoch": 3.566137566137566, "frac_reward_zero_std": 0.5, "grad_norm": 1.609375, "kl": 0.0011879053199663758, "learning_rate": 2.6493985375419775e-07, "loss": 0.0, "num_tokens": 13256847.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 674, "step_time": 230.72378408256918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 416.375, "completions/mean_terminated_length": 416.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5355749130249023, "epoch": 3.571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.65625, "kl": 0.0020900224335491657, "learning_rate": 2.6348178535517965e-07, "loss": -0.1134, "num_tokens": 13268133.0, "reward": 0.5729166269302368, "reward_std": 0.20013636350631714, "rewards/itbench_correctness/mean": 0.5729166269302368, "rewards/itbench_correctness/std": 0.28361913561820984, "step": 675, "step_time": 56.21729406807572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 439.875, "completions/mean_terminated_length": 439.875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.4046604037284851, "epoch": 3.5767195767195767, "frac_reward_zero_std": 0.5, "grad_norm": 0.9609375, "kl": 0.0022217335645109415, "learning_rate": 2.620263034814632e-07, "loss": -0.0015, "num_tokens": 13278187.0, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 676, "step_time": 1194.6907618306577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 816.25, "completions/mean_terminated_length": 608.5, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.45329248905181885, "epoch": 3.582010582010582, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.0015461933799088001, "learning_rate": 2.605734240499652e-07, "loss": -0.0181, "num_tokens": 13301351.0, "reward": 0.4479166567325592, "reward_std": 0.06200198084115982, "rewards/itbench_correctness/mean": 0.4479166567325592, "rewards/itbench_correctness/std": 0.4702983796596527, "step": 677, "step_time": 130.93884664587677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 537.1875, "completions/mean_terminated_length": 537.1875, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.4207097291946411, "epoch": 3.5873015873015874, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.001912532257847488, "learning_rate": 2.591231629491423e-07, "loss": 0.0, "num_tokens": 13314330.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 678, "step_time": 95.27887518052012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 665.8125, "completions/mean_terminated_length": 665.8125, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.36646953225135803, "epoch": 3.5925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.002286773407831788, "learning_rate": 2.5767553603881764e-07, "loss": -0.0025, "num_tokens": 13329519.0, "reward": 0.625, "reward_std": 0.3104073107242584, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.3979112207889557, "step": 679, "step_time": 807.2301516216248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 451.3125, "completions/mean_terminated_length": 451.3125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.3833264112472534, "epoch": 3.597883597883598, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0012348750606179237, "learning_rate": 2.5623055915000686e-07, "loss": 0.01, "num_tokens": 13339652.0, "reward": 0.6193181872367859, "reward_std": 0.13179811835289001, "rewards/itbench_correctness/mean": 0.6193181872367859, "rewards/itbench_correctness/std": 0.4134864807128906, "step": 680, "step_time": 135.08529091719538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 646.125, "completions/mean_terminated_length": 558.923095703125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.4178757965564728, "epoch": 3.6031746031746033, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0014911647886037827, "learning_rate": 2.547882480847461e-07, "loss": 0.0, "num_tokens": 13353918.0, "reward": 0.38749998807907104, "reward_std": 0.1157275065779686, "rewards/itbench_correctness/mean": 0.38749998807907104, "rewards/itbench_correctness/std": 0.4303099811077118, "step": 681, "step_time": 99.35422214772552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 769.75, "completions/mean_terminated_length": 572.0, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.7534914016723633, "epoch": 3.6084656084656084, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0021665135864168406, "learning_rate": 2.533486186159175e-07, "loss": 0.0043, "num_tokens": 13375346.0, "reward": 0.390625, "reward_std": 0.27564918994903564, "rewards/itbench_correctness/mean": 0.390625, "rewards/itbench_correctness/std": 0.4913311004638672, "step": 682, "step_time": 92.11163073871285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 768.75, "completions/mean_terminated_length": 513.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3460162580013275, "epoch": 3.613756613756614, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234375, "kl": 0.0016589618753641844, "learning_rate": 2.5191168648707884e-07, "loss": -0.0447, "num_tokens": 13402526.0, "reward": 0.11328125, "reward_std": 0.08341467380523682, "rewards/itbench_correctness/mean": 0.11328125, "rewards/itbench_correctness/std": 0.16332921385765076, "step": 683, "step_time": 159.6508161853999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 546.1875, "completions/mean_terminated_length": 546.1875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.44124042987823486, "epoch": 3.619047619047619, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0013375874841585755, "learning_rate": 2.5047746741228977e-07, "loss": -0.0346, "num_tokens": 13415305.0, "reward": 0.5416666865348816, "reward_std": 0.21535253524780273, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.4238273799419403, "step": 684, "step_time": 587.8082643058151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 800.875, "completions/mean_terminated_length": 786.0000610351562, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "entropy": 0.34712034463882446, "epoch": 3.624338624338624, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234375, "kl": 0.0008109880145639181, "learning_rate": 2.490459770759398e-07, "loss": -0.008, "num_tokens": 13434959.0, "reward": 0.296875, "reward_std": 0.0646936446428299, "rewards/itbench_correctness/mean": 0.296875, "rewards/itbench_correctness/std": 0.31909704208374023, "step": 685, "step_time": 89.69222616031766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 470.125, "completions/mean_terminated_length": 470.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.45094388723373413, "epoch": 3.6296296296296298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0274658203125, "kl": 0.0016529399435967207, "learning_rate": 2.476172311325783e-07, "loss": 0.0, "num_tokens": 13445337.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 686, "step_time": 106.66353179235011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 804.6875, "completions/mean_terminated_length": 673.1000366210938, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.4324660301208496, "epoch": 3.634920634920635, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.0018160316394641995, "learning_rate": 2.4619124520674145e-07, "loss": 0.0029, "num_tokens": 13466644.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 687, "step_time": 257.70799226593226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 765.375, "completions/mean_terminated_length": 610.2000122070312, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.535685122013092, "epoch": 3.64021164021164, "frac_reward_zero_std": 0.5, "grad_norm": 1.515625, "kl": 0.0018447366310283542, "learning_rate": 2.447680348927837e-07, "loss": 0.0164, "num_tokens": 13488698.0, "reward": 0.1875, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.25, "step": 688, "step_time": 100.56179421767592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 680.0625, "completions/mean_terminated_length": 473.70001220703125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.36467236280441284, "epoch": 3.6455026455026456, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0013507335679605603, "learning_rate": 2.4334761575470434e-07, "loss": 0.0176, "num_tokens": 13505539.0, "reward": 0.518750011920929, "reward_std": 0.2103695124387741, "rewards/itbench_correctness/mean": 0.518750011920929, "rewards/itbench_correctness/std": 0.38929542899131775, "step": 689, "step_time": 76.81764477398247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 653.4375, "completions/mean_terminated_length": 628.7333374023438, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.4254423677921295, "epoch": 3.6507936507936507, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.0013152090832591057, "learning_rate": 2.419300033259798e-07, "loss": 0.0775, "num_tokens": 13524146.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 690, "step_time": 80.11625996977091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 765.9375, "completions/mean_terminated_length": 565.2222290039062, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.3042023777961731, "epoch": 3.656084656084656, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.002678963588550687, "learning_rate": 2.4051521310939254e-07, "loss": 0.0039, "num_tokens": 13544377.0, "reward": 0.375, "reward_std": 0.49871626496315, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 691, "step_time": 937.4359912928194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 990.8125, "completions/mean_terminated_length": 847.0, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "entropy": 0.4723396301269531, "epoch": 3.6613756613756614, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.001547610154375434, "learning_rate": 2.3910326057686124e-07, "loss": 0.0235, "num_tokens": 13570094.0, "reward": 0.2622767686843872, "reward_std": 0.17326994240283966, "rewards/itbench_correctness/mean": 0.2622767686843872, "rewards/itbench_correctness/std": 0.22186197340488434, "step": 692, "step_time": 241.44549081102014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 634.1875, "completions/mean_terminated_length": 457.0, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.39735883474349976, "epoch": 3.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0194091796875, "kl": 0.0012317921500653028, "learning_rate": 2.3769416116927333e-07, "loss": 0.0, "num_tokens": 13591193.0, "reward": 0.3333333432674408, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.3333333432674408, "rewards/itbench_correctness/std": 0.3442651927471161, "step": 693, "step_time": 156.18573713861406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 772.6875, "completions/mean_terminated_length": 521.375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.41672733426094055, "epoch": 3.671957671957672, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0018979725427925587, "learning_rate": 2.362879302963135e-07, "loss": 0.0086, "num_tokens": 13612116.0, "reward": 0.53125, "reward_std": 0.29986464977264404, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.4366062581539154, "step": 694, "step_time": 268.16689282283187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 817.3125, "completions/mean_terminated_length": 610.625, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.5040911436080933, "epoch": 3.677248677248677, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0017554876394569874, "learning_rate": 2.3488458333629773e-07, "loss": 0.0111, "num_tokens": 13636593.0, "reward": 0.768750011920929, "reward_std": 0.3954527974128723, "rewards/itbench_correctness/mean": 0.768750011920929, "rewards/itbench_correctness/std": 0.39355337619781494, "step": 695, "step_time": 83.74904467258602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 973.5, "completions/mean_terminated_length": 822.0, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "entropy": 0.44170519709587097, "epoch": 3.682539682539683, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.0014584577875211835, "learning_rate": 2.3348413563600323e-07, "loss": 0.0101, "num_tokens": 13659681.0, "reward": 0.6041666865348816, "reward_std": 0.19287918508052826, "rewards/itbench_correctness/mean": 0.6041666865348816, "rewards/itbench_correctness/std": 0.48638883233070374, "step": 696, "step_time": 80.2720007058233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 911.125, "completions/mean_terminated_length": 766.0000610351562, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "entropy": 0.4477980434894562, "epoch": 3.687830687830688, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.0018148425733670592, "learning_rate": 2.3208660251050156e-07, "loss": 0.0001, "num_tokens": 13701451.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 697, "step_time": 590.3981730565429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 921.625, "completions/mean_terminated_length": 478.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5208191871643066, "epoch": 3.693121693121693, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.003547599073499441, "learning_rate": 2.306919992429917e-07, "loss": -0.0535, "num_tokens": 13724541.0, "reward": 0.48284316062927246, "reward_std": 0.2941243052482605, "rewards/itbench_correctness/mean": 0.48284316062927246, "rewards/itbench_correctness/std": 0.4697091579437256, "step": 698, "step_time": 156.73526183422655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 654.875, "completions/mean_terminated_length": 654.875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.3054018020629883, "epoch": 3.6984126984126986, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0012958102161064744, "learning_rate": 2.2930034108463097e-07, "loss": -0.0132, "num_tokens": 13740411.0, "reward": 0.6156250238418579, "reward_std": 0.1601249873638153, "rewards/itbench_correctness/mean": 0.6156250238418579, "rewards/itbench_correctness/std": 0.16301201283931732, "step": 699, "step_time": 99.30126603785902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 779.1875, "completions/mean_terminated_length": 534.375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.6083259582519531, "epoch": 3.7037037037037037, "frac_reward_zero_std": 1.0, "grad_norm": 0.020751953125, "kl": 0.0015235234750434756, "learning_rate": 2.2791164325437046e-07, "loss": 0.0, "num_tokens": 13769414.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 700, "step_time": 121.30535170529038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 895.125, "completions/mean_terminated_length": 766.25, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.33514872193336487, "epoch": 3.708994708994709, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0013990112347528338, "learning_rate": 2.2652592093878665e-07, "loss": 0.036, "num_tokens": 13794072.0, "reward": 0.125, "reward_std": 0.1746530830860138, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.17743022739887238, "step": 701, "step_time": 441.7110221767798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 561.3125, "completions/mean_terminated_length": 561.3125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.5237724184989929, "epoch": 3.7142857142857144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0223388671875, "kl": 0.0015220106579363346, "learning_rate": 2.2514318929191706e-07, "loss": 0.0, "num_tokens": 13810757.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 702, "step_time": 111.1009431509301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 699.9375, "completions/mean_terminated_length": 375.875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.4943298399448395, "epoch": 3.7195767195767195, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.001631591934710741, "learning_rate": 2.237634634350934e-07, "loss": 0.0044, "num_tokens": 13831820.0, "reward": 0.6041666865348816, "reward_std": 0.49329501390457153, "rewards/itbench_correctness/mean": 0.6041666865348816, "rewards/itbench_correctness/std": 0.4901813864707947, "step": 703, "step_time": 122.44539823755622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 715.6875, "completions/mean_terminated_length": 530.7000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4499170482158661, "epoch": 3.7248677248677247, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0019055476877838373, "learning_rate": 2.223867584567766e-07, "loss": -0.0356, "num_tokens": 13848519.0, "reward": 0.6761363744735718, "reward_std": 0.2798159420490265, "rewards/itbench_correctness/mean": 0.6761363744735718, "rewards/itbench_correctness/std": 0.4717574715614319, "step": 704, "step_time": 72.04201124608517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 997.9375, "completions/mean_terminated_length": 815.5, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "entropy": 0.5070458054542542, "epoch": 3.7301587301587302, "frac_reward_zero_std": 0.5, "grad_norm": 1.4453125, "kl": 0.0012196371098980308, "learning_rate": 2.21013089412392e-07, "loss": 0.0007, "num_tokens": 13874222.0, "reward": 0.27085813879966736, "reward_std": 0.10523707419633865, "rewards/itbench_correctness/mean": 0.27085813879966736, "rewards/itbench_correctness/std": 0.3145284056663513, "step": 705, "step_time": 71.73409328702837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 560.5, "completions/mean_terminated_length": 560.5, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.4085637927055359, "epoch": 3.7354497354497354, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.001750650000758469, "learning_rate": 2.1964247132416368e-07, "loss": 0.0159, "num_tokens": 13886822.0, "reward": 0.578125, "reward_std": 0.13797250390052795, "rewards/itbench_correctness/mean": 0.578125, "rewards/itbench_correctness/std": 0.3949551582336426, "step": 706, "step_time": 854.83407723438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 760.5, "completions/mean_terminated_length": 497.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4549638330936432, "epoch": 3.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.004151123110204935, "learning_rate": 2.1827491918095177e-07, "loss": -0.0855, "num_tokens": 13909070.0, "reward": 0.4791666567325592, "reward_std": 0.221320241689682, "rewards/itbench_correctness/mean": 0.4791666567325592, "rewards/itbench_correctness/std": 0.47871360182762146, "step": 707, "step_time": 122.79243450798094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 957.75, "completions/mean_terminated_length": 847.3333740234375, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "entropy": 0.48238056898117065, "epoch": 3.746031746031746, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.001507254783064127, "learning_rate": 2.1691044793808733e-07, "loss": 0.0161, "num_tokens": 13939002.0, "reward": 0.515625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.515625, "rewards/itbench_correctness/std": 0.503891110420227, "step": 708, "step_time": 193.1200410258025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 613.0625, "completions/mean_terminated_length": 613.0625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.39963299036026, "epoch": 3.751322751322751, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0013470432022586465, "learning_rate": 2.1554907251720945e-07, "loss": -0.0121, "num_tokens": 13952611.0, "reward": 0.6875, "reward_std": 0.33614614605903625, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.4425306022167206, "step": 709, "step_time": 139.45972900651395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 533.5, "completions/mean_terminated_length": 533.5, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "entropy": 0.5510777831077576, "epoch": 3.7566137566137567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0260009765625, "kl": 0.0015925114275887609, "learning_rate": 2.1419080780610122e-07, "loss": 0.0, "num_tokens": 13963859.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 710, "step_time": 515.4728924324736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 950.6875, "completions/mean_terminated_length": 877.375, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "entropy": 0.3702583611011505, "epoch": 3.761904761904762, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.001159166102297604, "learning_rate": 2.128356686585282e-07, "loss": 0.0, "num_tokens": 13988342.0, "reward": 0.875, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.12909944355487823, "step": 711, "step_time": 94.62389472685754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 636.3125, "completions/mean_terminated_length": 546.84619140625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5091837644577026, "epoch": 3.7671957671957674, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.0018951293313875794, "learning_rate": 2.1148366989407497e-07, "loss": -0.0344, "num_tokens": 14007139.0, "reward": 0.1041666716337204, "reward_std": 0.1178511306643486, "rewards/itbench_correctness/mean": 0.1041666716337204, "rewards/itbench_correctness/std": 0.13437096774578094, "step": 712, "step_time": 456.887752013281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 657.875, "completions/mean_terminated_length": 573.3846435546875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.40737220644950867, "epoch": 3.7724867724867726, "frac_reward_zero_std": 0.5, "grad_norm": 1.3984375, "kl": 0.0015902683371677995, "learning_rate": 2.101348262979833e-07, "loss": -0.0082, "num_tokens": 14027257.0, "reward": 0.4742647111415863, "reward_std": 0.0482964813709259, "rewards/itbench_correctness/mean": 0.4742647111415863, "rewards/itbench_correctness/std": 0.4942431151866913, "step": 713, "step_time": 178.64222278352827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 685.0625, "completions/mean_terminated_length": 346.125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.6481160521507263, "epoch": 3.7777777777777777, "frac_reward_zero_std": 0.5, "grad_norm": 7.1875, "kl": 0.0014102908316999674, "learning_rate": 2.0878915262099096e-07, "loss": 0.0, "num_tokens": 14044666.0, "reward": 0.046875, "reward_std": 0.09300297498703003, "rewards/itbench_correctness/mean": 0.046875, "rewards/itbench_correctness/std": 0.1359764039516449, "step": 714, "step_time": 107.09331988729537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 779.3125, "completions/mean_terminated_length": 668.0909423828125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.49530836939811707, "epoch": 3.7830687830687832, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0014117928221821785, "learning_rate": 2.0744666357916925e-07, "loss": 0.0542, "num_tokens": 14060735.0, "reward": 0.4479166865348816, "reward_std": 0.20154890418052673, "rewards/itbench_correctness/mean": 0.4479166865348816, "rewards/itbench_correctness/std": 0.4550386667251587, "step": 715, "step_time": 420.523594789207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 562.0625, "completions/mean_terminated_length": 531.2667236328125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.50172358751297, "epoch": 3.7883597883597884, "frac_reward_zero_std": 0.5, "grad_norm": 1.2421875, "kl": 0.0018034385284408927, "learning_rate": 2.0610737385376348e-07, "loss": 0.0156, "num_tokens": 14075944.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 716, "step_time": 1006.8671864075586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 761.0625, "completions/mean_terminated_length": 556.5555419921875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "entropy": 0.41783690452575684, "epoch": 3.7936507936507935, "frac_reward_zero_std": 1.0, "grad_norm": 0.049560546875, "kl": 0.0019152258755639195, "learning_rate": 2.0477129809103145e-07, "loss": 0.0001, "num_tokens": 14098977.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 717, "step_time": 450.0135377245024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 786.25, "completions/mean_terminated_length": 643.6000366210938, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.521462619304657, "epoch": 3.798941798941799, "frac_reward_zero_std": 1.0, "grad_norm": 0.0230712890625, "kl": 0.001451818854548037, "learning_rate": 2.0343845090208367e-07, "loss": 0.0, "num_tokens": 14115053.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 718, "step_time": 349.72742245160043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 635.5625, "completions/mean_terminated_length": 635.5625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.4814632833003998, "epoch": 3.804232804232804, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0014020799426361918, "learning_rate": 2.0210884686272367e-07, "loss": 0.0339, "num_tokens": 14129190.0, "reward": 0.7466298937797546, "reward_std": 0.3058916926383972, "rewards/itbench_correctness/mean": 0.7466298937797546, "rewards/itbench_correctness/std": 0.38068902492523193, "step": 719, "step_time": 175.0832874653861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 479.125, "completions/mean_terminated_length": 479.125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.44873467087745667, "epoch": 3.8095238095238093, "frac_reward_zero_std": 0.5, "grad_norm": 1.21875, "kl": 0.0020029842853546143, "learning_rate": 2.0078250051328782e-07, "loss": 0.0077, "num_tokens": 14139808.0, "reward": 0.648809552192688, "reward_std": 0.17577169835567474, "rewards/itbench_correctness/mean": 0.648809552192688, "rewards/itbench_correctness/std": 0.28511843085289, "step": 720, "step_time": 102.81194345280528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 414.3125, "completions/mean_terminated_length": 414.3125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.35721829533576965, "epoch": 3.814814814814815, "frac_reward_zero_std": 0.5, "grad_norm": 0.9375, "kl": 0.0016476555028930306, "learning_rate": 1.9945942635848745e-07, "loss": -0.0019, "num_tokens": 14150685.0, "reward": 0.453125, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.453125, "rewards/itbench_correctness/std": 0.5018196105957031, "step": 721, "step_time": 877.2679685084149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 784.125, "completions/mean_terminated_length": 544.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.45911046862602234, "epoch": 3.82010582010582, "frac_reward_zero_std": 0.5, "grad_norm": 1.328125, "kl": 0.0014677182771265507, "learning_rate": 1.981396388672496e-07, "loss": 0.0, "num_tokens": 14172399.0, "reward": 0.2109375, "reward_std": 0.06629125773906708, "rewards/itbench_correctness/mean": 0.2109375, "rewards/itbench_correctness/std": 0.2359323352575302, "step": 722, "step_time": 208.89351680781692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 920.625, "completions/mean_terminated_length": 817.25, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "entropy": 0.262864887714386, "epoch": 3.825396825396825, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0010848907986655831, "learning_rate": 1.9682315247255892e-07, "loss": 0.0043, "num_tokens": 14196345.0, "reward": 0.3854166865348816, "reward_std": 0.297717809677124, "rewards/itbench_correctness/mean": 0.3854166865348816, "rewards/itbench_correctness/std": 0.3145764470100403, "step": 723, "step_time": 847.0749486461282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 442.375, "completions/mean_terminated_length": 442.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.47018933296203613, "epoch": 3.8306878306878307, "frac_reward_zero_std": 0.5, "grad_norm": 0.9609375, "kl": 0.0014332140563055873, "learning_rate": 1.9550998157129944e-07, "loss": -0.0054, "num_tokens": 14206399.0, "reward": 0.921875, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.921875, "rewards/itbench_correctness/std": 0.1983000785112381, "step": 724, "step_time": 91.33387219905853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 687.5625, "completions/mean_terminated_length": 425.8888854980469, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.3752386271953583, "epoch": 3.835978835978836, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0017074119532480836, "learning_rate": 1.942001405240979e-07, "loss": 0.0057, "num_tokens": 14222120.0, "reward": 0.2232142835855484, "reward_std": 0.18483898043632507, "rewards/itbench_correctness/mean": 0.2232142835855484, "rewards/itbench_correctness/std": 0.21329134702682495, "step": 725, "step_time": 93.2281863456592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 709.0, "completions/mean_terminated_length": 604.0, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.5275035500526428, "epoch": 3.8412698412698414, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.002380735008046031, "learning_rate": 1.9289364365516607e-07, "loss": 0.0403, "num_tokens": 14258400.0, "reward": 0.453125, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.453125, "rewards/itbench_correctness/std": 0.5018196105957031, "step": 726, "step_time": 588.1754347216338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 655.8125, "completions/mean_terminated_length": 631.2667236328125, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.5062422752380371, "epoch": 3.8465608465608465, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.001622261363081634, "learning_rate": 1.915905052521445e-07, "loss": 0.0287, "num_tokens": 14277157.0, "reward": 0.5, "reward_std": 0.2177756428718567, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.3872983455657959, "step": 727, "step_time": 455.7577704479918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 502.9375, "completions/mean_terminated_length": 502.9375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.3757922351360321, "epoch": 3.851851851851852, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.0013325168984010816, "learning_rate": 1.9029073956594604e-07, "loss": -0.0056, "num_tokens": 14288780.0, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.90625, "rewards/itbench_correctness/std": 0.20155644416809082, "step": 728, "step_time": 82.46759236324579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 541.125, "completions/mean_terminated_length": 541.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.3862323760986328, "epoch": 3.857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.033935546875, "kl": 0.0012876316905021667, "learning_rate": 1.8899436081059972e-07, "loss": 0.0, "num_tokens": 14300838.0, "reward": 0.75, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 729, "step_time": 74.73872442170978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 920.1875, "completions/mean_terminated_length": 786.7142944335938, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "entropy": 0.22712762653827667, "epoch": 3.8624338624338623, "frac_reward_zero_std": 0.5, "grad_norm": 10.0, "kl": 0.0014261262258514762, "learning_rate": 1.877013831630961e-07, "loss": -0.022, "num_tokens": 14323721.0, "reward": 0.1458333432674408, "reward_std": 0.0589255690574646, "rewards/itbench_correctness/mean": 0.1458333432674408, "rewards/itbench_correctness/std": 0.17078252136707306, "step": 730, "step_time": 134.0663373246789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 542.5, "completions/mean_terminated_length": 542.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.26359447836875916, "epoch": 3.867724867724868, "frac_reward_zero_std": 0.5, "grad_norm": 1.265625, "kl": 0.0024193329736590385, "learning_rate": 1.8641182076323148e-07, "loss": -0.0109, "num_tokens": 14337273.0, "reward": 0.4166666865348816, "reward_std": 0.08908706903457642, "rewards/itbench_correctness/mean": 0.4166666865348816, "rewards/itbench_correctness/std": 0.14907118678092957, "step": 731, "step_time": 69.43494361732155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 759.0625, "completions/mean_terminated_length": 494.125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "entropy": 0.3925895392894745, "epoch": 3.873015873015873, "frac_reward_zero_std": 0.5, "grad_norm": 1.5078125, "kl": 0.0015624676598235965, "learning_rate": 1.8512568771345378e-07, "loss": 0.0, "num_tokens": 14358434.0, "reward": 0.9166666865348816, "reward_std": 0.17817416787147522, "rewards/itbench_correctness/mean": 0.9166666865348816, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 732, "step_time": 105.75646356213838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 645.6875, "completions/mean_terminated_length": 620.4666748046875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.6411770582199097, "epoch": 3.878306878306878, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0023670424707233906, "learning_rate": 1.8384299807870805e-07, "loss": -0.0062, "num_tokens": 14375989.0, "reward": 0.4270833134651184, "reward_std": 0.019287927076220512, "rewards/itbench_correctness/mean": 0.4270833134651184, "rewards/itbench_correctness/std": 0.44187626242637634, "step": 733, "step_time": 85.2570496154949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 710.0625, "completions/mean_terminated_length": 396.125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.4816477298736572, "epoch": 3.8835978835978837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0220947265625, "kl": 0.0012989024398848414, "learning_rate": 1.8256376588628235e-07, "loss": 0.0, "num_tokens": 14392414.0, "reward": 0.6785714626312256, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.6785714626312256, "rewards/itbench_correctness/std": 0.18442778289318085, "step": 734, "step_time": 1033.4855123637244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 602.5625, "completions/mean_terminated_length": 411.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.44476714730262756, "epoch": 3.888888888888889, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0017304952489212155, "learning_rate": 1.812880051256551e-07, "loss": -0.0347, "num_tokens": 14405887.0, "reward": 0.3031249940395355, "reward_std": 0.3322408199310303, "rewards/itbench_correctness/mean": 0.3031249940395355, "rewards/itbench_correctness/std": 0.3288711905479431, "step": 735, "step_time": 81.48759328760207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 607.625, "completions/mean_terminated_length": 548.1428833007812, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.3587739169597626, "epoch": 3.894179894179894, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.0017037625657394528, "learning_rate": 1.8001572974834168e-07, "loss": 0.0, "num_tokens": 14424505.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 736, "step_time": 320.6608420452103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 610.8125, "completions/mean_terminated_length": 583.2667236328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4322111904621124, "epoch": 3.8994708994708995, "frac_reward_zero_std": 0.5, "grad_norm": 0.498046875, "kl": 0.0016470147529616952, "learning_rate": 1.787469536677419e-07, "loss": -0.1173, "num_tokens": 14461894.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 737, "step_time": 313.65273729898036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 715.9375, "completions/mean_terminated_length": 476.3333435058594, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.7011785507202148, "epoch": 3.9047619047619047, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.001664389856159687, "learning_rate": 1.7748169075898727e-07, "loss": 0.0427, "num_tokens": 14482421.0, "reward": 0.2447916716337204, "reward_std": 0.235783189535141, "rewards/itbench_correctness/mean": 0.2447916716337204, "rewards/itbench_correctness/std": 0.3475692868232727, "step": 738, "step_time": 84.38055996689945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 416.9375, "completions/mean_terminated_length": 416.9375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.4149302840232849, "epoch": 3.91005291005291, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.0016000322066247463, "learning_rate": 1.762199548587906e-07, "loss": -0.0046, "num_tokens": 14491636.0, "reward": 0.546875, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.546875, "rewards/itbench_correctness/std": 0.07739239931106567, "step": 739, "step_time": 123.19425270985812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 759.5, "completions/mean_terminated_length": 639.2727661132812, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.35286372900009155, "epoch": 3.9153439153439153, "frac_reward_zero_std": 0.5, "grad_norm": 1.84375, "kl": 0.0021646912209689617, "learning_rate": 1.7496175976529337e-07, "loss": -0.0035, "num_tokens": 14509100.0, "reward": 0.75, "reward_std": 0.1259881556034088, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.31031644344329834, "step": 740, "step_time": 139.75384074263275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 614.625, "completions/mean_terminated_length": 614.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5694529414176941, "epoch": 3.9206349206349205, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.0038674459792673588, "learning_rate": 1.7370711923791564e-07, "loss": 0.0097, "num_tokens": 14532734.0, "reward": 0.265625, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.265625, "rewards/itbench_correctness/std": 0.28090256452560425, "step": 741, "step_time": 372.90891189686954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 923.75, "completions/mean_terminated_length": 623.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5044655203819275, "epoch": 3.925925925925926, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0016132418531924486, "learning_rate": 1.7245604699720535e-07, "loss": -0.069, "num_tokens": 14559434.0, "reward": 0.18854166567325592, "reward_std": 0.15936720371246338, "rewards/itbench_correctness/mean": 0.18854166567325592, "rewards/itbench_correctness/std": 0.15572041273117065, "step": 742, "step_time": 162.81722828093916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 870.875, "completions/mean_terminated_length": 819.8333740234375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "entropy": 0.4018946588039398, "epoch": 3.931216931216931, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0014265145873650908, "learning_rate": 1.7120855672468776e-07, "loss": -0.0312, "num_tokens": 14580296.0, "reward": 0.5625, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 743, "step_time": 541.2288927352056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 771.0625, "completions/mean_terminated_length": 574.3333129882812, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.5732349753379822, "epoch": 3.9365079365079367, "frac_reward_zero_std": 0.5, "grad_norm": 1.4296875, "kl": 0.0019290586933493614, "learning_rate": 1.6996466206271675e-07, "loss": -0.0139, "num_tokens": 14620177.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 744, "step_time": 638.7111993785948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 472.5, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.40634921193122864, "epoch": 3.941798941798942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.0012795755174010992, "learning_rate": 1.6872437661432516e-07, "loss": 0.0089, "num_tokens": 14630313.0, "reward": 0.734375, "reward_std": 0.15738674998283386, "rewards/itbench_correctness/mean": 0.734375, "rewards/itbench_correctness/std": 0.34856685996055603, "step": 745, "step_time": 94.9421982690692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 483.375, "completions/mean_terminated_length": 483.375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "entropy": 0.39513835310935974, "epoch": 3.947089947089947, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.004012894816696644, "learning_rate": 1.674877139430758e-07, "loss": -0.015, "num_tokens": 14646847.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 746, "step_time": 83.17668206058443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 630.6875, "completions/mean_terminated_length": 630.6875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "entropy": 0.4281042516231537, "epoch": 3.9523809523809526, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.001534756040200591, "learning_rate": 1.6625468757291377e-07, "loss": -0.012, "num_tokens": 14660506.0, "reward": 0.640625, "reward_std": 0.27564918994903564, "rewards/itbench_correctness/mean": 0.640625, "rewards/itbench_correctness/std": 0.341183602809906, "step": 747, "step_time": 418.7796147307381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 585.625, "completions/mean_terminated_length": 439.5, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "entropy": 0.35346850752830505, "epoch": 3.9576719576719577, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0018646334065124393, "learning_rate": 1.6502531098801753e-07, "loss": -0.0214, "num_tokens": 14676340.0, "reward": 0.7291666269302368, "reward_std": 0.3471825420856476, "rewards/itbench_correctness/mean": 0.7291666269302368, "rewards/itbench_correctness/std": 0.3542075455188751, "step": 748, "step_time": 124.75608675274998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 735.3125, "completions/mean_terminated_length": 694.0714721679688, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "entropy": 0.5439863801002502, "epoch": 3.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.0013193426420912147, "learning_rate": 1.6379959763265266e-07, "loss": 0.028, "num_tokens": 14694473.0, "reward": 0.375, "reward_std": 0.249358132481575, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.273861289024353, "step": 749, "step_time": 466.49796204734594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 735.25, "completions/mean_terminated_length": 562.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.4950697124004364, "epoch": 3.9682539682539684, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0012579681351780891, "learning_rate": 1.62577560911024e-07, "loss": -0.0039, "num_tokens": 14711301.0, "reward": 0.65625, "reward_std": 0.30173346400260925, "rewards/itbench_correctness/mean": 0.65625, "rewards/itbench_correctness/std": 0.375, "step": 750, "step_time": 988.6653963262215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 411.375, "completions/mean_terminated_length": 411.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.47645092010498047, "epoch": 3.9735449735449735, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.002736019669100642, "learning_rate": 1.6135921418712955e-07, "loss": 0.0, "num_tokens": 14720819.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 751, "step_time": 143.75118728913367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 756.6875, "completions/mean_terminated_length": 548.7777709960938, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.5418353080749512, "epoch": 3.9788359788359786, "frac_reward_zero_std": 0.5, "grad_norm": 1.375, "kl": 0.0015679231146350503, "learning_rate": 1.601445707846135e-07, "loss": 0.0288, "num_tokens": 14743110.0, "reward": 0.3697916865348816, "reward_std": 0.014731383882462978, "rewards/itbench_correctness/mean": 0.3697916865348816, "rewards/itbench_correctness/std": 0.3824491500854492, "step": 752, "step_time": 114.25977344904095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 878.625, "completions/mean_terminated_length": 812.5454711914062, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "entropy": 0.29022619128227234, "epoch": 3.984126984126984, "frac_reward_zero_std": 1.0, "grad_norm": 0.041748046875, "kl": 0.001140591804869473, "learning_rate": 1.5893364398662174e-07, "loss": 0.0, "num_tokens": 14769432.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 753, "step_time": 683.4805310554802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 809.3125, "completions/mean_terminated_length": 680.5, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.2903698980808258, "epoch": 3.9894179894179893, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0009075679117813706, "learning_rate": 1.5772644703565564e-07, "loss": -0.0137, "num_tokens": 14788877.0, "reward": 0.7447916865348816, "reward_std": 0.1927933692932129, "rewards/itbench_correctness/mean": 0.7447916865348816, "rewards/itbench_correctness/std": 0.2643453776836395, "step": 754, "step_time": 105.73342135362327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 823.0, "completions/mean_terminated_length": 666.6666870117188, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "entropy": 0.4835965931415558, "epoch": 3.9947089947089944, "frac_reward_zero_std": 0.5, "grad_norm": 1.453125, "kl": 0.001044503878802061, "learning_rate": 1.565229931334277e-07, "loss": 0.0002, "num_tokens": 14808949.0, "reward": 0.596875011920929, "reward_std": 0.041052017360925674, "rewards/itbench_correctness/mean": 0.596875011920929, "rewards/itbench_correctness/std": 0.4201066493988037, "step": 755, "step_time": 158.0458857798949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 702.8125, "completions/mean_terminated_length": 656.9285888671875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.26465097069740295, "epoch": 4.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.001511210692115128, "learning_rate": 1.553232954407171e-07, "loss": 0.0, "num_tokens": 14833258.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 756, "step_time": 233.77977779414505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 684.9375, "completions/mean_terminated_length": 684.9375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "entropy": 0.49639564752578735, "epoch": 4.005291005291006, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0017367677064612508, "learning_rate": 1.5412736707722534e-07, "loss": -0.0237, "num_tokens": 14860073.0, "reward": 0.38749998807907104, "reward_std": 0.42026326060295105, "rewards/itbench_correctness/mean": 0.38749998807907104, "rewards/itbench_correctness/std": 0.49244290590286255, "step": 757, "step_time": 126.27328568976372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 696.6875, "completions/mean_terminated_length": 442.1111145019531, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5597918629646301, "epoch": 4.01058201058201, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.0017616557888686657, "learning_rate": 1.529352211214337e-07, "loss": -0.0553, "num_tokens": 14877180.0, "reward": 0.5062500238418579, "reward_std": 0.23287571966648102, "rewards/itbench_correctness/mean": 0.5062500238418579, "rewards/itbench_correctness/std": 0.33042481541633606, "step": 758, "step_time": 145.3167848372832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 701.1875, "completions/mean_terminated_length": 378.375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.3793564438819885, "epoch": 4.015873015873016, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0020792728755623102, "learning_rate": 1.517468706104589e-07, "loss": -0.0105, "num_tokens": 14894423.0, "reward": 0.75, "reward_std": 0.1315174549818039, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.2357022613286972, "step": 759, "step_time": 143.815074888058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 579.625, "completions/mean_terminated_length": 579.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4658184051513672, "epoch": 4.021164021164021, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0024365338031202555, "learning_rate": 1.5056232853991208e-07, "loss": -0.0313, "num_tokens": 14908353.0, "reward": 0.7057291865348816, "reward_std": 0.188043013215065, "rewards/itbench_correctness/mean": 0.7057291865348816, "rewards/itbench_correctness/std": 0.325060099363327, "step": 760, "step_time": 169.37305662687868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 617.875, "completions/mean_terminated_length": 617.875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.2735181152820587, "epoch": 4.026455026455026, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0019248781027272344, "learning_rate": 1.493816078637557e-07, "loss": 0.0042, "num_tokens": 14922327.0, "reward": 0.5, "reward_std": 0.4629100561141968, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 761, "step_time": 152.33051012922078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 825.9375, "completions/mean_terminated_length": 707.1000366210938, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.5787363052368164, "epoch": 4.031746031746032, "frac_reward_zero_std": 0.5, "grad_norm": 1.828125, "kl": 0.0014200946316123009, "learning_rate": 1.4820472149416153e-07, "loss": 0.022, "num_tokens": 14957710.0, "reward": 0.09375, "reward_std": 0.03788072243332863, "rewards/itbench_correctness/mean": 0.09375, "rewards/itbench_correctness/std": 0.10978876054286957, "step": 762, "step_time": 102.25276782084256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 949.5625, "completions/mean_terminated_length": 875.125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "entropy": 0.3391035199165344, "epoch": 4.037037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0015424852026626468, "learning_rate": 1.470316823013707e-07, "loss": 0.0162, "num_tokens": 14984159.0, "reward": 0.4062500298023224, "reward_std": 0.2351749688386917, "rewards/itbench_correctness/mean": 0.4062500298023224, "rewards/itbench_correctness/std": 0.2916666865348816, "step": 763, "step_time": 115.0978871025145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 812.3125, "completions/mean_terminated_length": 782.0714721679688, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "entropy": 0.5022697448730469, "epoch": 4.042328042328043, "frac_reward_zero_std": 0.5, "grad_norm": 2.21875, "kl": 0.0012976002180948853, "learning_rate": 1.4586250311355132e-07, "loss": -0.0125, "num_tokens": 15003988.0, "reward": 0.15625, "reward_std": 0.18600594997406006, "rewards/itbench_correctness/mean": 0.15625, "rewards/itbench_correctness/std": 0.3010398745536804, "step": 764, "step_time": 219.78546244930476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 883.4375, "completions/mean_terminated_length": 742.875, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "entropy": 0.4867350459098816, "epoch": 4.0476190476190474, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.00221477122977376, "learning_rate": 1.4469719671666043e-07, "loss": -0.0099, "num_tokens": 15032371.0, "reward": 0.7265625, "reward_std": 0.3056884706020355, "rewards/itbench_correctness/mean": 0.7265625, "rewards/itbench_correctness/std": 0.3329750895500183, "step": 765, "step_time": 196.3956578373909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 634.0, "completions/mean_terminated_length": 634.0, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "entropy": 0.44164037704467773, "epoch": 4.052910052910053, "frac_reward_zero_std": 0.5, "grad_norm": 1.078125, "kl": 0.0014499113894999027, "learning_rate": 1.435357758543015e-07, "loss": -0.0066, "num_tokens": 15046147.0, "reward": 0.8051470518112183, "reward_std": 0.252979040145874, "rewards/itbench_correctness/mean": 0.8051470518112183, "rewards/itbench_correctness/std": 0.399953156709671, "step": 766, "step_time": 170.79877135157585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 731.25, "completions/mean_terminated_length": 438.5, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.2762393057346344, "epoch": 4.058201058201059, "frac_reward_zero_std": 1.0, "grad_norm": 0.033447265625, "kl": 0.0011394057655707002, "learning_rate": 1.4237825322758735e-07, "loss": 0.0, "num_tokens": 15063599.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 767, "step_time": 863.5087349172682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 764.25, "completions/mean_terminated_length": 608.4000244140625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.3663722574710846, "epoch": 4.063492063492063, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0017420370131731033, "learning_rate": 1.412246414949997e-07, "loss": -0.0017, "num_tokens": 15080899.0, "reward": 0.234375, "reward_std": 0.1953546553850174, "rewards/itbench_correctness/mean": 0.234375, "rewards/itbench_correctness/std": 0.36032232642173767, "step": 768, "step_time": 277.51467712502927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 764.5625, "completions/mean_terminated_length": 704.6923217773438, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.34791138768196106, "epoch": 4.068783068783069, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0010995686752721667, "learning_rate": 1.400749532722516e-07, "loss": 0.0227, "num_tokens": 15098204.0, "reward": 0.46875, "reward_std": 0.353828489780426, "rewards/itbench_correctness/mean": 0.46875, "rewards/itbench_correctness/std": 0.4181916415691376, "step": 769, "step_time": 572.5676989480853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 687.0625, "completions/mean_terminated_length": 350.125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.5006822347640991, "epoch": 4.074074074074074, "frac_reward_zero_std": 0.5, "grad_norm": 1.5703125, "kl": 0.0013974824687466025, "learning_rate": 1.389292011321498e-07, "loss": 0.0, "num_tokens": 15122029.0, "reward": 0.6875, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.3095695972442627, "step": 770, "step_time": 206.3093525590375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 523.125, "completions/mean_terminated_length": 523.125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.4262843430042267, "epoch": 4.079365079365079, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0012783248675987124, "learning_rate": 1.3778739760445552e-07, "loss": -0.0522, "num_tokens": 15132895.0, "reward": 0.31534090638160706, "reward_std": 0.3429919481277466, "rewards/itbench_correctness/mean": 0.31534090638160706, "rewards/itbench_correctness/std": 0.34952497482299805, "step": 771, "step_time": 136.4253909336403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 935.0, "completions/mean_terminated_length": 905.3333740234375, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "entropy": 0.498395711183548, "epoch": 4.084656084656085, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.001486019929870963, "learning_rate": 1.3664955517574967e-07, "loss": 0.0119, "num_tokens": 15163503.0, "reward": 0.4791666865348816, "reward_std": 0.4529353678226471, "rewards/itbench_correctness/mean": 0.4791666865348816, "rewards/itbench_correctness/std": 0.4549115002155304, "step": 772, "step_time": 149.56670145317912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 700.25, "completions/mean_terminated_length": 506.0, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.4455551505088806, "epoch": 4.08994708994709, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.001568189705722034, "learning_rate": 1.3551568628929432e-07, "loss": 0.0036, "num_tokens": 15185307.0, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.273861289024353, "step": 773, "step_time": 821.3875108454376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 763.9375, "completions/mean_terminated_length": 763.9375, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "entropy": 0.44768059253692627, "epoch": 4.095238095238095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322265625, "kl": 0.0012937538558617234, "learning_rate": 1.3438580334489818e-07, "loss": 0.0, "num_tokens": 15207562.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 774, "step_time": 537.4289036728442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 642.8125, "completions/mean_terminated_length": 617.4000244140625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.4698103964328766, "epoch": 4.1005291005291005, "frac_reward_zero_std": 0.5, "grad_norm": 1.0859375, "kl": 0.0019668727181851864, "learning_rate": 1.3325991869878012e-07, "loss": -0.0025, "num_tokens": 15221975.0, "reward": 0.15625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.15625, "rewards/itbench_correctness/std": 0.23935678601264954, "step": 775, "step_time": 191.42062663193792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 593.375, "completions/mean_terminated_length": 593.375, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.434800922870636, "epoch": 4.105820105820106, "frac_reward_zero_std": 0.5, "grad_norm": 1.3359375, "kl": 0.0018295013578608632, "learning_rate": 1.321380446634342e-07, "loss": 0.0433, "num_tokens": 15235517.0, "reward": 0.75, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.30276504158973694, "step": 776, "step_time": 495.94140707794577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 683.5625, "completions/mean_terminated_length": 418.77777099609375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.5295785069465637, "epoch": 4.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.002165441866964102, "learning_rate": 1.3102019350749527e-07, "loss": 0.0164, "num_tokens": 15252526.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 777, "step_time": 161.9596445625648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 830.625, "completions/mean_terminated_length": 637.25, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 0.4839729070663452, "epoch": 4.116402116402116, "frac_reward_zero_std": 0.5, "grad_norm": 1.4921875, "kl": 0.0012055831030011177, "learning_rate": 1.299063774556042e-07, "loss": 0.0, "num_tokens": 15273064.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 778, "step_time": 104.92770658805966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 831.1875, "completions/mean_terminated_length": 681.2222290039062, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "entropy": 0.42589667439460754, "epoch": 4.121693121693122, "frac_reward_zero_std": 0.5, "grad_norm": 1.203125, "kl": 0.001626756857149303, "learning_rate": 1.287966086882751e-07, "loss": 0.0145, "num_tokens": 15292003.0, "reward": 0.8660714626312256, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.8660714626312256, "rewards/itbench_correctness/std": 0.24169892072677612, "step": 779, "step_time": 1019.1612946912646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 657.125, "completions/mean_terminated_length": 657.125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.366749107837677, "epoch": 4.1269841269841265, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.0010745770996436477, "learning_rate": 1.2769089934176126e-07, "loss": 0.0175, "num_tokens": 15308181.0, "reward": 0.9187500476837158, "reward_std": 0.0258774496614933, "rewards/itbench_correctness/mean": 0.9187500476837158, "rewards/itbench_correctness/std": 0.09105858951807022, "step": 780, "step_time": 172.30875083897263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 626.6875, "completions/mean_terminated_length": 600.2000122070312, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 0.3909444510936737, "epoch": 4.132275132275132, "frac_reward_zero_std": 0.5, "grad_norm": 1.0703125, "kl": 0.0016821923200041056, "learning_rate": 1.2658926150792322e-07, "loss": 0.0125, "num_tokens": 15333040.0, "reward": 0.640625, "reward_std": 0.0867956355214119, "rewards/itbench_correctness/mean": 0.640625, "rewards/itbench_correctness/std": 0.3896446228027344, "step": 781, "step_time": 372.3970377044752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 615.75, "completions/mean_terminated_length": 588.5333862304688, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.43524158000946045, "epoch": 4.137566137566138, "frac_reward_zero_std": 0.5, "grad_norm": 1.5390625, "kl": 0.0014195973053574562, "learning_rate": 1.2549170723409547e-07, "loss": 0.0509, "num_tokens": 15354348.0, "reward": 0.04375000298023224, "reward_std": 0.0176776684820652, "rewards/itbench_correctness/mean": 0.04375000298023224, "rewards/itbench_correctness/std": 0.05123475566506386, "step": 782, "step_time": 1192.5468442188576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.3966299295425415, "epoch": 4.142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0015992495464161038, "learning_rate": 1.243982485229559e-07, "loss": -0.0488, "num_tokens": 15367896.0, "reward": 0.6400861740112305, "reward_std": 0.4733222424983978, "rewards/itbench_correctness/mean": 0.6400861740112305, "rewards/itbench_correctness/std": 0.4832458794116974, "step": 783, "step_time": 120.91243299655616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 658.6875, "completions/mean_terminated_length": 658.6875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.28237974643707275, "epoch": 4.148148148148148, "frac_reward_zero_std": 0.5, "grad_norm": 1.375, "kl": 0.0011543561704456806, "learning_rate": 1.2330889733239368e-07, "loss": -0.006, "num_tokens": 15385411.0, "reward": 0.359375, "reward_std": 0.04419417306780815, "rewards/itbench_correctness/mean": 0.359375, "rewards/itbench_correctness/std": 0.3760402202606201, "step": 784, "step_time": 991.2049660263583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 839.375, "completions/mean_terminated_length": 654.75, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.3037974536418915, "epoch": 4.1534391534391535, "frac_reward_zero_std": 1.0, "grad_norm": 0.032958984375, "kl": 0.0011951366905122995, "learning_rate": 1.222236655753791e-07, "loss": 0.0, "num_tokens": 15404521.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 785, "step_time": 491.68448298610747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 748.6875, "completions/mean_terminated_length": 685.1538696289062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.28850486874580383, "epoch": 4.158730158730159, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.0016689749900251627, "learning_rate": 1.2114256511983274e-07, "loss": -0.1494, "num_tokens": 15422436.0, "reward": 0.5833333730697632, "reward_std": 0.3327338695526123, "rewards/itbench_correctness/mean": 0.5833333730697632, "rewards/itbench_correctness/std": 0.3648312985897064, "step": 786, "step_time": 87.55616814736277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 728.3125, "completions/mean_terminated_length": 498.3333435058594, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.549214780330658, "epoch": 4.164021164021164, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0020776954479515553, "learning_rate": 1.2006560778849579e-07, "loss": -0.0303, "num_tokens": 15441321.0, "reward": 0.4000000059604645, "reward_std": 0.302165687084198, "rewards/itbench_correctness/mean": 0.4000000059604645, "rewards/itbench_correctness/std": 0.4898979663848877, "step": 787, "step_time": 132.10281661339104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 955.3125, "completions/mean_terminated_length": 474.5, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.3663722574710846, "epoch": 4.169312169312169, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.0013061100617051125, "learning_rate": 1.1899280535880119e-07, "loss": 0.0, "num_tokens": 15465854.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 788, "step_time": 161.56111018918455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 747.875, "completions/mean_terminated_length": 655.8333740234375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.26207587122917175, "epoch": 4.174603174603175, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.0017630105139687657, "learning_rate": 1.1792416956274443e-07, "loss": 0.0001, "num_tokens": 15485668.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 789, "step_time": 249.84946880768985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 826.875, "completions/mean_terminated_length": 673.5555419921875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 0.5635676383972168, "epoch": 4.1798941798941796, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0018597254529595375, "learning_rate": 1.1685971208675538e-07, "loss": -0.0061, "num_tokens": 15514546.0, "reward": 0.40625, "reward_std": 0.3471629321575165, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.4366062581539154, "step": 790, "step_time": 128.95786687266082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 678.5, "completions/mean_terminated_length": 678.5, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.5630066394805908, "epoch": 4.185185185185185, "frac_reward_zero_std": 0.5, "grad_norm": 1.2265625, "kl": 0.0021489164792001247, "learning_rate": 1.1579944457157059e-07, "loss": -0.0285, "num_tokens": 15538698.0, "reward": 0.34375, "reward_std": 0.22903135418891907, "rewards/itbench_correctness/mean": 0.34375, "rewards/itbench_correctness/std": 0.4732423722743988, "step": 791, "step_time": 140.31763851176947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 693.5625, "completions/mean_terminated_length": 646.357177734375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.48157158493995667, "epoch": 4.190476190476191, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.002031001029536128, "learning_rate": 1.1474337861210543e-07, "loss": 0.0427, "num_tokens": 15560955.0, "reward": 0.375, "reward_std": 0.4355512857437134, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.5, "step": 792, "step_time": 343.5579600026831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 522.625, "completions/mean_terminated_length": 522.625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "entropy": 0.5089691281318665, "epoch": 4.195767195767195, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.00155110121704638, "learning_rate": 1.1369152575732821e-07, "loss": -0.0072, "num_tokens": 15572493.0, "reward": 0.7232142686843872, "reward_std": 0.3042290210723877, "rewards/itbench_correctness/mean": 0.7232142686843872, "rewards/itbench_correctness/std": 0.4347764849662781, "step": 793, "step_time": 130.16915812157094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 901.4375, "completions/mean_terminated_length": 845.727294921875, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.4392983317375183, "epoch": 4.201058201058201, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0012185334926471114, "learning_rate": 1.1264389751013325e-07, "loss": 0.0073, "num_tokens": 15592028.0, "reward": 0.7326388955116272, "reward_std": 0.3273431062698364, "rewards/itbench_correctness/mean": 0.7326388955116272, "rewards/itbench_correctness/std": 0.33346834778785706, "step": 794, "step_time": 521.8481605676934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 563.3125, "completions/mean_terminated_length": 563.3125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.39232221245765686, "epoch": 4.2063492063492065, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.0014798998599871993, "learning_rate": 1.1160050532721527e-07, "loss": 0.0313, "num_tokens": 15605457.0, "reward": 0.8125, "reward_std": 0.3657589256763458, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.35939764976501465, "step": 795, "step_time": 94.33376376517117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 926.625, "completions/mean_terminated_length": 882.3636474609375, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 0.40145689249038696, "epoch": 4.211640211640212, "frac_reward_zero_std": 0.5, "grad_norm": 1.3671875, "kl": 0.0010374147677794099, "learning_rate": 1.1056136061894384e-07, "loss": -0.0077, "num_tokens": 15626107.0, "reward": 0.9632353186607361, "reward_std": 0.051545556634664536, "rewards/itbench_correctness/mean": 0.9632353186607361, "rewards/itbench_correctness/std": 0.08000864833593369, "step": 796, "step_time": 205.19225138891488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 572.9375, "completions/mean_terminated_length": 572.9375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "entropy": 0.45729246735572815, "epoch": 4.216931216931217, "frac_reward_zero_std": 0.5, "grad_norm": 1.375, "kl": 0.0012878067791461945, "learning_rate": 1.095264747492391e-07, "loss": 0.0172, "num_tokens": 15639266.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 797, "step_time": 114.62835809681565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 729.6875, "completions/mean_terminated_length": 710.0667114257812, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.3563169240951538, "epoch": 4.222222222222222, "frac_reward_zero_std": 0.5, "grad_norm": 1.171875, "kl": 0.0015947025967761874, "learning_rate": 1.0849585903544706e-07, "loss": -0.0083, "num_tokens": 15655869.0, "reward": 0.4285714328289032, "reward_std": 0.19342948496341705, "rewards/itbench_correctness/mean": 0.4285714328289032, "rewards/itbench_correctness/std": 0.27437829971313477, "step": 798, "step_time": 143.3468729155138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 741.5, "completions/mean_terminated_length": 572.0, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.5232636332511902, "epoch": 4.227513227513228, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.002720025833696127, "learning_rate": 1.0746952474821613e-07, "loss": -0.0039, "num_tokens": 15678173.0, "reward": 0.3645833432674408, "reward_std": 0.01928791031241417, "rewards/itbench_correctness/mean": 0.3645833432674408, "rewards/itbench_correctness/std": 0.3774610757827759, "step": 799, "step_time": 129.29889920540154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 640.9375, "completions/mean_terminated_length": 411.1000061035156, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.37913212180137634, "epoch": 4.232804232804233, "frac_reward_zero_std": 0.5, "grad_norm": 1.015625, "kl": 0.0011215686099603772, "learning_rate": 1.0644748311137375e-07, "loss": 0.0096, "num_tokens": 15702932.0, "reward": 0.21875, "reward_std": 0.0883883461356163, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.2561737895011902, "step": 800, "step_time": 164.42878744658083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 705.5, "completions/mean_terminated_length": 684.2667236328125, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "entropy": 0.3798724412918091, "epoch": 4.238095238095238, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0012407073518261313, "learning_rate": 1.0542974530180327e-07, "loss": -0.0115, "num_tokens": 15719212.0, "reward": 0.905024528503418, "reward_std": 0.16057346761226654, "rewards/itbench_correctness/mean": 0.905024528503418, "rewards/itbench_correctness/std": 0.17238253355026245, "step": 801, "step_time": 167.65833072923124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 873.875, "completions/mean_terminated_length": 783.7999877929688, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "entropy": 0.5355457067489624, "epoch": 4.243386243386244, "frac_reward_zero_std": 0.5, "grad_norm": 1.4296875, "kl": 0.001486996770836413, "learning_rate": 1.0441632244932235e-07, "loss": 0.0187, "num_tokens": 15744658.0, "reward": 0.2916666865348816, "reward_std": 0.18722420930862427, "rewards/itbench_correctness/mean": 0.2916666865348816, "rewards/itbench_correctness/std": 0.395187109708786, "step": 802, "step_time": 429.37837726902217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.466796875, "epoch": 4.248677248677248, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0013433246640488505, "learning_rate": 1.0340722563656107e-07, "loss": 0.0001, "num_tokens": 15774058.0, "reward": 0.5989583730697632, "reward_std": 0.2046467512845993, "rewards/itbench_correctness/mean": 0.5989583730697632, "rewards/itbench_correctness/std": 0.3842606544494629, "step": 803, "step_time": 261.3252537054941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 631.0625, "completions/mean_terminated_length": 631.0625, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "entropy": 0.4817272424697876, "epoch": 4.253968253968254, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0022742159198969603, "learning_rate": 1.0240246589884045e-07, "loss": 0.0008, "num_tokens": 15797947.0, "reward": 0.40833333134651184, "reward_std": 0.37586042284965515, "rewards/itbench_correctness/mean": 0.40833333134651184, "rewards/itbench_correctness/std": 0.392994225025177, "step": 804, "step_time": 447.9019009033218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 759.8125, "completions/mean_terminated_length": 759.8125, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "entropy": 0.5580323934555054, "epoch": 4.2592592592592595, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.002108614193275571, "learning_rate": 1.0140205422405212e-07, "loss": 0.0001, "num_tokens": 15830432.0, "reward": 0.05000000074505806, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.05000000074505806, "rewards/itbench_correctness/std": 0.05163978040218353, "step": 805, "step_time": 109.95679971016943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 816.0, "completions/mean_terminated_length": 608.0, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.44117647409439087, "epoch": 4.264550264550264, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0013343130704015493, "learning_rate": 1.0040600155253764e-07, "loss": -0.0138, "num_tokens": 15850416.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "rewards/itbench_correctness/mean": 0.9583333730697632, "rewards/itbench_correctness/std": 0.1666666567325592, "step": 806, "step_time": 525.2538241520524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 573.125, "completions/mean_terminated_length": 573.125, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 0.3297709822654724, "epoch": 4.26984126984127, "frac_reward_zero_std": 0.5, "grad_norm": 1.125, "kl": 0.0016436699079349637, "learning_rate": 9.941431877696954e-08, "loss": -0.0137, "num_tokens": 15863410.0, "reward": 0.828125, "reward_std": 0.22097086906433105, "rewards/itbench_correctness/mean": 0.828125, "rewards/itbench_correctness/std": 0.3502231538295746, "step": 807, "step_time": 648.7502203145996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 924.8125, "completions/mean_terminated_length": 825.625, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 0.27573156356811523, "epoch": 4.275132275132275, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0010158391669392586, "learning_rate": 9.842701674223187e-08, "loss": 0.0033, "num_tokens": 15890943.0, "reward": 0.7604166865348816, "reward_std": 0.12147815525531769, "rewards/itbench_correctness/mean": 0.7604166865348816, "rewards/itbench_correctness/std": 0.2979482412338257, "step": 808, "step_time": 365.2914238469675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 530.375, "completions/mean_terminated_length": 530.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.41480085253715515, "epoch": 4.28042328042328, "frac_reward_zero_std": 0.5, "grad_norm": 1.0703125, "kl": 0.0014939939137548208, "learning_rate": 9.744410624530147e-08, "loss": 0.009, "num_tokens": 15902429.0, "reward": 0.6875, "reward_std": 0.03857584670186043, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.28463754057884216, "step": 809, "step_time": 69.47060746885836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 745.8125, "completions/mean_terminated_length": 467.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4344255328178406, "epoch": 4.285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0032775227446109056, "learning_rate": 9.646559803512993e-08, "loss": -0.0804, "num_tokens": 15922018.0, "reward": 0.6015625, "reward_std": 0.24306795001029968, "rewards/itbench_correctness/mean": 0.6015625, "rewards/itbench_correctness/std": 0.3824775218963623, "step": 810, "step_time": 242.85561118088663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 685.3125, "completions/mean_terminated_length": 685.3125, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.3764705955982208, "epoch": 4.291005291005291, "frac_reward_zero_std": 0.5, "grad_norm": 1.359375, "kl": 0.001442611450329423, "learning_rate": 9.549150281252632e-08, "loss": -0.0122, "num_tokens": 15938535.0, "reward": 0.8571428656578064, "reward_std": 0.11921755969524384, "rewards/itbench_correctness/mean": 0.8571428656578064, "rewards/itbench_correctness/std": 0.21977105736732483, "step": 811, "step_time": 366.22836083732545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 713.875, "completions/mean_terminated_length": 713.875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.36981263756752014, "epoch": 4.296296296296296, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0017484528943896294, "learning_rate": 9.452183123003999e-08, "loss": 0.0069, "num_tokens": 15954229.0, "reward": 0.6102941036224365, "reward_std": 0.3698710799217224, "rewards/itbench_correctness/mean": 0.6102941036224365, "rewards/itbench_correctness/std": 0.455075204372406, "step": 812, "step_time": 313.4723123824224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 814.5, "completions/mean_terminated_length": 766.1538696289062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.586863100528717, "epoch": 4.301587301587301, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0016130534932017326, "learning_rate": 9.355659389184394e-08, "loss": -0.0401, "num_tokens": 15987341.0, "reward": 0.22499999403953552, "reward_std": 0.28192007541656494, "rewards/itbench_correctness/mean": 0.22499999403953552, "rewards/itbench_correctness/std": 0.3872983455657959, "step": 813, "step_time": 213.50641488097608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 672.9375, "completions/mean_terminated_length": 321.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.3982539176940918, "epoch": 4.306878306878307, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0019140745280310512, "learning_rate": 9.259580135361927e-08, "loss": 0.0014, "num_tokens": 16004804.0, "reward": 0.53125, "reward_std": 0.23289713263511658, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.3326033651828766, "step": 814, "step_time": 924.7196069033816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 823.25, "completions/mean_terminated_length": 667.1111450195312, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.590343177318573, "epoch": 4.3121693121693125, "frac_reward_zero_std": 0.5, "grad_norm": 1.484375, "kl": 0.0014230167726054788, "learning_rate": 9.163946412243895e-08, "loss": -0.0126, "num_tokens": 16059928.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 815, "step_time": 286.4796100119129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 703.25, "completions/mean_terminated_length": 681.86669921875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.27444010972976685, "epoch": 4.317460317460317, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.0018459860002622008, "learning_rate": 9.068759265665382e-08, "loss": 0.0001, "num_tokens": 16078292.0, "reward": 0.25, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 816, "step_time": 321.2267580414191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 578.3125, "completions/mean_terminated_length": 578.3125, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.4253755509853363, "epoch": 4.322751322751323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0595703125, "kl": 0.001751541974954307, "learning_rate": 8.974019736577775e-08, "loss": 0.0, "num_tokens": 16090817.0, "reward": 0.8333333730697632, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.8333333730697632, "rewards/itbench_correctness/std": 0.17213258147239685, "step": 817, "step_time": 207.34377425536513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 811.9375, "completions/mean_terminated_length": 684.7000122070312, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.330074667930603, "epoch": 4.328042328042328, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.00094667385565117, "learning_rate": 8.879728861037383e-08, "loss": 0.0025, "num_tokens": 16109496.0, "reward": 0.8690475821495056, "reward_std": 0.13734711706638336, "rewards/itbench_correctness/mean": 0.8690475821495056, "rewards/itbench_correctness/std": 0.15356682240962982, "step": 818, "step_time": 157.14436247292906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 744.25, "completions/mean_terminated_length": 651.0, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "entropy": 0.3815922141075134, "epoch": 4.333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0016716917743906379, "learning_rate": 8.785887670194136e-08, "loss": -0.0363, "num_tokens": 16129908.0, "reward": 0.7552083730697632, "reward_std": 0.12075783312320709, "rewards/itbench_correctness/mean": 0.7552083730697632, "rewards/itbench_correctness/std": 0.16796371340751648, "step": 819, "step_time": 186.5824106996879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 825.875, "completions/mean_terminated_length": 671.7777709960938, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "entropy": 0.5860450863838196, "epoch": 4.338624338624339, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0019729058258235455, "learning_rate": 8.692497190280224e-08, "loss": -0.0117, "num_tokens": 16162090.0, "reward": 0.171875, "reward_std": 0.3820367455482483, "rewards/itbench_correctness/mean": 0.171875, "rewards/itbench_correctness/std": 0.37325987219810486, "step": 820, "step_time": 353.5038594137877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 909.6875, "completions/mean_terminated_length": 795.375, "completions/min_length": 625.0, "completions/min_terminated_length": 625.0, "entropy": 0.6024046540260315, "epoch": 4.343915343915344, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.001822230638936162, "learning_rate": 8.599558442598998e-08, "loss": 0.035, "num_tokens": 16191157.0, "reward": 0.8125, "reward_std": 0.3458075523376465, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.3403429687023163, "step": 821, "step_time": 505.8641969123855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.46897411346435547, "epoch": 4.349206349206349, "frac_reward_zero_std": 0.5, "grad_norm": 0.515625, "kl": 0.002614083932712674, "learning_rate": 8.507072443513702e-08, "loss": -0.0603, "num_tokens": 16206291.0, "reward": 0.171875, "reward_std": 0.07281029224395752, "rewards/itbench_correctness/mean": 0.171875, "rewards/itbench_correctness/std": 0.20348526537418365, "step": 822, "step_time": 195.5613472936675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 786.8125, "completions/mean_terminated_length": 752.9285888671875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "entropy": 0.4295813739299774, "epoch": 4.354497354497354, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.0021104791667312384, "learning_rate": 8.415040204436425e-08, "loss": 0.0155, "num_tokens": 16229032.0, "reward": 0.8999999761581421, "reward_std": 0.09258200973272324, "rewards/itbench_correctness/mean": 0.8999999761581421, "rewards/itbench_correctness/std": 0.1632993221282959, "step": 823, "step_time": 379.2500517424196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 892.75, "completions/mean_terminated_length": 499.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5667880177497864, "epoch": 4.35978835978836, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.0018997631268575788, "learning_rate": 8.32346273181696e-08, "loss": -0.0691, "num_tokens": 16254500.0, "reward": 0.125, "reward_std": 0.1674824357032776, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.17320507764816284, "step": 824, "step_time": 547.5418346459046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 736.0625, "completions/mean_terminated_length": 512.1111450195312, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.41572555899620056, "epoch": 4.365079365079365, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.0017400278011336923, "learning_rate": 8.232341027131883e-08, "loss": 0.0001, "num_tokens": 16277821.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 825, "step_time": 1151.8339996775612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 460.5, "completions/mean_terminated_length": 460.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41910967230796814, "epoch": 4.37037037037037, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0028239088132977486, "learning_rate": 8.141676086873573e-08, "loss": -0.0546, "num_tokens": 16288837.0, "reward": 0.375, "reward_std": 0.19918900728225708, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.20412415266036987, "step": 826, "step_time": 428.2450467739254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 784.5, "completions/mean_terminated_length": 598.2222290039062, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.4869343638420105, "epoch": 4.375661375661376, "frac_reward_zero_std": 0.5, "grad_norm": 1.515625, "kl": 0.001279774820432067, "learning_rate": 8.051468902539271e-08, "loss": 0.0081, "num_tokens": 16312141.0, "reward": 0.75, "reward_std": 0.2182178944349289, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.394405335187912, "step": 827, "step_time": 134.5602146498859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 912.3125, "completions/mean_terminated_length": 825.4444580078125, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "entropy": 0.5195587873458862, "epoch": 4.380952380952381, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.002217257162556052, "learning_rate": 7.961720460620319e-08, "loss": 0.0265, "num_tokens": 16336338.0, "reward": 0.5885416269302368, "reward_std": 0.38563019037246704, "rewards/itbench_correctness/mean": 0.5885416269302368, "rewards/itbench_correctness/std": 0.4767450988292694, "step": 828, "step_time": 96.03808457683772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 416.9375, "completions/mean_terminated_length": 416.9375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.3237895369529724, "epoch": 4.386243386243386, "frac_reward_zero_std": 0.5, "grad_norm": 1.109375, "kl": 0.001336383749730885, "learning_rate": 7.872431742591267e-08, "loss": -0.009, "num_tokens": 16345745.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.25, "step": 829, "step_time": 79.61548331100494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 697.0625, "completions/mean_terminated_length": 588.0833740234375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "entropy": 0.39594727754592896, "epoch": 4.391534391534392, "frac_reward_zero_std": 0.5, "grad_norm": 1.2890625, "kl": 0.00119964184705168, "learning_rate": 7.783603724899257e-08, "loss": 0.0261, "num_tokens": 16370170.0, "reward": 0.25, "reward_std": 0.26726123690605164, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.44721361994743347, "step": 830, "step_time": 94.27793037891388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 753.0625, "completions/mean_terminated_length": 735.0000610351562, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.33463358879089355, "epoch": 4.396825396825397, "frac_reward_zero_std": 0.5, "grad_norm": 1.25, "kl": 0.0012139711761847138, "learning_rate": 7.695237378953224e-08, "loss": -0.0273, "num_tokens": 16387731.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 831, "step_time": 357.8091874551028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 880.25, "completions/mean_terminated_length": 794.0, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.48168134689331055, "epoch": 4.402116402116402, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0016746832989156246, "learning_rate": 7.607333671113408e-08, "loss": 0.0418, "num_tokens": 16408031.0, "reward": 0.375, "reward_std": 0.20927216112613678, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.4249182939529419, "step": 832, "step_time": 128.6728245029226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 810.9375, "completions/mean_terminated_length": 780.5000610351562, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.3551445007324219, "epoch": 4.407407407407407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.0015545395435765386, "learning_rate": 7.519893562680663e-08, "loss": 0.0, "num_tokens": 16426366.0, "reward": 0.0833333358168602, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0833333358168602, "rewards/itbench_correctness/std": 0.08606629818677902, "step": 833, "step_time": 679.0917965397239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 399.3125, "completions/mean_terminated_length": 399.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.3430896997451782, "epoch": 4.412698412698413, "frac_reward_zero_std": 0.5, "grad_norm": 1.0546875, "kl": 0.0014030004385858774, "learning_rate": 7.432918009885996e-08, "loss": 0.0148, "num_tokens": 16436691.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 834, "step_time": 1008.2919538905844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 666.6875, "completions/mean_terminated_length": 642.86669921875, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "entropy": 0.4679853618144989, "epoch": 4.417989417989418, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.001401036512106657, "learning_rate": 7.346407963880136e-08, "loss": 0.0403, "num_tokens": 16456902.0, "reward": 0.5125000476837158, "reward_std": 0.1552647352218628, "rewards/itbench_correctness/mean": 0.5125000476837158, "rewards/itbench_correctness/std": 0.38100746273994446, "step": 835, "step_time": 458.4377150340006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 636.4375, "completions/mean_terminated_length": 507.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.5530786514282227, "epoch": 4.423280423280423, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0014363399241119623, "learning_rate": 7.260364370723043e-08, "loss": -0.003, "num_tokens": 16482653.0, "reward": 0.3046875, "reward_std": 0.19887377321720123, "rewards/itbench_correctness/mean": 0.3046875, "rewards/itbench_correctness/std": 0.3060798943042755, "step": 836, "step_time": 170.9447146616876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 629.3125, "completions/mean_terminated_length": 572.9285888671875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.45764225721359253, "epoch": 4.428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.0012081885943189263, "learning_rate": 7.17478817137373e-08, "loss": 0.0681, "num_tokens": 16500194.0, "reward": 0.84375, "reward_std": 0.2893187999725342, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.286865234375, "step": 837, "step_time": 777.2624794654548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 558.375, "completions/mean_terminated_length": 558.375, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.40116408467292786, "epoch": 4.4338624338624335, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.002113268943503499, "learning_rate": 7.089680301679751e-08, "loss": -0.0104, "num_tokens": 16512536.0, "reward": 0.90625, "reward_std": 0.2346404492855072, "rewards/itbench_correctness/mean": 0.90625, "rewards/itbench_correctness/std": 0.2561737895011902, "step": 838, "step_time": 86.38017075136304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 955.75, "completions/mean_terminated_length": 660.0, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.31388962268829346, "epoch": 4.439153439153439, "frac_reward_zero_std": 1.0, "grad_norm": 0.037109375, "kl": 0.001120888045988977, "learning_rate": 7.005041692367153e-08, "loss": 0.0, "num_tokens": 16542196.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 839, "step_time": 386.5617507044226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 622.1875, "completions/mean_terminated_length": 622.1875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "entropy": 0.40020090341567993, "epoch": 4.444444444444445, "frac_reward_zero_std": 0.5, "grad_norm": 1.1875, "kl": 0.0012403958244249225, "learning_rate": 6.92087326903022e-08, "loss": 0.0021, "num_tokens": 16556511.0, "reward": 0.4296875, "reward_std": 0.17499202489852905, "rewards/itbench_correctness/mean": 0.4296875, "rewards/itbench_correctness/std": 0.5040848851203918, "step": 840, "step_time": 135.079236516729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 765.9375, "completions/mean_terminated_length": 611.1000366210938, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.5144022703170776, "epoch": 4.449735449735449, "frac_reward_zero_std": 0.5, "grad_norm": 1.765625, "kl": 0.0017959108809009194, "learning_rate": 6.837175952121304e-08, "loss": 0.0412, "num_tokens": 16577070.0, "reward": 0.2395833432674408, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.2395833432674408, "rewards/itbench_correctness/std": 0.19214914739131927, "step": 841, "step_time": 980.2466245274991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 794.0, "completions/mean_terminated_length": 656.0, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "entropy": 0.6423173546791077, "epoch": 4.455026455026455, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.001680102082900703, "learning_rate": 6.753950656940905e-08, "loss": 0.0577, "num_tokens": 16599126.0, "reward": 0.25, "reward_std": 0.1462520956993103, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.24152295291423798, "step": 842, "step_time": 89.48983163572848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 873.75, "completions/mean_terminated_length": 756.888916015625, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "entropy": 0.4738197326660156, "epoch": 4.4603174603174605, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0010829598177224398, "learning_rate": 6.671198293627479e-08, "loss": 0.0027, "num_tokens": 16620106.0, "reward": 0.4750000238418579, "reward_std": 0.19992218911647797, "rewards/itbench_correctness/mean": 0.4750000238418579, "rewards/itbench_correctness/std": 0.4358898997306824, "step": 843, "step_time": 68.92372180242091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 608.5, "completions/mean_terminated_length": 608.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4272801876068115, "epoch": 4.465608465608465, "frac_reward_zero_std": 0.5, "grad_norm": 0.55078125, "kl": 0.0015739505179226398, "learning_rate": 6.588919767147638e-08, "loss": -0.0799, "num_tokens": 16633634.0, "reward": 0.8125, "reward_std": 0.1298656165599823, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.2626432776451111, "step": 844, "step_time": 799.5955674275756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 709.375, "completions/mean_terminated_length": 636.7692260742188, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.38625550270080566, "epoch": 4.470899470899471, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.0017984964651986957, "learning_rate": 6.507115977286143e-08, "loss": 0.0032, "num_tokens": 16650712.0, "reward": 0.421875, "reward_std": 0.09300297498703003, "rewards/itbench_correctness/mean": 0.421875, "rewards/itbench_correctness/std": 0.4538607597351074, "step": 845, "step_time": 264.149626750499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 482.4375, "completions/mean_terminated_length": 482.4375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.4415079653263092, "epoch": 4.476190476190476, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.001339221140369773, "learning_rate": 6.42578781863613e-08, "loss": 0.0102, "num_tokens": 16661415.0, "reward": 0.484375, "reward_std": 0.16849708557128906, "rewards/itbench_correctness/mean": 0.484375, "rewards/itbench_correctness/std": 0.17001838982105255, "step": 846, "step_time": 347.23770444560796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 730.5625, "completions/mean_terminated_length": 554.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.533835232257843, "epoch": 4.481481481481482, "frac_reward_zero_std": 0.5, "grad_norm": 1.5390625, "kl": 0.0018466972978785634, "learning_rate": 6.34493618058935e-08, "loss": -0.0754, "num_tokens": 16700320.0, "reward": 0.03125, "reward_std": 0.043129101395606995, "rewards/itbench_correctness/mean": 0.03125, "rewards/itbench_correctness/std": 0.06718548387289047, "step": 847, "step_time": 213.22028856538236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 455.5, "completions/mean_terminated_length": 455.5, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.40175631642341614, "epoch": 4.4867724867724865, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0014747100649401546, "learning_rate": 6.26456194732633e-08, "loss": -0.0161, "num_tokens": 16710784.0, "reward": 0.4895833432674408, "reward_std": 0.21374498307704926, "rewards/itbench_correctness/mean": 0.4895833432674408, "rewards/itbench_correctness/std": 0.20983901619911194, "step": 848, "step_time": 55.42429989017546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 442.25, "completions/mean_terminated_length": 442.25, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.35048049688339233, "epoch": 4.492063492063492, "frac_reward_zero_std": 0.5, "grad_norm": 1.015625, "kl": 0.003505163826048374, "learning_rate": 6.184665997806831e-08, "loss": 0.0185, "num_tokens": 16721036.0, "reward": 0.4943181872367859, "reward_std": 0.016070598736405373, "rewards/itbench_correctness/mean": 0.4943181872367859, "rewards/itbench_correctness/std": 0.5110015273094177, "step": 849, "step_time": 1021.3690116815269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 910.4375, "completions/mean_terminated_length": 660.6000366210938, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.4964646100997925, "epoch": 4.497354497354498, "frac_reward_zero_std": 0.5, "grad_norm": 8.875, "kl": 0.0017396226758137345, "learning_rate": 6.105249205760127e-08, "loss": 0.0387, "num_tokens": 16749483.0, "reward": 0.4375, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.48733973503112793, "step": 850, "step_time": 576.2763332147151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 593.75, "completions/mean_terminated_length": 593.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3183158040046692, "epoch": 4.502645502645502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0289306640625, "kl": 0.001586488215252757, "learning_rate": 6.026312439675551e-08, "loss": 0.0, "num_tokens": 16763647.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 851, "step_time": 732.8523011729121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 584.375, "completions/mean_terminated_length": 555.0667114257812, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "entropy": 0.5373262166976929, "epoch": 4.507936507936508, "frac_reward_zero_std": 0.5, "grad_norm": 1.1953125, "kl": 0.0018168711103498936, "learning_rate": 5.9478565627929244e-08, "loss": 0.0319, "num_tokens": 16794397.0, "reward": 0.0729166716337204, "reward_std": 0.0294627845287323, "rewards/itbench_correctness/mean": 0.0729166716337204, "rewards/itbench_correctness/std": 0.08539126068353653, "step": 852, "step_time": 84.26783776376396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 935.625, "completions/mean_terminated_length": 741.2000122070312, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "entropy": 0.5600534677505493, "epoch": 4.5132275132275135, "frac_reward_zero_std": 1.0, "grad_norm": 0.034912109375, "kl": 0.0015273833414539695, "learning_rate": 5.869882433093154e-08, "loss": 0.0001, "num_tokens": 16832975.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 853, "step_time": 153.46809213608503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 617.25, "completions/mean_terminated_length": 481.66668701171875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.37748077511787415, "epoch": 4.518518518518518, "frac_reward_zero_std": 0.5, "grad_norm": 2.90625, "kl": 0.0018203725339844823, "learning_rate": 5.7923909032888295e-08, "loss": -0.0292, "num_tokens": 16855027.0, "reward": 0.2395833432674408, "reward_std": 0.13684004545211792, "rewards/itbench_correctness/mean": 0.2395833432674408, "rewards/itbench_correctness/std": 0.31012991070747375, "step": 854, "step_time": 189.47853012941778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 370.75, "completions/mean_terminated_length": 370.75, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.34794336557388306, "epoch": 4.523809523809524, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.002147156512364745, "learning_rate": 5.7153828208148846e-08, "loss": -0.0062, "num_tokens": 16863943.0, "reward": 0.53125, "reward_std": 0.405046284198761, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.4905354380607605, "step": 855, "step_time": 1156.329422229901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 743.25, "completions/mean_terminated_length": 615.6363525390625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.44399595260620117, "epoch": 4.529100529100529, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0015257069608196616, "learning_rate": 5.638859027819409e-08, "loss": -0.0599, "num_tokens": 16885899.0, "reward": 0.3812499940395355, "reward_std": 0.3087776303291321, "rewards/itbench_correctness/mean": 0.3812499940395355, "rewards/itbench_correctness/std": 0.46219584345817566, "step": 856, "step_time": 461.6809099484235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 481.375, "completions/mean_terminated_length": 481.375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.4964944124221802, "epoch": 4.534391534391535, "frac_reward_zero_std": 0.5, "grad_norm": 1.203125, "kl": 0.0020852303132414818, "learning_rate": 5.562820361154313e-08, "loss": -0.0059, "num_tokens": 16896417.0, "reward": 0.359375, "reward_std": 0.031000997871160507, "rewards/itbench_correctness/mean": 0.359375, "rewards/itbench_correctness/std": 0.3735698163509369, "step": 857, "step_time": 93.96854640357196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 994.9375, "completions/mean_terminated_length": 869.0, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "entropy": 0.4140963554382324, "epoch": 4.5396825396825395, "frac_reward_zero_std": 0.5, "grad_norm": 1.4296875, "kl": 0.001920070848427713, "learning_rate": 5.48726765236629e-08, "loss": 0.0154, "num_tokens": 16921448.0, "reward": 0.2678571343421936, "reward_std": 0.04959750175476074, "rewards/itbench_correctness/mean": 0.2678571343421936, "rewards/itbench_correctness/std": 0.12975645065307617, "step": 858, "step_time": 6701.7141972742975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 783.1875, "completions/mean_terminated_length": 595.888916015625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.43923071026802063, "epoch": 4.544973544973545, "frac_reward_zero_std": 0.5, "grad_norm": 1.140625, "kl": 0.001265035243704915, "learning_rate": 5.412201727687643e-08, "loss": -0.0124, "num_tokens": 16938763.0, "reward": 0.8556547164916992, "reward_std": 0.053405825048685074, "rewards/itbench_correctness/mean": 0.8556547164916992, "rewards/itbench_correctness/std": 0.07298243790864944, "step": 859, "step_time": 1027.743779040873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 712.5625, "completions/mean_terminated_length": 401.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "entropy": 0.5192527174949646, "epoch": 4.550264550264551, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0014884049305692315, "learning_rate": 5.337623408027292e-08, "loss": -0.0094, "num_tokens": 16957572.0, "reward": 0.6171875, "reward_std": 0.37874263525009155, "rewards/itbench_correctness/mean": 0.6171875, "rewards/itbench_correctness/std": 0.3991364538669586, "step": 860, "step_time": 262.18378533329815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 984.5625, "completions/mean_terminated_length": 708.5, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "entropy": 0.5484669804573059, "epoch": 4.555555555555555, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.0018317234935238957, "learning_rate": 5.263533508961826e-08, "loss": 0.0379, "num_tokens": 16981197.0, "reward": 0.265625, "reward_std": 0.39435434341430664, "rewards/itbench_correctness/mean": 0.265625, "rewards/itbench_correctness/std": 0.4096280336380005, "step": 861, "step_time": 77.7364522125572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 457.0625, "completions/mean_terminated_length": 457.0625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.345685750246048, "epoch": 4.560846560846561, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.002134555485099554, "learning_rate": 5.1899328407264855e-08, "loss": -0.087, "num_tokens": 16993086.0, "reward": 0.3812499940395355, "reward_std": 0.318040132522583, "rewards/itbench_correctness/mean": 0.3812499940395355, "rewards/itbench_correctness/std": 0.4069705307483673, "step": 862, "step_time": 827.9984636185691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 547.8125, "completions/mean_terminated_length": 547.8125, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.46731317043304443, "epoch": 4.5661375661375665, "frac_reward_zero_std": 0.5, "grad_norm": 1.234375, "kl": 0.0013622003607451916, "learning_rate": 5.116822208206395e-08, "loss": -0.0166, "num_tokens": 17005059.0, "reward": 0.5588235259056091, "reward_std": 0.16637806594371796, "rewards/itbench_correctness/mean": 0.5588235259056091, "rewards/itbench_correctness/std": 0.5092002749443054, "step": 863, "step_time": 190.58081929571927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 952.5, "completions/mean_terminated_length": 452.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.2813648283481598, "epoch": 4.571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 1.421875, "kl": 0.001091470243409276, "learning_rate": 5.044202410927706e-08, "loss": -0.0038, "num_tokens": 17038027.0, "reward": 0.048076923936605453, "reward_std": 0.13598206639289856, "rewards/itbench_correctness/mean": 0.048076923936605453, "rewards/itbench_correctness/std": 0.192307710647583, "step": 864, "step_time": 116.58771913684905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 526.375, "completions/mean_terminated_length": 526.375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "entropy": 0.391355961561203, "epoch": 4.576719576719577, "frac_reward_zero_std": 0.5, "grad_norm": 1.03125, "kl": 0.001298400922678411, "learning_rate": 4.972074243048896e-08, "loss": -0.0148, "num_tokens": 17049241.0, "reward": 0.71875, "reward_std": 0.18196186423301697, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.3823356628417969, "step": 865, "step_time": 239.0914654675871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 552.75, "completions/mean_terminated_length": 552.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "entropy": 0.510176420211792, "epoch": 4.582010582010582, "frac_reward_zero_std": 0.5, "grad_norm": 1.3125, "kl": 0.0015060057630762458, "learning_rate": 4.9004384933520547e-08, "loss": -0.0102, "num_tokens": 17061309.0, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 866, "step_time": 120.828508451581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 870.375, "completions/mean_terminated_length": 750.888916015625, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.25850924849510193, "epoch": 4.587301587301587, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.001840650918893516, "learning_rate": 4.829295945234257e-08, "loss": 0.01, "num_tokens": 17083635.0, "reward": 0.6875, "reward_std": 0.12400396168231964, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.28463754057884216, "step": 867, "step_time": 78.40537928510457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 641.875, "completions/mean_terminated_length": 641.875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.5577409863471985, "epoch": 4.592592592592593, "frac_reward_zero_std": 0.5, "grad_norm": 1.5, "kl": 0.0019760173745453358, "learning_rate": 4.758647376699032e-08, "loss": -0.0254, "num_tokens": 17104393.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.5, "step": 868, "step_time": 90.65404016617686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 561.0, "completions/mean_terminated_length": 561.0, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.35650622844696045, "epoch": 4.597883597883598, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0018372276099398732, "learning_rate": 4.6884935603477724e-08, "loss": -0.0127, "num_tokens": 17119761.0, "reward": 0.3125, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.3095695972442627, "step": 869, "step_time": 567.832233437337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 735.75, "completions/mean_terminated_length": 562.7999877929688, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4512402415275574, "epoch": 4.603174603174603, "frac_reward_zero_std": 0.5, "grad_norm": 1.3125, "kl": 0.0015587429516017437, "learning_rate": 4.6188352633713956e-08, "loss": -0.0225, "num_tokens": 17156341.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 870, "step_time": 147.64252108428627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 688.875, "completions/mean_terminated_length": 666.5333862304688, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.4325893521308899, "epoch": 4.608465608465608, "frac_reward_zero_std": 0.5, "grad_norm": 1.171875, "kl": 0.0013199535897001624, "learning_rate": 4.549673247541874e-08, "loss": -0.0139, "num_tokens": 17174155.0, "reward": 0.40625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.4552929699420929, "step": 871, "step_time": 94.71705105807632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 609.0625, "completions/mean_terminated_length": 420.4545593261719, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.36613649129867554, "epoch": 4.613756613756614, "frac_reward_zero_std": 0.5, "grad_norm": 0.71484375, "kl": 0.001597036374732852, "learning_rate": 4.48100826920394e-08, "loss": -0.1975, "num_tokens": 17194796.0, "reward": 0.0625, "reward_std": 0.03857583925127983, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.0833333358168602, "step": 872, "step_time": 804.2430580342188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "entropy": 0.296875, "epoch": 4.619047619047619, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.001162711065262556, "learning_rate": 4.412841079266777e-08, "loss": 0.0, "num_tokens": 17221364.0, "reward": 0.2708333432674408, "reward_std": 0.19795581698417664, "rewards/itbench_correctness/mean": 0.2708333432674408, "rewards/itbench_correctness/std": 0.3890872597694397, "step": 873, "step_time": 150.55902750603855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 616.125, "completions/mean_terminated_length": 616.125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.5778048038482666, "epoch": 4.624338624338624, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.002010586205869913, "learning_rate": 4.3451724231958645e-08, "loss": 0.0149, "num_tokens": 17239318.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.5, "step": 874, "step_time": 94.01774641126394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 537.5, "completions/mean_terminated_length": 537.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.26232558488845825, "epoch": 4.62962962962963, "frac_reward_zero_std": 0.5, "grad_norm": 1.109375, "kl": 0.0017294755671173334, "learning_rate": 4.2780030410047796e-08, "loss": -0.0713, "num_tokens": 17252630.0, "reward": 0.375, "reward_std": 0.08908706903457642, "rewards/itbench_correctness/mean": 0.375, "rewards/itbench_correctness/std": 0.17743021249771118, "step": 875, "step_time": 71.65096860099584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 792.8125, "completions/mean_terminated_length": 715.75, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.47930628061294556, "epoch": 4.634920634920634, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0011707143858075142, "learning_rate": 4.2113336672471245e-08, "loss": 0.0018, "num_tokens": 17273627.0, "reward": 0.84375, "reward_std": 0.2088201940059662, "rewards/itbench_correctness/mean": 0.84375, "rewards/itbench_correctness/std": 0.24757154285907745, "step": 876, "step_time": 72.07463994249701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 585.375, "completions/mean_terminated_length": 585.375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.5500747561454773, "epoch": 4.64021164021164, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0019222473492845893, "learning_rate": 4.145165031008507e-08, "loss": 0.0357, "num_tokens": 17291033.0, "reward": 0.800000011920929, "reward_std": 0.2777460217475891, "rewards/itbench_correctness/mean": 0.800000011920929, "rewards/itbench_correctness/std": 0.3265986442565918, "step": 877, "step_time": 86.38187370076776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 519.375, "completions/mean_terminated_length": 402.923095703125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.379302054643631, "epoch": 4.645502645502646, "frac_reward_zero_std": 0.5, "grad_norm": 1.546875, "kl": 0.00152775296010077, "learning_rate": 4.0794978558985e-08, "loss": 0.0501, "num_tokens": 17305767.0, "reward": 0.2395833432674408, "reward_std": 0.10386862605810165, "rewards/itbench_correctness/mean": 0.2395833432674408, "rewards/itbench_correctness/std": 0.2852468192577362, "step": 878, "step_time": 181.77928131632507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 672.125, "completions/mean_terminated_length": 554.8333740234375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.5118095874786377, "epoch": 4.650793650793651, "frac_reward_zero_std": 0.5, "grad_norm": 1.65625, "kl": 0.0015323893167078495, "learning_rate": 4.0143328600428294e-08, "loss": 0.0114, "num_tokens": 17320513.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 879, "step_time": 106.95045015309006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 721.0625, "completions/mean_terminated_length": 539.2999877929688, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.3689000606536865, "epoch": 4.656084656084656, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0019987330306321383, "learning_rate": 3.949670756075446e-08, "loss": 0.0123, "num_tokens": 17337194.0, "reward": 0.5625, "reward_std": 0.3098883032798767, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.4466957151889801, "step": 880, "step_time": 1010.3906643372029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 663.1875, "completions/mean_terminated_length": 302.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5669588446617126, "epoch": 4.661375661375661, "frac_reward_zero_std": 0.5, "grad_norm": 1.40625, "kl": 0.0028189942240715027, "learning_rate": 3.8855122511307626e-08, "loss": 0.0001, "num_tokens": 17354189.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 881, "step_time": 113.85438270866871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 442.6875, "completions/mean_terminated_length": 442.6875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.4585627615451813, "epoch": 4.666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0014935840154066682, "learning_rate": 3.821858046835913e-08, "loss": 0.0196, "num_tokens": 17363480.0, "reward": 0.5653409361839294, "reward_std": 0.1779802441596985, "rewards/itbench_correctness/mean": 0.5653409361839294, "rewards/itbench_correctness/std": 0.3175182640552521, "step": 882, "step_time": 61.071511584334075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 825.4375, "completions/mean_terminated_length": 779.6154174804688, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "entropy": 0.32225334644317627, "epoch": 4.671957671957672, "frac_reward_zero_std": 0.5, "grad_norm": 1.4453125, "kl": 0.0013399338349699974, "learning_rate": 3.75870883930306e-08, "loss": 0.0614, "num_tokens": 17384615.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 883, "step_time": 223.48245067708194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 802.8125, "completions/mean_terminated_length": 581.625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "entropy": 0.5904242992401123, "epoch": 4.677248677248677, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0015930193476378918, "learning_rate": 3.6960653191218324e-08, "loss": -0.0112, "num_tokens": 17405748.0, "reward": 0.8020833134651184, "reward_std": 0.25074294209480286, "rewards/itbench_correctness/mean": 0.8020833134651184, "rewards/itbench_correctness/std": 0.32185086607933044, "step": 884, "step_time": 80.26837155316025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 697.0, "completions/mean_terminated_length": 548.3636474609375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.42180773615837097, "epoch": 4.682539682539683, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0014244532212615013, "learning_rate": 3.63392817135173e-08, "loss": 0.0199, "num_tokens": 17421772.0, "reward": 0.3482142686843872, "reward_std": 0.14384004473686218, "rewards/itbench_correctness/mean": 0.3482142686843872, "rewards/itbench_correctness/std": 0.2632541060447693, "step": 885, "step_time": 135.67724260222167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 652.875, "completions/mean_terminated_length": 652.875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3706682026386261, "epoch": 4.6878306878306875, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.001711677061393857, "learning_rate": 3.572298075514652e-08, "loss": -0.118, "num_tokens": 17441162.0, "reward": 0.5, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 886, "step_time": 319.866651549004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 697.0, "completions/mean_terminated_length": 675.2000122070312, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.4418938159942627, "epoch": 4.693121693121693, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0013350816443562508, "learning_rate": 3.5111757055874326e-08, "loss": 0.0371, "num_tokens": 17455858.0, "reward": 0.9236111640930176, "reward_std": 0.21606040000915527, "rewards/itbench_correctness/mean": 0.9236111640930176, "rewards/itbench_correctness/std": 0.20971761643886566, "step": 887, "step_time": 798.9065483696759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 778.75, "completions/mean_terminated_length": 722.1538696289062, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "entropy": 0.4057784974575043, "epoch": 4.698412698412699, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0015928231878206134, "learning_rate": 3.450561729994533e-08, "loss": 0.0253, "num_tokens": 17493110.0, "reward": 0.5625, "reward_std": 0.5260357856750488, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 888, "step_time": 112.42098965961486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 633.0, "completions/mean_terminated_length": 633.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.32543444633483887, "epoch": 4.703703703703704, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0019525340758264065, "learning_rate": 3.390456811600673e-08, "loss": -0.145, "num_tokens": 17514366.0, "reward": 0.4375, "reward_std": 0.23927490413188934, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.30276504158973694, "step": 889, "step_time": 95.3800596492365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 672.375, "completions/mean_terminated_length": 672.375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.5919315814971924, "epoch": 4.708994708994709, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.002438145689666271, "learning_rate": 3.330861607703611e-08, "loss": 0.0001, "num_tokens": 17529404.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 890, "step_time": 194.32226402964443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 910.875, "completions/mean_terminated_length": 797.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6191848516464233, "epoch": 4.714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.002367243403568864, "learning_rate": 3.271776770026963e-08, "loss": 0.0057, "num_tokens": 17554098.0, "reward": 0.1015625, "reward_std": 0.21758441627025604, "rewards/itbench_correctness/mean": 0.1015625, "rewards/itbench_correctness/std": 0.2550275921821594, "step": 891, "step_time": 93.79412244167179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 419.75, "completions/mean_terminated_length": 419.75, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.38832637667655945, "epoch": 4.71957671957672, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0022835221607238054, "learning_rate": 3.213202944713023e-08, "loss": 0.0096, "num_tokens": 17563774.0, "reward": 0.4837239682674408, "reward_std": 0.10496115684509277, "rewards/itbench_correctness/mean": 0.4837239682674408, "rewards/itbench_correctness/std": 0.12295603007078171, "step": 892, "step_time": 44.90997119899839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 433.8125, "completions/mean_terminated_length": 433.8125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.4886903762817383, "epoch": 4.724867724867725, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0014157530386000872, "learning_rate": 3.155140772315773e-08, "loss": 0.0198, "num_tokens": 17573195.0, "reward": 0.4437499940395355, "reward_std": 0.11511446535587311, "rewards/itbench_correctness/mean": 0.4437499940395355, "rewards/itbench_correctness/std": 0.1263263076543808, "step": 893, "step_time": 62.78798679355532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 990.5, "completions/mean_terminated_length": 890.0, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "entropy": 0.43210500478744507, "epoch": 4.73015873015873, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0013504911912605166, "learning_rate": 3.097590887793827e-08, "loss": 0.0194, "num_tokens": 17598411.0, "reward": 0.6432291865348816, "reward_std": 0.2726758122444153, "rewards/itbench_correctness/mean": 0.6432291865348816, "rewards/itbench_correctness/std": 0.4446098804473877, "step": 894, "step_time": 102.70882797706872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 671.25, "completions/mean_terminated_length": 553.6666870117188, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.3515828549861908, "epoch": 4.735449735449736, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0012724708067253232, "learning_rate": 3.040553920503502e-08, "loss": 0.0152, "num_tokens": 17614191.0, "reward": 0.3125, "reward_std": 0.2335786372423172, "rewards/itbench_correctness/mean": 0.3125, "rewards/itbench_correctness/std": 0.23471811413764954, "step": 895, "step_time": 131.4134486299008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 748.6875, "completions/mean_terminated_length": 473.375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "entropy": 0.44878536462783813, "epoch": 4.7407407407407405, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0016055708983913064, "learning_rate": 2.9840304941919416e-08, "loss": -0.0042, "num_tokens": 17634162.0, "reward": 0.625, "reward_std": 0.31586384773254395, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.40138646960258484, "step": 896, "step_time": 121.16828937549144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 698.125, "completions/mean_terminated_length": 651.5714721679688, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.4755595326423645, "epoch": 4.746031746031746, "frac_reward_zero_std": 0.5, "grad_norm": 6.84375, "kl": 0.0011872922768816352, "learning_rate": 2.9280212269902628e-08, "loss": 0.0145, "num_tokens": 17650676.0, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.8125, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 897, "step_time": 88.3342649359256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 678.5, "completions/mean_terminated_length": 333.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.38319823145866394, "epoch": 4.751322751322752, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0016045935917645693, "learning_rate": 2.872526731406849e-08, "loss": -0.0215, "num_tokens": 17670076.0, "reward": 0.5572916865348816, "reward_std": 0.398481547832489, "rewards/itbench_correctness/mean": 0.5572916865348816, "rewards/itbench_correctness/std": 0.38934746384620667, "step": 898, "step_time": 123.173085459508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 656.1875, "completions/mean_terminated_length": 571.3077392578125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.55167156457901, "epoch": 4.756613756613756, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0021755106281489134, "learning_rate": 2.8175476143206145e-08, "loss": 0.0149, "num_tokens": 17684807.0, "reward": 0.778124988079071, "reward_std": 0.3087267279624939, "rewards/itbench_correctness/mean": 0.778124988079071, "rewards/itbench_correctness/std": 0.3087441027164459, "step": 899, "step_time": 81.62243656814098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 475.625, "completions/mean_terminated_length": 475.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3910643756389618, "epoch": 4.761904761904762, "frac_reward_zero_std": 0.5, "grad_norm": 0.4921875, "kl": 0.0022366566117852926, "learning_rate": 2.7630844769743756e-08, "loss": -0.0554, "num_tokens": 17695441.0, "reward": 0.6812499761581421, "reward_std": 0.0752970278263092, "rewards/itbench_correctness/mean": 0.6812499761581421, "rewards/itbench_correctness/std": 0.3449033796787262, "step": 900, "step_time": 96.98080993723124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 706.625, "completions/mean_terminated_length": 661.2857666015625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5575800538063049, "epoch": 4.767195767195767, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.0022240527905523777, "learning_rate": 2.7091379149682682e-08, "loss": -0.0212, "num_tokens": 17720731.0, "reward": 0.484375, "reward_std": 0.46940183639526367, "rewards/itbench_correctness/mean": 0.484375, "rewards/itbench_correctness/std": 0.4696519374847412, "step": 901, "step_time": 119.61820879764855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 772.5625, "completions/mean_terminated_length": 577.0, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "entropy": 0.5074023008346558, "epoch": 4.772486772486772, "frac_reward_zero_std": 0.0, "grad_norm": 5.0625, "kl": 0.0013676214730367064, "learning_rate": 2.655708518253258e-08, "loss": -0.0031, "num_tokens": 17737724.0, "reward": 0.5364583134651184, "reward_std": 0.3415539562702179, "rewards/itbench_correctness/mean": 0.5364583134651184, "rewards/itbench_correctness/std": 0.4584280252456665, "step": 902, "step_time": 71.03902994468808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 556.0625, "completions/mean_terminated_length": 556.0625, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.467573344707489, "epoch": 4.777777777777778, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.0027739950455725193, "learning_rate": 2.6027968711246627e-08, "loss": -0.0162, "num_tokens": 17756685.0, "reward": 0.6875, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.4787135720252991, "step": 903, "step_time": 73.05204797629267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 432.375, "completions/mean_terminated_length": 432.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.33304423093795776, "epoch": 4.783068783068783, "frac_reward_zero_std": 1.0, "grad_norm": 0.040771484375, "kl": 0.0020940680988132954, "learning_rate": 2.550403552215785e-08, "loss": 0.0, "num_tokens": 17766963.0, "reward": 0.75, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 904, "step_time": 75.88487505353987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 543.375, "completions/mean_terminated_length": 543.375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "entropy": 0.36438924074172974, "epoch": 4.788359788359788, "frac_reward_zero_std": 0.5, "grad_norm": 1.0625, "kl": 0.0024348387960344553, "learning_rate": 2.4985291344915673e-08, "loss": 0.0049, "num_tokens": 17780065.0, "reward": 0.22499999403953552, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.22499999403953552, "rewards/itbench_correctness/std": 0.29552215337753296, "step": 905, "step_time": 420.1394767453894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 911.4375, "completions/mean_terminated_length": 766.7142944335938, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "entropy": 0.6890214681625366, "epoch": 4.7936507936507935, "frac_reward_zero_std": 0.5, "grad_norm": 1.65625, "kl": 0.00153902149759233, "learning_rate": 2.4471741852423233e-08, "loss": 0.0055, "num_tokens": 17825752.0, "reward": 0.03125, "reward_std": 0.0578637570142746, "rewards/itbench_correctness/mean": 0.03125, "rewards/itbench_correctness/std": 0.08539126068353653, "step": 906, "step_time": 132.87724316772074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 648.3125, "completions/mean_terminated_length": 648.3125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4966740608215332, "epoch": 4.798941798941799, "frac_reward_zero_std": 0.5, "grad_norm": 0.4765625, "kl": 0.0030572270043194294, "learning_rate": 2.396339266077557e-08, "loss": -0.1071, "num_tokens": 17839621.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.25, "step": 907, "step_time": 503.1076301559806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 917.625, "completions/mean_terminated_length": 598.5, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.3029560148715973, "epoch": 4.804232804232804, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234375, "kl": 0.0009918678551912308, "learning_rate": 2.3460249329197823e-08, "loss": -0.0133, "num_tokens": 17868991.0, "reward": 0.3541666865348816, "reward_std": 0.058925561606884, "rewards/itbench_correctness/mean": 0.3541666865348816, "rewards/itbench_correctness/std": 0.37453675270080566, "step": 908, "step_time": 243.62568031344563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 767.5625, "completions/mean_terminated_length": 750.4667358398438, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.2605651021003723, "epoch": 4.809523809523809, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.001077577588148415, "learning_rate": 2.2962317359985107e-08, "loss": -0.0185, "num_tokens": 17888648.0, "reward": 0.3333333432674408, "reward_std": 0.2766174077987671, "rewards/itbench_correctness/mean": 0.3333333432674408, "rewards/itbench_correctness/std": 0.3162277638912201, "step": 909, "step_time": 128.46637521497905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 711.375, "completions/mean_terminated_length": 690.5333862304688, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.4948163628578186, "epoch": 4.814814814814815, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.002276236657053232, "learning_rate": 2.2469602198441573e-08, "loss": 0.0748, "num_tokens": 17906702.0, "reward": 0.5833333134651184, "reward_std": 0.19287919998168945, "rewards/itbench_correctness/mean": 0.5833333134651184, "rewards/itbench_correctness/std": 0.28706690669059753, "step": 910, "step_time": 451.41445366758853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 479.5, "completions/mean_terminated_length": 479.5, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.31699687242507935, "epoch": 4.8201058201058204, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.001171560725197196, "learning_rate": 2.1982109232821176e-08, "loss": -0.0032, "num_tokens": 17917550.0, "reward": 0.9453125, "reward_std": 0.08679073303937912, "rewards/itbench_correctness/mean": 0.9453125, "rewards/itbench_correctness/std": 0.10174263268709183, "step": 911, "step_time": 838.8527243016288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 508.5625, "completions/mean_terminated_length": 508.5625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5309081673622131, "epoch": 4.825396825396825, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.0025923862121999264, "learning_rate": 2.1499843794269058e-08, "loss": -0.1172, "num_tokens": 17931191.0, "reward": 0.5625, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 912, "step_time": 216.96005523204803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 761.25, "completions/mean_terminated_length": 673.6666870117188, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.47027915716171265, "epoch": 4.830687830687831, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0013219509273767471, "learning_rate": 2.1022811156762576e-08, "loss": 0.0432, "num_tokens": 17947115.0, "reward": 0.25, "reward_std": 0.30284827947616577, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.3259601294994354, "step": 913, "step_time": 100.75746689084917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 631.75, "completions/mean_terminated_length": 396.3999938964844, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3593193590641022, "epoch": 4.835978835978836, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.002075113356113434, "learning_rate": 2.055101653705449e-08, "loss": -0.0584, "num_tokens": 17963415.0, "reward": 0.671875, "reward_std": 0.4432469606399536, "rewards/itbench_correctness/mean": 0.671875, "rewards/itbench_correctness/std": 0.4718646705150604, "step": 914, "step_time": 378.09901642706245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 865.3125, "completions/mean_terminated_length": 741.888916015625, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "entropy": 0.40216684341430664, "epoch": 4.841269841269841, "frac_reward_zero_std": 0.5, "grad_norm": 1.2265625, "kl": 0.0013008101377636194, "learning_rate": 2.008446509461498e-08, "loss": -0.0024, "num_tokens": 17983796.0, "reward": 0.8459821343421936, "reward_std": 0.10143714398145676, "rewards/itbench_correctness/mean": 0.8459821343421936, "rewards/itbench_correctness/std": 0.21097390353679657, "step": 915, "step_time": 85.97270075790584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 609.9375, "completions/mean_terminated_length": 287.8888854980469, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5607131719589233, "epoch": 4.8465608465608465, "frac_reward_zero_std": 0.5, "grad_norm": 1.3125, "kl": 0.0020648783538490534, "learning_rate": 1.9623161931575926e-08, "loss": -0.0784, "num_tokens": 18000307.0, "reward": 0.171875, "reward_std": 0.1367267221212387, "rewards/itbench_correctness/mean": 0.171875, "rewards/itbench_correctness/std": 0.2576940953731537, "step": 916, "step_time": 217.30383673589677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 639.0625, "completions/mean_terminated_length": 639.0625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.4193643033504486, "epoch": 4.851851851851852, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.0019990454893559217, "learning_rate": 1.9167112092674796e-08, "loss": 0.0, "num_tokens": 18025980.0, "reward": 0.05000000074505806, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.05000000074505806, "rewards/itbench_correctness/std": 0.05163978040218353, "step": 917, "step_time": 113.52076997049153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 583.625, "completions/mean_terminated_length": 583.625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.24159349501132965, "epoch": 4.857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0012394418008625507, "learning_rate": 1.8716320565199618e-08, "loss": 0.0073, "num_tokens": 18040222.0, "reward": 0.4027777910232544, "reward_std": 0.11368955671787262, "rewards/itbench_correctness/mean": 0.4027777910232544, "rewards/itbench_correctness/std": 0.17033012211322784, "step": 918, "step_time": 70.6566086569801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 733.75, "completions/mean_terminated_length": 559.6000366210938, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "entropy": 0.5805792212486267, "epoch": 4.862433862433862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0245361328125, "kl": 0.0014999855775386095, "learning_rate": 1.82707922789343e-08, "loss": 0.0, "num_tokens": 18068698.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 919, "step_time": 638.5085713258013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 725.3125, "completions/mean_terminated_length": 725.3125, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.6038776636123657, "epoch": 4.867724867724868, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.0026757661253213882, "learning_rate": 1.7830532106104746e-08, "loss": 0.0001, "num_tokens": 18099127.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 920, "step_time": 105.40813992917538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 492.125, "completions/mean_terminated_length": 492.125, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.4124968349933624, "epoch": 4.8730158730158735, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0012788712047040462, "learning_rate": 1.7395544861325718e-08, "loss": -0.0021, "num_tokens": 18109953.0, "reward": 0.20928031206130981, "reward_std": 0.10235221683979034, "rewards/itbench_correctness/mean": 0.20928031206130981, "rewards/itbench_correctness/std": 0.1258237361907959, "step": 921, "step_time": 53.33936434518546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 424.125, "completions/mean_terminated_length": 424.125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.38903623819351196, "epoch": 4.878306878306878, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0015015568351373076, "learning_rate": 1.6965835301547936e-08, "loss": -0.0055, "num_tokens": 18119051.0, "reward": 0.5625, "reward_std": 0.2177756428718567, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.25, "step": 922, "step_time": 53.09215545654297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 631.75, "completions/mean_terminated_length": 631.75, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.42105263471603394, "epoch": 4.883597883597884, "frac_reward_zero_std": 0.5, "grad_norm": 1.4765625, "kl": 0.0011097542010247707, "learning_rate": 1.654140812600646e-08, "loss": 0.0024, "num_tokens": 18132919.0, "reward": 0.8697916865348816, "reward_std": 0.014731383882462978, "rewards/itbench_correctness/mean": 0.8697916865348816, "rewards/itbench_correctness/std": 0.1359764039516449, "step": 923, "step_time": 800.9228875609115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 391.125, "completions/mean_terminated_length": 391.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.3630552887916565, "epoch": 4.888888888888889, "frac_reward_zero_std": 0.5, "grad_norm": 0.96875, "kl": 0.0026786988601088524, "learning_rate": 1.612226797616878e-08, "loss": -0.0025, "num_tokens": 18141865.0, "reward": 0.71875, "reward_std": 0.0578637570142746, "rewards/itbench_correctness/mean": 0.71875, "rewards/itbench_correctness/std": 0.23935678601264954, "step": 924, "step_time": 811.8248612135649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 992.0, "completions/mean_terminated_length": 921.6000366210938, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "entropy": 0.44556450843811035, "epoch": 4.894179894179894, "frac_reward_zero_std": 0.0, "grad_norm": 3.796875, "kl": 0.0014187112683430314, "learning_rate": 1.570841943568446e-08, "loss": -0.0001, "num_tokens": 18173001.0, "reward": 0.30000001192092896, "reward_std": 0.32691311836242676, "rewards/itbench_correctness/mean": 0.30000001192092896, "rewards/itbench_correctness/std": 0.37372004985809326, "step": 925, "step_time": 140.93405285663903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 780.3125, "completions/mean_terminated_length": 590.7777709960938, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.5049259066581726, "epoch": 4.8994708994708995, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0018744791159406304, "learning_rate": 1.5299867030334813e-08, "loss": 0.0123, "num_tokens": 18191094.0, "reward": 0.6193181872367859, "reward_std": 0.19284729659557343, "rewards/itbench_correctness/mean": 0.6193181872367859, "rewards/itbench_correctness/std": 0.4520004689693451, "step": 926, "step_time": 850.4311718912795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 744.375, "completions/mean_terminated_length": 576.6000366210938, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "entropy": 0.30495381355285645, "epoch": 4.904761904761905, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0011643210891634226, "learning_rate": 1.4896615227983466e-08, "loss": 0.0384, "num_tokens": 18211444.0, "reward": 0.1875, "reward_std": 0.2700308561325073, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.32702362537384033, "step": 927, "step_time": 924.7460502795875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 743.0625, "completions/mean_terminated_length": 524.5555419921875, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.2853057384490967, "epoch": 4.91005291005291, "frac_reward_zero_std": 1.0, "grad_norm": 0.026123046875, "kl": 0.0013434689026325941, "learning_rate": 1.4498668438527595e-08, "loss": 0.0, "num_tokens": 18229109.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 928, "step_time": 146.66280045732856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 827.8125, "completions/mean_terminated_length": 675.2222290039062, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "entropy": 0.5242733359336853, "epoch": 4.915343915343915, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0013374233385547996, "learning_rate": 1.4106031013849496e-08, "loss": -0.0194, "num_tokens": 18248834.0, "reward": 0.40312498807907104, "reward_std": 0.18545761704444885, "rewards/itbench_correctness/mean": 0.40312498807907104, "rewards/itbench_correctness/std": 0.21328286826610565, "step": 929, "step_time": 373.5206086365506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 554.125, "completions/mean_terminated_length": 554.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5053011775016785, "epoch": 4.920634920634921, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.002205699449405074, "learning_rate": 1.3718707247769134e-08, "loss": -0.0714, "num_tokens": 18262628.0, "reward": 0.4375, "reward_std": 0.22226819396018982, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.32221025228500366, "step": 930, "step_time": 99.51419737841934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 370.1875, "completions/mean_terminated_length": 370.1875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.35117340087890625, "epoch": 4.925925925925926, "frac_reward_zero_std": 0.5, "grad_norm": 0.40625, "kl": 0.00277713593095541, "learning_rate": 1.3336701375997127e-08, "loss": -0.0628, "num_tokens": 18275143.0, "reward": 0.6875, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.3095695972442627, "step": 931, "step_time": 64.9049273962155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 748.5, "completions/mean_terminated_length": 473.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.3607214391231537, "epoch": 4.931216931216931, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0014514221111312509, "learning_rate": 1.2960017576088444e-08, "loss": -0.0523, "num_tokens": 18299639.0, "reward": 0.4375, "reward_std": 0.290380597114563, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.45896419882774353, "step": 932, "step_time": 117.33534361980855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 515.25, "completions/mean_terminated_length": 515.25, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.3842794895172119, "epoch": 4.936507936507937, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0029691443778574467, "learning_rate": 1.2588659967396997e-08, "loss": -0.0036, "num_tokens": 18311107.0, "reward": 0.8636363744735718, "reward_std": 0.09819302707910538, "rewards/itbench_correctness/mean": 0.8636363744735718, "rewards/itbench_correctness/std": 0.1603485643863678, "step": 933, "step_time": 64.65582219231874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 692.875, "completions/mean_terminated_length": 692.875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "entropy": 0.3896806836128235, "epoch": 4.941798941798941, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.001258630771189928, "learning_rate": 1.2222632611029848e-08, "loss": 0.0216, "num_tokens": 18326641.0, "reward": 0.545036792755127, "reward_std": 0.15080596506595612, "rewards/itbench_correctness/mean": 0.545036792755127, "rewards/itbench_correctness/std": 0.3676183223724365, "step": 934, "step_time": 169.38152172323316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 851.6875, "completions/mean_terminated_length": 717.6666870117188, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5119248628616333, "epoch": 4.947089947089947, "frac_reward_zero_std": 0.5, "grad_norm": 1.265625, "kl": 0.0015083615435287356, "learning_rate": 1.1861939509803686e-08, "loss": 0.0377, "num_tokens": 18350692.0, "reward": 0.0390625, "reward_std": 0.05725783854722977, "rewards/itbench_correctness/mean": 0.0390625, "rewards/itbench_correctness/std": 0.08801929652690887, "step": 935, "step_time": 103.84523029625416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 659.0625, "completions/mean_terminated_length": 659.0625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.2715979218482971, "epoch": 4.9523809523809526, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.000976603594608605, "learning_rate": 1.1506584608200364e-08, "loss": -0.0178, "num_tokens": 18366501.0, "reward": 0.6177083253860474, "reward_std": 0.17081069946289062, "rewards/itbench_correctness/mean": 0.6177083253860474, "rewards/itbench_correctness/std": 0.21085968613624573, "step": 936, "step_time": 92.50707028061152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 699.0625, "completions/mean_terminated_length": 504.1000061035156, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4091193675994873, "epoch": 4.957671957671957, "frac_reward_zero_std": 0.5, "grad_norm": 4.21875, "kl": 0.002065515611320734, "learning_rate": 1.115657179232421e-08, "loss": -0.065, "num_tokens": 18387390.0, "reward": 0.1875, "reward_std": 0.10681165754795074, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.14751020073890686, "step": 937, "step_time": 594.838815539144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 817.4375, "completions/mean_terminated_length": 787.9285888671875, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "entropy": 0.4061472713947296, "epoch": 4.962962962962963, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0015314513584598899, "learning_rate": 1.0811904889859335e-08, "loss": 0.0224, "num_tokens": 18405933.0, "reward": 0.854687511920929, "reward_std": 0.22741259634494781, "rewards/itbench_correctness/mean": 0.854687511920929, "rewards/itbench_correctness/std": 0.24986976385116577, "step": 938, "step_time": 84.86788581125438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 857.3125, "completions/mean_terminated_length": 801.75, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "entropy": 0.5902165174484253, "epoch": 4.968253968253968, "frac_reward_zero_std": 0.5, "grad_norm": 1.5078125, "kl": 0.0016059105983003974, "learning_rate": 1.0472587670027678e-08, "loss": 0.0006, "num_tokens": 18446842.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 939, "step_time": 161.18546231649816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 533.8125, "completions/mean_terminated_length": 533.8125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.5657417178153992, "epoch": 4.973544973544973, "frac_reward_zero_std": 1.0, "grad_norm": 0.01708984375, "kl": 0.0015139211900532246, "learning_rate": 1.0138623843548078e-08, "loss": 0.0, "num_tokens": 18468391.0, "reward": 0.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.0, "rewards/itbench_correctness/std": 0.0, "step": 940, "step_time": 92.19830699265003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 734.0, "completions/mean_terminated_length": 444.0, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.3760218024253845, "epoch": 4.978835978835979, "frac_reward_zero_std": 0.5, "grad_norm": 1.2109375, "kl": 0.001119913998991251, "learning_rate": 9.810017062595321e-09, "loss": 0.0, "num_tokens": 18486999.0, "reward": 0.734375, "reward_std": 0.19408094882965088, "rewards/itbench_correctness/mean": 0.734375, "rewards/itbench_correctness/std": 0.3815402090549469, "step": 941, "step_time": 845.4121110225096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 763.5625, "completions/mean_terminated_length": 503.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5107637047767639, "epoch": 4.984126984126984, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.002076751785352826, "learning_rate": 9.486770920760667e-09, "loss": -0.0707, "num_tokens": 18506976.0, "reward": 0.53125, "reward_std": 0.41746097803115845, "rewards/itbench_correctness/mean": 0.53125, "rewards/itbench_correctness/std": 0.4989572763442993, "step": 942, "step_time": 94.76647205464542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 720.125, "completions/mean_terminated_length": 582.0, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.5610136985778809, "epoch": 4.98941798941799, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.0018183528445661068, "learning_rate": 9.168888953011989e-09, "loss": 0.0444, "num_tokens": 18531946.0, "reward": 0.1979166716337204, "reward_std": 0.2609178125858307, "rewards/itbench_correctness/mean": 0.1979166716337204, "rewards/itbench_correctness/std": 0.3232860863208771, "step": 943, "step_time": 101.69731870479882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 479.875, "completions/mean_terminated_length": 479.875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.4459494650363922, "epoch": 4.994708994708994, "frac_reward_zero_std": 0.5, "grad_norm": 1.1796875, "kl": 0.0016703108558431268, "learning_rate": 8.856374635655695e-09, "loss": -0.0015, "num_tokens": 18542184.0, "reward": 0.4583333432674408, "reward_std": 0.1178511306643486, "rewards/itbench_correctness/mean": 0.4583333432674408, "rewards/itbench_correctness/std": 0.5, "step": 944, "step_time": 465.4684395249933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 718.6875, "completions/mean_terminated_length": 481.22222900390625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.5816158056259155, "epoch": 5.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0013618582161143422, "learning_rate": 8.54923138629815e-09, "loss": 0.013, "num_tokens": 18560147.0, "reward": 0.40625, "reward_std": 0.19776971638202667, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.306526243686676, "step": 945, "step_time": 143.76468984037638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 575.8125, "completions/mean_terminated_length": 545.933349609375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.4619559347629547, "epoch": 5.005291005291006, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0013607589062303305, "learning_rate": 8.247462563808816e-09, "loss": 0.0691, "num_tokens": 18571720.0, "reward": 0.8020833134651184, "reward_std": 0.4064691960811615, "rewards/itbench_correctness/mean": 0.8020833134651184, "rewards/itbench_correctness/std": 0.40008679032325745, "step": 946, "step_time": 82.60051180887967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 862.5625, "completions/mean_terminated_length": 701.125, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "entropy": 0.5077893137931824, "epoch": 5.01058201058201, "frac_reward_zero_std": 0.5, "grad_norm": 1.421875, "kl": 0.0012988558737561107, "learning_rate": 7.951071468283166e-09, "loss": 0.0, "num_tokens": 18591313.0, "reward": 0.796875, "reward_std": 0.13258251547813416, "rewards/itbench_correctness/mean": 0.796875, "rewards/itbench_correctness/std": 0.27716949582099915, "step": 947, "step_time": 1016.9221306946129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 920.5625, "completions/mean_terminated_length": 787.5714721679688, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "entropy": 0.4605879485607147, "epoch": 5.015873015873016, "frac_reward_zero_std": 0.5, "grad_norm": 1.5390625, "kl": 0.001742081018164754, "learning_rate": 7.660061341006718e-09, "loss": 0.0001, "num_tokens": 18623898.0, "reward": 0.25, "reward_std": 0.15430334210395813, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.3333333432674408, "step": 948, "step_time": 110.10245905164629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 639.3125, "completions/mean_terminated_length": 511.0833435058594, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4192003011703491, "epoch": 5.021164021164021, "frac_reward_zero_std": 0.5, "grad_norm": 0.443359375, "kl": 0.0027288675773888826, "learning_rate": 7.374435364419673e-09, "loss": -0.0643, "num_tokens": 18638975.0, "reward": 0.17500001192092896, "reward_std": 0.0707106739282608, "rewards/itbench_correctness/mean": 0.17500001192092896, "rewards/itbench_correctness/std": 0.20493902266025543, "step": 949, "step_time": 263.15858253091574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 859.5625, "completions/mean_terminated_length": 821.6154174804688, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "entropy": 0.3839162290096283, "epoch": 5.026455026455026, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.000997141352854669, "learning_rate": 7.09419666208183e-09, "loss": -0.0236, "num_tokens": 18657616.0, "reward": 0.6669219732284546, "reward_std": 0.24255049228668213, "rewards/itbench_correctness/mean": 0.6669219732284546, "rewards/itbench_correctness/std": 0.4117698669433594, "step": 950, "step_time": 535.0438889786601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 944.6875, "completions/mean_terminated_length": 883.0, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "entropy": 0.39801523089408875, "epoch": 5.031746031746032, "frac_reward_zero_std": 0.5, "grad_norm": 1.0703125, "kl": 0.001400322187691927, "learning_rate": 6.819348298638839e-09, "loss": 0.0019, "num_tokens": 18678923.0, "reward": 0.328125, "reward_std": 0.0646936446428299, "rewards/itbench_correctness/mean": 0.328125, "rewards/itbench_correctness/std": 0.3502231538295746, "step": 951, "step_time": 90.80604922864586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 787.0625, "completions/mean_terminated_length": 753.2142944335938, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "entropy": 0.36845865845680237, "epoch": 5.037037037037037, "frac_reward_zero_std": 0.5, "grad_norm": 1.3515625, "kl": 0.0017281656619161367, "learning_rate": 6.549893279788277e-09, "loss": 0.007, "num_tokens": 18701348.0, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/itbench_correctness/mean": 0.9375, "rewards/itbench_correctness/std": 0.17078252136707306, "step": 952, "step_time": 321.3709614155814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 685.25, "completions/mean_terminated_length": 685.25, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.4961692690849304, "epoch": 5.042328042328043, "frac_reward_zero_std": 0.5, "grad_norm": 1.265625, "kl": 0.0015087542124092579, "learning_rate": 6.2858345522471265e-09, "loss": 0.0125, "num_tokens": 18722424.0, "reward": 0.34687501192092896, "reward_std": 0.0646936446428299, "rewards/itbench_correctness/mean": 0.34687501192092896, "rewards/itbench_correctness/std": 0.2698571979999542, "step": 953, "step_time": 409.55402624513954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 433.375, "completions/mean_terminated_length": 433.375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.5030285716056824, "epoch": 5.0476190476190474, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.0032849370036274195, "learning_rate": 6.0271750037193534e-09, "loss": 0.0001, "num_tokens": 18731814.0, "reward": 0.25, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.25, "rewards/itbench_correctness/std": 0.25819888710975647, "step": 954, "step_time": 96.037104123272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 762.25, "completions/mean_terminated_length": 701.84619140625, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "entropy": 0.3961954712867737, "epoch": 5.052910052910053, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0015258269850164652, "learning_rate": 5.773917462864264e-09, "loss": 0.0162, "num_tokens": 18749154.0, "reward": 0.5750000476837158, "reward_std": 0.24238379299640656, "rewards/itbench_correctness/mean": 0.5750000476837158, "rewards/itbench_correctness/std": 0.4028027057647705, "step": 955, "step_time": 67.30359940230846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 587.3125, "completions/mean_terminated_length": 558.2000122070312, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3524529039859772, "epoch": 5.058201058201059, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.002124128630384803, "learning_rate": 5.526064699265753e-09, "loss": 0.0, "num_tokens": 18767295.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 956, "step_time": 319.00149345304817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 828.9375, "completions/mean_terminated_length": 633.875, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.35708361864089966, "epoch": 5.063492063492063, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0012050803052261472, "learning_rate": 5.283619423401997e-09, "loss": -0.0121, "num_tokens": 18791526.0, "reward": 0.1875, "reward_std": 0.4082317352294922, "rewards/itbench_correctness/mean": 0.1875, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 957, "step_time": 935.1128796143457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 1020.0, "completions/mean_terminated_length": 960.0, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "entropy": 0.31568628549575806, "epoch": 5.068783068783069, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.00110468955244869, "learning_rate": 5.046584286615696e-09, "loss": 0.0017, "num_tokens": 18820270.0, "reward": 0.3437500298023224, "reward_std": 0.23224487900733948, "rewards/itbench_correctness/mean": 0.3437500298023224, "rewards/itbench_correctness/std": 0.30712980031967163, "step": 958, "step_time": 187.4145915368572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 732.625, "completions/mean_terminated_length": 732.625, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "entropy": 0.3330489695072174, "epoch": 5.074074074074074, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0017922122497111559, "learning_rate": 4.8149618810850444e-09, "loss": 0.0026, "num_tokens": 18846408.0, "reward": 0.5416666865348816, "reward_std": 0.3205420970916748, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.40138646960258484, "step": 959, "step_time": 115.98179497290403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 747.9375, "completions/mean_terminated_length": 582.2999877929688, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.4786496162414551, "epoch": 5.079365079365079, "frac_reward_zero_std": 1.0, "grad_norm": 0.0233154296875, "kl": 0.0014122002758085728, "learning_rate": 4.588754739795586e-09, "loss": 0.0, "num_tokens": 18866879.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 960, "step_time": 447.1240088623017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 872.0625, "completions/mean_terminated_length": 780.9000244140625, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "entropy": 0.36006593704223633, "epoch": 5.084656084656085, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.0014327183598652482, "learning_rate": 4.367965336512403e-09, "loss": 0.0033, "num_tokens": 18892360.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.5625, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 961, "step_time": 565.5308811077848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 447.9375, "completions/mean_terminated_length": 447.9375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.4866750240325928, "epoch": 5.08994708994709, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0028053594287484884, "learning_rate": 4.152596085753024e-09, "loss": -0.0055, "num_tokens": 18904239.0, "reward": 0.2922794222831726, "reward_std": 0.34673309326171875, "rewards/itbench_correctness/mean": 0.2922794222831726, "rewards/itbench_correctness/std": 0.35596761107444763, "step": 962, "step_time": 62.929508111439645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 579.75, "completions/mean_terminated_length": 431.66668701171875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.3777490258216858, "epoch": 5.095238095238095, "frac_reward_zero_std": 0.5, "grad_norm": 1.03125, "kl": 0.0031987950205802917, "learning_rate": 3.9426493427611175e-09, "loss": 0.0033, "num_tokens": 18924067.0, "reward": 0.1171875, "reward_std": 0.1269381046295166, "rewards/itbench_correctness/mean": 0.1171875, "rewards/itbench_correctness/std": 0.2114865630865097, "step": 963, "step_time": 148.80320667196065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 530.5625, "completions/mean_terminated_length": 530.5625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.5051242709159851, "epoch": 5.1005291005291005, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0027365966234356165, "learning_rate": 3.7381274034805066e-09, "loss": -0.0151, "num_tokens": 18945556.0, "reward": 0.5562499761581421, "reward_std": 0.3939805328845978, "rewards/itbench_correctness/mean": 0.5562499761581421, "rewards/itbench_correctness/std": 0.4657878875732422, "step": 964, "step_time": 113.64905078150332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 575.75, "completions/mean_terminated_length": 545.86669921875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.35084670782089233, "epoch": 5.105820105820106, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0013689551269635558, "learning_rate": 3.53903250453047e-09, "loss": -0.0007, "num_tokens": 18957760.0, "reward": 0.6875, "reward_std": 0.3613206446170807, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.36830443143844604, "step": 965, "step_time": 143.10534042678773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 667.4375, "completions/mean_terminated_length": 453.5, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.4554733633995056, "epoch": 5.111111111111111, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.001402169349603355, "learning_rate": 3.3453668231809283e-09, "loss": 0.0134, "num_tokens": 18976887.0, "reward": 0.7916666865348816, "reward_std": 0.3177001476287842, "rewards/itbench_correctness/mean": 0.7916666865348816, "rewards/itbench_correctness/std": 0.4013864994049072, "step": 966, "step_time": 106.49516909942031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 808.4375, "completions/mean_terminated_length": 640.7777709960938, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.4626207947731018, "epoch": 5.116402116402116, "frac_reward_zero_std": 1.0, "grad_norm": 0.03515625, "kl": 0.0013422613264992833, "learning_rate": 3.1571324773286278e-09, "loss": 0.0, "num_tokens": 19000374.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 967, "step_time": 106.7643784377724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 628.375, "completions/mean_terminated_length": 448.54547119140625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.4742391109466553, "epoch": 5.121693121693122, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.001217088894918561, "learning_rate": 2.9743315254743828e-09, "loss": 0.0213, "num_tokens": 19014292.0, "reward": 0.5, "reward_std": 0.3745020925998688, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.40824830532073975, "step": 968, "step_time": 76.93132317159325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 427.0625, "completions/mean_terminated_length": 427.0625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.4425581693649292, "epoch": 5.1269841269841265, "frac_reward_zero_std": 0.5, "grad_norm": 1.1015625, "kl": 0.0016664480790495872, "learning_rate": 2.7969659666999267e-09, "loss": -0.0014, "num_tokens": 19023773.0, "reward": 0.6875, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.6875, "rewards/itbench_correctness/std": 0.370809942483902, "step": 969, "step_time": 200.08597892336547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 686.0, "completions/mean_terminated_length": 483.20001220703125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3600583076477051, "epoch": 5.132275132275132, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0012360253604128957, "learning_rate": 2.6250377406467627e-09, "loss": -0.0297, "num_tokens": 19050061.0, "reward": 0.515625, "reward_std": 0.2414703369140625, "rewards/itbench_correctness/mean": 0.515625, "rewards/itbench_correctness/std": 0.4784414768218994, "step": 970, "step_time": 540.8324056314304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 750.4375, "completions/mean_terminated_length": 537.6666870117188, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "entropy": 0.4610643684864044, "epoch": 5.137566137566138, "frac_reward_zero_std": 0.5, "grad_norm": 1.1171875, "kl": 0.0013813948025926948, "learning_rate": 2.458548727494292e-09, "loss": 0.0043, "num_tokens": 19075564.0, "reward": 0.9921875, "reward_std": 0.022097086533904076, "rewards/itbench_correctness/mean": 0.9921875, "rewards/itbench_correctness/std": 0.03125, "step": 971, "step_time": 217.15286646224558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 657.375, "completions/mean_terminated_length": 657.375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.40768206119537354, "epoch": 5.142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 1.640625, "kl": 0.0014334998559206724, "learning_rate": 2.2975007479397733e-09, "loss": 0.0096, "num_tokens": 19090834.0, "reward": 0.75, "reward_std": 0.13363061845302582, "rewards/itbench_correctness/mean": 0.75, "rewards/itbench_correctness/std": 0.3162277936935425, "step": 972, "step_time": 432.4331463770941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 699.7142944335938, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.5214454531669617, "epoch": 5.148148148148148, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0014667396899312735, "learning_rate": 2.14189556317812e-09, "loss": -0.0047, "num_tokens": 19107054.0, "reward": 0.8541666865348816, "reward_std": 0.290380597114563, "rewards/itbench_correctness/mean": 0.8541666865348816, "rewards/itbench_correctness/std": 0.3435921370983124, "step": 973, "step_time": 205.37443487346172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 665.625, "completions/mean_terminated_length": 665.625, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "entropy": 0.39962440729141235, "epoch": 5.1534391534391535, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.001995999598875642, "learning_rate": 1.9917348748826334e-09, "loss": -0.0209, "num_tokens": 19121896.0, "reward": 0.4375, "reward_std": 0.13969546556472778, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.2922613024711609, "step": 974, "step_time": 112.39083941001445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 617.75, "completions/mean_terminated_length": 617.75, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.2751922309398651, "epoch": 5.158730158730159, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0011612976668402553, "learning_rate": 1.8470203251865768e-09, "loss": -0.0003, "num_tokens": 19137156.0, "reward": 0.578125, "reward_std": 0.3319548964500427, "rewards/itbench_correctness/mean": 0.578125, "rewards/itbench_correctness/std": 0.3842606544494629, "step": 975, "step_time": 101.55415380187333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 901.0, "completions/mean_terminated_length": 778.0, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "entropy": 0.4217536151409149, "epoch": 5.164021164021164, "frac_reward_zero_std": 0.5, "grad_norm": 1.4375, "kl": 0.0018101928289979696, "learning_rate": 1.7077534966650765e-09, "loss": 0.0001, "num_tokens": 19166732.0, "reward": 0.40416669845581055, "reward_std": 0.2077372521162033, "rewards/itbench_correctness/mean": 0.40416669845581055, "rewards/itbench_correctness/std": 0.423368364572525, "step": 976, "step_time": 115.02839307207614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 664.25, "completions/mean_terminated_length": 304.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.5600301027297974, "epoch": 5.169312169312169, "frac_reward_zero_std": 0.5, "grad_norm": 1.09375, "kl": 0.002377049997448921, "learning_rate": 1.5739359123178585e-09, "loss": -0.0112, "num_tokens": 19183648.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 977, "step_time": 76.0952754272148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 735.0, "completions/mean_terminated_length": 638.6666870117188, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.38639456033706665, "epoch": 5.174603174603175, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0020921749528497458, "learning_rate": 1.4455690355525963e-09, "loss": 0.0233, "num_tokens": 19205040.0, "reward": 0.5208333730697632, "reward_std": 0.347861647605896, "rewards/itbench_correctness/mean": 0.5208333730697632, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 978, "step_time": 102.6859831251204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 640.625, "completions/mean_terminated_length": 615.0667114257812, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.3012682795524597, "epoch": 5.1798941798941796, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.0016695463564246893, "learning_rate": 1.3226542701689214e-09, "loss": 0.0201, "num_tokens": 19219874.0, "reward": 0.40625, "reward_std": 0.24511480331420898, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.23935678601264954, "step": 979, "step_time": 449.7205182630569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 882.25, "completions/mean_terminated_length": 740.5, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "entropy": 0.5644658803939819, "epoch": 5.185185185185185, "frac_reward_zero_std": 0.5, "grad_norm": 1.5234375, "kl": 0.0016784444451332092, "learning_rate": 1.2051929603428823e-09, "loss": 0.0001, "num_tokens": 19257654.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 980, "step_time": 202.28774461336434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 706.5625, "completions/mean_terminated_length": 633.3077392578125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.5095090866088867, "epoch": 5.190476190476191, "frac_reward_zero_std": 0.5, "grad_norm": 1.53125, "kl": 0.0016119469655677676, "learning_rate": 1.0931863906127325e-09, "loss": -0.0168, "num_tokens": 19297487.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/itbench_correctness/mean": 0.625, "rewards/itbench_correctness/std": 0.5, "step": 981, "step_time": 162.3620089488104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 797.4375, "completions/mean_terminated_length": 721.9166870117188, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 0.4715103209018707, "epoch": 5.195767195767195, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0011810938594862819, "learning_rate": 9.866357858642205e-10, "loss": 0.0159, "num_tokens": 19314862.0, "reward": 0.8062499761581421, "reward_std": 0.2764522433280945, "rewards/itbench_correctness/mean": 0.8062499761581421, "rewards/itbench_correctness/std": 0.40078049898147583, "step": 982, "step_time": 87.88412514608353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 870.625, "completions/mean_terminated_length": 751.3333129882812, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "entropy": 0.45254844427108765, "epoch": 5.201058201058201, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.001551373046822846, "learning_rate": 8.855423113177662e-10, "loss": 0.0001, "num_tokens": 19335648.0, "reward": 1.0, "reward_std": 0.0, "rewards/itbench_correctness/mean": 1.0, "rewards/itbench_correctness/std": 0.0, "step": 983, "step_time": 624.4646268095821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 983.9375, "completions/mean_terminated_length": 895.7999877929688, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "entropy": 0.5325541496276855, "epoch": 5.2063492063492065, "frac_reward_zero_std": 0.5, "grad_norm": 1.4921875, "kl": 0.0016566679114475846, "learning_rate": 7.899070725153611e-10, "loss": -0.0178, "num_tokens": 19373543.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.4375, "rewards/itbench_correctness/std": 0.5123475790023804, "step": 984, "step_time": 265.6073463913053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 461.125, "completions/mean_terminated_length": 461.125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.4120357930660248, "epoch": 5.211640211640212, "frac_reward_zero_std": 0.5, "grad_norm": 1.0234375, "kl": 0.002073045587167144, "learning_rate": 6.997311153086882e-10, "loss": 0.0054, "num_tokens": 19384705.0, "reward": 0.28125, "reward_std": 0.0294627845287323, "rewards/itbench_correctness/mean": 0.28125, "rewards/itbench_correctness/std": 0.145535409450531, "step": 985, "step_time": 46.38171513937414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 612.9375, "completions/mean_terminated_length": 612.9375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.5286020040512085, "epoch": 5.216931216931217, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0022879249881953, "learning_rate": 6.150154258476314e-10, "loss": -0.0764, "num_tokens": 19399360.0, "reward": 0.7708333730697632, "reward_std": 0.25392836332321167, "rewards/itbench_correctness/mean": 0.7708333730697632, "rewards/itbench_correctness/std": 0.26440009474754333, "step": 986, "step_time": 232.74916400574148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 668.0, "completions/mean_terminated_length": 668.0, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.5179640650749207, "epoch": 5.222222222222222, "frac_reward_zero_std": 0.5, "grad_norm": 1.28125, "kl": 0.0015953207621350884, "learning_rate": 5.35760930569229e-10, "loss": 0.0063, "num_tokens": 19417072.0, "reward": 0.0625, "reward_std": 0.1767766922712326, "rewards/itbench_correctness/mean": 0.0625, "rewards/itbench_correctness/std": 0.25, "step": 987, "step_time": 90.40588045120239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 559.8125, "completions/mean_terminated_length": 559.8125, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "entropy": 0.5394663214683533, "epoch": 5.227513227513228, "frac_reward_zero_std": 0.5, "grad_norm": 1.140625, "kl": 0.0017291223630309105, "learning_rate": 4.619684961881254e-10, "loss": -0.0123, "num_tokens": 19447357.0, "reward": 0.40625, "reward_std": 0.1293872892856598, "rewards/itbench_correctness/mean": 0.40625, "rewards/itbench_correctness/std": 0.4552929699420929, "step": 988, "step_time": 91.7427905248478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 719.4375, "completions/mean_terminated_length": 617.9166870117188, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.3725132346153259, "epoch": 5.232804232804233, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0019212639890611172, "learning_rate": 3.9363892968641287e-10, "loss": 0.0219, "num_tokens": 19463452.0, "reward": 0.21875, "reward_std": 0.1978391408920288, "rewards/itbench_correctness/mean": 0.21875, "rewards/itbench_correctness/std": 0.22219711542129517, "step": 989, "step_time": 508.81171389855444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 719.0, "completions/mean_terminated_length": 675.4285888671875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 0.2767733037471771, "epoch": 5.238095238095238, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.0014671377139165998, "learning_rate": 3.3077297830541585e-10, "loss": 0.0246, "num_tokens": 19480260.0, "reward": 0.5416666865348816, "reward_std": 0.37473249435424805, "rewards/itbench_correctness/mean": 0.5416666865348816, "rewards/itbench_correctness/std": 0.5, "step": 990, "step_time": 126.7575543159619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 516.25, "completions/mean_terminated_length": 516.25, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.46295398473739624, "epoch": 5.243386243386244, "frac_reward_zero_std": 0.5, "grad_norm": 1.0078125, "kl": 0.0014694234123453498, "learning_rate": 2.733713295369755e-10, "loss": -0.0213, "num_tokens": 19491424.0, "reward": 0.875, "reward_std": 0.18898223340511322, "rewards/itbench_correctness/mean": 0.875, "rewards/itbench_correctness/std": 0.28867512941360474, "step": 991, "step_time": 357.908637705259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 658.0625, "completions/mean_terminated_length": 658.0625, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.6351979970932007, "epoch": 5.248677248677248, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.0013824838679283857, "learning_rate": 2.2143461111645556e-10, "loss": 0.0117, "num_tokens": 19507433.0, "reward": 0.6458333730697632, "reward_std": 0.2946278154850006, "rewards/itbench_correctness/mean": 0.6458333730697632, "rewards/itbench_correctness/std": 0.40311288833618164, "step": 992, "step_time": 522.8660918865353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 678.25, "completions/mean_terminated_length": 655.2000122070312, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.4423147737979889, "epoch": 5.253968253968254, "frac_reward_zero_std": 1.0, "grad_norm": 0.024169921875, "kl": 0.0016678622923791409, "learning_rate": 1.7496339101535918e-10, "loss": 0.0, "num_tokens": 19545629.0, "reward": 0.5, "reward_std": 0.0, "rewards/itbench_correctness/mean": 0.5, "rewards/itbench_correctness/std": 0.5163977742195129, "step": 993, "step_time": 334.44221889507025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 698.9375, "completions/mean_terminated_length": 677.2667236328125, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.39774659276008606, "epoch": 5.2592592592592595, "frac_reward_zero_std": 0.5, "grad_norm": 1.2265625, "kl": 0.0019350027432665229, "learning_rate": 1.3395817743561132e-10, "loss": 0.0034, "num_tokens": 19562180.0, "reward": 0.1041666716337204, "reward_std": 0.03857583925127983, "rewards/itbench_correctness/mean": 0.1041666716337204, "rewards/itbench_correctness/std": 0.11979921907186508, "step": 994, "step_time": 805.2065543290228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 563.8125, "completions/mean_terminated_length": 533.1333618164062, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4930717349052429, "epoch": 5.264550264550264, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0037524921353906393, "learning_rate": 9.841941880361914e-11, "loss": -0.0262, "num_tokens": 19574545.0, "reward": 0.5104166269302368, "reward_std": 0.32622629404067993, "rewards/itbench_correctness/mean": 0.5104166269302368, "rewards/itbench_correctness/std": 0.3812578022480011, "step": 995, "step_time": 363.2403373187408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 771.625, "completions/mean_terminated_length": 575.3333129882812, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.5106106996536255, "epoch": 5.26984126984127, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0013155718334019184, "learning_rate": 6.834750376549791e-11, "loss": -0.011, "num_tokens": 19592107.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/itbench_correctness/mean": 0.125, "rewards/itbench_correctness/std": 0.3415650427341461, "step": 996, "step_time": 159.3905362924561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 999.8125, "completions/mean_terminated_length": 895.0, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "entropy": 0.6281177997589111, "epoch": 5.275132275132275, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.001502976636402309, "learning_rate": 4.3742761183018783e-11, "loss": 0.0133, "num_tokens": 19617248.0, "reward": 0.21250000596046448, "reward_std": 0.1787744164466858, "rewards/itbench_correctness/mean": 0.21250000596046448, "rewards/itbench_correctness/std": 0.20124614238739014, "step": 997, "step_time": 138.13474278803915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 651.625, "completions/mean_terminated_length": 651.625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.40821024775505066, "epoch": 5.28042328042328, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.0013937059557065368, "learning_rate": 2.4605460129556442e-11, "loss": -0.0343, "num_tokens": 19632762.0, "reward": 0.8848214149475098, "reward_std": 0.04293148219585419, "rewards/itbench_correctness/mean": 0.8848214149475098, "rewards/itbench_correctness/std": 0.10898028314113617, "step": 998, "step_time": 133.00229213759303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 707.75, "completions/mean_terminated_length": 391.5, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.6329918503761292, "epoch": 5.285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0015219503547996283, "learning_rate": 1.0935809887702152e-11, "loss": 0.0067, "num_tokens": 19651310.0, "reward": 0.5859375, "reward_std": 0.2041938304901123, "rewards/itbench_correctness/mean": 0.5859375, "rewards/itbench_correctness/std": 0.4557931423187256, "step": 999, "step_time": 188.53237317036837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 665.75, "completions/mean_terminated_length": 546.3333740234375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "entropy": 0.42959070205688477, "epoch": 5.291005291005291, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.001996886683627963, "learning_rate": 2.7339599464326622e-12, "loss": 0.021, "num_tokens": 19667050.0, "reward": 0.1160714328289032, "reward_std": 0.23404696583747864, "rewards/itbench_correctness/mean": 0.1160714328289032, "rewards/itbench_correctness/std": 0.25404882431030273, "step": 1000, "step_time": 844.8066724454984 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 19667050, "num_train_epochs": 6, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }