diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.291005291005291, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 723.75, + "completions/mean_terminated_length": 490.22222900390625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5195164084434509, + "epoch": 0.005291005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.0010675216326490045, + "learning_rate": 0.0, + "loss": -0.1441, + "num_tokens": 18452.0, + "reward": 0.828125, + "reward_std": 0.3463020324707031, + "rewards/itbench_correctness/mean": 0.828125, + "rewards/itbench_correctness/std": 0.33811673521995544, + "step": 1, + "step_time": 91.14044637419283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 604.75, + "completions/mean_terminated_length": 544.857177734375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3654402792453766, + "epoch": 0.010582010582010581, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8046875, + "kl": 0.0009289107983931899, + "learning_rate": 2e-08, + "loss": -0.0658, + "num_tokens": 33008.0, + "reward": 0.3645833432674408, + "reward_std": 0.1873345822095871, + "rewards/itbench_correctness/mean": 0.3645833432674408, + "rewards/itbench_correctness/std": 0.4552929401397705, + "step": 2, + "step_time": 828.1970858396962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 957.5625, + "completions/mean_terminated_length": 905.888916015625, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.5472227931022644, + "epoch": 0.015873015873015872, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.001058843918144703, + "learning_rate": 4e-08, + "loss": 0.0343, + "num_tokens": 55673.0, + "reward": 0.34687501192092896, + "reward_std": 0.3456803262233734, + "rewards/itbench_correctness/mean": 0.34687501192092896, + "rewards/itbench_correctness/std": 0.4120957851409912, + "step": 3, + "step_time": 151.529059112072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 532.5625, + "completions/mean_terminated_length": 532.5625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5482924580574036, + "epoch": 0.021164021164021163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.0013292429503053427, + "learning_rate": 6e-08, + "loss": -0.1305, + "num_tokens": 68794.0, + "reward": 0.7916666865348816, + "reward_std": 0.32439103722572327, + "rewards/itbench_correctness/mean": 0.7916666865348816, + "rewards/itbench_correctness/std": 0.34960296750068665, + "step": 4, + "step_time": 417.0535086672753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 711.4375, + "completions/mean_terminated_length": 468.3333435058594, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.47509443759918213, + "epoch": 0.026455026455026454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0010377272265031934, + "learning_rate": 8e-08, + "loss": -0.0456, + "num_tokens": 83449.0, + "reward": 0.3515625, + "reward_std": 0.2974616289138794, + "rewards/itbench_correctness/mean": 0.3515625, + "rewards/itbench_correctness/std": 0.32021722197532654, + "step": 5, + "step_time": 128.02622807957232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 951.0, + "completions/mean_terminated_length": 440.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5846477150917053, + "epoch": 0.031746031746031744, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5625, + "kl": 0.000945708598010242, + "learning_rate": 1e-07, + "loss": 0.0034, + "num_tokens": 122025.0, + "reward": 0.25, + "reward_std": 0.2182178944349289, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.394405335187912, + "step": 6, + "step_time": 145.1888073068112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 544.4375, + "completions/mean_terminated_length": 544.4375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.27000343799591064, + "epoch": 0.037037037037037035, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.001091663958504796, + "learning_rate": 1.2e-07, + "loss": -0.0527, + "num_tokens": 135296.0, + "reward": 0.3736979365348816, + "reward_std": 0.31324487924575806, + "rewards/itbench_correctness/mean": 0.3736979365348816, + "rewards/itbench_correctness/std": 0.3162706792354584, + "step": 7, + "step_time": 83.35824911855161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 683.25, + "completions/mean_terminated_length": 660.5333862304688, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.47713136672973633, + "epoch": 0.042328042328042326, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0009320093668065965, + "learning_rate": 1.4e-07, + "loss": 0.0447, + "num_tokens": 150748.0, + "reward": 0.9322916269302368, + "reward_std": 0.062747523188591, + "rewards/itbench_correctness/mean": 0.9322916269302368, + "rewards/itbench_correctness/std": 0.11063265055418015, + "step": 8, + "step_time": 179.78012859076262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 518.6875, + "completions/mean_terminated_length": 446.5000305175781, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.42993131279945374, + "epoch": 0.047619047619047616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.001125591923482716, + "learning_rate": 1.6e-07, + "loss": -0.0315, + "num_tokens": 165327.0, + "reward": 0.578125, + "reward_std": 0.24882009625434875, + "rewards/itbench_correctness/mean": 0.578125, + "rewards/itbench_correctness/std": 0.2660909593105316, + "step": 9, + "step_time": 145.98578487429768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 700.0625, + "completions/mean_terminated_length": 552.8181762695312, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.4028211832046509, + "epoch": 0.05291005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0010170488385483623, + "learning_rate": 1.8e-07, + "loss": -0.0257, + "num_tokens": 181176.0, + "reward": 0.5083333253860474, + "reward_std": 0.3309464454650879, + "rewards/itbench_correctness/mean": 0.5083333253860474, + "rewards/itbench_correctness/std": 0.3380225598812103, + "step": 10, + "step_time": 135.02681362256408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 466.4375, + "completions/mean_terminated_length": 466.4375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.37089642882347107, + "epoch": 0.0582010582010582, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.15625, + "kl": 0.0011742996284738183, + "learning_rate": 2e-07, + "loss": -0.0049, + "num_tokens": 192455.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4989572763442993, + "step": 11, + "step_time": 994.5879717040807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 501.25, + "completions/mean_terminated_length": 327.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.49077308177948, + "epoch": 0.06349206349206349, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.890625, + "kl": 0.0011536800302565098, + "learning_rate": 2.1999999999999998e-07, + "loss": -0.0581, + "num_tokens": 210611.0, + "reward": 0.09375, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.09375, + "rewards/itbench_correctness/std": 0.20155644416809082, + "step": 12, + "step_time": 106.51383402384818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 420.9375, + "completions/mean_terminated_length": 420.9375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5107646584510803, + "epoch": 0.06878306878306878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.0012960818130522966, + "learning_rate": 2.4e-07, + "loss": -0.1036, + "num_tokens": 220666.0, + "reward": 0.5572916865348816, + "reward_std": 0.2719196677207947, + "rewards/itbench_correctness/mean": 0.5572916865348816, + "rewards/itbench_correctness/std": 0.2750736474990845, + "step": 13, + "step_time": 78.42556338571012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 553.4375, + "completions/mean_terminated_length": 486.21429443359375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6179559826850891, + "epoch": 0.07407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.0013870123075321317, + "learning_rate": 2.6e-07, + "loss": -0.1253, + "num_tokens": 237537.0, + "reward": 0.5, + "reward_std": 0.3650856614112854, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 14, + "step_time": 266.15765621792525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 504.9375, + "completions/mean_terminated_length": 504.9375, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.5228369832038879, + "epoch": 0.07936507936507936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.0013084918027743697, + "learning_rate": 2.8e-07, + "loss": 0.0175, + "num_tokens": 253968.0, + "reward": 0.9035714864730835, + "reward_std": 0.06060914695262909, + "rewards/itbench_correctness/mean": 0.9035714864730835, + "rewards/itbench_correctness/std": 0.10054273903369904, + "step": 15, + "step_time": 132.4059884781018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 420.5, + "completions/mean_terminated_length": 420.5, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 0.4851367473602295, + "epoch": 0.08465608465608465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.001197831821627915, + "learning_rate": 3e-07, + "loss": 0.0638, + "num_tokens": 263192.0, + "reward": 0.4375, + "reward_std": 0.38298875093460083, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 16, + "step_time": 94.08578859362751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 573.8125, + "completions/mean_terminated_length": 573.8125, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.3694586753845215, + "epoch": 0.08994708994708994, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.0009872535010799766, + "learning_rate": 3.2e-07, + "loss": -0.0021, + "num_tokens": 276349.0, + "reward": 0.7132352590560913, + "reward_std": 0.24745365977287292, + "rewards/itbench_correctness/mean": 0.7132352590560913, + "rewards/itbench_correctness/std": 0.44946467876434326, + "step": 17, + "step_time": 803.3225803021342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 918.625, + "completions/mean_terminated_length": 783.1428833007812, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.3722955584526062, + "epoch": 0.09523809523809523, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7421875, + "kl": 0.0011494335485622287, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0019, + "num_tokens": 305519.0, + "reward": 0.7291666865348816, + "reward_std": 0.23464766144752502, + "rewards/itbench_correctness/mean": 0.7291666865348816, + "rewards/itbench_correctness/std": 0.4254627227783203, + "step": 18, + "step_time": 293.30187319312245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 638.5, + "completions/mean_terminated_length": 638.5, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "entropy": 0.4792482256889343, + "epoch": 0.10052910052910052, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.390625, + "kl": 0.001083478331565857, + "learning_rate": 3.6e-07, + "loss": -0.0201, + "num_tokens": 319703.0, + "reward": 0.71875, + "reward_std": 0.09797047078609467, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.16520188748836517, + "step": 19, + "step_time": 138.92694834899157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 680.625, + "completions/mean_terminated_length": 524.5454711914062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.39669421315193176, + "epoch": 0.10582010582010581, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0011998838745057583, + "learning_rate": 3.7999999999999996e-07, + "loss": -0.1584, + "num_tokens": 342409.0, + "reward": 0.1770833432674408, + "reward_std": 0.3077988028526306, + "rewards/itbench_correctness/mean": 0.1770833432674408, + "rewards/itbench_correctness/std": 0.3413955569267273, + "step": 20, + "step_time": 374.13402384892106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 655.375, + "completions/mean_terminated_length": 655.375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4669082462787628, + "epoch": 0.1111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.0013768852222710848, + "learning_rate": 4e-07, + "loss": -0.115, + "num_tokens": 365695.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 21, + "step_time": 114.49692635703832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 880.375, + "completions/mean_terminated_length": 847.2307739257812, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "entropy": 0.511145830154419, + "epoch": 0.1164021164021164, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4609375, + "kl": 0.0011111602652817965, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0192, + "num_tokens": 389429.0, + "reward": 0.59375, + "reward_std": 0.03788072243332863, + "rewards/itbench_correctness/mean": 0.59375, + "rewards/itbench_correctness/std": 0.4227531850337982, + "step": 22, + "step_time": 103.79572249855846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 653.375, + "completions/mean_terminated_length": 653.375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.5203749537467957, + "epoch": 0.12169312169312169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021240234375, + "kl": 0.0011424734257161617, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "num_tokens": 405051.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 23, + "step_time": 158.34662247169763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 565.0, + "completions/mean_terminated_length": 565.0, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.5274336338043213, + "epoch": 0.12698412698412698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0010382338659837842, + "learning_rate": 4.6e-07, + "loss": -0.0043, + "num_tokens": 417107.0, + "reward": 0.578125, + "reward_std": 0.1099528968334198, + "rewards/itbench_correctness/mean": 0.578125, + "rewards/itbench_correctness/std": 0.19116783142089844, + "step": 24, + "step_time": 93.74338541273028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 806.9375, + "completions/mean_terminated_length": 734.5833740234375, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.4733947813510895, + "epoch": 0.13227513227513227, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.0012686237460002303, + "learning_rate": 4.8e-07, + "loss": -0.027, + "num_tokens": 443242.0, + "reward": 0.5, + "reward_std": 0.39511844515800476, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.4654746949672699, + "step": 25, + "step_time": 117.60556835308671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 717.125, + "completions/mean_terminated_length": 673.2857666015625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.46017083525657654, + "epoch": 0.13756613756613756, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.234375, + "kl": 0.0011198390275239944, + "learning_rate": 5e-07, + "loss": -0.0578, + "num_tokens": 460940.0, + "reward": 0.15625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.15625, + "rewards/itbench_correctness/std": 0.23935678601264954, + "step": 26, + "step_time": 435.78113711997867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 709.125, + "completions/mean_terminated_length": 664.1428833007812, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.38639166951179504, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.001125096925534308, + "learning_rate": 5.2e-07, + "loss": 0.0106, + "num_tokens": 477014.0, + "reward": 0.3166666626930237, + "reward_std": 0.175833061337471, + "rewards/itbench_correctness/mean": 0.3166666626930237, + "rewards/itbench_correctness/std": 0.2388242930173874, + "step": 27, + "step_time": 136.88797108456492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 603.9375, + "completions/mean_terminated_length": 543.9285888671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.42057332396507263, + "epoch": 0.14814814814814814, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0390625, + "kl": 0.0010505876271054149, + "learning_rate": 5.4e-07, + "loss": -0.012, + "num_tokens": 491261.0, + "reward": 0.46875, + "reward_std": 0.04312910512089729, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4876958429813385, + "step": 28, + "step_time": 450.98891491629183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 822.4375, + "completions/mean_terminated_length": 755.25, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.4498822093009949, + "epoch": 0.15343915343915343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0008466003928333521, + "learning_rate": 5.6e-07, + "loss": -0.0265, + "num_tokens": 510580.0, + "reward": 0.831250011920929, + "reward_std": 0.2088201940059662, + "rewards/itbench_correctness/mean": 0.831250011920929, + "rewards/itbench_correctness/std": 0.24958299100399017, + "step": 29, + "step_time": 82.54267377220094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 790.4375, + "completions/mean_terminated_length": 650.2999877929688, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.33146199584007263, + "epoch": 0.15873015873015872, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3125, + "kl": 0.000913652591407299, + "learning_rate": 5.8e-07, + "loss": -0.0192, + "num_tokens": 529675.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4989572763442993, + "step": 30, + "step_time": 153.0279028210789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 852.0625, + "completions/mean_terminated_length": 773.9091186523438, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 0.624367356300354, + "epoch": 0.164021164021164, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0011790527496486902, + "learning_rate": 6e-07, + "loss": 0.0451, + "num_tokens": 554588.0, + "reward": 0.17812499403953552, + "reward_std": 0.21488739550113678, + "rewards/itbench_correctness/mean": 0.17812499403953552, + "rewards/itbench_correctness/std": 0.21210749447345734, + "step": 31, + "step_time": 496.71210376080126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 464.375, + "completions/mean_terminated_length": 464.375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4242261052131653, + "epoch": 0.1693121693121693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.001077913912013173, + "learning_rate": 6.2e-07, + "loss": -0.0869, + "num_tokens": 565106.0, + "reward": 0.53125, + "reward_std": 0.12696418166160583, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.43977582454681396, + "step": 32, + "step_time": 62.5571150816977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 790.375, + "completions/mean_terminated_length": 490.0000305175781, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5136802196502686, + "epoch": 0.1746031746031746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.34375, + "kl": 0.0010994027834385633, + "learning_rate": 6.4e-07, + "loss": -0.0752, + "num_tokens": 590992.0, + "reward": 0.2723214328289032, + "reward_std": 0.22582654654979706, + "rewards/itbench_correctness/mean": 0.2723214328289032, + "rewards/itbench_correctness/std": 0.417490690946579, + "step": 33, + "step_time": 873.944114420563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 382.3125, + "completions/mean_terminated_length": 382.3125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.41327446699142456, + "epoch": 0.17989417989417988, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.38671875, + "kl": 0.0016043909126892686, + "learning_rate": 6.6e-07, + "loss": -0.05, + "num_tokens": 599933.0, + "reward": 0.71875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 34, + "step_time": 811.917650568299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 961.25, + "completions/mean_terminated_length": 856.6666870117188, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "entropy": 0.6283485293388367, + "epoch": 0.18518518518518517, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.515625, + "kl": 0.0012013108935207129, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0029, + "num_tokens": 628801.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 35, + "step_time": 104.12166160158813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 765.4375, + "completions/mean_terminated_length": 506.875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.29917532205581665, + "epoch": 0.19047619047619047, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.390625, + "kl": 0.0010058790212497115, + "learning_rate": 7e-07, + "loss": 0.0029, + "num_tokens": 648056.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 36, + "step_time": 884.992473276332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 540.0625, + "completions/mean_terminated_length": 540.0625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.4203217327594757, + "epoch": 0.19576719576719576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0106201171875, + "kl": 0.001015704357996583, + "learning_rate": 7.2e-07, + "loss": 0.0, + "num_tokens": 660377.0, + "reward": 0.5833333134651184, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5833333134651184, + "rewards/itbench_correctness/std": 0.4303314983844757, + "step": 37, + "step_time": 85.28049738146365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 456.1875, + "completions/mean_terminated_length": 456.1875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5830935835838318, + "epoch": 0.20105820105820105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017822265625, + "kl": 0.0013373795663937926, + "learning_rate": 7.4e-07, + "loss": 0.0, + "num_tokens": 688692.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 38, + "step_time": 211.86649047024548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 711.375, + "completions/mean_terminated_length": 639.2307739257812, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.2656826674938202, + "epoch": 0.20634920634920634, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.265625, + "kl": 0.0009032340021803975, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0308, + "num_tokens": 707554.0, + "reward": 0.484375, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.484375, + "rewards/itbench_correctness/std": 0.503891110420227, + "step": 39, + "step_time": 823.7539153788239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 455.6875, + "completions/mean_terminated_length": 455.6875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.4432862401008606, + "epoch": 0.21164021164021163, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9609375, + "kl": 0.0011500748805701733, + "learning_rate": 7.799999999999999e-07, + "loss": -0.0166, + "num_tokens": 717741.0, + "reward": 0.2395833432674408, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.2395833432674408, + "rewards/itbench_correctness/std": 0.19214914739131927, + "step": 40, + "step_time": 798.0437586428598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 644.25, + "completions/mean_terminated_length": 416.3999938964844, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6643383502960205, + "epoch": 0.21693121693121692, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3984375, + "kl": 0.0012484320905059576, + "learning_rate": 8e-07, + "loss": 0.0101, + "num_tokens": 743129.0, + "reward": 0.015625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.015625, + "rewards/itbench_correctness/std": 0.0625, + "step": 41, + "step_time": 98.21953046228737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 725.875, + "completions/mean_terminated_length": 547.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5951437950134277, + "epoch": 0.2222222222222222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.0012599489418789744, + "learning_rate": 8.199999999999999e-07, + "loss": -0.1275, + "num_tokens": 779247.0, + "reward": 0.4375, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 42, + "step_time": 374.9474004274234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 411.625, + "completions/mean_terminated_length": 411.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.449438214302063, + "epoch": 0.2275132275132275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.0011913544731214643, + "learning_rate": 8.399999999999999e-07, + "loss": -0.063, + "num_tokens": 794473.0, + "reward": 0.2911931872367859, + "reward_std": 0.16020165383815765, + "rewards/itbench_correctness/mean": 0.2911931872367859, + "rewards/itbench_correctness/std": 0.1646159142255783, + "step": 43, + "step_time": 82.59138822741807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 500.0625, + "completions/mean_terminated_length": 500.0625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5279340147972107, + "epoch": 0.2328042328042328, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.91796875, + "kl": 0.0011128420010209084, + "learning_rate": 8.599999999999999e-07, + "loss": -0.0494, + "num_tokens": 805986.0, + "reward": 0.65625, + "reward_std": 0.09643959254026413, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.3786855936050415, + "step": 44, + "step_time": 486.3739328915253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 970.5625, + "completions/mean_terminated_length": 596.5, + "completions/min_length": 579.0, + "completions/min_terminated_length": 579.0, + "entropy": 0.5398930907249451, + "epoch": 0.23809523809523808, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5625, + "kl": 0.0010154710616916418, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "num_tokens": 836355.0, + "reward": 0.109375, + "reward_std": 0.14074896275997162, + "rewards/itbench_correctness/mean": 0.109375, + "rewards/itbench_correctness/std": 0.22302372753620148, + "step": 45, + "step_time": 134.77385379187763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 522.5, + "completions/mean_terminated_length": 522.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.45741626620292664, + "epoch": 0.24338624338624337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02490234375, + "kl": 0.0011397113557904959, + "learning_rate": 9e-07, + "loss": 0.0, + "num_tokens": 849099.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 46, + "step_time": 88.64014313649386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 596.6875, + "completions/mean_terminated_length": 596.6875, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "entropy": 0.3787577152252197, + "epoch": 0.24867724867724866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0011281240731477737, + "learning_rate": 9.2e-07, + "loss": -0.0062, + "num_tokens": 862838.0, + "reward": 0.7421875, + "reward_std": 0.3093565106391907, + "rewards/itbench_correctness/mean": 0.7421875, + "rewards/itbench_correctness/std": 0.3337562382221222, + "step": 47, + "step_time": 70.17125954851508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 423.0625, + "completions/mean_terminated_length": 423.0625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.45146992802619934, + "epoch": 0.25396825396825395, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.0013216013321653008, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0073, + "num_tokens": 872559.0, + "reward": 0.11328125, + "reward_std": 0.08985587954521179, + "rewards/itbench_correctness/mean": 0.11328125, + "rewards/itbench_correctness/std": 0.16958704590797424, + "step": 48, + "step_time": 87.19072807300836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 815.375, + "completions/mean_terminated_length": 653.1111450195312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.39981603622436523, + "epoch": 0.25925925925925924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.0008777660550549626, + "learning_rate": 9.6e-07, + "loss": 0.0344, + "num_tokens": 891605.0, + "reward": 0.32207342982292175, + "reward_std": 0.2425267994403839, + "rewards/itbench_correctness/mean": 0.32207342982292175, + "rewards/itbench_correctness/std": 0.32837510108947754, + "step": 49, + "step_time": 145.8219982078299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 576.25, + "completions/mean_terminated_length": 546.4000244140625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.5032538175582886, + "epoch": 0.26455026455026454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0010894184233620763, + "learning_rate": 9.8e-07, + "loss": 0.002, + "num_tokens": 903569.0, + "reward": 0.3849431872367859, + "reward_std": 0.13158553838729858, + "rewards/itbench_correctness/mean": 0.3849431872367859, + "rewards/itbench_correctness/std": 0.20182853937149048, + "step": 50, + "step_time": 374.8412516852841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 721.0625, + "completions/mean_terminated_length": 539.2999877929688, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.3356158435344696, + "epoch": 0.2698412698412698, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0703125, + "kl": 0.0009243504609912634, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 926762.0, + "reward": 0.4732142686843872, + "reward_std": 0.07576144486665726, + "rewards/itbench_correctness/mean": 0.4732142686843872, + "rewards/itbench_correctness/std": 0.4995746612548828, + "step": 51, + "step_time": 99.96388372033834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 644.8125, + "completions/mean_terminated_length": 590.6428833007812, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "entropy": 0.43733644485473633, + "epoch": 0.2751322751322751, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.0011704964563250542, + "learning_rate": 9.999972660400534e-07, + "loss": -0.0123, + "num_tokens": 941111.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 52, + "step_time": 114.90833497233689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 588.4375, + "completions/mean_terminated_length": 588.4375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.42825278639793396, + "epoch": 0.2804232804232804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.0011681180913001299, + "learning_rate": 9.999890641901124e-07, + "loss": -0.0885, + "num_tokens": 953494.0, + "reward": 0.8244047164916992, + "reward_std": 0.27204394340515137, + "rewards/itbench_correctness/mean": 0.8244047164916992, + "rewards/itbench_correctness/std": 0.2754608690738678, + "step": 53, + "step_time": 127.88445741310716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 788.75, + "completions/mean_terminated_length": 647.6000366210938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3549920618534088, + "epoch": 0.2857142857142857, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3359375, + "kl": 0.0009949615923687816, + "learning_rate": 9.999753945398703e-07, + "loss": -0.1075, + "num_tokens": 981738.0, + "reward": 0.5416666865348816, + "reward_std": 0.044543541967868805, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.4772607088088989, + "step": 54, + "step_time": 270.70763381849974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 641.1875, + "completions/mean_terminated_length": 615.6666870117188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4897163510322571, + "epoch": 0.291005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.0010646218433976173, + "learning_rate": 9.99956257238817e-07, + "loss": -0.0731, + "num_tokens": 996773.0, + "reward": 0.6666666865348816, + "reward_std": 0.35634830594062805, + "rewards/itbench_correctness/mean": 0.6666666865348816, + "rewards/itbench_correctness/std": 0.42163705825805664, + "step": 55, + "step_time": 172.69540655519813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 365.875, + "completions/mean_terminated_length": 365.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3881106972694397, + "epoch": 0.2962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.001151208532974124, + "learning_rate": 9.999316524962345e-07, + "loss": -0.0416, + "num_tokens": 1010651.0, + "reward": 0.359375, + "reward_std": 0.1751839816570282, + "rewards/itbench_correctness/mean": 0.359375, + "rewards/itbench_correctness/std": 0.1875, + "step": 56, + "step_time": 88.2503134328872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 699.6875, + "completions/mean_terminated_length": 552.2727661132812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.523090660572052, + "epoch": 0.30158730158730157, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.001052780426107347, + "learning_rate": 9.999015805811963e-07, + "loss": -0.1995, + "num_tokens": 1031726.0, + "reward": 0.2864583432674408, + "reward_std": 0.1927037239074707, + "rewards/itbench_correctness/mean": 0.2864583432674408, + "rewards/itbench_correctness/std": 0.2652195990085602, + "step": 57, + "step_time": 354.65077784564346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 657.0625, + "completions/mean_terminated_length": 490.2727355957031, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.47179681062698364, + "epoch": 0.30687830687830686, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.03125, + "kl": 0.0011720317415893078, + "learning_rate": 9.998660418225644e-07, + "loss": 0.0026, + "num_tokens": 1048359.0, + "reward": 0.6420454978942871, + "reward_std": 0.07464002817869186, + "rewards/itbench_correctness/mean": 0.6420454978942871, + "rewards/itbench_correctness/std": 0.38350099325180054, + "step": 58, + "step_time": 612.9088207762688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 757.1875, + "completions/mean_terminated_length": 597.1000366210938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5388361811637878, + "epoch": 0.31216931216931215, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.001013660104945302, + "learning_rate": 9.998250366089846e-07, + "loss": -0.0625, + "num_tokens": 1067810.0, + "reward": 0.375, + "reward_std": 0.33407655358314514, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.3979112207889557, + "step": 59, + "step_time": 368.7298939973116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 921.625, + "completions/mean_terminated_length": 696.4000244140625, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.49694833159446716, + "epoch": 0.31746031746031744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.0011713014682754874, + "learning_rate": 9.997785653888834e-07, + "loss": 0.0514, + "num_tokens": 1093876.0, + "reward": 0.140625, + "reward_std": 0.26977968215942383, + "rewards/itbench_correctness/mean": 0.140625, + "rewards/itbench_correctness/std": 0.2733854353427887, + "step": 60, + "step_time": 751.7327463729307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 780.5625, + "completions/mean_terminated_length": 634.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4586436152458191, + "epoch": 0.32275132275132273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.0011096000671386719, + "learning_rate": 9.99726628670463e-07, + "loss": -0.0469, + "num_tokens": 1113397.0, + "reward": 0.5833333134651184, + "reward_std": 0.3903999924659729, + "rewards/itbench_correctness/mean": 0.5833333134651184, + "rewards/itbench_correctness/std": 0.3884918689727783, + "step": 61, + "step_time": 380.8789173979312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 782.6875, + "completions/mean_terminated_length": 702.25, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5136149525642395, + "epoch": 0.328042328042328, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0011364103993400931, + "learning_rate": 9.996692270216946e-07, + "loss": -0.0176, + "num_tokens": 1133432.0, + "reward": 0.581250011920929, + "reward_std": 0.37959763407707214, + "rewards/itbench_correctness/mean": 0.581250011920929, + "rewards/itbench_correctness/std": 0.4445503354072571, + "step": 62, + "step_time": 86.81900852825493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 403.1875, + "completions/mean_terminated_length": 403.1875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3745155930519104, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.0013223676942288876, + "learning_rate": 9.996063610703135e-07, + "loss": 0.0032, + "num_tokens": 1148299.0, + "reward": 0.4635416865348816, + "reward_std": 0.06842001527547836, + "rewards/itbench_correctness/mean": 0.4635416865348816, + "rewards/itbench_correctness/std": 0.48778483271598816, + "step": 63, + "step_time": 649.0491365483031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 997.625, + "completions/mean_terminated_length": 883.3333740234375, + "completions/min_length": 725.0, + "completions/min_terminated_length": 725.0, + "entropy": 0.3588522672653198, + "epoch": 0.3386243386243386, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.0009991804836317897, + "learning_rate": 9.995380315038117e-07, + "loss": 0.0, + "num_tokens": 1174997.0, + "reward": 0.2083333432674408, + "reward_std": 0.11785111576318741, + "rewards/itbench_correctness/mean": 0.2083333432674408, + "rewards/itbench_correctness/std": 0.2687419056892395, + "step": 64, + "step_time": 110.76476481370628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 454.0625, + "completions/mean_terminated_length": 454.0625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5461803078651428, + "epoch": 0.3439153439153439, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.0015210240380838513, + "learning_rate": 9.994642390694308e-07, + "loss": -0.0216, + "num_tokens": 1187070.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 65, + "step_time": 79.26396809145808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 719.8125, + "completions/mean_terminated_length": 581.5454711914062, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.5056872367858887, + "epoch": 0.3492063492063492, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0013450310798361897, + "learning_rate": 9.993849845741523e-07, + "loss": -0.0131, + "num_tokens": 1207347.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 66, + "step_time": 337.5469845244661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 396.625, + "completions/mean_terminated_length": 396.625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.35297825932502747, + "epoch": 0.3544973544973545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0014365667011588812, + "learning_rate": 9.993002688846912e-07, + "loss": 0.0055, + "num_tokens": 1216421.0, + "reward": 0.28125, + "reward_std": 0.3061639666557312, + "rewards/itbench_correctness/mean": 0.28125, + "rewards/itbench_correctness/std": 0.3400367796421051, + "step": 67, + "step_time": 1142.9677757564932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 682.5, + "completions/mean_terminated_length": 603.6923217773438, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4981684982776642, + "epoch": 0.35978835978835977, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0014837104827165604, + "learning_rate": 9.992100929274846e-07, + "loss": -0.0849, + "num_tokens": 1231053.0, + "reward": 0.5208333134651184, + "reward_std": 0.4382143020629883, + "rewards/itbench_correctness/mean": 0.5208333134651184, + "rewards/itbench_correctness/std": 0.4326561689376831, + "step": 68, + "step_time": 479.8328125309199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 817.875, + "completions/mean_terminated_length": 694.2000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5012990832328796, + "epoch": 0.36507936507936506, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.001043464639224112, + "learning_rate": 9.991144576886822e-07, + "loss": 0.0323, + "num_tokens": 1249699.0, + "reward": 0.4642857015132904, + "reward_std": 0.22637419402599335, + "rewards/itbench_correctness/mean": 0.4642857015132904, + "rewards/itbench_correctness/std": 0.4928053915500641, + "step": 69, + "step_time": 82.86210318095982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 693.5625, + "completions/mean_terminated_length": 646.357177734375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.2220419943332672, + "epoch": 0.37037037037037035, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.388671875, + "kl": 0.0010405785869807005, + "learning_rate": 9.990133642141357e-07, + "loss": -0.0727, + "num_tokens": 1271964.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 70, + "step_time": 202.81221913732588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 769.125, + "completions/mean_terminated_length": 710.3077392578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4264586269855499, + "epoch": 0.37566137566137564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.0012975148856639862, + "learning_rate": 9.989068136093872e-07, + "loss": -0.0651, + "num_tokens": 1293950.0, + "reward": 0.6597222089767456, + "reward_std": 0.4048736095428467, + "rewards/itbench_correctness/mean": 0.6597222089767456, + "rewards/itbench_correctness/std": 0.4526442587375641, + "step": 71, + "step_time": 454.2652143603191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 711.125, + "completions/mean_terminated_length": 690.2667236328125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.28124451637268066, + "epoch": 0.38095238095238093, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.51953125, + "kl": 0.0010576838394626975, + "learning_rate": 9.98794807039657e-07, + "loss": -0.085, + "num_tokens": 1311536.0, + "reward": 0.8125, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.25, + "step": 72, + "step_time": 104.54512037336826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 516.25, + "completions/mean_terminated_length": 516.25, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.46295398473739624, + "epoch": 0.3862433862433862, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1640625, + "kl": 0.0009957690490409732, + "learning_rate": 9.98677345729831e-07, + "loss": -0.0093, + "num_tokens": 1322844.0, + "reward": 0.6812499761581421, + "reward_std": 0.062321171164512634, + "rewards/itbench_correctness/mean": 0.6812499761581421, + "rewards/itbench_correctness/std": 0.3400367796421051, + "step": 73, + "step_time": 635.4008999932557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 674.0, + "completions/mean_terminated_length": 674.0, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.48961424827575684, + "epoch": 0.3915343915343915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0283203125, + "kl": 0.0012715591583400965, + "learning_rate": 9.985544309644473e-07, + "loss": 0.0, + "num_tokens": 1342212.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 74, + "step_time": 107.24268661439419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 889.125, + "completions/mean_terminated_length": 844.1666870117188, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "entropy": 0.5128637552261353, + "epoch": 0.3968253968253968, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.640625, + "kl": 0.0014187946217134595, + "learning_rate": 9.98426064087682e-07, + "loss": 0.0213, + "num_tokens": 1371870.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 75, + "step_time": 283.4586225701496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 880.9375, + "completions/mean_terminated_length": 566.2000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6129833459854126, + "epoch": 0.4021164021164021, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.0010527537669986486, + "learning_rate": 9.982922465033348e-07, + "loss": 0.0007, + "num_tokens": 1399837.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 76, + "step_time": 86.58735218271613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 715.6875, + "completions/mean_terminated_length": 715.6875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "entropy": 0.3325473666191101, + "epoch": 0.4074074074074074, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2265625, + "kl": 0.0010047402465716004, + "learning_rate": 9.981529796748134e-07, + "loss": 0.0174, + "num_tokens": 1423416.0, + "reward": 0.25833335518836975, + "reward_std": 0.07715167105197906, + "rewards/itbench_correctness/mean": 0.25833335518836975, + "rewards/itbench_correctness/std": 0.19455552101135254, + "step": 77, + "step_time": 97.38318173773587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 376.1875, + "completions/mean_terminated_length": 376.1875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4253198206424713, + "epoch": 0.4126984126984127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.0014126732712611556, + "learning_rate": 9.980082651251174e-07, + "loss": -0.0444, + "num_tokens": 1432859.0, + "reward": 0.5520833730697632, + "reward_std": 0.29967689514160156, + "rewards/itbench_correctness/mean": 0.5520833730697632, + "rewards/itbench_correctness/std": 0.32185083627700806, + "step": 78, + "step_time": 62.12770148552954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 779.1875, + "completions/mean_terminated_length": 744.2142944335938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4029838740825653, + "epoch": 0.41798941798941797, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0012558053713291883, + "learning_rate": 9.978581044368217e-07, + "loss": -0.0309, + "num_tokens": 1461190.0, + "reward": 0.1354166716337204, + "reward_std": 0.07634378224611282, + "rewards/itbench_correctness/mean": 0.1354166716337204, + "rewards/itbench_correctness/std": 0.17447009682655334, + "step": 79, + "step_time": 79.0433895830065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 858.5625, + "completions/mean_terminated_length": 729.888916015625, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.36106863617897034, + "epoch": 0.42328042328042326, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6171875, + "kl": 0.0013509814161807299, + "learning_rate": 9.977024992520601e-07, + "loss": 0.0063, + "num_tokens": 1483655.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 80, + "step_time": 7292.784606534056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 627.6875, + "completions/mean_terminated_length": 627.6875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5671612024307251, + "epoch": 0.42857142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0016267532482743263, + "learning_rate": 9.975414512725056e-07, + "loss": -0.0913, + "num_tokens": 1504522.0, + "reward": 0.5, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 81, + "step_time": 89.92690824903548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 600.375, + "completions/mean_terminated_length": 600.375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.264834463596344, + "epoch": 0.43386243386243384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0011798108462244272, + "learning_rate": 9.973749622593532e-07, + "loss": -0.0018, + "num_tokens": 1519384.0, + "reward": 0.5625, + "reward_std": 0.1462520956993103, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.19364917278289795, + "step": 82, + "step_time": 92.88886137399822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 491.9375, + "completions/mean_terminated_length": 491.9375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.35370346903800964, + "epoch": 0.43915343915343913, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1484375, + "kl": 0.001843614736571908, + "learning_rate": 9.972030340333e-07, + "loss": 0.0148, + "num_tokens": 1531063.0, + "reward": 0.3020833134651184, + "reward_std": 0.1386406421661377, + "rewards/itbench_correctness/mean": 0.3020833134651184, + "rewards/itbench_correctness/std": 0.36498987674713135, + "step": 83, + "step_time": 1134.5993446996436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 475.75, + "completions/mean_terminated_length": 475.75, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.41828691959381104, + "epoch": 0.4444444444444444, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.046875, + "kl": 0.001323950826190412, + "learning_rate": 9.970256684745255e-07, + "loss": -0.0128, + "num_tokens": 1542371.0, + "reward": 0.75, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 84, + "step_time": 89.19195851124823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 987.0, + "completions/mean_terminated_length": 876.0, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.3343465030193329, + "epoch": 0.4497354497354497, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.0010419428581371903, + "learning_rate": 9.968428675226713e-07, + "loss": 0.0338, + "num_tokens": 1576531.0, + "reward": 0.6875, + "reward_std": 0.32618680596351624, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.42108768224716187, + "step": 85, + "step_time": 85.11601546406746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 709.0625, + "completions/mean_terminated_length": 520.1000366210938, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "entropy": 0.33565446734428406, + "epoch": 0.455026455026455, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.98828125, + "kl": 0.0012508188374340534, + "learning_rate": 9.966546331768192e-07, + "loss": -0.0029, + "num_tokens": 1595508.0, + "reward": 0.5104166865348816, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.5104166865348816, + "rewards/itbench_correctness/std": 0.2543601393699646, + "step": 86, + "step_time": 110.2943638684228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 485.75, + "completions/mean_terminated_length": 485.75, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.5352547764778137, + "epoch": 0.4603174603174603, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4296875, + "kl": 0.0011995767708867788, + "learning_rate": 9.964609674954695e-07, + "loss": 0.0036, + "num_tokens": 1608696.0, + "reward": 0.3125, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 87, + "step_time": 85.32795084360987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 802.3125, + "completions/mean_terminated_length": 751.1538696289062, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.40632545948028564, + "epoch": 0.4656084656084656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.0009902386227622628, + "learning_rate": 9.962618725965194e-07, + "loss": -0.0316, + "num_tokens": 1627885.0, + "reward": 0.4479166865348816, + "reward_std": 0.3577525019645691, + "rewards/itbench_correctness/mean": 0.4479166865348816, + "rewards/itbench_correctness/std": 0.420399934053421, + "step": 88, + "step_time": 81.01259941980243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 542.0625, + "completions/mean_terminated_length": 542.0625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4243053197860718, + "epoch": 0.4708994708994709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0011555891251191497, + "learning_rate": 9.960573506572389e-07, + "loss": -0.0988, + "num_tokens": 1640238.0, + "reward": 0.53515625, + "reward_std": 0.2504205107688904, + "rewards/itbench_correctness/mean": 0.53515625, + "rewards/itbench_correctness/std": 0.43777894973754883, + "step": 89, + "step_time": 97.55466525349766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 807.875, + "completions/mean_terminated_length": 709.6364135742188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.48522359132766724, + "epoch": 0.47619047619047616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.001208159956149757, + "learning_rate": 9.958474039142469e-07, + "loss": -0.1015, + "num_tokens": 1668412.0, + "reward": 0.10625000298023224, + "reward_std": 0.1334051787853241, + "rewards/itbench_correctness/mean": 0.10625000298023224, + "rewards/itbench_correctness/std": 0.16111589968204498, + "step": 90, + "step_time": 459.5639867214486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 543.0625, + "completions/mean_terminated_length": 543.0625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5929335951805115, + "epoch": 0.48148148148148145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.0014933362836018205, + "learning_rate": 9.956320346634875e-07, + "loss": -0.0536, + "num_tokens": 1681853.0, + "reward": 0.8125, + "reward_std": 0.32946425676345825, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.3256048858165741, + "step": 91, + "step_time": 78.2018728973344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 682.625, + "completions/mean_terminated_length": 527.45458984375, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.38381248712539673, + "epoch": 0.48677248677248675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.001028747414238751, + "learning_rate": 9.954112452602043e-07, + "loss": 0.0, + "num_tokens": 1707895.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 92, + "step_time": 160.40463780704886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 984.0625, + "completions/mean_terminated_length": 704.5, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.30079391598701477, + "epoch": 0.49206349206349204, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.890625, + "kl": 0.0009068697690963745, + "learning_rate": 9.95185038118915e-07, + "loss": -0.0136, + "num_tokens": 1733104.0, + "reward": 0.53125, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.4989572763442993, + "step": 93, + "step_time": 135.90597889758646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 646.875, + "completions/mean_terminated_length": 475.4545593261719, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5750724673271179, + "epoch": 0.4973544973544973, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.001511996379122138, + "learning_rate": 9.949534157133844e-07, + "loss": -0.1351, + "num_tokens": 1762622.0, + "reward": 0.4765625, + "reward_std": 0.32506585121154785, + "rewards/itbench_correctness/mean": 0.4765625, + "rewards/itbench_correctness/std": 0.3958607614040375, + "step": 94, + "step_time": 178.96230245847255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 802.5625, + "completions/mean_terminated_length": 751.4615478515625, + "completions/min_length": 542.0, + "completions/min_terminated_length": 542.0, + "entropy": 0.5806401371955872, + "epoch": 0.5026455026455027, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0013606835855171084, + "learning_rate": 9.947163805765979e-07, + "loss": 0.0764, + "num_tokens": 1795879.0, + "reward": 0.48124998807907104, + "reward_std": 0.1944543570280075, + "rewards/itbench_correctness/mean": 0.48124998807907104, + "rewards/itbench_correctness/std": 0.47359442710876465, + "step": 95, + "step_time": 182.67914429306984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 667.5, + "completions/mean_terminated_length": 667.5, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.32958802580833435, + "epoch": 0.5079365079365079, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.0008195637492462993, + "learning_rate": 9.944739353007341e-07, + "loss": 0.0178, + "num_tokens": 1811303.0, + "reward": 0.8374999761581421, + "reward_std": 0.09672200679779053, + "rewards/itbench_correctness/mean": 0.8374999761581421, + "rewards/itbench_correctness/std": 0.1031898632645607, + "step": 96, + "step_time": 74.22002993617207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 519.4375, + "completions/mean_terminated_length": 519.4375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3407532274723053, + "epoch": 0.5132275132275133, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.0011502447305247188, + "learning_rate": 9.942260825371357e-07, + "loss": -0.1158, + "num_tokens": 1824454.0, + "reward": 0.5687500238418579, + "reward_std": 0.23231291770935059, + "rewards/itbench_correctness/mean": 0.5687500238418579, + "rewards/itbench_correctness/std": 0.2676284909248352, + "step": 97, + "step_time": 72.25101596303284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 519.75, + "completions/mean_terminated_length": 519.75, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.49639248847961426, + "epoch": 0.5185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.0014472255716100335, + "learning_rate": 9.939728249962806e-07, + "loss": -0.0098, + "num_tokens": 1844642.0, + "reward": 0.8500000238418579, + "reward_std": 0.2121320366859436, + "rewards/itbench_correctness/mean": 0.8500000238418579, + "rewards/itbench_correctness/std": 0.24765567481517792, + "step": 98, + "step_time": 68.29791031684726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 692.9375, + "completions/mean_terminated_length": 542.45458984375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5685938596725464, + "epoch": 0.5238095238095238, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6171875, + "kl": 0.0014234319096431136, + "learning_rate": 9.937141654477528e-07, + "loss": -0.1176, + "num_tokens": 1866377.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 99, + "step_time": 99.10520203411579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 337.6875, + "completions/mean_terminated_length": 337.6875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3746066987514496, + "epoch": 0.5291005291005291, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.97265625, + "kl": 0.0013704805169254541, + "learning_rate": 9.934501067202117e-07, + "loss": -0.0118, + "num_tokens": 1874500.0, + "reward": 0.3125, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.3095695972442627, + "step": 100, + "step_time": 831.8933219816536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 611.1875, + "completions/mean_terminated_length": 473.5833435058594, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.5955619215965271, + "epoch": 0.5343915343915344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0012612693244591355, + "learning_rate": 9.931806517013612e-07, + "loss": 0.0328, + "num_tokens": 1899799.0, + "reward": 0.125, + "reward_std": 0.2925041913986206, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.28867512941360474, + "step": 101, + "step_time": 185.49466035328805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 743.0625, + "completions/mean_terminated_length": 724.3333740234375, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.4737151861190796, + "epoch": 0.5396825396825397, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.609375, + "kl": 0.0012106020003557205, + "learning_rate": 9.929058033379181e-07, + "loss": 0.0185, + "num_tokens": 1915728.0, + "reward": 0.8194444179534912, + "reward_std": 0.20520132780075073, + "rewards/itbench_correctness/mean": 0.8194444179534912, + "rewards/itbench_correctness/std": 0.3367112874984741, + "step": 102, + "step_time": 418.81876328215003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 761.3125, + "completions/mean_terminated_length": 557.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5017650723457336, + "epoch": 0.544973544973545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.0011779210763052106, + "learning_rate": 9.926255646355803e-07, + "loss": -0.1277, + "num_tokens": 1953421.0, + "reward": 0.2708333432674408, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.2708333432674408, + "rewards/itbench_correctness/std": 0.4254627227783203, + "step": 103, + "step_time": 131.8819383457303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 631.1875, + "completions/mean_terminated_length": 605.0000610351562, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.41825923323631287, + "epoch": 0.5502645502645502, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3046875, + "kl": 0.001125396229326725, + "learning_rate": 9.923399386589932e-07, + "loss": 0.0027, + "num_tokens": 1967568.0, + "reward": 0.967524528503418, + "reward_std": 0.0356326624751091, + "rewards/itbench_correctness/mean": 0.967524528503418, + "rewards/itbench_correctness/std": 0.059118952602148056, + "step": 104, + "step_time": 237.89590667374432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 437.25, + "completions/mean_terminated_length": 398.13336181640625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.41623786091804504, + "epoch": 0.5555555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.001291301567107439, + "learning_rate": 9.92048928531717e-07, + "loss": -0.0479, + "num_tokens": 1981084.0, + "reward": 0.46875, + "reward_std": 0.1883128434419632, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.2525334656238556, + "step": 105, + "step_time": 178.8811132274568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 725.9375, + "completions/mean_terminated_length": 427.875, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "entropy": 0.5179509520530701, + "epoch": 0.5608465608465608, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0009696544148027897, + "learning_rate": 9.917525374361911e-07, + "loss": 0.0018, + "num_tokens": 1999387.0, + "reward": 0.546875, + "reward_std": 0.22097086906433105, + "rewards/itbench_correctness/mean": 0.546875, + "rewards/itbench_correctness/std": 0.5018196105957031, + "step": 106, + "step_time": 493.8660353682935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 911.375, + "completions/mean_terminated_length": 843.7999877929688, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "entropy": 0.34014537930488586, + "epoch": 0.5661375661375662, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4453125, + "kl": 0.001088326214812696, + "learning_rate": 9.914507686137017e-07, + "loss": 0.0167, + "num_tokens": 2022945.0, + "reward": 0.35624998807907104, + "reward_std": 0.11475905776023865, + "rewards/itbench_correctness/mean": 0.35624998807907104, + "rewards/itbench_correctness/std": 0.3999479115009308, + "step": 107, + "step_time": 235.87840359471738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 542.4375, + "completions/mean_terminated_length": 542.4375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.4701002538204193, + "epoch": 0.5714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0014570436906069517, + "learning_rate": 9.911436253643443e-07, + "loss": 0.0162, + "num_tokens": 2036592.0, + "reward": 0.8567708134651184, + "reward_std": 0.19427995383739471, + "rewards/itbench_correctness/mean": 0.8567708134651184, + "rewards/itbench_correctness/std": 0.24054758250713348, + "step": 108, + "step_time": 129.46329625695944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 1023.4375, + "completions/mean_terminated_length": 1015.0, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 0.5901679396629333, + "epoch": 0.5767195767195767, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0010806693462654948, + "learning_rate": 9.90831111046988e-07, + "loss": 0.0009, + "num_tokens": 2060871.0, + "reward": 0.15625, + "reward_std": 0.3198433816432953, + "rewards/itbench_correctness/mean": 0.15625, + "rewards/itbench_correctness/std": 0.3520771861076355, + "step": 109, + "step_time": 73.70483169332147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 689.5, + "completions/mean_terminated_length": 689.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.39158812165260315, + "epoch": 0.582010582010582, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.001037560636177659, + "learning_rate": 9.905132290792392e-07, + "loss": -0.0033, + "num_tokens": 2076943.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 110, + "step_time": 73.8764311010018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 513.0, + "completions/mean_terminated_length": 206.40000915527344, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5146198868751526, + "epoch": 0.5873015873015873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.0013582897372543812, + "learning_rate": 9.901899829374047e-07, + "loss": -0.1464, + "num_tokens": 2089871.0, + "reward": 0.3740079402923584, + "reward_std": 0.34763163328170776, + "rewards/itbench_correctness/mean": 0.3740079402923584, + "rewards/itbench_correctness/std": 0.3568885028362274, + "step": 111, + "step_time": 695.7899582823738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 466.9375, + "completions/mean_terminated_length": 466.9375, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "entropy": 0.3662160336971283, + "epoch": 0.5925925925925926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01226806640625, + "kl": 0.0011427823919802904, + "learning_rate": 9.89861376156452e-07, + "loss": 0.0, + "num_tokens": 2100646.0, + "reward": 0.4166666865348816, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.4166666865348816, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 112, + "step_time": 65.8763862894848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 998.875, + "completions/mean_terminated_length": 823.0, + "completions/min_length": 642.0, + "completions/min_terminated_length": 642.0, + "entropy": 0.3023401200771332, + "epoch": 0.5978835978835979, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3046875, + "kl": 0.0008460183744318783, + "learning_rate": 9.895274123299722e-07, + "loss": 0.0013, + "num_tokens": 2126916.0, + "reward": 0.28125, + "reward_std": 0.2086307406425476, + "rewards/itbench_correctness/mean": 0.28125, + "rewards/itbench_correctness/std": 0.4069705307483673, + "step": 113, + "step_time": 870.3144110767171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 618.875, + "completions/mean_terminated_length": 525.3846435546875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "entropy": 0.47182387113571167, + "epoch": 0.6031746031746031, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4609375, + "kl": 0.0011947272578254342, + "learning_rate": 9.891880951101407e-07, + "loss": -0.0027, + "num_tokens": 2140634.0, + "reward": 0.15416666865348816, + "reward_std": 0.21283237636089325, + "rewards/itbench_correctness/mean": 0.15416666865348816, + "rewards/itbench_correctness/std": 0.3315228819847107, + "step": 114, + "step_time": 111.45921329036355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 831.6875, + "completions/mean_terminated_length": 639.375, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.3174269199371338, + "epoch": 0.6084656084656085, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0009422831353731453, + "learning_rate": 9.888434282076757e-07, + "loss": 0.0093, + "num_tokens": 2159877.0, + "reward": 0.10546875, + "reward_std": 0.07999982684850693, + "rewards/itbench_correctness/mean": 0.10546875, + "rewards/itbench_correctness/std": 0.1543108969926834, + "step": 115, + "step_time": 162.2415656549856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 833.0, + "completions/mean_terminated_length": 718.4000244140625, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.5162065029144287, + "epoch": 0.6137566137566137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.0014166326727718115, + "learning_rate": 9.884934153917996e-07, + "loss": 0.0456, + "num_tokens": 2190885.0, + "reward": 0.21875, + "reward_std": 0.1735912710428238, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.3275540769100189, + "step": 116, + "step_time": 763.6827120250091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 753.6875, + "completions/mean_terminated_length": 591.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.41927191615104675, + "epoch": 0.6190476190476191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0011654142290353775, + "learning_rate": 9.881380604901963e-07, + "loss": -0.1407, + "num_tokens": 2212584.0, + "reward": 0.2708333134651184, + "reward_std": 0.3443610668182373, + "rewards/itbench_correctness/mean": 0.2708333134651184, + "rewards/itbench_correctness/std": 0.33471935987472534, + "step": 117, + "step_time": 234.95893322955817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 748.8125, + "completions/mean_terminated_length": 623.727294921875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4727485179901123, + "epoch": 0.6243386243386243, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.0017693624831736088, + "learning_rate": 9.8777736738897e-07, + "loss": -0.09, + "num_tokens": 2236157.0, + "reward": 0.2291666716337204, + "reward_std": 0.3471825420856476, + "rewards/itbench_correctness/mean": 0.2291666716337204, + "rewards/itbench_correctness/std": 0.35420751571655273, + "step": 118, + "step_time": 141.18642224557698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 832.25, + "completions/mean_terminated_length": 683.1111450195312, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.4133373498916626, + "epoch": 0.6296296296296297, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5, + "kl": 0.0010757006239145994, + "learning_rate": 9.87411340032603e-07, + "loss": 0.0049, + "num_tokens": 2259913.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4989572763442993, + "step": 119, + "step_time": 577.6952238306403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 707.6875, + "completions/mean_terminated_length": 461.6666564941406, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.5736995339393616, + "epoch": 0.6349206349206349, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.0011166096664965153, + "learning_rate": 9.870399824239114e-07, + "loss": -0.0078, + "num_tokens": 2278228.0, + "reward": 0.3671875, + "reward_std": 0.2785572409629822, + "rewards/itbench_correctness/mean": 0.3671875, + "rewards/itbench_correctness/std": 0.2793920040130615, + "step": 120, + "step_time": 203.33785133063793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 780.75, + "completions/mean_terminated_length": 670.1818237304688, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.40473902225494385, + "epoch": 0.6402116402116402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0012949309311807156, + "learning_rate": 9.866632986240029e-07, + "loss": 0.0027, + "num_tokens": 2296336.0, + "reward": 0.4776785671710968, + "reward_std": 0.2322283834218979, + "rewards/itbench_correctness/mean": 0.4776785671710968, + "rewards/itbench_correctness/std": 0.4821428656578064, + "step": 121, + "step_time": 101.13796862587333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 618.75, + "completions/mean_terminated_length": 618.75, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.5333333611488342, + "epoch": 0.6455026455026455, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1328125, + "kl": 0.0013620926765725017, + "learning_rate": 9.862812927522308e-07, + "loss": 0.0167, + "num_tokens": 2314388.0, + "reward": 0.6145833134651184, + "reward_std": 0.043129097670316696, + "rewards/itbench_correctness/mean": 0.6145833134651184, + "rewards/itbench_correctness/std": 0.40239447355270386, + "step": 122, + "step_time": 715.118090393953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 589.375, + "completions/mean_terminated_length": 444.5, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 0.38006362318992615, + "epoch": 0.6507936507936508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0010308035416528583, + "learning_rate": 9.858939689861506e-07, + "loss": 0.0628, + "num_tokens": 2330282.0, + "reward": 0.5416666865348816, + "reward_std": 0.17097428441047668, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.197202667593956, + "step": 123, + "step_time": 104.44047453720123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 700.125, + "completions/mean_terminated_length": 376.25, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.43992143869400024, + "epoch": 0.656084656084656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0233154296875, + "kl": 0.0014201418962329626, + "learning_rate": 9.855013315614725e-07, + "loss": 0.0, + "num_tokens": 2353412.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 124, + "step_time": 91.80700621567667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 658.0, + "completions/mean_terminated_length": 658.0, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.3844984769821167, + "epoch": 0.6613756613756614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.0017445363337174058, + "learning_rate": 9.851033847720164e-07, + "loss": 0.0, + "num_tokens": 2368164.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 125, + "step_time": 84.32240361534059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 700.6875, + "completions/mean_terminated_length": 449.22222900390625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.6279546618461609, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.001191388233564794, + "learning_rate": 9.847001329696652e-07, + "loss": -0.0546, + "num_tokens": 2386335.0, + "reward": 0.41874998807907104, + "reward_std": 0.2509503960609436, + "rewards/itbench_correctness/mean": 0.41874998807907104, + "rewards/itbench_correctness/std": 0.3046172559261322, + "step": 126, + "step_time": 192.25429659802467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 673.125, + "completions/mean_terminated_length": 400.22222900390625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.6596100330352783, + "epoch": 0.671957671957672, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0013828243827447295, + "learning_rate": 9.842915805643156e-07, + "loss": -0.0019, + "num_tokens": 2410073.0, + "reward": 0.453125, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.453125, + "rewards/itbench_correctness/std": 0.5018196105957031, + "step": 127, + "step_time": 370.60414741840214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 675.125, + "completions/mean_terminated_length": 465.8000183105469, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6161822080612183, + "epoch": 0.6772486772486772, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0390625, + "kl": 0.0011551063507795334, + "learning_rate": 9.838777320238312e-07, + "loss": -0.0151, + "num_tokens": 2430699.0, + "reward": 0.34375, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.3966001570224762, + "step": 128, + "step_time": 101.63996140938252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 655.9375, + "completions/mean_terminated_length": 655.9375, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.27136731147766113, + "epoch": 0.6825396825396826, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.0009132423438131809, + "learning_rate": 9.834585918739934e-07, + "loss": 0.0035, + "num_tokens": 2448146.0, + "reward": 0.34375, + "reward_std": 0.0578637570142746, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.36371922492980957, + "step": 129, + "step_time": 926.4854553686455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 507.75, + "completions/mean_terminated_length": 507.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.486459881067276, + "epoch": 0.6878306878306878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0013310650829225779, + "learning_rate": 9.83034164698452e-07, + "loss": -0.08, + "num_tokens": 2459726.0, + "reward": 0.8690476417541504, + "reward_std": 0.28752756118774414, + "rewards/itbench_correctness/mean": 0.8690476417541504, + "rewards/itbench_correctness/std": 0.2865068316459656, + "step": 130, + "step_time": 497.3244105326012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 458.625, + "completions/mean_terminated_length": 458.625, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.40119925141334534, + "epoch": 0.6931216931216931, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.328125, + "kl": 0.0013875841395929456, + "learning_rate": 9.826044551386742e-07, + "loss": 0.0024, + "num_tokens": 2469992.0, + "reward": 0.4791666865348816, + "reward_std": 0.19795583188533783, + "rewards/itbench_correctness/mean": 0.4791666865348816, + "rewards/itbench_correctness/std": 0.27131369709968567, + "step": 131, + "step_time": 64.11436599586159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 632.5625, + "completions/mean_terminated_length": 606.4666748046875, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "entropy": 0.3730856776237488, + "epoch": 0.6984126984126984, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0234375, + "kl": 0.0012992192059755325, + "learning_rate": 9.821694678938952e-07, + "loss": -0.0026, + "num_tokens": 2484161.0, + "reward": 0.9255682229995728, + "reward_std": 0.17330622673034668, + "rewards/itbench_correctness/mean": 0.9255682229995728, + "rewards/itbench_correctness/std": 0.24894750118255615, + "step": 132, + "step_time": 782.2131289467216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 779.25, + "completions/mean_terminated_length": 762.933349609375, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "entropy": 0.6185434460639954, + "epoch": 0.7037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.0011128331534564495, + "learning_rate": 9.817292077210656e-07, + "loss": 0.0277, + "num_tokens": 2503445.0, + "reward": 0.59375, + "reward_std": 0.3061639666557312, + "rewards/itbench_correctness/mean": 0.59375, + "rewards/itbench_correctness/std": 0.41708314418792725, + "step": 133, + "step_time": 234.19261386059225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 657.4375, + "completions/mean_terminated_length": 535.25, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 0.31942200660705566, + "epoch": 0.708994708994709, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.109375, + "kl": 0.0009678892092779279, + "learning_rate": 9.812836794348002e-07, + "loss": 0.0316, + "num_tokens": 2520980.0, + "reward": 0.78125, + "reward_std": 0.1085391715168953, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.27024510502815247, + "step": 134, + "step_time": 130.00603658426553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 696.0, + "completions/mean_terminated_length": 696.0, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "entropy": 0.36063218116760254, + "epoch": 0.7142857142857143, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1015625, + "kl": 0.0012922929599881172, + "learning_rate": 9.808328879073251e-07, + "loss": -0.024, + "num_tokens": 2537100.0, + "reward": 0.6875, + "reward_std": 0.0862581878900528, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.3435921370983124, + "step": 135, + "step_time": 191.46370885893703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 310.375, + "completions/mean_terminated_length": 310.375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.3898509740829468, + "epoch": 0.7195767195767195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.0024722313974052668, + "learning_rate": 9.803768380684242e-07, + "loss": -0.0114, + "num_tokens": 2544442.0, + "reward": 0.21875, + "reward_std": 0.3061639666557312, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 136, + "step_time": 65.17159292474389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 960.5625, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.4580649435520172, + "epoch": 0.7248677248677249, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.0012847312027588487, + "learning_rate": 9.79915534905385e-07, + "loss": -0.0218, + "num_tokens": 2571915.0, + "reward": 0.3541666865348816, + "reward_std": 0.349293053150177, + "rewards/itbench_correctness/mean": 0.3541666865348816, + "rewards/itbench_correctness/std": 0.4121982753276825, + "step": 137, + "step_time": 95.23527884297073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 655.5, + "completions/mean_terminated_length": 630.933349609375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5095347166061401, + "epoch": 0.7301587301587301, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.0019625271670520306, + "learning_rate": 9.794489834629454e-07, + "loss": 0.0004, + "num_tokens": 2596083.0, + "reward": 0.296875, + "reward_std": 0.24944134056568146, + "rewards/itbench_correctness/mean": 0.296875, + "rewards/itbench_correctness/std": 0.4584280252456665, + "step": 138, + "step_time": 73.28423386160284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 568.3125, + "completions/mean_terminated_length": 568.3125, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.47509074211120605, + "epoch": 0.7354497354497355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.001536766067147255, + "learning_rate": 9.789771888432373e-07, + "loss": 0.0225, + "num_tokens": 2617728.0, + "reward": 0.5104166865348816, + "reward_std": 0.43504026532173157, + "rewards/itbench_correctness/mean": 0.5104166865348816, + "rewards/itbench_correctness/std": 0.43127182126045227, + "step": 139, + "step_time": 116.2228917106986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 554.75, + "completions/mean_terminated_length": 487.71429443359375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.3929698169231415, + "epoch": 0.7407407407407407, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.001249143504537642, + "learning_rate": 9.78500156205731e-07, + "loss": -0.0021, + "num_tokens": 2630956.0, + "reward": 0.19062501192092896, + "reward_std": 0.0265165064483881, + "rewards/itbench_correctness/mean": 0.19062501192092896, + "rewards/itbench_correctness/std": 0.2001822143793106, + "step": 140, + "step_time": 416.3123774584383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 462.125, + "completions/mean_terminated_length": 462.125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.36353799700737, + "epoch": 0.746031746031746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0016349649522453547, + "learning_rate": 9.780178907671788e-07, + "loss": 0.0084, + "num_tokens": 2641358.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 141, + "step_time": 87.9296273579821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 552.0, + "completions/mean_terminated_length": 552.0, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.5036231875419617, + "epoch": 0.7513227513227513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.0012071933597326279, + "learning_rate": 9.775303978015585e-07, + "loss": -0.0368, + "num_tokens": 2652918.0, + "reward": 0.65625, + "reward_std": 0.4532671868801117, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.4732423722743988, + "step": 142, + "step_time": 125.74961478449404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 690.5, + "completions/mean_terminated_length": 490.3999938964844, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.483707457780838, + "epoch": 0.7566137566137566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.0013088560663163662, + "learning_rate": 9.77037682640015e-07, + "loss": -0.1116, + "num_tokens": 2668606.0, + "reward": 0.5104166269302368, + "reward_std": 0.39774924516677856, + "rewards/itbench_correctness/mean": 0.5104166269302368, + "rewards/itbench_correctness/std": 0.4732423722743988, + "step": 143, + "step_time": 81.490906807594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 869.1875, + "completions/mean_terminated_length": 817.5833740234375, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.36585891246795654, + "epoch": 0.7619047619047619, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.375, + "kl": 0.001102715963497758, + "learning_rate": 9.76539750670802e-07, + "loss": 0.0248, + "num_tokens": 2688489.0, + "reward": 0.29411765933036804, + "reward_std": 0.1618601679801941, + "rewards/itbench_correctness/mean": 0.29411765933036804, + "rewards/itbench_correctness/std": 0.3757345974445343, + "step": 144, + "step_time": 625.0967052578926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 753.125, + "completions/mean_terminated_length": 714.4285888671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6479668021202087, + "epoch": 0.7671957671957672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0020443897228688, + "learning_rate": 9.760366073392244e-07, + "loss": -0.1502, + "num_tokens": 2719323.0, + "reward": 0.6875, + "reward_std": 0.44403791427612305, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 145, + "step_time": 126.37558931391686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 751.5, + "completions/mean_terminated_length": 751.5, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.39387890696525574, + "epoch": 0.7724867724867724, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.0012558232992887497, + "learning_rate": 9.755282581475767e-07, + "loss": -0.0169, + "num_tokens": 2736931.0, + "reward": 0.84375, + "reward_std": 0.32239729166030884, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 146, + "step_time": 86.20990402065217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 764.1875, + "completions/mean_terminated_length": 677.5833740234375, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.5051116347312927, + "epoch": 0.7777777777777778, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1015625, + "kl": 0.0011690112296491861, + "learning_rate": 9.750147086550842e-07, + "loss": 0.0162, + "num_tokens": 2773926.0, + "reward": 0.4734848737716675, + "reward_std": 0.05882110819220543, + "rewards/itbench_correctness/mean": 0.4734848737716675, + "rewards/itbench_correctness/std": 0.4955727159976959, + "step": 147, + "step_time": 137.13953017815948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 518.0, + "completions/mean_terminated_length": 518.0, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.3146718144416809, + "epoch": 0.783068783068783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028564453125, + "kl": 0.001314603490754962, + "learning_rate": 9.744959644778421e-07, + "loss": 0.0, + "num_tokens": 2787054.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 148, + "step_time": 1022.448972039856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 704.8125, + "completions/mean_terminated_length": 513.2999877929688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4767225384712219, + "epoch": 0.7883597883597884, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3984375, + "kl": 0.0012462595477700233, + "learning_rate": 9.739720312887533e-07, + "loss": -0.0812, + "num_tokens": 2813323.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 149, + "step_time": 102.6073711141944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 666.75, + "completions/mean_terminated_length": 584.3077392578125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4049493670463562, + "epoch": 0.7936507936507936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.0011266146320849657, + "learning_rate": 9.734429148174674e-07, + "loss": -0.0562, + "num_tokens": 2830087.0, + "reward": 0.453125, + "reward_std": 0.15026018023490906, + "rewards/itbench_correctness/mean": 0.453125, + "rewards/itbench_correctness/std": 0.413710355758667, + "step": 150, + "step_time": 72.91534078493714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 764.9375, + "completions/mean_terminated_length": 609.5, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.3294386863708496, + "epoch": 0.798941798941799, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2734375, + "kl": 0.0012626381358131766, + "learning_rate": 9.729086208503173e-07, + "loss": -0.0019, + "num_tokens": 2847998.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 151, + "step_time": 135.860564914532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 789.375, + "completions/mean_terminated_length": 755.857177734375, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.6638163328170776, + "epoch": 0.8042328042328042, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4140625, + "kl": 0.0012647550320252776, + "learning_rate": 9.723691552302562e-07, + "loss": 0.006, + "num_tokens": 2892140.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 152, + "step_time": 128.31029498856515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 438.8125, + "completions/mean_terminated_length": 438.8125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.35778379440307617, + "epoch": 0.8095238095238095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.0011610172223299742, + "learning_rate": 9.718245238567938e-07, + "loss": -0.0117, + "num_tokens": 2901465.0, + "reward": 0.5062500238418579, + "reward_std": 0.1627907156944275, + "rewards/itbench_correctness/mean": 0.5062500238418579, + "rewards/itbench_correctness/std": 0.17308476567268372, + "step": 153, + "step_time": 53.513846694491804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 529.3125, + "completions/mean_terminated_length": 529.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.44775062799453735, + "epoch": 0.8148148148148148, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0016965508693829179, + "learning_rate": 9.712747326859315e-07, + "loss": 0.0038, + "num_tokens": 2931910.0, + "reward": 0.40625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.4552929699420929, + "step": 154, + "step_time": 79.1174840349704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 713.0, + "completions/mean_terminated_length": 609.3333740234375, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.41234222054481506, + "epoch": 0.8201058201058201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.001036427216604352, + "learning_rate": 9.707197877300973e-07, + "loss": -0.0351, + "num_tokens": 2949046.0, + "reward": 0.47181373834609985, + "reward_std": 0.2768261134624481, + "rewards/itbench_correctness/mean": 0.47181373834609985, + "rewards/itbench_correctness/std": 0.45311903953552246, + "step": 155, + "step_time": 1143.2126589166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 572.9375, + "completions/mean_terminated_length": 572.9375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.4380931556224823, + "epoch": 0.8253968253968254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.0014040054520592093, + "learning_rate": 9.701596950580807e-07, + "loss": 0.0116, + "num_tokens": 2961597.0, + "reward": 0.953125, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.953125, + "rewards/itbench_correctness/std": 0.1875, + "step": 156, + "step_time": 101.3859726889059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 650.625, + "completions/mean_terminated_length": 650.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.40268972516059875, + "epoch": 0.8306878306878307, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.375, + "kl": 0.0015219611814245582, + "learning_rate": 9.695944607949648e-07, + "loss": -0.0258, + "num_tokens": 2981207.0, + "reward": 0.78125, + "reward_std": 0.2086307406425476, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.36371922492980957, + "step": 157, + "step_time": 316.9267311077565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 514.25, + "completions/mean_terminated_length": 514.25, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5833738446235657, + "epoch": 0.8359788359788359, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.80078125, + "kl": 0.0016437954036518931, + "learning_rate": 9.690240911220617e-07, + "loss": -0.0919, + "num_tokens": 2994235.0, + "reward": 0.84375, + "reward_std": 0.15866193175315857, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.27024510502815247, + "step": 158, + "step_time": 80.80979425925761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 591.0, + "completions/mean_terminated_length": 591.0, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.46023687720298767, + "epoch": 0.8412698412698413, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.34375, + "kl": 0.001493943389505148, + "learning_rate": 9.684485922768421e-07, + "loss": -0.0018, + "num_tokens": 3009803.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 159, + "step_time": 92.85486916080117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 839.375, + "completions/mean_terminated_length": 654.75, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "entropy": 0.3979151248931885, + "epoch": 0.8465608465608465, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.0009861888829618692, + "learning_rate": 9.678679705528698e-07, + "loss": 0.0391, + "num_tokens": 3033361.0, + "reward": 0.4520833492279053, + "reward_std": 0.2401251643896103, + "rewards/itbench_correctness/mean": 0.4520833492279053, + "rewards/itbench_correctness/std": 0.3798574209213257, + "step": 160, + "step_time": 113.24223164469004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 866.9375, + "completions/mean_terminated_length": 772.7000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4936918616294861, + "epoch": 0.8518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0016467805253341794, + "learning_rate": 9.672822322997304e-07, + "loss": -0.0031, + "num_tokens": 3052032.0, + "reward": 0.535937488079071, + "reward_std": 0.39822056889533997, + "rewards/itbench_correctness/mean": 0.535937488079071, + "rewards/itbench_correctness/std": 0.4591630697250366, + "step": 161, + "step_time": 73.70399552583694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 524.125, + "completions/mean_terminated_length": 524.125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.503696620464325, + "epoch": 0.8571428571428571, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1015625, + "kl": 0.0013778236461803317, + "learning_rate": 9.666913839229637e-07, + "loss": -0.0048, + "num_tokens": 3063106.0, + "reward": 0.5, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.3651483952999115, + "step": 162, + "step_time": 143.48580626491457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 587.75, + "completions/mean_terminated_length": 587.75, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.3862186372280121, + "epoch": 0.8624338624338624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0010811445536091924, + "learning_rate": 9.660954318839932e-07, + "loss": 0.0044, + "num_tokens": 3076070.0, + "reward": 0.643750011920929, + "reward_std": 0.20177768170833588, + "rewards/itbench_correctness/mean": 0.643750011920929, + "rewards/itbench_correctness/std": 0.36142081022262573, + "step": 163, + "step_time": 79.29910835064948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 771.5625, + "completions/mean_terminated_length": 620.1000366210938, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 0.6091535091400146, + "epoch": 0.8677248677248677, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.53125, + "kl": 0.0012378692626953125, + "learning_rate": 9.654943827000546e-07, + "loss": 0.0099, + "num_tokens": 3094839.0, + "reward": 0.609375, + "reward_std": 0.1043153703212738, + "rewards/itbench_correctness/mean": 0.609375, + "rewards/itbench_correctness/std": 0.4278702139854431, + "step": 164, + "step_time": 97.59095096122473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 990.5625, + "completions/mean_terminated_length": 934.8333740234375, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "entropy": 0.5027446746826172, + "epoch": 0.873015873015873, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.390625, + "kl": 0.0011952045606449246, + "learning_rate": 9.648882429441256e-07, + "loss": 0.0129, + "num_tokens": 3138016.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 165, + "step_time": 121.71588209550828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 771.75, + "completions/mean_terminated_length": 771.75, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.39909297227859497, + "epoch": 0.8783068783068783, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.46875, + "kl": 0.0013839628081768751, + "learning_rate": 9.642770192448535e-07, + "loss": 0.0037, + "num_tokens": 3161268.0, + "reward": 0.47187501192092896, + "reward_std": 0.08010874688625336, + "rewards/itbench_correctness/mean": 0.47187501192092896, + "rewards/itbench_correctness/std": 0.3993614614009857, + "step": 166, + "step_time": 104.31897877063602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 805.625, + "completions/mean_terminated_length": 755.2307739257812, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.5784329175949097, + "epoch": 0.8835978835978836, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.00185630121268332, + "learning_rate": 9.636607182864826e-07, + "loss": -0.0227, + "num_tokens": 3196606.0, + "reward": 0.25, + "reward_std": 0.4355512857437134, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 167, + "step_time": 113.68263853341341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 573.625, + "completions/mean_terminated_length": 573.625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.4166485071182251, + "epoch": 0.8888888888888888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0012832505162805319, + "learning_rate": 9.630393468087817e-07, + "loss": -0.0249, + "num_tokens": 3209872.0, + "reward": 0.2291666716337204, + "reward_std": 0.14026343822479248, + "rewards/itbench_correctness/mean": 0.2291666716337204, + "rewards/itbench_correctness/std": 0.1787301003932953, + "step": 168, + "step_time": 417.7054488658905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 928.125, + "completions/mean_terminated_length": 804.857177734375, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 0.5990572571754456, + "epoch": 0.8941798941798942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.0013898048782721162, + "learning_rate": 9.624129116069694e-07, + "loss": 0.0001, + "num_tokens": 3258930.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 169, + "step_time": 225.11859526112676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 959.5, + "completions/mean_terminated_length": 876.5714721679688, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.4815007746219635, + "epoch": 0.8994708994708994, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.001060598180629313, + "learning_rate": 9.61781419531641e-07, + "loss": 0.0041, + "num_tokens": 3282762.0, + "reward": 0.625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.4564354717731476, + "step": 170, + "step_time": 735.476375034079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 735.125, + "completions/mean_terminated_length": 446.25, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.5223601460456848, + "epoch": 0.9047619047619048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.0013405587524175644, + "learning_rate": 9.611448774886923e-07, + "loss": 0.0105, + "num_tokens": 3301500.0, + "reward": 0.6875, + "reward_std": 0.22201895713806152, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.33850160241127014, + "step": 171, + "step_time": 763.4565976867452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 626.875, + "completions/mean_terminated_length": 626.875, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "entropy": 0.4658025801181793, + "epoch": 0.91005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 35.0, + "kl": 0.0011788216652348638, + "learning_rate": 9.605032924392455e-07, + "loss": -0.0153, + "num_tokens": 3315410.0, + "reward": 0.7395833134651184, + "reward_std": 0.16796313226222992, + "rewards/itbench_correctness/mean": 0.7395833134651184, + "rewards/itbench_correctness/std": 0.19924628734588623, + "step": 172, + "step_time": 103.72399638220668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 573.3125, + "completions/mean_terminated_length": 573.3125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.3575711250305176, + "epoch": 0.9153439153439153, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2734375, + "kl": 0.0011289599351584911, + "learning_rate": 9.598566713995717e-07, + "loss": 0.0046, + "num_tokens": 3328047.0, + "reward": 0.3333333432674408, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.3333333432674408, + "rewards/itbench_correctness/std": 0.4036867320537567, + "step": 173, + "step_time": 597.9741206569597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 885.25, + "completions/mean_terminated_length": 802.0, + "completions/min_length": 537.0, + "completions/min_terminated_length": 537.0, + "entropy": 0.4992939829826355, + "epoch": 0.9206349206349206, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0012533684493973851, + "learning_rate": 9.59205021441015e-07, + "loss": 0.0048, + "num_tokens": 3350707.0, + "reward": 0.03750000149011612, + "reward_std": 0.1060660183429718, + "rewards/itbench_correctness/mean": 0.03750000149011612, + "rewards/itbench_correctness/std": 0.15000000596046448, + "step": 174, + "step_time": 158.50677568931133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 715.8125, + "completions/mean_terminated_length": 476.1111145019531, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.5113070607185364, + "epoch": 0.9259259259259259, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.59375, + "kl": 0.0011880681850016117, + "learning_rate": 9.585483496899149e-07, + "loss": -0.0041, + "num_tokens": 3367576.0, + "reward": 0.6875, + "reward_std": 0.20044593513011932, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.42328083515167236, + "step": 175, + "step_time": 881.939713913016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 832.5625, + "completions/mean_terminated_length": 411.3999938964844, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.5116732716560364, + "epoch": 0.9312169312169312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0010343582835048437, + "learning_rate": 9.578866633275286e-07, + "loss": 0.0285, + "num_tokens": 3392569.0, + "reward": 0.5052083730697632, + "reward_std": 0.1857735514640808, + "rewards/itbench_correctness/mean": 0.5052083730697632, + "rewards/itbench_correctness/std": 0.2930029034614563, + "step": 176, + "step_time": 269.2045645285398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 568.8125, + "completions/mean_terminated_length": 568.8125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.5203824043273926, + "epoch": 0.9365079365079365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017578125, + "kl": 0.0011270169634371996, + "learning_rate": 9.572199695899521e-07, + "loss": 0.0, + "num_tokens": 3405782.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 177, + "step_time": 226.17386937886477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 958.875, + "completions/mean_terminated_length": 850.3333740234375, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "entropy": 0.39004039764404297, + "epoch": 0.9417989417989417, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0013279912527650595, + "learning_rate": 9.565482757680414e-07, + "loss": -0.0199, + "num_tokens": 3432116.0, + "reward": 0.625, + "reward_std": 0.28324785828590393, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.3626037836074829, + "step": 178, + "step_time": 150.1005060262978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 558.1875, + "completions/mean_terminated_length": 491.64288330078125, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.4765423834323883, + "epoch": 0.9470899470899471, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0016712526557967067, + "learning_rate": 9.558715892073323e-07, + "loss": 0.0807, + "num_tokens": 3467055.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 179, + "step_time": 91.84085294324905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 847.5625, + "completions/mean_terminated_length": 822.357177734375, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.3374382555484772, + "epoch": 0.9523809523809523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.002606587251648307, + "learning_rate": 9.551899173079606e-07, + "loss": 0.0001, + "num_tokens": 3486896.0, + "reward": 0.4375, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.4518480598926544, + "step": 180, + "step_time": 250.4422083152458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 560.0, + "completions/mean_terminated_length": 529.0667114257812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5392857193946838, + "epoch": 0.9576719576719577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0206298828125, + "kl": 0.0013467188691720366, + "learning_rate": 9.545032675245813e-07, + "loss": 0.0, + "num_tokens": 3501360.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 181, + "step_time": 231.51458043325692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 722.3125, + "completions/mean_terminated_length": 702.2000122070312, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "entropy": 0.41533270478248596, + "epoch": 0.9629629629629629, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0013854112476110458, + "learning_rate": 9.538116473662861e-07, + "loss": -0.0126, + "num_tokens": 3528605.0, + "reward": 0.765625, + "reward_std": 0.4136722683906555, + "rewards/itbench_correctness/mean": 0.765625, + "rewards/itbench_correctness/std": 0.40278977155685425, + "step": 182, + "step_time": 96.76111165247858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 632.6875, + "completions/mean_terminated_length": 576.7857666015625, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "entropy": 0.46152326464653015, + "epoch": 0.9682539682539683, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.001549158594571054, + "learning_rate": 9.531150643965222e-07, + "loss": 0.005, + "num_tokens": 3549936.0, + "reward": 0.3125, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 183, + "step_time": 141.31063493527472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 763.9375, + "completions/mean_terminated_length": 607.9000244140625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.3272519111633301, + "epoch": 0.9735449735449735, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1171875, + "kl": 0.0013428620295599103, + "learning_rate": 9.524135262330098e-07, + "loss": -0.0182, + "num_tokens": 3567807.0, + "reward": 0.3645833432674408, + "reward_std": 0.01928791031241417, + "rewards/itbench_correctness/mean": 0.3645833432674408, + "rewards/itbench_correctness/std": 0.3774610757827759, + "step": 184, + "step_time": 143.63786490540951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 427.9375, + "completions/mean_terminated_length": 427.9375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.4136117994785309, + "epoch": 0.9788359788359788, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.359375, + "kl": 0.0012134211137890816, + "learning_rate": 9.517070405476574e-07, + "loss": -0.0009, + "num_tokens": 3577486.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 185, + "step_time": 170.13555748201907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 573.375, + "completions/mean_terminated_length": 543.3333740234375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.2842816710472107, + "epoch": 0.9841269841269841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0015454581007361412, + "learning_rate": 9.509956150664795e-07, + "loss": -0.0815, + "num_tokens": 3591212.0, + "reward": 0.3984375, + "reward_std": 0.28348496556282043, + "rewards/itbench_correctness/mean": 0.3984375, + "rewards/itbench_correctness/std": 0.2954002320766449, + "step": 186, + "step_time": 82.30455144122243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 573.4375, + "completions/mean_terminated_length": 573.4375, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "entropy": 0.5754768252372742, + "epoch": 0.9894179894179894, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.002303932560607791, + "learning_rate": 9.502792575695111e-07, + "loss": 0.0049, + "num_tokens": 3614019.0, + "reward": 0.40625, + "reward_std": 0.1735912710428238, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.48196646571159363, + "step": 187, + "step_time": 89.72001887392253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 801.25, + "completions/mean_terminated_length": 667.6000366210938, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "entropy": 0.44430577754974365, + "epoch": 0.9947089947089947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.0012758101802319288, + "learning_rate": 9.495579758907229e-07, + "loss": 0.0478, + "num_tokens": 3631471.0, + "reward": 0.4765625, + "reward_std": 0.23403453826904297, + "rewards/itbench_correctness/mean": 0.4765625, + "rewards/itbench_correctness/std": 0.4835174083709717, + "step": 188, + "step_time": 79.22767079528421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 716.1875, + "completions/mean_terminated_length": 613.5833740234375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.6143642663955688, + "epoch": 1.0, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5234375, + "kl": 0.001404265291057527, + "learning_rate": 9.488317779179361e-07, + "loss": 0.0008, + "num_tokens": 3658762.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 189, + "step_time": 153.7122633298859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 575.1875, + "completions/mean_terminated_length": 575.1875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.46245789527893066, + "epoch": 1.0052910052910053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.001671693054959178, + "learning_rate": 9.481006715927351e-07, + "loss": -0.0487, + "num_tokens": 3673277.0, + "reward": 0.6145833730697632, + "reward_std": 0.2882373631000519, + "rewards/itbench_correctness/mean": 0.6145833730697632, + "rewards/itbench_correctness/std": 0.43341347575187683, + "step": 190, + "step_time": 71.11040670704097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 503.75, + "completions/mean_terminated_length": 503.75, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.35334986448287964, + "epoch": 1.0105820105820107, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.0010900463676080108, + "learning_rate": 9.473646649103817e-07, + "loss": 0.012, + "num_tokens": 3684537.0, + "reward": 0.875, + "reward_std": 0.16866441071033478, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.18257419764995575, + "step": 191, + "step_time": 796.484293489717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 665.6875, + "completions/mean_terminated_length": 502.8182067871094, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.43263542652130127, + "epoch": 1.0158730158730158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0013347232015803456, + "learning_rate": 9.466237659197269e-07, + "loss": -0.1131, + "num_tokens": 3704212.0, + "reward": 0.5625, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.4699290990829468, + "step": 192, + "step_time": 630.9985243473202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 414.125, + "completions/mean_terminated_length": 414.125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3936009705066681, + "epoch": 1.0211640211640212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0016619176603853703, + "learning_rate": 9.458779827231236e-07, + "loss": -0.0404, + "num_tokens": 3713654.0, + "reward": 0.65625, + "reward_std": 0.1735912710428238, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.3400367796421051, + "step": 193, + "step_time": 692.6463372064754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 944.5625, + "completions/mean_terminated_length": 600.3333740234375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.44041553139686584, + "epoch": 1.0264550264550265, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0013270394410938025, + "learning_rate": 9.451273234763371e-07, + "loss": 0.0, + "num_tokens": 3736343.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 194, + "step_time": 4224.783679332584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 861.5, + "completions/mean_terminated_length": 374.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5571677088737488, + "epoch": 1.0317460317460316, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0012215389870107174, + "learning_rate": 9.443717963884568e-07, + "loss": -0.0105, + "num_tokens": 3761855.0, + "reward": 0.375, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 195, + "step_time": 750.6342479139566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 776.3125, + "completions/mean_terminated_length": 740.9285888671875, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 0.3168826997280121, + "epoch": 1.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0012352862395346165, + "learning_rate": 9.436114097218058e-07, + "loss": 0.0208, + "num_tokens": 3779892.0, + "reward": 0.578125, + "reward_std": 0.25282490253448486, + "rewards/itbench_correctness/mean": 0.578125, + "rewards/itbench_correctness/std": 0.32556042075157166, + "step": 196, + "step_time": 167.4240329694003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 712.4375, + "completions/mean_terminated_length": 525.5, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.30879902839660645, + "epoch": 1.0423280423280423, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.421875, + "kl": 0.001269629574380815, + "learning_rate": 9.42846171791851e-07, + "loss": -0.0334, + "num_tokens": 3798771.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 197, + "step_time": 1535.9739540033042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 782.875, + "completions/mean_terminated_length": 541.75, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "entropy": 0.38831230998039246, + "epoch": 1.0476190476190477, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.421875, + "kl": 0.001182721694931388, + "learning_rate": 9.420760909671118e-07, + "loss": 0.0, + "num_tokens": 3818961.0, + "reward": 0.3333333432674408, + "reward_std": 0.17817413806915283, + "rewards/itbench_correctness/mean": 0.3333333432674408, + "rewards/itbench_correctness/std": 0.42163702845573425, + "step": 198, + "step_time": 118.89587634429336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 746.625, + "completions/mean_terminated_length": 746.625, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.409844309091568, + "epoch": 1.052910052910053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.001056072534993291, + "learning_rate": 9.413011756690684e-07, + "loss": 0.0058, + "num_tokens": 3839107.0, + "reward": 0.46875, + "reward_std": 0.1944543570280075, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.43006783723831177, + "step": 199, + "step_time": 86.62848719768226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 775.25, + "completions/mean_terminated_length": 581.7777709960938, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.611415684223175, + "epoch": 1.0582010582010581, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.515625, + "kl": 0.0012188013643026352, + "learning_rate": 9.405214343720706e-07, + "loss": 0.0098, + "num_tokens": 3858671.0, + "reward": 0.1302083432674408, + "reward_std": 0.09300297498703003, + "rewards/itbench_correctness/mean": 0.1302083432674408, + "rewards/itbench_correctness/std": 0.13252796232700348, + "step": 200, + "step_time": 82.59393281675875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 843.9375, + "completions/mean_terminated_length": 703.888916015625, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "entropy": 0.46211951971054077, + "epoch": 1.0634920634920635, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3359375, + "kl": 0.000934995652642101, + "learning_rate": 9.397368756032444e-07, + "loss": -0.001, + "num_tokens": 3879470.0, + "reward": 0.71875, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.34308648109436035, + "step": 201, + "step_time": 210.63156687188894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 485.8125, + "completions/mean_terminated_length": 485.8125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.2428920567035675, + "epoch": 1.0687830687830688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.002602006308734417, + "learning_rate": 9.389475079423988e-07, + "loss": -0.0939, + "num_tokens": 3892331.0, + "reward": 0.3125, + "reward_std": 0.25763458013534546, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.25730079412460327, + "step": 202, + "step_time": 79.75068347156048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 439.5, + "completions/mean_terminated_length": 439.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.457337886095047, + "epoch": 1.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.0015101981116458774, + "learning_rate": 9.381533400219317e-07, + "loss": -0.0691, + "num_tokens": 3902267.0, + "reward": 0.5852272510528564, + "reward_std": 0.252642422914505, + "rewards/itbench_correctness/mean": 0.5852272510528564, + "rewards/itbench_correctness/std": 0.3983004689216614, + "step": 203, + "step_time": 112.7746303929016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 522.6875, + "completions/mean_terminated_length": 522.6875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.47829726338386536, + "epoch": 1.0793650793650793, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.0021134628914296627, + "learning_rate": 9.373543805267367e-07, + "loss": -0.0106, + "num_tokens": 3916214.0, + "reward": 0.4910714328289032, + "reward_std": 0.02525380812585354, + "rewards/itbench_correctness/mean": 0.4910714328289032, + "rewards/itbench_correctness/std": 0.5083487033843994, + "step": 204, + "step_time": 118.03211208153516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 762.875, + "completions/mean_terminated_length": 606.2000122070312, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.3617892861366272, + "epoch": 1.0846560846560847, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.53125, + "kl": 0.0010750198271125555, + "learning_rate": 9.365506381941065e-07, + "loss": -0.0237, + "num_tokens": 3933452.0, + "reward": 0.4270833134651184, + "reward_std": 0.053405821323394775, + "rewards/itbench_correctness/mean": 0.4270833134651184, + "rewards/itbench_correctness/std": 0.4470841884613037, + "step": 205, + "step_time": 255.03229981381446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 652.625, + "completions/mean_terminated_length": 363.77777099609375, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.4872629642486572, + "epoch": 1.08994708994709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0011171969817951322, + "learning_rate": 9.357421218136386e-07, + "loss": -0.0152, + "num_tokens": 3953614.0, + "reward": 0.5843750238418579, + "reward_std": 0.21464183926582336, + "rewards/itbench_correctness/mean": 0.5843750238418579, + "rewards/itbench_correctness/std": 0.29686442017555237, + "step": 206, + "step_time": 127.40266931243241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 552.6875, + "completions/mean_terminated_length": 552.6875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4143390357494354, + "epoch": 1.0952380952380953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.001612935564480722, + "learning_rate": 9.349288402271387e-07, + "loss": -0.021, + "num_tokens": 3966409.0, + "reward": 0.71875, + "reward_std": 0.35564959049224854, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.44604745507240295, + "step": 207, + "step_time": 76.78453262429684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 522.875, + "completions/mean_terminated_length": 522.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.2907004654407501, + "epoch": 1.1005291005291005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.001368207624182105, + "learning_rate": 9.341108023285237e-07, + "loss": -0.1923, + "num_tokens": 3980703.0, + "reward": 0.5208333134651184, + "reward_std": 0.25392836332321167, + "rewards/itbench_correctness/mean": 0.5208333134651184, + "rewards/itbench_correctness/std": 0.45082229375839233, + "step": 208, + "step_time": 87.08768197055906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 712.4375, + "completions/mean_terminated_length": 712.4375, + "completions/min_length": 595.0, + "completions/min_terminated_length": 595.0, + "entropy": 0.4154750406742096, + "epoch": 1.1058201058201058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.15625, + "kl": 0.0015189462574198842, + "learning_rate": 9.332880170637252e-07, + "loss": 0.0093, + "num_tokens": 3996494.0, + "reward": 0.8671875, + "reward_std": 0.07790146768093109, + "rewards/itbench_correctness/mean": 0.8671875, + "rewards/itbench_correctness/std": 0.17361806333065033, + "step": 209, + "step_time": 73.19756223168224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 527.75, + "completions/mean_terminated_length": 527.75, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "entropy": 0.43202275037765503, + "epoch": 1.1111111111111112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.0011430936865508556, + "learning_rate": 9.32460493430591e-07, + "loss": -0.0004, + "num_tokens": 4008082.0, + "reward": 0.9166666865348816, + "reward_std": 0.235702246427536, + "rewards/itbench_correctness/mean": 0.9166666865348816, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 210, + "step_time": 7547.0997234797105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 756.1875, + "completions/mean_terminated_length": 547.888916015625, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.4284651577472687, + "epoch": 1.1164021164021163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0010993402684107423, + "learning_rate": 9.316282404787869e-07, + "loss": -0.0121, + "num_tokens": 4028837.0, + "reward": 0.7395833730697632, + "reward_std": 0.28634417057037354, + "rewards/itbench_correctness/mean": 0.7395833730697632, + "rewards/itbench_correctness/std": 0.35988038778305054, + "step": 211, + "step_time": 127.47603439353406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 988.5, + "completions/mean_terminated_length": 740.0, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.3925139009952545, + "epoch": 1.1216931216931216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6640625, + "kl": 0.0009769725147634745, + "learning_rate": 9.307912673096979e-07, + "loss": 0.0022, + "num_tokens": 4061109.0, + "reward": 0.375, + "reward_std": 0.1725163757801056, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.45338237285614014, + "step": 212, + "step_time": 153.4806991070509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 641.0625, + "completions/mean_terminated_length": 586.357177734375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.4180559515953064, + "epoch": 1.126984126984127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.00120700488332659, + "learning_rate": 9.299495830763284e-07, + "loss": -0.0587, + "num_tokens": 4076166.0, + "reward": 0.3768939673900604, + "reward_std": 0.29744255542755127, + "rewards/itbench_correctness/mean": 0.3768939673900604, + "rewards/itbench_correctness/std": 0.3607577383518219, + "step": 213, + "step_time": 132.46722139418125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 566.4375, + "completions/mean_terminated_length": 535.933349609375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.4042811393737793, + "epoch": 1.1322751322751323, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.171875, + "kl": 0.00115107255987823, + "learning_rate": 9.291031969832025e-07, + "loss": 0.0001, + "num_tokens": 4089029.0, + "reward": 0.38786762952804565, + "reward_std": 0.16254664957523346, + "rewards/itbench_correctness/mean": 0.38786762952804565, + "rewards/itbench_correctness/std": 0.458029180765152, + "step": 214, + "step_time": 364.9181332997978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 452.25, + "completions/mean_terminated_length": 452.25, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "entropy": 0.4731896221637726, + "epoch": 1.1375661375661377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0016687542665749788, + "learning_rate": 9.282521182862629e-07, + "loss": 0.0181, + "num_tokens": 4103865.0, + "reward": 0.587890625, + "reward_std": 0.31902575492858887, + "rewards/itbench_correctness/mean": 0.587890625, + "rewards/itbench_correctness/std": 0.37728795409202576, + "step": 215, + "step_time": 78.70893874578178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 621.3125, + "completions/mean_terminated_length": 621.3125, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 0.4731918275356293, + "epoch": 1.1428571428571428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0012563822092488408, + "learning_rate": 9.273963562927694e-07, + "loss": 0.0316, + "num_tokens": 4116998.0, + "reward": 0.875, + "reward_std": 0.2630348801612854, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.2687419056892395, + "step": 216, + "step_time": 189.60282021015882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 981.875, + "completions/mean_terminated_length": 927.71435546875, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "entropy": 0.4297899305820465, + "epoch": 1.1481481481481481, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5078125, + "kl": 0.0011373235611245036, + "learning_rate": 9.265359203611987e-07, + "loss": 0.0, + "num_tokens": 4144004.0, + "reward": 0.02500000037252903, + "reward_std": 0.04629100486636162, + "rewards/itbench_correctness/mean": 0.02500000037252903, + "rewards/itbench_correctness/std": 0.06831301003694534, + "step": 217, + "step_time": 195.92587360646576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 956.875, + "completions/mean_terminated_length": 845.0, + "completions/min_length": 667.0, + "completions/min_terminated_length": 667.0, + "entropy": 0.5434356331825256, + "epoch": 1.1534391534391535, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0011996476678177714, + "learning_rate": 9.2567081990114e-07, + "loss": 0.0111, + "num_tokens": 4186818.0, + "reward": 0.16249999403953552, + "reward_std": 0.25583362579345703, + "rewards/itbench_correctness/mean": 0.16249999403953552, + "rewards/itbench_correctness/std": 0.2673948407173157, + "step": 218, + "step_time": 182.73709686659276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 886.0625, + "completions/mean_terminated_length": 708.7142944335938, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6590957045555115, + "epoch": 1.1587301587301586, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.578125, + "kl": 0.0019986084662377834, + "learning_rate": 9.248010643731934e-07, + "loss": 0.0001, + "num_tokens": 4218627.0, + "reward": 0.171875, + "reward_std": 0.16952534019947052, + "rewards/itbench_correctness/mean": 0.171875, + "rewards/itbench_correctness/std": 0.29181545972824097, + "step": 219, + "step_time": 215.92658524494618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 586.3125, + "completions/mean_terminated_length": 586.3125, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "entropy": 0.48438331484794617, + "epoch": 1.164021164021164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0286865234375, + "kl": 0.0012342262780293822, + "learning_rate": 9.239266632888658e-07, + "loss": 0.0, + "num_tokens": 4232136.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 220, + "step_time": 87.92167458124459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 588.4375, + "completions/mean_terminated_length": 588.4375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.3789697289466858, + "epoch": 1.1693121693121693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0008935832884162664, + "learning_rate": 9.230476262104676e-07, + "loss": 0.0133, + "num_tokens": 4245863.0, + "reward": 0.6875, + "reward_std": 0.09531004726886749, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.15000000596046448, + "step": 221, + "step_time": 73.37130374461412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 957.1875, + "completions/mean_terminated_length": 489.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.35938623547554016, + "epoch": 1.1746031746031746, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.0009425441385246813, + "learning_rate": 9.221639627510075e-07, + "loss": -0.1453, + "num_tokens": 4274026.0, + "reward": 0.2772817611694336, + "reward_std": 0.13795886933803558, + "rewards/itbench_correctness/mean": 0.2772817611694336, + "rewards/itbench_correctness/std": 0.22852860391139984, + "step": 222, + "step_time": 114.55098836030811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 747.1875, + "completions/mean_terminated_length": 683.3077392578125, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.36135509610176086, + "epoch": 1.17989417989418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.00118353555444628, + "learning_rate": 9.212756825740872e-07, + "loss": -0.0024, + "num_tokens": 4290805.0, + "reward": 0.4583333432674408, + "reward_std": 0.2527993321418762, + "rewards/itbench_correctness/mean": 0.4583333432674408, + "rewards/itbench_correctness/std": 0.30804041028022766, + "step": 223, + "step_time": 133.77505498286337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 501.3125, + "completions/mean_terminated_length": 466.4667053222656, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4587956666946411, + "epoch": 1.1851851851851851, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0022395735140889883, + "learning_rate": 9.203827953937968e-07, + "loss": -0.0998, + "num_tokens": 4302938.0, + "reward": 0.35208332538604736, + "reward_std": 0.3391679525375366, + "rewards/itbench_correctness/mean": 0.35208332538604736, + "rewards/itbench_correctness/std": 0.3392188847064972, + "step": 224, + "step_time": 86.85009481851012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 717.4375, + "completions/mean_terminated_length": 578.0909423828125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.36240091919898987, + "epoch": 1.1904761904761905, + "frac_reward_zero_std": 0.5, + "grad_norm": 15.875, + "kl": 0.0009973702253773808, + "learning_rate": 9.194853109746072e-07, + "loss": -0.0269, + "num_tokens": 4321705.0, + "reward": 0.4114583432674408, + "reward_std": 0.17598573863506317, + "rewards/itbench_correctness/mean": 0.4114583432674408, + "rewards/itbench_correctness/std": 0.3488987386226654, + "step": 225, + "step_time": 695.8362277401611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 332.375, + "completions/mean_terminated_length": 332.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.3760812282562256, + "epoch": 1.1957671957671958, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.0032024469692260027, + "learning_rate": 9.185832391312642e-07, + "loss": -0.0008, + "num_tokens": 4329399.0, + "reward": 0.3125, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.3095695972442627, + "step": 226, + "step_time": 71.38310491386801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 811.25, + "completions/mean_terminated_length": 537.7142944335938, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "entropy": 0.4955315887928009, + "epoch": 1.201058201058201, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.21875, + "kl": 0.0015454755630344152, + "learning_rate": 9.176765897286811e-07, + "loss": 0.0, + "num_tokens": 4367643.0, + "reward": 0.171875, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.171875, + "rewards/itbench_correctness/std": 0.25361964106559753, + "step": 227, + "step_time": 732.0901973983273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 692.625, + "completions/mean_terminated_length": 434.8888854980469, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.4908861219882965, + "epoch": 1.2063492063492063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01904296875, + "kl": 0.0013219286920502782, + "learning_rate": 9.167653726818304e-07, + "loss": 0.0, + "num_tokens": 4388877.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 228, + "step_time": 883.394539824687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 532.375, + "completions/mean_terminated_length": 368.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.33810752630233765, + "epoch": 1.2116402116402116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.0016971830045804381, + "learning_rate": 9.158495979556358e-07, + "loss": -0.1874, + "num_tokens": 4406811.0, + "reward": 0.4947916567325592, + "reward_std": 0.33243152499198914, + "rewards/itbench_correctness/mean": 0.4947916567325592, + "rewards/itbench_correctness/std": 0.3914227783679962, + "step": 229, + "step_time": 297.6574033163488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 516.5625, + "completions/mean_terminated_length": 516.5625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5071990489959717, + "epoch": 1.216931216931217, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1640625, + "kl": 0.0015578863676637411, + "learning_rate": 9.14929275564863e-07, + "loss": -0.0018, + "num_tokens": 4418316.0, + "reward": 0.359375, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.359375, + "rewards/itbench_correctness/std": 0.3760402202606201, + "step": 230, + "step_time": 98.72435673046857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 711.5625, + "completions/mean_terminated_length": 468.5555725097656, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.34571805596351624, + "epoch": 1.2222222222222223, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3671875, + "kl": 0.0011883811093866825, + "learning_rate": 9.1400441557401e-07, + "loss": 0.0284, + "num_tokens": 4450277.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 231, + "step_time": 109.3037657784298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 506.375, + "completions/mean_terminated_length": 506.375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.5213527679443359, + "epoch": 1.2275132275132274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.003350482787936926, + "learning_rate": 9.130750280971977e-07, + "loss": -0.0084, + "num_tokens": 4470851.0, + "reward": 0.5249999761581421, + "reward_std": 0.34211215376853943, + "rewards/itbench_correctness/mean": 0.5249999761581421, + "rewards/itbench_correctness/std": 0.3803507089614868, + "step": 232, + "step_time": 117.60546538699418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 696.25, + "completions/mean_terminated_length": 696.25, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "entropy": 0.5285457968711853, + "epoch": 1.2328042328042328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020263671875, + "kl": 0.0014563931617885828, + "learning_rate": 9.121411232980587e-07, + "loss": 0.0, + "num_tokens": 4490551.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 233, + "step_time": 93.69822262041271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 886.25, + "completions/mean_terminated_length": 854.4615478515625, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "entropy": 0.2617771625518799, + "epoch": 1.2380952380952381, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.234375, + "kl": 0.00089176808251068, + "learning_rate": 9.112027113896261e-07, + "loss": 0.0017, + "num_tokens": 4513339.0, + "reward": 0.375, + "reward_std": 0.15669579803943634, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.4425306022167206, + "step": 234, + "step_time": 236.43063350580633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 487.125, + "completions/mean_terminated_length": 487.125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.4331537187099457, + "epoch": 1.2433862433862433, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.03125, + "kl": 0.0012799181276932359, + "learning_rate": 9.102598026342222e-07, + "loss": -0.0038, + "num_tokens": 4523829.0, + "reward": 0.3333333432674408, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.3333333432674408, + "rewards/itbench_correctness/std": 0.4036867320537567, + "step": 235, + "step_time": 690.8860946493223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 448.0, + "completions/mean_terminated_length": 448.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5066964030265808, + "epoch": 1.2486772486772486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034423828125, + "kl": 0.0018882593140006065, + "learning_rate": 9.093124073433462e-07, + "loss": 0.0, + "num_tokens": 4552069.0, + "reward": 0.3125, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.3227486312389374, + "step": 236, + "step_time": 151.43605288118124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 564.9375, + "completions/mean_terminated_length": 564.9375, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.4885496199131012, + "epoch": 1.253968253968254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.001722036860883236, + "learning_rate": 9.083605358775611e-07, + "loss": -0.0206, + "num_tokens": 4567172.0, + "reward": 0.6875, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 237, + "step_time": 79.38015065714717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 528.6875, + "completions/mean_terminated_length": 528.6875, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.4085589349269867, + "epoch": 1.2592592592592593, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.0010910117998719215, + "learning_rate": 9.074041986463808e-07, + "loss": 0.0075, + "num_tokens": 4578575.0, + "reward": 0.9047619104385376, + "reward_std": 0.19606643915176392, + "rewards/itbench_correctness/mean": 0.9047619104385376, + "rewards/itbench_correctness/std": 0.2161296308040619, + "step": 238, + "step_time": 126.61944894865155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 696.9375, + "completions/mean_terminated_length": 500.70001220703125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.5710698366165161, + "epoch": 1.2645502645502646, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.453125, + "kl": 0.0016381683526560664, + "learning_rate": 9.064434061081561e-07, + "loss": 0.017, + "num_tokens": 4602870.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 239, + "step_time": 184.73187920358032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 644.0625, + "completions/mean_terminated_length": 644.0625, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.4098981022834778, + "epoch": 1.2698412698412698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021240234375, + "kl": 0.001283331774175167, + "learning_rate": 9.0547816876996e-07, + "loss": 0.0, + "num_tokens": 4623511.0, + "reward": 0.550000011920929, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.550000011920929, + "rewards/itbench_correctness/std": 0.4647580087184906, + "step": 240, + "step_time": 119.32252531778067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 804.875, + "completions/mean_terminated_length": 705.2727661132812, + "completions/min_length": 579.0, + "completions/min_terminated_length": 579.0, + "entropy": 0.4646684229373932, + "epoch": 1.2751322751322751, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.0014155855169519782, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0092, + "num_tokens": 4644965.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 241, + "step_time": 253.224197126925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 761.9375, + "completions/mean_terminated_length": 604.7000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5197276473045349, + "epoch": 1.2804232804232805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0014071539044380188, + "learning_rate": 9.0353440196487e-07, + "loss": -0.189, + "num_tokens": 4670188.0, + "reward": 0.609375, + "reward_std": 0.3135034143924713, + "rewards/itbench_correctness/mean": 0.609375, + "rewards/itbench_correctness/std": 0.41047483682632446, + "step": 242, + "step_time": 248.56613456085324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 806.9375, + "completions/mean_terminated_length": 756.84619140625, + "completions/min_length": 579.0, + "completions/min_terminated_length": 579.0, + "entropy": 0.4957013428211212, + "epoch": 1.2857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.0014731662813574076, + "learning_rate": 9.025558937546987e-07, + "loss": 0.0259, + "num_tokens": 4690107.0, + "reward": 0.6979166269302368, + "reward_std": 0.18552666902542114, + "rewards/itbench_correctness/mean": 0.6979166269302368, + "rewards/itbench_correctness/std": 0.18225695192813873, + "step": 243, + "step_time": 141.42184507194906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 674.5625, + "completions/mean_terminated_length": 674.5625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.5336792469024658, + "epoch": 1.291005291005291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035400390625, + "kl": 0.0016851610271260142, + "learning_rate": 9.015729832577681e-07, + "loss": 0.0, + "num_tokens": 4710412.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 244, + "step_time": 105.10308491624892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 527.75, + "completions/mean_terminated_length": 527.75, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.5949786901473999, + "epoch": 1.2962962962962963, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.21875, + "kl": 0.001679896144196391, + "learning_rate": 9.005856812230304e-07, + "loss": -0.0189, + "num_tokens": 4723320.0, + "reward": 0.4322916865348816, + "reward_std": 0.031000997871160507, + "rewards/itbench_correctness/mean": 0.4322916865348816, + "rewards/itbench_correctness/std": 0.4484735131263733, + "step": 245, + "step_time": 98.61767490487546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 661.1875, + "completions/mean_terminated_length": 577.4615478515625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.3705454170703888, + "epoch": 1.3015873015873016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0013237885432317853, + "learning_rate": 8.995939984474623e-07, + "loss": 0.0171, + "num_tokens": 4739299.0, + "reward": 0.6763392686843872, + "reward_std": 0.17046323418617249, + "rewards/itbench_correctness/mean": 0.6763392686843872, + "rewards/itbench_correctness/std": 0.29665619134902954, + "step": 246, + "step_time": 81.29718050733209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 945.125, + "completions/mean_terminated_length": 909.2727661132812, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.31318607926368713, + "epoch": 1.306878306878307, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.515625, + "kl": 0.001092436257749796, + "learning_rate": 8.98597945775948e-07, + "loss": 0.0178, + "num_tokens": 4762901.0, + "reward": 0.41874998807907104, + "reward_std": 0.17100021243095398, + "rewards/itbench_correctness/mean": 0.41874998807907104, + "rewards/itbench_correctness/std": 0.4915536642074585, + "step": 247, + "step_time": 379.86342859547585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 466.1875, + "completions/mean_terminated_length": 466.1875, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.49336370825767517, + "epoch": 1.312169312169312, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2578125, + "kl": 0.0015457995468750596, + "learning_rate": 8.975975341011595e-07, + "loss": -0.0118, + "num_tokens": 4772808.0, + "reward": 0.5645833611488342, + "reward_std": 0.01928791031241417, + "rewards/itbench_correctness/mean": 0.5645833611488342, + "rewards/itbench_correctness/std": 0.17201152443885803, + "step": 248, + "step_time": 83.39602283388376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 502.6875, + "completions/mean_terminated_length": 502.6875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "entropy": 0.3978614807128906, + "epoch": 1.3174603174603174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0164794921875, + "kl": 0.001136748120188713, + "learning_rate": 8.965927743634389e-07, + "loss": 0.0, + "num_tokens": 4783827.0, + "reward": 0.5833333134651184, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5833333134651184, + "rewards/itbench_correctness/std": 0.4303314983844757, + "step": 249, + "step_time": 808.8291652789339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 488.375, + "completions/mean_terminated_length": 488.375, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.3910928964614868, + "epoch": 1.3227513227513228, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.0019690291956067085, + "learning_rate": 8.955836775506775e-07, + "loss": 0.0, + "num_tokens": 4795977.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 250, + "step_time": 1037.5402492322028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 415.9375, + "completions/mean_terminated_length": 415.9375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.40631103515625, + "epoch": 1.328042328042328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.0014389187563210726, + "learning_rate": 8.945702546981968e-07, + "loss": -0.0199, + "num_tokens": 4805024.0, + "reward": 0.546875, + "reward_std": 0.16521647572517395, + "rewards/itbench_correctness/mean": 0.546875, + "rewards/itbench_correctness/std": 0.24714809656143188, + "step": 251, + "step_time": 89.62413766887039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 614.9375, + "completions/mean_terminated_length": 614.9375, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.5171257257461548, + "epoch": 1.3333333333333333, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.046875, + "kl": 0.0015879496932029724, + "learning_rate": 8.935525168886262e-07, + "loss": 0.0103, + "num_tokens": 4827879.0, + "reward": 0.359375, + "reward_std": 0.05866191163659096, + "rewards/itbench_correctness/mean": 0.359375, + "rewards/itbench_correctness/std": 0.3797157406806946, + "step": 252, + "step_time": 79.69811306335032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 657.75, + "completions/mean_terminated_length": 657.75, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.3618395924568176, + "epoch": 1.3386243386243386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.001461725914850831, + "learning_rate": 8.925304752517839e-07, + "loss": -0.0421, + "num_tokens": 4842899.0, + "reward": 0.40937501192092896, + "reward_std": 0.30845823884010315, + "rewards/itbench_correctness/mean": 0.40937501192092896, + "rewards/itbench_correctness/std": 0.3658055067062378, + "step": 253, + "step_time": 131.8951225792989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 725.875, + "completions/mean_terminated_length": 626.5, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 0.5731014013290405, + "epoch": 1.343915343915344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.0015030583599582314, + "learning_rate": 8.91504140964553e-07, + "loss": 0.0, + "num_tokens": 4870481.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 254, + "step_time": 419.8384141791612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 756.4375, + "completions/mean_terminated_length": 488.875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.4759150743484497, + "epoch": 1.3492063492063493, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.953125, + "kl": 0.0011384043609723449, + "learning_rate": 8.904735252507609e-07, + "loss": -0.0058, + "num_tokens": 4889368.0, + "reward": 0.2916666865348816, + "reward_std": 0.1178511381149292, + "rewards/itbench_correctness/mean": 0.2916666865348816, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 255, + "step_time": 75.48513688519597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 621.25, + "completions/mean_terminated_length": 563.7142944335938, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.36378270387649536, + "epoch": 1.3544973544973544, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.171875, + "kl": 0.002011555014178157, + "learning_rate": 8.894386393810562e-07, + "loss": 0.014, + "num_tokens": 4904140.0, + "reward": 0.71875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 256, + "step_time": 448.9436140609905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 1007.375, + "completions/mean_terminated_length": 935.3333740234375, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 0.4546469748020172, + "epoch": 1.3597883597883598, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6484375, + "kl": 0.0013795711565762758, + "learning_rate": 8.883994946727847e-07, + "loss": 0.0001, + "num_tokens": 4929690.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 257, + "step_time": 249.96061486005783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 705.5, + "completions/mean_terminated_length": 632.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5301204919815063, + "epoch": 1.3650793650793651, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4609375, + "kl": 0.0015691749285906553, + "learning_rate": 8.873561024898667e-07, + "loss": -0.0308, + "num_tokens": 4954970.0, + "reward": 0.75, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 258, + "step_time": 128.34479956980795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 485.875, + "completions/mean_terminated_length": 485.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3889889419078827, + "epoch": 1.3703703703703702, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0859375, + "kl": 0.002027226844802499, + "learning_rate": 8.863084742426718e-07, + "loss": -0.0592, + "num_tokens": 4980176.0, + "reward": 0.75, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 259, + "step_time": 110.6389656001702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 611.4375, + "completions/mean_terminated_length": 611.4375, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.4971890151500702, + "epoch": 1.3756613756613756, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.0010566740529611707, + "learning_rate": 8.852566213878946e-07, + "loss": -0.0071, + "num_tokens": 4992951.0, + "reward": 0.84375, + "reward_std": 0.22903135418891907, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.3520771861076355, + "step": 260, + "step_time": 502.41869831830263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 419.0, + "completions/mean_terminated_length": 419.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 0.4797135889530182, + "epoch": 1.380952380952381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0223388671875, + "kl": 0.0018066860502585769, + "learning_rate": 8.842005554284295e-07, + "loss": 0.0, + "num_tokens": 5002711.0, + "reward": 0.2083333283662796, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.2083333283662796, + "rewards/itbench_correctness/std": 0.21516574919223785, + "step": 261, + "step_time": 94.77580868080258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 758.8125, + "completions/mean_terminated_length": 599.7000122070312, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.4796968996524811, + "epoch": 1.3862433862433863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.234375, + "kl": 0.0017543428111821413, + "learning_rate": 8.831402879132445e-07, + "loss": -0.0144, + "num_tokens": 5039356.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 262, + "step_time": 486.13853998761624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 641.75, + "completions/mean_terminated_length": 514.3333740234375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.49863654375076294, + "epoch": 1.3915343915343916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0019419684540480375, + "learning_rate": 8.820758304372555e-07, + "loss": 0.0274, + "num_tokens": 5056128.0, + "reward": 0.5, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 263, + "step_time": 1132.2881942698732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 859.75, + "completions/mean_terminated_length": 785.0909423828125, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "entropy": 0.47223028540611267, + "epoch": 1.3968253968253967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0234375, + "kl": 0.0012077168794348836, + "learning_rate": 8.810071946411988e-07, + "loss": 0.0, + "num_tokens": 5077620.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 264, + "step_time": 98.81888623256236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 616.875, + "completions/mean_terminated_length": 589.7333374023438, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.40688955783843994, + "epoch": 1.402116402116402, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0010585308773443103, + "learning_rate": 8.799343922115043e-07, + "loss": -0.0001, + "num_tokens": 5090890.0, + "reward": 0.6937500238418579, + "reward_std": 0.13999362289905548, + "rewards/itbench_correctness/mean": 0.6937500238418579, + "rewards/itbench_correctness/std": 0.3696281909942627, + "step": 265, + "step_time": 779.0254694251344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 599.3125, + "completions/mean_terminated_length": 599.3125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 0.5172593593597412, + "epoch": 1.4074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0015477510169148445, + "learning_rate": 8.788574348801674e-07, + "loss": -0.0177, + "num_tokens": 5103495.0, + "reward": 0.46875, + "reward_std": 0.1552036553621292, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.15478479862213135, + "step": 266, + "step_time": 430.513926978223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 705.0625, + "completions/mean_terminated_length": 513.7000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3616700768470764, + "epoch": 1.4126984126984126, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.140625, + "kl": 0.0013228055322542787, + "learning_rate": 8.777763344246208e-07, + "loss": 0.0035, + "num_tokens": 5126072.0, + "reward": 0.34375, + "reward_std": 0.1735912710428238, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.42695629596710205, + "step": 267, + "step_time": 154.13704107049853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 493.375, + "completions/mean_terminated_length": 493.375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.3830757439136505, + "epoch": 1.417989417989418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0012491599190980196, + "learning_rate": 8.766911026676063e-07, + "loss": 0.0118, + "num_tokens": 5137798.0, + "reward": 0.796875, + "reward_std": 0.26196980476379395, + "rewards/itbench_correctness/mean": 0.796875, + "rewards/itbench_correctness/std": 0.27716949582099915, + "step": 268, + "step_time": 78.56690625380725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 639.6875, + "completions/mean_terminated_length": 639.6875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 0.41582804918289185, + "epoch": 1.4232804232804233, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0703125, + "kl": 0.0016272872453555465, + "learning_rate": 8.756017514770442e-07, + "loss": 0.0292, + "num_tokens": 5150993.0, + "reward": 0.78125, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.286865234375, + "step": 269, + "step_time": 87.11506285239011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 907.8125, + "completions/mean_terminated_length": 559.25, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.4648537039756775, + "epoch": 1.4285714285714286, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.046875, + "kl": 0.0014328653924167156, + "learning_rate": 8.745082927659046e-07, + "loss": 0.0103, + "num_tokens": 5182310.0, + "reward": 0.3958333432674408, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.3958333432674408, + "rewards/itbench_correctness/std": 0.47482940554618835, + "step": 270, + "step_time": 148.08130174782127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 821.0, + "completions/mean_terminated_length": 663.1111450195312, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.38246041536331177, + "epoch": 1.433862433862434, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5703125, + "kl": 0.0012811021879315376, + "learning_rate": 8.734107384920769e-07, + "loss": 0.045, + "num_tokens": 5206270.0, + "reward": 0.4124999940395355, + "reward_std": 0.172688826918602, + "rewards/itbench_correctness/mean": 0.4124999940395355, + "rewards/itbench_correctness/std": 0.4869976043701172, + "step": 271, + "step_time": 145.7680284064263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 584.0, + "completions/mean_terminated_length": 482.4615478515625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3476027250289917, + "epoch": 1.439153439153439, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.002004146808758378, + "learning_rate": 8.723091006582388e-07, + "loss": 0.0194, + "num_tokens": 5224206.0, + "reward": 0.4375, + "reward_std": 0.38298875093460083, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 272, + "step_time": 309.5534623619169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 656.25, + "completions/mean_terminated_length": 370.22222900390625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.7039999961853027, + "epoch": 1.4444444444444444, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.421875, + "kl": 0.0013335734838619828, + "learning_rate": 8.712033913117249e-07, + "loss": 0.074, + "num_tokens": 5243858.0, + "reward": 0.015625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.015625, + "rewards/itbench_correctness/std": 0.0625, + "step": 273, + "step_time": 103.5548415929079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 646.75, + "completions/mean_terminated_length": 646.75, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "entropy": 0.5102435350418091, + "epoch": 1.4497354497354498, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2265625, + "kl": 0.0015617223689332604, + "learning_rate": 8.700936225443958e-07, + "loss": 0.0166, + "num_tokens": 5259974.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 274, + "step_time": 187.742013909854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 469.75, + "completions/mean_terminated_length": 469.75, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.42575839161872864, + "epoch": 1.455026455026455, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0016291391802951694, + "learning_rate": 8.689798064925048e-07, + "loss": 0.012, + "num_tokens": 5270194.0, + "reward": 0.4375, + "reward_std": 0.0862581878900528, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.4669642150402069, + "step": 275, + "step_time": 89.66884011216462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 821.375, + "completions/mean_terminated_length": 807.86669921875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4723786413669586, + "epoch": 1.4603174603174602, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.001486663124524057, + "learning_rate": 8.678619553365658e-07, + "loss": -0.0841, + "num_tokens": 5292856.0, + "reward": 0.8402777910232544, + "reward_std": 0.21910008788108826, + "rewards/itbench_correctness/mean": 0.8402777910232544, + "rewards/itbench_correctness/std": 0.3417908549308777, + "step": 276, + "step_time": 421.1291719619185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 588.8125, + "completions/mean_terminated_length": 588.8125, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.43817004561424255, + "epoch": 1.4656084656084656, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25, + "kl": 0.001539916731417179, + "learning_rate": 8.667400813012199e-07, + "loss": -0.002, + "num_tokens": 5312117.0, + "reward": 0.43541669845581055, + "reward_std": 0.1833198070526123, + "rewards/itbench_correctness/mean": 0.43541669845581055, + "rewards/itbench_correctness/std": 0.4274764358997345, + "step": 277, + "step_time": 134.6105333585292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 812.3125, + "completions/mean_terminated_length": 741.75, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.4210202395915985, + "epoch": 1.470899470899471, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.0012785056605935097, + "learning_rate": 8.656141966551018e-07, + "loss": -0.0175, + "num_tokens": 5336450.0, + "reward": 0.125, + "reward_std": 0.2925041913986206, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.28867512941360474, + "step": 278, + "step_time": 588.067256687209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 479.125, + "completions/mean_terminated_length": 479.125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.4654317796230316, + "epoch": 1.4761904761904763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.001857173629105091, + "learning_rate": 8.644843137107057e-07, + "loss": 0.0, + "num_tokens": 5347140.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 279, + "step_time": 93.42977315280586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 902.1875, + "completions/mean_terminated_length": 807.4444580078125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4633183181285858, + "epoch": 1.4814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0013472916325554252, + "learning_rate": 8.633504448242504e-07, + "loss": -0.0543, + "num_tokens": 5367903.0, + "reward": 0.3077380955219269, + "reward_std": 0.26785334944725037, + "rewards/itbench_correctness/mean": 0.3077380955219269, + "rewards/itbench_correctness/std": 0.3630719482898712, + "step": 280, + "step_time": 90.29017782397568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 777.25, + "completions/mean_terminated_length": 665.0909423828125, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.3731103241443634, + "epoch": 1.4867724867724867, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.0013979077339172363, + "learning_rate": 8.622126023955445e-07, + "loss": 0.0109, + "num_tokens": 5393947.0, + "reward": 0.31041666865348816, + "reward_std": 0.18059369921684265, + "rewards/itbench_correctness/mean": 0.31041666865348816, + "rewards/itbench_correctness/std": 0.32879552245140076, + "step": 281, + "step_time": 111.45759059861302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 780.375, + "completions/mean_terminated_length": 536.75, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.6048374176025391, + "epoch": 1.492063492063492, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.53125, + "kl": 0.0014561447314918041, + "learning_rate": 8.610707988678503e-07, + "loss": 0.0, + "num_tokens": 5413105.0, + "reward": 0.609375, + "reward_std": 0.12387890368700027, + "rewards/itbench_correctness/mean": 0.609375, + "rewards/itbench_correctness/std": 0.4375000298023224, + "step": 282, + "step_time": 87.52544206380844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 567.25, + "completions/mean_terminated_length": 567.25, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.39841338992118835, + "epoch": 1.4973544973544972, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1328125, + "kl": 0.0016809444641694427, + "learning_rate": 8.599250467277483e-07, + "loss": -0.1189, + "num_tokens": 5431333.0, + "reward": 0.3125, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 283, + "step_time": 285.17205636110157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 790.9375, + "completions/mean_terminated_length": 651.1000366210938, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 0.3767680823802948, + "epoch": 1.5026455026455028, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.59375, + "kl": 0.001456994330510497, + "learning_rate": 8.587753585050004e-07, + "loss": -0.0216, + "num_tokens": 5450876.0, + "reward": 0.03125, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.03125, + "rewards/itbench_correctness/std": 0.125, + "step": 284, + "step_time": 7258.076673376374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 572.8125, + "completions/mean_terminated_length": 572.8125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.36661210656166077, + "epoch": 1.507936507936508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.001243554288521409, + "learning_rate": 8.576217467724127e-07, + "loss": -0.0147, + "num_tokens": 5463649.0, + "reward": 0.875, + "reward_std": 0.2177756428718567, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.22360680997371674, + "step": 285, + "step_time": 65.26539001893252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 674.0, + "completions/mean_terminated_length": 464.0, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.4094955623149872, + "epoch": 1.5132275132275133, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.0017368828412145376, + "learning_rate": 8.564642241456986e-07, + "loss": 0.0128, + "num_tokens": 5481657.0, + "reward": 0.1640625, + "reward_std": 0.06629125773906708, + "rewards/itbench_correctness/mean": 0.1640625, + "rewards/itbench_correctness/std": 0.19213032722473145, + "step": 286, + "step_time": 1010.0560459299013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 677.5625, + "completions/mean_terminated_length": 469.70001220703125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "entropy": 0.30107924342155457, + "epoch": 1.5185185185185186, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0014942490961402655, + "learning_rate": 8.553028032833396e-07, + "loss": 0.0399, + "num_tokens": 5500474.0, + "reward": 0.34375, + "reward_std": 0.12938730418682098, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.3966001570224762, + "step": 287, + "step_time": 967.4249309562147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 861.8125, + "completions/mean_terminated_length": 851.0000610351562, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.3689897656440735, + "epoch": 1.5238095238095237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.001182331470772624, + "learning_rate": 8.541374968864485e-07, + "loss": 0.0161, + "num_tokens": 5520511.0, + "reward": 0.6545138955116272, + "reward_std": 0.2626494765281677, + "rewards/itbench_correctness/mean": 0.6545138955116272, + "rewards/itbench_correctness/std": 0.284541517496109, + "step": 288, + "step_time": 416.6262904284522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 753.5625, + "completions/mean_terminated_length": 591.2999877929688, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "entropy": 0.6183959245681763, + "epoch": 1.529100529100529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.0017281738109886646, + "learning_rate": 8.529683176986295e-07, + "loss": 0.0359, + "num_tokens": 5542376.0, + "reward": 0.5255681872367859, + "reward_std": 0.07393435388803482, + "rewards/itbench_correctness/mean": 0.5255681872367859, + "rewards/itbench_correctness/std": 0.48524191975593567, + "step": 289, + "step_time": 100.21258049272001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 957.125, + "completions/mean_terminated_length": 756.5, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.5600104331970215, + "epoch": 1.5343915343915344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.224609375, + "kl": 0.002044468652456999, + "learning_rate": 8.517952785058384e-07, + "loss": 0.0001, + "num_tokens": 5578490.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 290, + "step_time": 204.50664361845702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 846.8125, + "completions/mean_terminated_length": 805.923095703125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.49125397205352783, + "epoch": 1.5396825396825395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.001689074793830514, + "learning_rate": 8.506183921362442e-07, + "loss": -0.1079, + "num_tokens": 5598255.0, + "reward": 0.7552083730697632, + "reward_std": 0.3175256550312042, + "rewards/itbench_correctness/mean": 0.7552083730697632, + "rewards/itbench_correctness/std": 0.3325946629047394, + "step": 291, + "step_time": 141.62990444898605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 697.3125, + "completions/mean_terminated_length": 443.22222900390625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.5621582865715027, + "epoch": 1.544973544973545, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.140625, + "kl": 0.0017327865352854133, + "learning_rate": 8.494376714600877e-07, + "loss": -0.0076, + "num_tokens": 5618796.0, + "reward": 0.21250000596046448, + "reward_std": 0.02314549870789051, + "rewards/itbench_correctness/mean": 0.21250000596046448, + "rewards/itbench_correctness/std": 0.22173558175563812, + "step": 292, + "step_time": 116.08490148931742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 701.75, + "completions/mean_terminated_length": 680.2667236328125, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.36052724719047546, + "epoch": 1.5502645502645502, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3203125, + "kl": 0.0012737837387248874, + "learning_rate": 8.48253129389541e-07, + "loss": 0.0105, + "num_tokens": 5634952.0, + "reward": 0.10546875, + "reward_std": 0.06008155643939972, + "rewards/itbench_correctness/mean": 0.10546875, + "rewards/itbench_correctness/std": 0.1363947093486786, + "step": 293, + "step_time": 391.972909046337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 493.25, + "completions/mean_terminated_length": 493.25, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4176381230354309, + "epoch": 1.5555555555555556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.0017929230816662312, + "learning_rate": 8.470647788785664e-07, + "loss": -0.0618, + "num_tokens": 5646612.0, + "reward": 0.671875, + "reward_std": 0.2810920476913452, + "rewards/itbench_correctness/mean": 0.671875, + "rewards/itbench_correctness/std": 0.3502231538295746, + "step": 294, + "step_time": 417.62054439727217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 440.1875, + "completions/mean_terminated_length": 440.1875, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "entropy": 0.3566662073135376, + "epoch": 1.560846560846561, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0015034499811008573, + "learning_rate": 8.458726329227747e-07, + "loss": 0.0048, + "num_tokens": 5656511.0, + "reward": 0.7447916865348816, + "reward_std": 0.11666134744882584, + "rewards/itbench_correctness/mean": 0.7447916865348816, + "rewards/itbench_correctness/std": 0.17864912748336792, + "step": 295, + "step_time": 839.189504972659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 655.875, + "completions/mean_terminated_length": 655.875, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 0.5641318559646606, + "epoch": 1.566137566137566, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.002510908292606473, + "learning_rate": 8.446767045592829e-07, + "loss": 0.0185, + "num_tokens": 5680541.0, + "reward": 0.265625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.265625, + "rewards/itbench_correctness/std": 0.28090256452560425, + "step": 296, + "step_time": 371.1978498548269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 673.5, + "completions/mean_terminated_length": 650.1333618164062, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.5404602885246277, + "epoch": 1.5714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.001535046030767262, + "learning_rate": 8.434770068665722e-07, + "loss": 0.0315, + "num_tokens": 5695821.0, + "reward": 0.5397727489471436, + "reward_std": 0.29765012860298157, + "rewards/itbench_correctness/mean": 0.5397727489471436, + "rewards/itbench_correctness/std": 0.45329374074935913, + "step": 297, + "step_time": 163.53786495421082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 764.0625, + "completions/mean_terminated_length": 677.4166870117188, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 0.48425358533859253, + "epoch": 1.5767195767195767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.001875088200904429, + "learning_rate": 8.422735529643443e-07, + "loss": -0.0003, + "num_tokens": 5711494.0, + "reward": 0.390625, + "reward_std": 0.0794283002614975, + "rewards/itbench_correctness/mean": 0.390625, + "rewards/itbench_correctness/std": 0.3492303192615509, + "step": 298, + "step_time": 102.2513699810952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 742.1875, + "completions/mean_terminated_length": 523.0, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.36648422479629517, + "epoch": 1.5820105820105819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0281982421875, + "kl": 0.0010558166541159153, + "learning_rate": 8.410663560133783e-07, + "loss": 0.0, + "num_tokens": 5738233.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 299, + "step_time": 170.24059600010514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 844.5625, + "completions/mean_terminated_length": 705.0, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.5233479142189026, + "epoch": 1.5873015873015874, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.001366324140690267, + "learning_rate": 8.398554292153865e-07, + "loss": 0.0093, + "num_tokens": 5760042.0, + "reward": 0.4791666865348816, + "reward_std": 0.16512766480445862, + "rewards/itbench_correctness/mean": 0.4791666865348816, + "rewards/itbench_correctness/std": 0.3019995093345642, + "step": 300, + "step_time": 285.300213762559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 753.6875, + "completions/mean_terminated_length": 543.4444580078125, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "entropy": 0.536031186580658, + "epoch": 1.5925925925925926, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5625, + "kl": 0.0015036487020552158, + "learning_rate": 8.386407858128706e-07, + "loss": 0.0005, + "num_tokens": 5783901.0, + "reward": 0.28437501192092896, + "reward_std": 0.07841908931732178, + "rewards/itbench_correctness/mean": 0.28437501192092896, + "rewards/itbench_correctness/std": 0.14226588606834412, + "step": 301, + "step_time": 148.27129491977394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 530.0625, + "completions/mean_terminated_length": 530.0625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.42825138568878174, + "epoch": 1.597883597883598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.0016496152384206653, + "learning_rate": 8.374224390889759e-07, + "loss": -0.0793, + "num_tokens": 5802318.0, + "reward": 0.296875, + "reward_std": 0.13962560892105103, + "rewards/itbench_correctness/mean": 0.296875, + "rewards/itbench_correctness/std": 0.1434326171875, + "step": 302, + "step_time": 129.92639573384076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 1009.8125, + "completions/mean_terminated_length": 948.3333740234375, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "entropy": 0.43770501017570496, + "epoch": 1.6031746031746033, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.0010520732030272484, + "learning_rate": 8.362004023673472e-07, + "loss": 0.0025, + "num_tokens": 5825731.0, + "reward": 0.5, + "reward_std": 0.1356339007616043, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.47214052081108093, + "step": 303, + "step_time": 78.02016614936292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 747.8125, + "completions/mean_terminated_length": 582.1000366210938, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.4011700749397278, + "epoch": 1.6084656084656084, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.0011779264314100146, + "learning_rate": 8.349746890119824e-07, + "loss": -0.0198, + "num_tokens": 5864376.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 304, + "step_time": 941.6569038927555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 496.3125, + "completions/mean_terminated_length": 496.3125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.40297192335128784, + "epoch": 1.6137566137566137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.00194979936350137, + "learning_rate": 8.337453124270862e-07, + "loss": -0.0047, + "num_tokens": 5884285.0, + "reward": 0.5625, + "reward_std": 0.3471825420856476, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 305, + "step_time": 107.52555268164724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 965.625, + "completions/mean_terminated_length": 837.2000122070312, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "entropy": 0.41838186979293823, + "epoch": 1.619047619047619, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0014495647046715021, + "learning_rate": 8.325122860569241e-07, + "loss": -0.0125, + "num_tokens": 5910599.0, + "reward": 0.5354166626930237, + "reward_std": 0.05260828882455826, + "rewards/itbench_correctness/mean": 0.5354166626930237, + "rewards/itbench_correctness/std": 0.4056031107902527, + "step": 306, + "step_time": 113.85000483132899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 630.75, + "completions/mean_terminated_length": 574.5714721679688, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 0.3202536702156067, + "epoch": 1.6243386243386242, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.078125, + "kl": 0.001761075691320002, + "learning_rate": 8.312756233856748e-07, + "loss": -0.0057, + "num_tokens": 5925579.0, + "reward": 0.1875, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.25, + "step": 307, + "step_time": 252.66727325879037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 831.3125, + "completions/mean_terminated_length": 681.4444580078125, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 0.4017743170261383, + "epoch": 1.6296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.0010056800674647093, + "learning_rate": 8.300353379372833e-07, + "loss": 0.0837, + "num_tokens": 5954544.0, + "reward": 0.5625, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 308, + "step_time": 91.5687418980524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 438.6875, + "completions/mean_terminated_length": 399.66668701171875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.36244478821754456, + "epoch": 1.6349206349206349, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.0014679917367175221, + "learning_rate": 8.287914432753123e-07, + "loss": -0.1423, + "num_tokens": 5971995.0, + "reward": 0.78125, + "reward_std": 0.2706093192100525, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.29007503390312195, + "step": 309, + "step_time": 103.0185587760061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 951.5625, + "completions/mean_terminated_length": 830.8333740234375, + "completions/min_length": 641.0, + "completions/min_terminated_length": 641.0, + "entropy": 0.5422660112380981, + "epoch": 1.6402116402116402, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.00137800769880414, + "learning_rate": 8.275439530027947e-07, + "loss": -0.0189, + "num_tokens": 5994244.0, + "reward": 0.4236111044883728, + "reward_std": 0.3206467628479004, + "rewards/itbench_correctness/mean": 0.4236111044883728, + "rewards/itbench_correctness/std": 0.4552505910396576, + "step": 310, + "step_time": 408.57210523914546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 718.8125, + "completions/mean_terminated_length": 648.3846435546875, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.4062255322933197, + "epoch": 1.6455026455026456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.0012174441944807768, + "learning_rate": 8.262928807620843e-07, + "loss": 0.0105, + "num_tokens": 6010465.0, + "reward": 0.25, + "reward_std": 0.3745020925998688, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.40824830532073975, + "step": 311, + "step_time": 475.81261223275214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 547.4375, + "completions/mean_terminated_length": 547.4375, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.3781253695487976, + "epoch": 1.6507936507936507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0014938174281269312, + "learning_rate": 8.250382402347064e-07, + "loss": 0.0148, + "num_tokens": 6023008.0, + "reward": 0.7837010025978088, + "reward_std": 0.2962387502193451, + "rewards/itbench_correctness/mean": 0.7837010025978088, + "rewards/itbench_correctness/std": 0.3930458724498749, + "step": 312, + "step_time": 166.25692852959037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 492.6875, + "completions/mean_terminated_length": 492.6875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.3694024980068207, + "epoch": 1.656084656084656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0012153934221714735, + "learning_rate": 8.237800451412094e-07, + "loss": -0.0061, + "num_tokens": 6034795.0, + "reward": 0.671875, + "reward_std": 0.25043365359306335, + "rewards/itbench_correctness/mean": 0.671875, + "rewards/itbench_correctness/std": 0.3353670835494995, + "step": 313, + "step_time": 146.2452635196969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 496.75, + "completions/mean_terminated_length": 496.75, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "entropy": 0.5234020948410034, + "epoch": 1.6613756613756614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.0013322219019755721, + "learning_rate": 8.225183092410127e-07, + "loss": -0.0344, + "num_tokens": 6045567.0, + "reward": 0.5625, + "reward_std": 0.3471825420856476, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 314, + "step_time": 998.2186722587794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 660.6875, + "completions/mean_terminated_length": 636.4666748046875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.44499102234840393, + "epoch": 1.6666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0257568359375, + "kl": 0.001278597628697753, + "learning_rate": 8.212530463322582e-07, + "loss": 0.0, + "num_tokens": 6061834.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 315, + "step_time": 1148.7406483720988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 454.25, + "completions/mean_terminated_length": 454.25, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.41607043147087097, + "epoch": 1.671957671957672, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.078125, + "kl": 0.0012671623844653368, + "learning_rate": 8.199842702516582e-07, + "loss": 0.0057, + "num_tokens": 6072118.0, + "reward": 0.953125, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.953125, + "rewards/itbench_correctness/std": 0.1875, + "step": 316, + "step_time": 864.3541559455916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 566.625, + "completions/mean_terminated_length": 566.625, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.4729759395122528, + "epoch": 1.6772486772486772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.001966112991794944, + "learning_rate": 8.187119948743449e-07, + "loss": 0.0121, + "num_tokens": 6086384.0, + "reward": 0.6979166269302368, + "reward_std": 0.3103903532028198, + "rewards/itbench_correctness/mean": 0.6979166269302368, + "rewards/itbench_correctness/std": 0.42259669303894043, + "step": 317, + "step_time": 80.52064239047468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 786.5, + "completions/mean_terminated_length": 549.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4755244851112366, + "epoch": 1.6825396825396826, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.494140625, + "kl": 0.0016143879620358348, + "learning_rate": 8.174362341137176e-07, + "loss": -0.0945, + "num_tokens": 6105360.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 318, + "step_time": 496.4877818999812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 753.0, + "completions/mean_terminated_length": 753.0, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.2895086407661438, + "epoch": 1.687830687830688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0012620992492884398, + "learning_rate": 8.16157001921292e-07, + "loss": -0.0084, + "num_tokens": 6123976.0, + "reward": 0.8125, + "reward_std": 0.1462520956993103, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.19364917278289795, + "step": 319, + "step_time": 96.52025901339948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 1001.3125, + "completions/mean_terminated_length": 903.0, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.39947569370269775, + "epoch": 1.693121693121693, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.59375, + "kl": 0.0011935190996155143, + "learning_rate": 8.148743122865463e-07, + "loss": -0.0069, + "num_tokens": 6166669.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 320, + "step_time": 7083.372859461233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 675.4375, + "completions/mean_terminated_length": 652.2000122070312, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.37753307819366455, + "epoch": 1.6984126984126984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0014044505078345537, + "learning_rate": 8.135881792367685e-07, + "loss": -0.0313, + "num_tokens": 6182420.0, + "reward": 0.5208333730697632, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5208333730697632, + "rewards/itbench_correctness/std": 0.48638883233070374, + "step": 321, + "step_time": 248.25969803985208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 685.5, + "completions/mean_terminated_length": 572.6666870117188, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "entropy": 0.6068562865257263, + "epoch": 1.7037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.0015854442026466131, + "learning_rate": 8.122986168369039e-07, + "loss": -0.0155, + "num_tokens": 6206140.0, + "reward": 0.234375, + "reward_std": 0.3006556034088135, + "rewards/itbench_correctness/mean": 0.234375, + "rewards/itbench_correctness/std": 0.3158157467842102, + "step": 322, + "step_time": 145.6991236684844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 787.6875, + "completions/mean_terminated_length": 551.375, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "entropy": 0.4164087772369385, + "epoch": 1.7089947089947088, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.59375, + "kl": 0.0013617142103612423, + "learning_rate": 8.110056391894003e-07, + "loss": 0.0, + "num_tokens": 6227303.0, + "reward": 0.5520833134651184, + "reward_std": 0.06200198456645012, + "rewards/itbench_correctness/mean": 0.5520833134651184, + "rewards/itbench_correctness/std": 0.4702983796596527, + "step": 323, + "step_time": 140.916482466273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 699.0, + "completions/mean_terminated_length": 699.0, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "entropy": 0.3690987229347229, + "epoch": 1.7142857142857144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0010593783808872104, + "learning_rate": 8.097092604340541e-07, + "loss": 0.0083, + "num_tokens": 6244119.0, + "reward": 0.84375, + "reward_std": 0.14777101576328278, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.16720746457576752, + "step": 324, + "step_time": 67.42365125380456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 972.25, + "completions/mean_terminated_length": 748.0, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.5759835243225098, + "epoch": 1.7195767195767195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.001520626712590456, + "learning_rate": 8.084094947478554e-07, + "loss": 0.0001, + "num_tokens": 6288875.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 325, + "step_time": 162.16153999976814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 747.1875, + "completions/mean_terminated_length": 621.3636474609375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.40150564908981323, + "epoch": 1.7248677248677249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.0012618422042578459, + "learning_rate": 8.071063563448339e-07, + "loss": 0.0, + "num_tokens": 6307974.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 326, + "step_time": 658.432337153703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 683.375, + "completions/mean_terminated_length": 683.375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4097311198711395, + "epoch": 1.7301587301587302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0016633245395496488, + "learning_rate": 8.057998594759022e-07, + "loss": -0.0924, + "num_tokens": 6323180.0, + "reward": 0.45098039507865906, + "reward_std": 0.24446815252304077, + "rewards/itbench_correctness/mean": 0.45098039507865906, + "rewards/itbench_correctness/std": 0.37708234786987305, + "step": 327, + "step_time": 183.20953813474625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 992.0625, + "completions/mean_terminated_length": 938.8333740234375, + "completions/min_length": 773.0, + "completions/min_terminated_length": 773.0, + "entropy": 0.4838404953479767, + "epoch": 1.7354497354497354, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.0014802846126258373, + "learning_rate": 8.044900184287006e-07, + "loss": -0.0402, + "num_tokens": 6345661.0, + "reward": 0.7080707550048828, + "reward_std": 0.3341723680496216, + "rewards/itbench_correctness/mean": 0.7080707550048828, + "rewards/itbench_correctness/std": 0.33529266715049744, + "step": 328, + "step_time": 849.2058257460594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 852.4375, + "completions/mean_terminated_length": 827.9285888671875, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.4762812554836273, + "epoch": 1.7407407407407407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.0015186754753813148, + "learning_rate": 8.031768475274412e-07, + "loss": 0.0195, + "num_tokens": 6375004.0, + "reward": 0.4166666865348816, + "reward_std": 0.3177001476287842, + "rewards/itbench_correctness/mean": 0.4166666865348816, + "rewards/itbench_correctness/std": 0.42163705825805664, + "step": 329, + "step_time": 127.72529877442867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 461.375, + "completions/mean_terminated_length": 423.86669921875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.353291779756546, + "epoch": 1.746031746031746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0018113987753167748, + "learning_rate": 8.018603611327504e-07, + "loss": -0.0122, + "num_tokens": 6384650.0, + "reward": 0.53125, + "reward_std": 0.0578637570142746, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.08539126068353653, + "step": 330, + "step_time": 139.3600283851847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 869.3125, + "completions/mean_terminated_length": 611.5, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.39111366868019104, + "epoch": 1.7513227513227512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.0013434779830276966, + "learning_rate": 8.005405736415125e-07, + "loss": -0.0235, + "num_tokens": 6410383.0, + "reward": 0.6458333730697632, + "reward_std": 0.37862008810043335, + "rewards/itbench_correctness/mean": 0.6458333730697632, + "rewards/itbench_correctness/std": 0.36704525351524353, + "step": 331, + "step_time": 760.4793853284791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 822.4375, + "completions/mean_terminated_length": 730.8181762695312, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "entropy": 0.2966790795326233, + "epoch": 1.7566137566137567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0013310646172612906, + "learning_rate": 7.992174994867123e-07, + "loss": 0.0379, + "num_tokens": 6431670.0, + "reward": 0.4851190745830536, + "reward_std": 0.19627538323402405, + "rewards/itbench_correctness/mean": 0.4851190745830536, + "rewards/itbench_correctness/std": 0.2754608690738678, + "step": 332, + "step_time": 100.20586761180311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 540.0, + "completions/mean_terminated_length": 540.0, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.5, + "epoch": 1.7619047619047619, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.640625, + "kl": 0.0020073342602699995, + "learning_rate": 7.978911531372764e-07, + "loss": -0.007, + "num_tokens": 6452334.0, + "reward": 0.25, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 333, + "step_time": 130.50346516724676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 894.25, + "completions/mean_terminated_length": 727.4285888671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5636007785797119, + "epoch": 1.7671957671957672, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0019210126483812928, + "learning_rate": 7.965615490979163e-07, + "loss": -0.0575, + "num_tokens": 6485010.0, + "reward": 0.3125, + "reward_std": 0.3924052119255066, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 334, + "step_time": 287.8023096676916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 860.6875, + "completions/mean_terminated_length": 697.375, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.5019243359565735, + "epoch": 1.7724867724867726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3203125, + "kl": 0.0012930968077853322, + "learning_rate": 7.952287019089685e-07, + "loss": 0.001, + "num_tokens": 6506909.0, + "reward": 0.956250011920929, + "reward_std": 0.086344413459301, + "rewards/itbench_correctness/mean": 0.956250011920929, + "rewards/itbench_correctness/std": 0.1263262927532196, + "step": 335, + "step_time": 81.09772168658674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 642.625, + "completions/mean_terminated_length": 642.625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.38280490040779114, + "epoch": 1.7777777777777777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025146484375, + "kl": 0.0013457894092425704, + "learning_rate": 7.938926261462365e-07, + "loss": 0.0, + "num_tokens": 6521703.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 336, + "step_time": 95.01154231280088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 756.6875, + "completions/mean_terminated_length": 548.7777709960938, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 0.306599497795105, + "epoch": 1.783068783068783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.0012715982738882303, + "learning_rate": 7.925533364208308e-07, + "loss": -0.0117, + "num_tokens": 6541690.0, + "reward": 0.25, + "reward_std": 0.4355512857437134, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 337, + "step_time": 142.65012488793582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 928.625, + "completions/mean_terminated_length": 769.6666870117188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3445954918861389, + "epoch": 1.7883597883597884, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0012255455367267132, + "learning_rate": 7.912108473790091e-07, + "loss": -0.0833, + "num_tokens": 6563700.0, + "reward": 0.36250001192092896, + "reward_std": 0.22638463973999023, + "rewards/itbench_correctness/mean": 0.36250001192092896, + "rewards/itbench_correctness/std": 0.4856267273426056, + "step": 338, + "step_time": 146.54249787330627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 537.0625, + "completions/mean_terminated_length": 537.0625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.333294540643692, + "epoch": 1.7936507936507935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.0017877722857519984, + "learning_rate": 7.898651737020166e-07, + "loss": -0.092, + "num_tokens": 6577333.0, + "reward": 0.875568151473999, + "reward_std": 0.17176605761051178, + "rewards/itbench_correctness/mean": 0.875568151473999, + "rewards/itbench_correctness/std": 0.24365714192390442, + "step": 339, + "step_time": 63.54152914788574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 494.625, + "completions/mean_terminated_length": 494.625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "entropy": 0.414455384016037, + "epoch": 1.798941798941799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0016418452141806483, + "learning_rate": 7.88516330105925e-07, + "loss": 0.0023, + "num_tokens": 6588839.0, + "reward": 0.46875, + "reward_std": 0.23779192566871643, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.23935678601264954, + "step": 340, + "step_time": 421.9616943122819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 897.6875, + "completions/mean_terminated_length": 771.375, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.4611849784851074, + "epoch": 1.8042328042328042, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3984375, + "kl": 0.0015902061713859439, + "learning_rate": 7.871643313414718e-07, + "loss": 0.0001, + "num_tokens": 6620194.0, + "reward": 0.4583333432674408, + "reward_std": 0.07715166360139847, + "rewards/itbench_correctness/mean": 0.4583333432674408, + "rewards/itbench_correctness/std": 0.4849589467048645, + "step": 341, + "step_time": 85.41169494390488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 788.5625, + "completions/mean_terminated_length": 710.0833740234375, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.5148609280586243, + "epoch": 1.8095238095238095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0013109652791172266, + "learning_rate": 7.858091921938987e-07, + "loss": 0.0209, + "num_tokens": 6637355.0, + "reward": 0.49609375, + "reward_std": 0.31232208013534546, + "rewards/itbench_correctness/mean": 0.49609375, + "rewards/itbench_correctness/std": 0.42540958523750305, + "step": 342, + "step_time": 104.21269215922803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 768.9375, + "completions/mean_terminated_length": 513.875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.3641388416290283, + "epoch": 1.8148148148148149, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1640625, + "kl": 0.0012282090028747916, + "learning_rate": 7.844509274827906e-07, + "loss": 0.0, + "num_tokens": 6656498.0, + "reward": 0.84375, + "reward_std": 0.08258593082427979, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.19690898060798645, + "step": 343, + "step_time": 250.37473237421364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 728.8125, + "completions/mean_terminated_length": 551.7000122070312, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.4253494441509247, + "epoch": 1.82010582010582, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.001717406790703535, + "learning_rate": 7.830895520619128e-07, + "loss": 0.0331, + "num_tokens": 6673527.0, + "reward": 0.3839285671710968, + "reward_std": 0.24145764112472534, + "rewards/itbench_correctness/mean": 0.3839285671710968, + "rewards/itbench_correctness/std": 0.41063666343688965, + "step": 344, + "step_time": 111.02232545148581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 734.6875, + "completions/mean_terminated_length": 509.6666564941406, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.43555933237075806, + "epoch": 1.8253968253968254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.001631794380955398, + "learning_rate": 7.817250808190483e-07, + "loss": 0.0, + "num_tokens": 6696034.0, + "reward": 0.4285714328289032, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.4285714328289032, + "rewards/itbench_correctness/std": 0.4426266849040985, + "step": 345, + "step_time": 991.8486757231876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 619.3125, + "completions/mean_terminated_length": 619.3125, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.4714905619621277, + "epoch": 1.8306878306878307, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.001351330429315567, + "learning_rate": 7.803575286758363e-07, + "loss": 0.0057, + "num_tokens": 6714223.0, + "reward": 0.875, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.22360680997371674, + "step": 346, + "step_time": 114.43643134180456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 641.6875, + "completions/mean_terminated_length": 641.6875, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "entropy": 0.3397292196750641, + "epoch": 1.8359788359788358, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.15625, + "kl": 0.000967301893979311, + "learning_rate": 7.789869105876082e-07, + "loss": 0.0277, + "num_tokens": 6727946.0, + "reward": 0.9375, + "reward_std": 0.03857583552598953, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.08333335071802139, + "step": 347, + "step_time": 799.3260576492175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 940.0625, + "completions/mean_terminated_length": 688.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4978392422199249, + "epoch": 1.8412698412698414, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.00129983713850379, + "learning_rate": 7.776132415432232e-07, + "loss": -0.0185, + "num_tokens": 6748563.0, + "reward": 0.2109375, + "reward_std": 0.37981581687927246, + "rewards/itbench_correctness/mean": 0.2109375, + "rewards/itbench_correctness/std": 0.3783702850341797, + "step": 348, + "step_time": 79.58014123514295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 987.375, + "completions/mean_terminated_length": 877.5, + "completions/min_length": 806.0, + "completions/min_terminated_length": 806.0, + "entropy": 0.5266489386558533, + "epoch": 1.8465608465608465, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.001141308806836605, + "learning_rate": 7.762365365649067e-07, + "loss": 0.0426, + "num_tokens": 6769345.0, + "reward": 0.28125, + "reward_std": 0.32512497901916504, + "rewards/itbench_correctness/mean": 0.28125, + "rewards/itbench_correctness/std": 0.4366062581539154, + "step": 349, + "step_time": 74.42612945474684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 600.4375, + "completions/mean_terminated_length": 572.2000122070312, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.2631414532661438, + "epoch": 1.8518518518518519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0013918590266257524, + "learning_rate": 7.74856810708083e-07, + "loss": -0.0166, + "num_tokens": 6783368.0, + "reward": 0.4817708432674408, + "reward_std": 0.2546592652797699, + "rewards/itbench_correctness/mean": 0.4817708432674408, + "rewards/itbench_correctness/std": 0.2613040506839752, + "step": 350, + "step_time": 84.01541598606855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 432.25, + "completions/mean_terminated_length": 432.25, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.37478309869766235, + "epoch": 1.8571428571428572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.0021659955382347107, + "learning_rate": 7.734740790612136e-07, + "loss": -0.0002, + "num_tokens": 6793332.0, + "reward": 0.2847222089767456, + "reward_std": 0.1159602552652359, + "rewards/itbench_correctness/mean": 0.2847222089767456, + "rewards/itbench_correctness/std": 0.19016453623771667, + "step": 351, + "step_time": 52.17074024025351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 744.5625, + "completions/mean_terminated_length": 527.2222290039062, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.44052714109420776, + "epoch": 1.8624338624338623, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.625, + "kl": 0.0016061851056292653, + "learning_rate": 7.720883567456298e-07, + "loss": 0.0084, + "num_tokens": 6812589.0, + "reward": 0.78125, + "reward_std": 0.23543904721736908, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.39308255910873413, + "step": 352, + "step_time": 82.14024385716766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 459.25, + "completions/mean_terminated_length": 459.25, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.5182362794876099, + "epoch": 1.8677248677248677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.0017120328266173601, + "learning_rate": 7.706996589153689e-07, + "loss": 0.0197, + "num_tokens": 6822457.0, + "reward": 0.53125, + "reward_std": 0.38816186785697937, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.4069705307483673, + "step": 353, + "step_time": 141.23254205007106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 991.625, + "completions/mean_terminated_length": 920.4000244140625, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.3872431516647339, + "epoch": 1.873015873015873, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2734375, + "kl": 0.0011720954207703471, + "learning_rate": 7.693080007570083e-07, + "loss": -0.0084, + "num_tokens": 6850571.0, + "reward": 0.03125, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.03125, + "rewards/itbench_correctness/std": 0.125, + "step": 354, + "step_time": 552.564743893221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 522.0, + "completions/mean_terminated_length": 522.0, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.517241358757019, + "epoch": 1.8783068783068781, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.0022999588400125504, + "learning_rate": 7.679133974894982e-07, + "loss": -0.0017, + "num_tokens": 6861683.0, + "reward": 0.40625, + "reward_std": 0.2882373631000519, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.42149975895881653, + "step": 355, + "step_time": 615.0445311861113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 993.8125, + "completions/mean_terminated_length": 927.4000244140625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.4246273934841156, + "epoch": 1.8835978835978837, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.001227955799549818, + "learning_rate": 7.665158643639969e-07, + "loss": 0.022, + "num_tokens": 6892520.0, + "reward": 0.1294642835855484, + "reward_std": 0.2322283834218979, + "rewards/itbench_correctness/mean": 0.1294642835855484, + "rewards/itbench_correctness/std": 0.2531687021255493, + "step": 356, + "step_time": 94.74817245267332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 760.8125, + "completions/mean_terminated_length": 760.8125, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.316766619682312, + "epoch": 1.8888888888888888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0009881765581667423, + "learning_rate": 7.651154166637024e-07, + "loss": 0.0684, + "num_tokens": 6910381.0, + "reward": 0.7250000238418579, + "reward_std": 0.28192007541656494, + "rewards/itbench_correctness/mean": 0.7250000238418579, + "rewards/itbench_correctness/std": 0.3696845769882202, + "step": 357, + "step_time": 73.87886378820986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 723.5, + "completions/mean_terminated_length": 680.5714721679688, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "entropy": 0.43676573038101196, + "epoch": 1.8941798941798942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25, + "kl": 0.00232238182798028, + "learning_rate": 7.637120697036865e-07, + "loss": -0.0125, + "num_tokens": 6932685.0, + "reward": 0.9479166865348816, + "reward_std": 0.019287927076220512, + "rewards/itbench_correctness/mean": 0.9479166865348816, + "rewards/itbench_correctness/std": 0.05989960953593254, + "step": 358, + "step_time": 316.1507151676342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 980.3125, + "completions/mean_terminated_length": 849.25, + "completions/min_length": 761.0, + "completions/min_terminated_length": 761.0, + "entropy": 0.5059611201286316, + "epoch": 1.8994708994708995, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.0012296068016439676, + "learning_rate": 7.623058388307268e-07, + "loss": 0.0303, + "num_tokens": 6960266.0, + "reward": 0.015625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.015625, + "rewards/itbench_correctness/std": 0.0625, + "step": 359, + "step_time": 104.70608714781702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 538.4375, + "completions/mean_terminated_length": 538.4375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.3045850396156311, + "epoch": 1.9047619047619047, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.001858119503594935, + "learning_rate": 7.608967394231386e-07, + "loss": 0.0023, + "num_tokens": 6972577.0, + "reward": 0.8125, + "reward_std": 0.2982703447341919, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.3354102075099945, + "step": 360, + "step_time": 63.012714352458715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 982.0625, + "completions/mean_terminated_length": 353.0, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "entropy": 0.42563483119010925, + "epoch": 1.91005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.0017107086023315787, + "learning_rate": 7.594847868906076e-07, + "loss": -0.0493, + "num_tokens": 6998698.0, + "reward": 0.6770833730697632, + "reward_std": 0.2745841145515442, + "rewards/itbench_correctness/mean": 0.6770833730697632, + "rewards/itbench_correctness/std": 0.3303687572479248, + "step": 361, + "step_time": 104.44509523361921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 527.0625, + "completions/mean_terminated_length": 527.0625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.46863511204719543, + "epoch": 1.9153439153439153, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.390625, + "kl": 0.0021068877540528774, + "learning_rate": 7.5806999667402e-07, + "loss": 0.0109, + "num_tokens": 7018539.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 362, + "step_time": 93.05886326078326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 664.125, + "completions/mean_terminated_length": 664.125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.41859591007232666, + "epoch": 1.9206349206349205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.001816941425204277, + "learning_rate": 7.566523842452956e-07, + "loss": 0.0, + "num_tokens": 7034125.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 363, + "step_time": 467.18558633420616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 868.125, + "completions/mean_terminated_length": 712.25, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "entropy": 0.49992799758911133, + "epoch": 1.925925925925926, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6953125, + "kl": 0.0011530888732522726, + "learning_rate": 7.552319651072163e-07, + "loss": 0.0, + "num_tokens": 7056823.0, + "reward": 0.5546875, + "reward_std": 0.09704047441482544, + "rewards/itbench_correctness/mean": 0.5546875, + "rewards/itbench_correctness/std": 0.3563961982727051, + "step": 364, + "step_time": 246.65094076655805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 850.6875, + "completions/mean_terminated_length": 677.375, + "completions/min_length": 563.0, + "completions/min_terminated_length": 563.0, + "entropy": 0.46785688400268555, + "epoch": 1.9312169312169312, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4921875, + "kl": 0.001800206839106977, + "learning_rate": 7.538087547932584e-07, + "loss": 0.0001, + "num_tokens": 7084386.0, + "reward": 0.6041666865348816, + "reward_std": 0.07386711239814758, + "rewards/itbench_correctness/mean": 0.6041666865348816, + "rewards/itbench_correctness/std": 0.42108768224716187, + "step": 365, + "step_time": 140.03905525244772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 627.0625, + "completions/mean_terminated_length": 600.6000366210938, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.5358317494392395, + "epoch": 1.9365079365079365, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.328125, + "kl": 0.0013238092651590705, + "learning_rate": 7.523827688674219e-07, + "loss": 0.0181, + "num_tokens": 7100155.0, + "reward": 0.65625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.3966001570224762, + "step": 366, + "step_time": 214.51375654805452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 813.6875, + "completions/mean_terminated_length": 650.1111450195312, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.7029725909233093, + "epoch": 1.9417989417989419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01507568359375, + "kl": 0.0012179253390058875, + "learning_rate": 7.509540229240601e-07, + "loss": 0.0, + "num_tokens": 7147734.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 367, + "step_time": 142.35515129286796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 393.875, + "completions/mean_terminated_length": 393.875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.35036495327949524, + "epoch": 1.947089947089947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01324462890625, + "kl": 0.0013395985588431358, + "learning_rate": 7.495225325877103e-07, + "loss": 0.0, + "num_tokens": 7157244.0, + "reward": 0.0833333358168602, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0833333358168602, + "rewards/itbench_correctness/std": 0.08606629818677902, + "step": 368, + "step_time": 50.80347699671984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 622.8125, + "completions/mean_terminated_length": 530.2307739257812, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 0.34520822763442993, + "epoch": 1.9523809523809523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.002235216787084937, + "learning_rate": 7.480883135129211e-07, + "loss": -0.0501, + "num_tokens": 7173433.0, + "reward": 0.5, + "reward_std": 0.4629100561141968, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 369, + "step_time": 573.6945302598178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 703.75, + "completions/mean_terminated_length": 558.1818237304688, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.47744226455688477, + "epoch": 1.9576719576719577, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5078125, + "kl": 0.0014034686610102654, + "learning_rate": 7.466513813840824e-07, + "loss": 0.0179, + "num_tokens": 7199237.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 370, + "step_time": 871.0688090631738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 799.0, + "completions/mean_terminated_length": 574.0, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "entropy": 0.5306633114814758, + "epoch": 1.9629629629629628, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.0018356168875470757, + "learning_rate": 7.452117519152541e-07, + "loss": 0.0105, + "num_tokens": 7218325.0, + "reward": 0.21250000596046448, + "reward_std": 0.21977336704730988, + "rewards/itbench_correctness/mean": 0.21250000596046448, + "rewards/itbench_correctness/std": 0.239095538854599, + "step": 371, + "step_time": 154.47280544694513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 673.0625, + "completions/mean_terminated_length": 649.6666870117188, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "entropy": 0.3461788594722748, + "epoch": 1.9682539682539684, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.671875, + "kl": 0.0013453871943056583, + "learning_rate": 7.437694408499932e-07, + "loss": -0.0357, + "num_tokens": 7233958.0, + "reward": 0.8125, + "reward_std": 0.13908715546131134, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.2713136672973633, + "step": 372, + "step_time": 89.71556733455509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 590.9375, + "completions/mean_terminated_length": 590.9375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.3553675413131714, + "epoch": 1.9735449735449735, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.98046875, + "kl": 0.0020165969617664814, + "learning_rate": 7.423244639611826e-07, + "loss": -0.0436, + "num_tokens": 7253629.0, + "reward": 0.125, + "reward_std": 0.10564428567886353, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.19364917278289795, + "step": 373, + "step_time": 113.8127332655713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 715.875, + "completions/mean_terminated_length": 407.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5308189392089844, + "epoch": 1.9788359788359788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.0015742821851745248, + "learning_rate": 7.408768370508576e-07, + "loss": -0.0489, + "num_tokens": 7276483.0, + "reward": 0.4375, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 374, + "step_time": 63.292789563536644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 612.8125, + "completions/mean_terminated_length": 612.8125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.40632331371307373, + "epoch": 1.9841269841269842, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.89453125, + "kl": 0.0012454271782189608, + "learning_rate": 7.394265759500347e-07, + "loss": -0.015, + "num_tokens": 7296752.0, + "reward": 0.40937501192092896, + "reward_std": 0.16952534019947052, + "rewards/itbench_correctness/mean": 0.40937501192092896, + "rewards/itbench_correctness/std": 0.3946385681629181, + "step": 375, + "step_time": 100.02406275831163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 618.0625, + "completions/mean_terminated_length": 302.3333435058594, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4271412789821625, + "epoch": 1.9894179894179893, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.390625, + "kl": 0.0022359276190400124, + "learning_rate": 7.379736965185368e-07, + "loss": -0.0336, + "num_tokens": 7315561.0, + "reward": 0.02083333395421505, + "reward_std": 0.03857583925127983, + "rewards/itbench_correctness/mean": 0.02083333395421505, + "rewards/itbench_correctness/std": 0.05692750960588455, + "step": 376, + "step_time": 119.51777216419578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 737.875, + "completions/mean_terminated_length": 451.75, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "entropy": 0.4445197284221649, + "epoch": 1.9947089947089947, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.234375, + "kl": 0.0014217497082427144, + "learning_rate": 7.365182146448204e-07, + "loss": 0.006, + "num_tokens": 7339351.0, + "reward": 0.46875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4989572763442993, + "step": 377, + "step_time": 101.3526473660022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 957.8125, + "completions/mean_terminated_length": 872.7142944335938, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "entropy": 0.5074061751365662, + "epoch": 2.0, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.0012597291497513652, + "learning_rate": 7.350601462458024e-07, + "loss": 0.0172, + "num_tokens": 7379628.0, + "reward": 0.40625, + "reward_std": 0.08210401982069016, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.43430978059768677, + "step": 378, + "step_time": 126.26444634236395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 574.6875, + "completions/mean_terminated_length": 574.6875, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "entropy": 0.4402392506599426, + "epoch": 2.005291005291005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0262451171875, + "kl": 0.0012956882128491998, + "learning_rate": 7.335995072666847e-07, + "loss": 0.0, + "num_tokens": 7393239.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 379, + "step_time": 1022.2069364916533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 945.5625, + "completions/mean_terminated_length": 844.7142944335938, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.5710886120796204, + "epoch": 2.0105820105820107, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0013581872917711735, + "learning_rate": 7.321363136807818e-07, + "loss": 0.0597, + "num_tokens": 7419288.0, + "reward": 0.49687498807907104, + "reward_std": 0.14920906722545624, + "rewards/itbench_correctness/mean": 0.49687498807907104, + "rewards/itbench_correctness/std": 0.4410097301006317, + "step": 380, + "step_time": 104.49046333320439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 816.75, + "completions/mean_terminated_length": 747.6666870117188, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "entropy": 0.38934803009033203, + "epoch": 2.015873015873016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.001262744888663292, + "learning_rate": 7.306705814893439e-07, + "loss": -0.0128, + "num_tokens": 7437700.0, + "reward": 0.4375, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.4281744360923767, + "step": 381, + "step_time": 86.5950373802334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 804.875, + "completions/mean_terminated_length": 790.2667236328125, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "entropy": 0.47212299704551697, + "epoch": 2.0211640211640214, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0021258334163576365, + "learning_rate": 7.292023267213835e-07, + "loss": -0.0265, + "num_tokens": 7457698.0, + "reward": 0.75, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 382, + "step_time": 89.82852033432573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 692.875, + "completions/mean_terminated_length": 670.800048828125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "entropy": 0.26556017994880676, + "epoch": 2.0264550264550265, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.0009645685204304755, + "learning_rate": 7.277315654334996e-07, + "loss": -0.0005, + "num_tokens": 7474384.0, + "reward": 0.796875, + "reward_std": 0.11100947856903076, + "rewards/itbench_correctness/mean": 0.796875, + "rewards/itbench_correctness/std": 0.1359764039516449, + "step": 383, + "step_time": 838.6957097211853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 654.375, + "completions/mean_terminated_length": 601.5714721679688, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.39732569456100464, + "epoch": 2.0317460317460316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.0012180046178400517, + "learning_rate": 7.262583137097018e-07, + "loss": 0.0288, + "num_tokens": 7490990.0, + "reward": 0.6614583134651184, + "reward_std": 0.30096644163131714, + "rewards/itbench_correctness/mean": 0.6614583134651184, + "rewards/itbench_correctness/std": 0.2926076054573059, + "step": 384, + "step_time": 613.9163551460952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 685.25, + "completions/mean_terminated_length": 662.6666870117188, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.41152864694595337, + "epoch": 2.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.0014390156138688326, + "learning_rate": 7.247825876612352e-07, + "loss": -0.0331, + "num_tokens": 7505778.0, + "reward": 0.3324652910232544, + "reward_std": 0.2620442807674408, + "rewards/itbench_correctness/mean": 0.3324652910232544, + "rewards/itbench_correctness/std": 0.35357046127319336, + "step": 385, + "step_time": 686.7393183000386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 706.1875, + "completions/mean_terminated_length": 561.727294921875, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.4871227443218231, + "epoch": 2.0423280423280423, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0018794104689732194, + "learning_rate": 7.233044034264033e-07, + "loss": 0.0135, + "num_tokens": 7523709.0, + "reward": 0.9829545617103577, + "reward_std": 0.023524951189756393, + "rewards/itbench_correctness/mean": 0.9829545617103577, + "rewards/itbench_correctness/std": 0.03664661943912506, + "step": 386, + "step_time": 79.07936265133321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 968.625, + "completions/mean_terminated_length": 913.25, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "entropy": 0.35927215218544006, + "epoch": 2.0476190476190474, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.001336527056992054, + "learning_rate": 7.21823777170392e-07, + "loss": -0.0067, + "num_tokens": 7549655.0, + "reward": 0.5, + "reward_std": 0.3729091286659241, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.38490018248558044, + "step": 387, + "step_time": 140.9922649441287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 678.0625, + "completions/mean_terminated_length": 470.5, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.45128583908081055, + "epoch": 2.052910052910053, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.328125, + "kl": 0.0015049315989017487, + "learning_rate": 7.203407250850928e-07, + "loss": -0.0119, + "num_tokens": 7567664.0, + "reward": 0.859375, + "reward_std": 0.19408094882965088, + "rewards/itbench_correctness/mean": 0.859375, + "rewards/itbench_correctness/std": 0.30233466625213623, + "step": 388, + "step_time": 802.1390054896474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 659.5625, + "completions/mean_terminated_length": 635.2667236328125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.36842605471611023, + "epoch": 2.058201058201058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.000988945015706122, + "learning_rate": 7.188552633889259e-07, + "loss": 0.0073, + "num_tokens": 7583641.0, + "reward": 0.40625, + "reward_std": 0.01767767034471035, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.420267790555954, + "step": 389, + "step_time": 102.60351053066552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 627.5, + "completions/mean_terminated_length": 536.0, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.4270916283130646, + "epoch": 2.0634920634920633, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0013897939352318645, + "learning_rate": 7.173674083266623e-07, + "loss": -0.0168, + "num_tokens": 7597833.0, + "reward": 0.4437499940395355, + "reward_std": 0.08210402727127075, + "rewards/itbench_correctness/mean": 0.4437499940395355, + "rewards/itbench_correctness/std": 0.4718315303325653, + "step": 390, + "step_time": 941.484922320582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 509.375, + "completions/mean_terminated_length": 509.375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.4750920236110687, + "epoch": 2.068783068783069, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.98828125, + "kl": 0.001085254829376936, + "learning_rate": 7.158771761692464e-07, + "loss": -0.0065, + "num_tokens": 7608767.0, + "reward": 0.6937500238418579, + "reward_std": 0.1399936079978943, + "rewards/itbench_correctness/mean": 0.6937500238418579, + "rewards/itbench_correctness/std": 0.3696281909942627, + "step": 391, + "step_time": 484.2868151040748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 783.5, + "completions/mean_terminated_length": 543.0, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.5437141060829163, + "epoch": 2.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0014096169034019113, + "learning_rate": 7.143845832136187e-07, + "loss": 0.0198, + "num_tokens": 7629503.0, + "reward": 0.390625, + "reward_std": 0.35405686497688293, + "rewards/itbench_correctness/mean": 0.390625, + "rewards/itbench_correctness/std": 0.40019527077674866, + "step": 392, + "step_time": 247.14675129018724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 411.0, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.4428223967552185, + "epoch": 2.0793650793650795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.002443905221298337, + "learning_rate": 7.128896457825363e-07, + "loss": -0.0199, + "num_tokens": 7638431.0, + "reward": 0.42500001192092896, + "reward_std": 0.1060660183429718, + "rewards/itbench_correctness/mean": 0.42500001192092896, + "rewards/itbench_correctness/std": 0.12382783740758896, + "step": 393, + "step_time": 81.88051935099065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 575.75, + "completions/mean_terminated_length": 575.75, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.33174121379852295, + "epoch": 2.0846560846560847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.002629755064845085, + "learning_rate": 7.113923802243956e-07, + "loss": -0.0546, + "num_tokens": 7652163.0, + "reward": 0.3479166626930237, + "reward_std": 0.15140824019908905, + "rewards/itbench_correctness/mean": 0.3479166626930237, + "rewards/itbench_correctness/std": 0.14930394291877747, + "step": 394, + "step_time": 82.69239473901689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 812.25, + "completions/mean_terminated_length": 741.6666870117188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5195444822311401, + "epoch": 2.0899470899470898, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3203125, + "kl": 0.0016354058170691133, + "learning_rate": 7.098928029130528e-07, + "loss": -0.0397, + "num_tokens": 7684783.0, + "reward": 0.8125, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 395, + "step_time": 280.40570612065494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 763.375, + "completions/mean_terminated_length": 726.1428833007812, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.5344686508178711, + "epoch": 2.0952380952380953, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0014837183989584446, + "learning_rate": 7.083909302476452e-07, + "loss": 0.0739, + "num_tokens": 7704229.0, + "reward": 0.2749999761581421, + "reward_std": 0.17728103697299957, + "rewards/itbench_correctness/mean": 0.2749999761581421, + "rewards/itbench_correctness/std": 0.3732738196849823, + "step": 396, + "step_time": 85.86852881591767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 552.5625, + "completions/mean_terminated_length": 552.5625, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.46148625016212463, + "epoch": 2.1005291005291005, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.984375, + "kl": 0.0017779659247025847, + "learning_rate": 7.068867786524115e-07, + "loss": 0.0021, + "num_tokens": 7725262.0, + "reward": 0.6354166269302368, + "reward_std": 0.36623916029930115, + "rewards/itbench_correctness/mean": 0.6354166269302368, + "rewards/itbench_correctness/std": 0.41373974084854126, + "step": 397, + "step_time": 108.55369200650603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 638.5, + "completions/mean_terminated_length": 463.2727355957031, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "entropy": 0.46045419573783875, + "epoch": 2.105820105820106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.0019145008409395814, + "learning_rate": 7.053803645765127e-07, + "loss": -0.0524, + "num_tokens": 7743718.0, + "reward": 0.3229166567325592, + "reward_std": 0.18293291330337524, + "rewards/itbench_correctness/mean": 0.3229166567325592, + "rewards/itbench_correctness/std": 0.25783106684684753, + "step": 398, + "step_time": 124.76364956516773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 718.875, + "completions/mean_terminated_length": 481.5555725097656, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.3672404885292053, + "epoch": 2.111111111111111, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3125, + "kl": 0.001165916328318417, + "learning_rate": 7.038717044938518e-07, + "loss": 0.0019, + "num_tokens": 7766796.0, + "reward": 0.4196428656578064, + "reward_std": 0.11090338230133057, + "rewards/itbench_correctness/mean": 0.4196428656578064, + "rewards/itbench_correctness/std": 0.45912888646125793, + "step": 399, + "step_time": 92.30510796047747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 742.4375, + "completions/mean_terminated_length": 702.2142944335938, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.4337065517902374, + "epoch": 2.1164021164021163, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25, + "kl": 0.0014574953820556402, + "learning_rate": 7.023608149028936e-07, + "loss": -0.0175, + "num_tokens": 7783843.0, + "reward": 0.8125, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 400, + "step_time": 163.5410966835916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 410.75, + "completions/mean_terminated_length": 410.75, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.41387704014778137, + "epoch": 2.121693121693122, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.91796875, + "kl": 0.001556264702230692, + "learning_rate": 7.008477123264847e-07, + "loss": -0.0122, + "num_tokens": 7792695.0, + "reward": 0.625, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.4281744360923767, + "step": 401, + "step_time": 68.41044199559838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 720.375, + "completions/mean_terminated_length": 538.2000122070312, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.3942390978336334, + "epoch": 2.126984126984127, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5234375, + "kl": 0.0016359214205294847, + "learning_rate": 6.993324133116725e-07, + "loss": 0.026, + "num_tokens": 7820917.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 402, + "step_time": 80.19952200446278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 813.4375, + "completions/mean_terminated_length": 649.6666870117188, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "entropy": 0.4278140664100647, + "epoch": 2.132275132275132, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0014102754648774862, + "learning_rate": 6.978149344295241e-07, + "loss": 0.0034, + "num_tokens": 7839988.0, + "reward": 0.8035714626312256, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.8035714626312256, + "rewards/itbench_correctness/std": 0.1907735913991928, + "step": 403, + "step_time": 899.5258836848661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 411.5625, + "completions/mean_terminated_length": 411.5625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.4130599796772003, + "epoch": 2.1375661375661377, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7421875, + "kl": 0.0012595870066434145, + "learning_rate": 6.962952922749457e-07, + "loss": -0.051, + "num_tokens": 7853909.0, + "reward": 0.39008620381355286, + "reward_std": 0.2060610055923462, + "rewards/itbench_correctness/mean": 0.39008620381355286, + "rewards/itbench_correctness/std": 0.4915003180503845, + "step": 404, + "step_time": 101.09288472961634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 735.25, + "completions/mean_terminated_length": 604.0, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.5277116894721985, + "epoch": 2.142857142857143, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.609375, + "kl": 0.0016202793922275305, + "learning_rate": 6.947735034665001e-07, + "loss": 0.0106, + "num_tokens": 7879449.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 405, + "step_time": 199.16461731493473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 484.25, + "completions/mean_terminated_length": 484.25, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "entropy": 0.35105833411216736, + "epoch": 2.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.0018317453796043992, + "learning_rate": 6.932495846462261e-07, + "loss": 0.0256, + "num_tokens": 7889989.0, + "reward": 0.5866477489471436, + "reward_std": 0.1253555417060852, + "rewards/itbench_correctness/mean": 0.5866477489471436, + "rewards/itbench_correctness/std": 0.4179156720638275, + "step": 406, + "step_time": 137.3452343745157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 487.5, + "completions/mean_terminated_length": 487.5, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.38564103841781616, + "epoch": 2.1534391534391535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.0013662968995049596, + "learning_rate": 6.917235524794558e-07, + "loss": 0.009, + "num_tokens": 7901029.0, + "reward": 0.890625, + "reward_std": 0.22707363963127136, + "rewards/itbench_correctness/mean": 0.890625, + "rewards/itbench_correctness/std": 0.22302372753620148, + "step": 407, + "step_time": 68.24169243406504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 953.25, + "completions/mean_terminated_length": 862.2857666015625, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "entropy": 0.36926305294036865, + "epoch": 2.1587301587301586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.0011503396090120077, + "learning_rate": 6.901954236546324e-07, + "loss": -0.0107, + "num_tokens": 7924657.0, + "reward": 0.5640318393707275, + "reward_std": 0.34312185645103455, + "rewards/itbench_correctness/mean": 0.5640318393707275, + "rewards/itbench_correctness/std": 0.37077954411506653, + "step": 408, + "step_time": 846.7672138344496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 768.3125, + "completions/mean_terminated_length": 569.4444580078125, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.41389408707618713, + "epoch": 2.164021164021164, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.0032037936616688967, + "learning_rate": 6.886652148831279e-07, + "loss": 0.0154, + "num_tokens": 7947958.0, + "reward": 0.1041666716337204, + "reward_std": 0.19795581698417664, + "rewards/itbench_correctness/mean": 0.1041666716337204, + "rewards/itbench_correctness/std": 0.291070818901062, + "step": 409, + "step_time": 180.43269913457334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 513.1875, + "completions/mean_terminated_length": 513.1875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.378029465675354, + "epoch": 2.1693121693121693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.458984375, + "kl": 0.0019971805159002542, + "learning_rate": 6.871329428990601e-07, + "loss": -0.0628, + "num_tokens": 7959761.0, + "reward": 0.71875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 410, + "step_time": 105.00821590330452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 519.75, + "completions/mean_terminated_length": 519.75, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.4290524423122406, + "epoch": 2.1746031746031744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037841796875, + "kl": 0.0021021170541644096, + "learning_rate": 6.855986244591103e-07, + "loss": 0.0, + "num_tokens": 7976613.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 411, + "step_time": 872.8376494199038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 883.0625, + "completions/mean_terminated_length": 773.4444580078125, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.5888597965240479, + "epoch": 2.17989417989418, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.7109375, + "kl": 0.002118014730513096, + "learning_rate": 6.840622763423391e-07, + "loss": 0.0001, + "num_tokens": 8008030.0, + "reward": 0.25, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 412, + "step_time": 87.85907210037112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 798.0625, + "completions/mean_terminated_length": 572.125, + "completions/min_length": 490.0, + "completions/min_terminated_length": 490.0, + "entropy": 0.3759104013442993, + "epoch": 2.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0015699933283030987, + "learning_rate": 6.825239153500029e-07, + "loss": 0.0035, + "num_tokens": 8027279.0, + "reward": 0.4518229365348816, + "reward_std": 0.22612185776233673, + "rewards/itbench_correctness/mean": 0.4518229365348816, + "rewards/itbench_correctness/std": 0.41147592663764954, + "step": 413, + "step_time": 98.12107760738581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 754.125, + "completions/mean_terminated_length": 484.25, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.5648930668830872, + "epoch": 2.1904761904761907, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5234375, + "kl": 0.0014875817578285933, + "learning_rate": 6.809835583053715e-07, + "loss": 0.0, + "num_tokens": 8047225.0, + "reward": 0.6000000238418579, + "reward_std": 0.13887301087379456, + "rewards/itbench_correctness/mean": 0.6000000238418579, + "rewards/itbench_correctness/std": 0.4546060562133789, + "step": 414, + "step_time": 132.7436649715528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 362.0625, + "completions/mean_terminated_length": 362.0625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.4474365711212158, + "epoch": 2.195767195767196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.0019536821637302637, + "learning_rate": 6.794412220535425e-07, + "loss": 0.0011, + "num_tokens": 8055514.0, + "reward": 0.21875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.2561737895011902, + "step": 415, + "step_time": 80.36343740858138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 892.25, + "completions/mean_terminated_length": 760.5, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "entropy": 0.542448878288269, + "epoch": 2.201058201058201, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4921875, + "kl": 0.001819648896344006, + "learning_rate": 6.778969234612583e-07, + "loss": 0.0001, + "num_tokens": 8082102.0, + "reward": 0.640625, + "reward_std": 0.0867956355214119, + "rewards/itbench_correctness/mean": 0.640625, + "rewards/itbench_correctness/std": 0.3896446228027344, + "step": 416, + "step_time": 243.75771763175726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 1010.1875, + "completions/mean_terminated_length": 913.5, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.6018684506416321, + "epoch": 2.2063492063492065, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0012090131640434265, + "learning_rate": 6.763506794167206e-07, + "loss": 0.0067, + "num_tokens": 8108297.0, + "reward": 0.46875, + "reward_std": 0.2346404492855072, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4905354380607605, + "step": 417, + "step_time": 88.06844450253993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 861.75, + "completions/mean_terminated_length": 764.4000244140625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.5616478323936462, + "epoch": 2.2116402116402116, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.375, + "kl": 0.0016445523360744119, + "learning_rate": 6.748025068294067e-07, + "loss": -0.0042, + "num_tokens": 8138485.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 418, + "step_time": 170.92238603066653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 637.4375, + "completions/mean_terminated_length": 611.6666870117188, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.5302480459213257, + "epoch": 2.2169312169312168, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3359375, + "kl": 0.0013207794399932027, + "learning_rate": 6.732524226298841e-07, + "loss": -0.0056, + "num_tokens": 8171140.0, + "reward": 0.21875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.2561737895011902, + "step": 419, + "step_time": 438.8370144786313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 541.875, + "completions/mean_terminated_length": 541.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.44844290614128113, + "epoch": 2.2222222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.001750895637087524, + "learning_rate": 6.717004437696249e-07, + "loss": -0.0712, + "num_tokens": 8183074.0, + "reward": 0.796875, + "reward_std": 0.1886717677116394, + "rewards/itbench_correctness/mean": 0.796875, + "rewards/itbench_correctness/std": 0.25634312629699707, + "step": 420, + "step_time": 80.88844703137875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 709.4375, + "completions/mean_terminated_length": 520.7000122070312, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 0.28755176067352295, + "epoch": 2.2275132275132274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.001124854083172977, + "learning_rate": 6.701465872208216e-07, + "loss": 0.0004, + "num_tokens": 8200969.0, + "reward": 0.59375, + "reward_std": 0.36201947927474976, + "rewards/itbench_correctness/mean": 0.59375, + "rewards/itbench_correctness/std": 0.3598804175853729, + "step": 421, + "step_time": 135.98304109089077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 456.5625, + "completions/mean_terminated_length": 456.5625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.37234771251678467, + "epoch": 2.2328042328042326, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7578125, + "kl": 0.001798869576305151, + "learning_rate": 6.685908699762001e-07, + "loss": -0.0517, + "num_tokens": 8211418.0, + "reward": 0.3270833492279053, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.3270833492279053, + "rewards/itbench_correctness/std": 0.24227891862392426, + "step": 422, + "step_time": 66.6123378733173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 658.6875, + "completions/mean_terminated_length": 658.6875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.30059778690338135, + "epoch": 2.238095238095238, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.34375, + "kl": 0.00184349634218961, + "learning_rate": 6.670333090488356e-07, + "loss": -0.0076, + "num_tokens": 8227349.0, + "reward": 0.65625, + "reward_std": 0.0578637570142746, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.08539126068353653, + "step": 423, + "step_time": 148.7072524903342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 991.8125, + "completions/mean_terminated_length": 766.5, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.25408029556274414, + "epoch": 2.2433862433862433, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.001193932956084609, + "learning_rate": 6.654739214719641e-07, + "loss": -0.0169, + "num_tokens": 8252114.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 424, + "step_time": 569.5789128560573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 811.375, + "completions/mean_terminated_length": 598.75, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.4436912536621094, + "epoch": 2.248677248677249, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.0011822007363662124, + "learning_rate": 6.639127242987987e-07, + "loss": -0.0078, + "num_tokens": 8271904.0, + "reward": 0.65625, + "reward_std": 0.3243582546710968, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.3966001570224762, + "step": 425, + "step_time": 137.0775876250118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 793.625, + "completions/mean_terminated_length": 688.9091186523438, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.44857457280158997, + "epoch": 2.253968253968254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0012536750873550773, + "learning_rate": 6.623497346023417e-07, + "loss": 0.0162, + "num_tokens": 8290466.0, + "reward": 0.2544642686843872, + "reward_std": 0.20485526323318481, + "rewards/itbench_correctness/mean": 0.2544642686843872, + "rewards/itbench_correctness/std": 0.2508065104484558, + "step": 426, + "step_time": 381.0265443623066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 668.5625, + "completions/mean_terminated_length": 668.5625, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.4487239420413971, + "epoch": 2.259259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.0014325481606647372, + "learning_rate": 6.607849694751977e-07, + "loss": -0.0007, + "num_tokens": 8311259.0, + "reward": 0.800000011920929, + "reward_std": 0.09974324703216553, + "rewards/itbench_correctness/mean": 0.800000011920929, + "rewards/itbench_correctness/std": 0.10954451560974121, + "step": 427, + "step_time": 78.88445997610688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 581.875, + "completions/mean_terminated_length": 581.875, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.36090224981307983, + "epoch": 2.2645502645502646, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.25, + "kl": 0.0013826474314555526, + "learning_rate": 6.592184460293877e-07, + "loss": 0.0109, + "num_tokens": 8325825.0, + "reward": 0.4765625, + "reward_std": 0.3056884706020355, + "rewards/itbench_correctness/mean": 0.4765625, + "rewards/itbench_correctness/std": 0.4062500298023224, + "step": 428, + "step_time": 880.099565721117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 642.125, + "completions/mean_terminated_length": 587.5714721679688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3862176239490509, + "epoch": 2.2698412698412698, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.21875, + "kl": 0.0022476972080767155, + "learning_rate": 6.576501813961608e-07, + "loss": -0.0962, + "num_tokens": 8351099.0, + "reward": 0.75, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 429, + "step_time": 397.64244225714356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 775.5, + "completions/mean_terminated_length": 718.1538696289062, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 0.5390070676803589, + "epoch": 2.2751322751322753, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0013746700715273619, + "learning_rate": 6.560801927258079e-07, + "loss": 0.0052, + "num_tokens": 8370627.0, + "reward": 0.6875, + "reward_std": 0.44403791427612305, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 430, + "step_time": 87.13009965512902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 880.875, + "completions/mean_terminated_length": 737.75, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "entropy": 0.5494536757469177, + "epoch": 2.2804232804232805, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.546875, + "kl": 0.0012544175842776895, + "learning_rate": 6.545084971874736e-07, + "loss": 0.0, + "num_tokens": 8397801.0, + "reward": 0.11249999701976776, + "reward_std": 0.09449111670255661, + "rewards/itbench_correctness/mean": 0.11249999701976776, + "rewards/itbench_correctness/std": 0.12974333763122559, + "step": 431, + "step_time": 98.64099729061127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 733.5625, + "completions/mean_terminated_length": 559.2999877929688, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.42532163858413696, + "epoch": 2.2857142857142856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0014970600605010986, + "learning_rate": 6.529351119689687e-07, + "loss": 0.0263, + "num_tokens": 8413058.0, + "reward": 0.4921875, + "reward_std": 0.10436524450778961, + "rewards/itbench_correctness/mean": 0.4921875, + "rewards/itbench_correctness/std": 0.47096699476242065, + "step": 432, + "step_time": 579.7829250898212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 803.1875, + "completions/mean_terminated_length": 729.5833740234375, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.3336705267429352, + "epoch": 2.291005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0011378336930647492, + "learning_rate": 6.513600542765816e-07, + "loss": -0.0063, + "num_tokens": 8433925.0, + "reward": 0.78125, + "reward_std": 0.13837619125843048, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.19924625754356384, + "step": 433, + "step_time": 182.92570608016104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 910.25, + "completions/mean_terminated_length": 796.5, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.3559461832046509, + "epoch": 2.2962962962962963, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4140625, + "kl": 0.0012446820037439466, + "learning_rate": 6.497833413348909e-07, + "loss": -0.0338, + "num_tokens": 8462945.0, + "reward": 0.5625, + "reward_std": 0.03857583925127983, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.454911470413208, + "step": 434, + "step_time": 330.48511962778866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 839.0625, + "completions/mean_terminated_length": 654.125, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.464804470539093, + "epoch": 2.3015873015873014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.0014192892704159021, + "learning_rate": 6.482049903865768e-07, + "loss": 0.008, + "num_tokens": 8487602.0, + "reward": 0.8541666865348816, + "reward_std": 0.3027648329734802, + "rewards/itbench_correctness/mean": 0.8541666865348816, + "rewards/itbench_correctness/std": 0.2973649799823761, + "step": 435, + "step_time": 126.97447157558054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 963.5, + "completions/mean_terminated_length": 916.4444580078125, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.39647120237350464, + "epoch": 2.306878306878307, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.0012867730110883713, + "learning_rate": 6.466250186922324e-07, + "loss": 0.0023, + "num_tokens": 8509778.0, + "reward": 0.697578489780426, + "reward_std": 0.291044145822525, + "rewards/itbench_correctness/mean": 0.697578489780426, + "rewards/itbench_correctness/std": 0.3142254650592804, + "step": 436, + "step_time": 449.84700517356396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 460.5625, + "completions/mean_terminated_length": 460.5625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.4451078772544861, + "epoch": 2.312169312169312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02685546875, + "kl": 0.0015331042231991887, + "learning_rate": 6.450434435301751e-07, + "loss": 0.0, + "num_tokens": 8519963.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.3333333432674408, + "rewards/itbench_correctness/std": 0.17213259637355804, + "step": 437, + "step_time": 796.3606786699966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 611.5, + "completions/mean_terminated_length": 424.0, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.4153720438480377, + "epoch": 2.317460317460317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9453125, + "kl": 0.0015486880438402295, + "learning_rate": 6.43460282196257e-07, + "loss": 0.008, + "num_tokens": 8541179.0, + "reward": 0.2395833432674408, + "reward_std": 0.0883883535861969, + "rewards/itbench_correctness/mean": 0.2395833432674408, + "rewards/itbench_correctness/std": 0.27533650398254395, + "step": 438, + "step_time": 145.90149160753936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 577.0625, + "completions/mean_terminated_length": 577.0625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.5198743343353271, + "epoch": 2.322751322751323, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1171875, + "kl": 0.0017036347417160869, + "learning_rate": 6.418755520036774e-07, + "loss": 0.0056, + "num_tokens": 8558452.0, + "reward": 0.765625, + "reward_std": 0.09300297498703003, + "rewards/itbench_correctness/mean": 0.765625, + "rewards/itbench_correctness/std": 0.2733854353427887, + "step": 439, + "step_time": 154.49192036502063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 362.25, + "completions/mean_terminated_length": 362.25, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.4306418299674988, + "epoch": 2.328042328042328, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.94140625, + "kl": 0.0017033821204677224, + "learning_rate": 6.402892702827916e-07, + "loss": -0.0083, + "num_tokens": 8566496.0, + "reward": 0.1953125, + "reward_std": 0.08956430107355118, + "rewards/itbench_correctness/mean": 0.1953125, + "rewards/itbench_correctness/std": 0.2359323352575302, + "step": 440, + "step_time": 84.76937860064209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 722.875, + "completions/mean_terminated_length": 622.5, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.40670931339263916, + "epoch": 2.3333333333333335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.0013241568813100457, + "learning_rate": 6.387014543809223e-07, + "loss": 0.0764, + "num_tokens": 8586822.0, + "reward": 0.4375, + "reward_std": 0.3339453935623169, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.35420751571655273, + "step": 441, + "step_time": 146.5532330982387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.48046875, + "epoch": 2.3386243386243386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0238037109375, + "kl": 0.0011454581981524825, + "learning_rate": 6.371121216621697e-07, + "loss": 0.0, + "num_tokens": 8615486.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 442, + "step_time": 117.81919787544757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 531.1875, + "completions/mean_terminated_length": 531.1875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 0.3595717251300812, + "epoch": 2.3439153439153437, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0625, + "kl": 0.001713753561489284, + "learning_rate": 6.355212895072222e-07, + "loss": -0.0025, + "num_tokens": 8627873.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 443, + "step_time": 1035.1622464098036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 728.5, + "completions/mean_terminated_length": 594.1818237304688, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.3816060423851013, + "epoch": 2.3492063492063493, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0015201023779809475, + "learning_rate": 6.339289753131648e-07, + "loss": 0.0442, + "num_tokens": 8645001.0, + "reward": 0.109375, + "reward_std": 0.30935919284820557, + "rewards/itbench_correctness/mean": 0.109375, + "rewards/itbench_correctness/std": 0.30233466625213623, + "step": 444, + "step_time": 933.9612167160958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 916.4375, + "completions/mean_terminated_length": 808.875, + "completions/min_length": 648.0, + "completions/min_terminated_length": 648.0, + "entropy": 0.49976131319999695, + "epoch": 2.3544973544973544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0014402419328689575, + "learning_rate": 6.323351964932908e-07, + "loss": 0.0092, + "num_tokens": 8666528.0, + "reward": 0.44999998807907104, + "reward_std": 0.09669842571020126, + "rewards/itbench_correctness/mean": 0.44999998807907104, + "rewards/itbench_correctness/std": 0.41733282804489136, + "step": 445, + "step_time": 141.34655232075602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 649.5, + "completions/mean_terminated_length": 358.22222900390625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4064665138721466, + "epoch": 2.35978835978836, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.28125, + "kl": 0.0020702362526208162, + "learning_rate": 6.307399704769098e-07, + "loss": -0.1482, + "num_tokens": 8692528.0, + "reward": 0.2447916716337204, + "reward_std": 0.1262161284685135, + "rewards/itbench_correctness/mean": 0.2447916716337204, + "rewards/itbench_correctness/std": 0.16020458936691284, + "step": 446, + "step_time": 118.47124487534165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 942.625, + "completions/mean_terminated_length": 861.25, + "completions/min_length": 709.0, + "completions/min_terminated_length": 709.0, + "entropy": 0.50921630859375, + "epoch": 2.365079365079365, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0014145068125799298, + "learning_rate": 6.291433147091583e-07, + "loss": 0.0104, + "num_tokens": 8714466.0, + "reward": 0.4854166805744171, + "reward_std": 0.3144327402114868, + "rewards/itbench_correctness/mean": 0.4854166805744171, + "rewards/itbench_correctness/std": 0.393459290266037, + "step": 447, + "step_time": 369.4135863818228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 899.375, + "completions/mean_terminated_length": 691.6666870117188, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5092425346374512, + "epoch": 2.3703703703703702, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.00447537936270237, + "learning_rate": 6.275452466508075e-07, + "loss": -0.0622, + "num_tokens": 8750192.0, + "reward": 0.09375, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.09375, + "rewards/itbench_correctness/std": 0.20155644416809082, + "step": 448, + "step_time": 469.61162946000695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 550.5, + "completions/mean_terminated_length": 550.5, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.4396003484725952, + "epoch": 2.375661375661376, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0023843871895223856, + "learning_rate": 6.259457837780741e-07, + "loss": 0.0034, + "num_tokens": 8762320.0, + "reward": 0.75, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 449, + "step_time": 616.3559736898169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 805.8125, + "completions/mean_terminated_length": 587.625, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.6304196119308472, + "epoch": 2.380952380952381, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0017924520652741194, + "learning_rate": 6.243449435824276e-07, + "loss": -0.0157, + "num_tokens": 8783789.0, + "reward": 0.5416666865348816, + "reward_std": 0.2629520893096924, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.43885374069213867, + "step": 450, + "step_time": 88.87441652361304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 751.1875, + "completions/mean_terminated_length": 539.0, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.4606040418148041, + "epoch": 2.386243386243386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0016598474467173219, + "learning_rate": 6.227427435703995e-07, + "loss": -0.0021, + "num_tokens": 8807136.0, + "reward": 0.5546875, + "reward_std": 0.2294243574142456, + "rewards/itbench_correctness/mean": 0.5546875, + "rewards/itbench_correctness/std": 0.3080184757709503, + "step": 451, + "step_time": 986.2314578304067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 817.625, + "completions/mean_terminated_length": 770.0, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 0.5087907314300537, + "epoch": 2.3915343915343916, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.359375, + "kl": 0.001573887187987566, + "learning_rate": 6.211392012633931e-07, + "loss": -0.0123, + "num_tokens": 8831826.0, + "reward": 0.59375, + "reward_std": 0.18600594997406006, + "rewards/itbench_correctness/mean": 0.59375, + "rewards/itbench_correctness/std": 0.4905354380607605, + "step": 452, + "step_time": 369.9421289321035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 602.0, + "completions/mean_terminated_length": 602.0, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 0.49501660466194153, + "epoch": 2.3968253968253967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.0018204387743026018, + "learning_rate": 6.1953433419749e-07, + "loss": 0.0146, + "num_tokens": 8844602.0, + "reward": 0.4765625, + "reward_std": 0.10795740783214569, + "rewards/itbench_correctness/mean": 0.4765625, + "rewards/itbench_correctness/std": 0.2781464755535126, + "step": 453, + "step_time": 103.37166160158813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 449.25, + "completions/mean_terminated_length": 449.25, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.5052865743637085, + "epoch": 2.402116402116402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.0016118157655000687, + "learning_rate": 6.17928159923259e-07, + "loss": 0.0095, + "num_tokens": 8854590.0, + "reward": 0.71875, + "reward_std": 0.35564959049224854, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.44604745507240295, + "step": 454, + "step_time": 980.6566639961675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 527.625, + "completions/mean_terminated_length": 527.625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 0.48140251636505127, + "epoch": 2.4074074074074074, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.0013566534034907818, + "learning_rate": 6.163206960055652e-07, + "loss": -0.0056, + "num_tokens": 8868488.0, + "reward": 0.21875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.2561737895011902, + "step": 455, + "step_time": 94.81741558108479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 838.3125, + "completions/mean_terminated_length": 652.625, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.37217625975608826, + "epoch": 2.4126984126984126, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.0014726277440786362, + "learning_rate": 6.147119600233758e-07, + "loss": -0.0124, + "num_tokens": 8892141.0, + "reward": 0.5520833134651184, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.5520833134651184, + "rewards/itbench_correctness/std": 0.4781087338924408, + "step": 456, + "step_time": 774.9578263629228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 805.3125, + "completions/mean_terminated_length": 732.4166870117188, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.4619324803352356, + "epoch": 2.417989417989418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.0020939442329108715, + "learning_rate": 6.131019695695702e-07, + "loss": 0.0001, + "num_tokens": 8917394.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 457, + "step_time": 160.94475755654275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 415.625, + "completions/mean_terminated_length": 415.625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.4980451166629791, + "epoch": 2.4232804232804233, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.002369649475440383, + "learning_rate": 6.114907422507459e-07, + "loss": 0.0046, + "num_tokens": 8926548.0, + "reward": 0.625, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.3535533845424652, + "step": 458, + "step_time": 130.5550601184368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 761.1875, + "completions/mean_terminated_length": 641.727294921875, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.3757287263870239, + "epoch": 2.4285714285714284, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.0018944862531498075, + "learning_rate": 6.098782956870265e-07, + "loss": 0.0028, + "num_tokens": 8944471.0, + "reward": 0.1875, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 459, + "step_time": 351.48624353297055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 730.5, + "completions/mean_terminated_length": 502.22222900390625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.5448322892189026, + "epoch": 2.433862433862434, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.0037839075084775686, + "learning_rate": 6.082646475118699e-07, + "loss": 0.0032, + "num_tokens": 8962455.0, + "reward": 0.10880681872367859, + "reward_std": 0.1294855922460556, + "rewards/itbench_correctness/mean": 0.10880681872367859, + "rewards/itbench_correctness/std": 0.12925373017787933, + "step": 460, + "step_time": 137.66368599049747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 897.5, + "completions/mean_terminated_length": 799.1111450195312, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.41671308875083923, + "epoch": 2.439153439153439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.0014645819319412112, + "learning_rate": 6.066498153718734e-07, + "loss": 0.0001, + "num_tokens": 8993351.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 461, + "step_time": 331.1069351742044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 638.3125, + "completions/mean_terminated_length": 638.3125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.333692342042923, + "epoch": 2.4444444444444446, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9453125, + "kl": 0.0018866208847612143, + "learning_rate": 6.05033816926583e-07, + "loss": -0.0577, + "num_tokens": 9008684.0, + "reward": 0.0520833358168602, + "reward_std": 0.043129097670316696, + "rewards/itbench_correctness/mean": 0.0520833358168602, + "rewards/itbench_correctness/std": 0.07978560030460358, + "step": 462, + "step_time": 83.101976220496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 986.3125, + "completions/mean_terminated_length": 873.25, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.42785629630088806, + "epoch": 2.4497354497354498, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3828125, + "kl": 0.0011906343279406428, + "learning_rate": 6.034166698482983e-07, + "loss": 0.0241, + "num_tokens": 9031329.0, + "reward": 0.1875, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 463, + "step_time": 6071.444487111643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 587.5, + "completions/mean_terminated_length": 442.0, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.5753191709518433, + "epoch": 2.455026455026455, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0014363499358296394, + "learning_rate": 6.017983918218811e-07, + "loss": -0.0264, + "num_tokens": 9064257.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 464, + "step_time": 156.23662452865392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 489.0, + "completions/mean_terminated_length": 453.3333435058594, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.5644171833992004, + "epoch": 2.4603174603174605, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.0021656756289303303, + "learning_rate": 6.001790005445606e-07, + "loss": 0.0027, + "num_tokens": 9093129.0, + "reward": 0.359375, + "reward_std": 0.19408094882965088, + "rewards/itbench_correctness/mean": 0.359375, + "rewards/itbench_correctness/std": 0.4561501145362854, + "step": 465, + "step_time": 114.18916879687458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 454.1875, + "completions/mean_terminated_length": 454.1875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.45355716347694397, + "epoch": 2.4656084656084656, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7421875, + "kl": 0.0023789138067513704, + "learning_rate": 5.985585137257401e-07, + "loss": -0.0632, + "num_tokens": 9103764.0, + "reward": 0.71875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 466, + "step_time": 111.49137642700225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 459.8125, + "completions/mean_terminated_length": 459.8125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 0.3784151077270508, + "epoch": 2.4708994708994707, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0017901118844747543, + "learning_rate": 5.969369490868042e-07, + "loss": 0.0151, + "num_tokens": 9114825.0, + "reward": 0.6927083730697632, + "reward_std": 0.1530819982290268, + "rewards/itbench_correctness/mean": 0.6927083730697632, + "rewards/itbench_correctness/std": 0.26652559638023376, + "step": 467, + "step_time": 45.78145207092166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 631.375, + "completions/mean_terminated_length": 631.375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.46565037965774536, + "epoch": 2.4761904761904763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.0021383543498814106, + "learning_rate": 5.953143243609234e-07, + "loss": 0.0001, + "num_tokens": 9128071.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 468, + "step_time": 75.04701119381934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 431.6875, + "completions/mean_terminated_length": 431.6875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5073114037513733, + "epoch": 2.4814814814814814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.0021981364116072655, + "learning_rate": 5.936906572928624e-07, + "loss": -0.0696, + "num_tokens": 9143002.0, + "reward": 0.871874988079071, + "reward_std": 0.20840224623680115, + "rewards/itbench_correctness/mean": 0.871874988079071, + "rewards/itbench_correctness/std": 0.2529616057872772, + "step": 469, + "step_time": 78.34151318110526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 646.0625, + "completions/mean_terminated_length": 419.3000183105469, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.5819870233535767, + "epoch": 2.4867724867724865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.002163654426112771, + "learning_rate": 5.920659656387836e-07, + "loss": 0.072, + "num_tokens": 9169283.0, + "reward": 0.28125, + "reward_std": 0.2651650309562683, + "rewards/itbench_correctness/mean": 0.28125, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 470, + "step_time": 197.0865554632619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 707.25, + "completions/mean_terminated_length": 563.2727661132812, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.41003888845443726, + "epoch": 2.492063492063492, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.001396682346239686, + "learning_rate": 5.90440267166055e-07, + "loss": -0.0062, + "num_tokens": 9187327.0, + "reward": 0.875, + "reward_std": 0.2619796097278595, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.273861289024353, + "step": 471, + "step_time": 86.19406038243324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 556.1875, + "completions/mean_terminated_length": 556.1875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.3613889217376709, + "epoch": 2.497354497354497, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0625, + "kl": 0.001245600637048483, + "learning_rate": 5.888135796530544e-07, + "loss": 0.0086, + "num_tokens": 9200090.0, + "reward": 0.5729166865348816, + "reward_std": 0.0294627845287323, + "rewards/itbench_correctness/mean": 0.5729166865348816, + "rewards/itbench_correctness/std": 0.4429227113723755, + "step": 472, + "step_time": 72.5158723751083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 477.25, + "completions/mean_terminated_length": 477.25, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.45049765706062317, + "epoch": 2.502645502645503, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.15625, + "kl": 0.001709087984636426, + "learning_rate": 5.871859208889758e-07, + "loss": -0.0137, + "num_tokens": 9210406.0, + "reward": 0.640625, + "reward_std": 0.11451567709445953, + "rewards/itbench_correctness/mean": 0.640625, + "rewards/itbench_correctness/std": 0.40278977155685425, + "step": 473, + "step_time": 132.74163577985018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 835.9375, + "completions/mean_terminated_length": 773.25, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.521570086479187, + "epoch": 2.507936507936508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0017986115999519825, + "learning_rate": 5.855573086736349e-07, + "loss": -0.1102, + "num_tokens": 9243021.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 474, + "step_time": 366.73758555483073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 830.5, + "completions/mean_terminated_length": 742.5454711914062, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.2576760947704315, + "epoch": 2.5132275132275135, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0009246127447113395, + "learning_rate": 5.839277608172738e-07, + "loss": 0.0089, + "num_tokens": 9265781.0, + "reward": 0.42500001192092896, + "reward_std": 0.026726115494966507, + "rewards/itbench_correctness/mean": 0.42500001192092896, + "rewards/itbench_correctness/std": 0.44045430421829224, + "step": 475, + "step_time": 242.08502481784672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 789.1875, + "completions/mean_terminated_length": 648.2999877929688, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.3066444993019104, + "epoch": 2.5185185185185186, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0016043871873989701, + "learning_rate": 5.82297295140367e-07, + "loss": 0.0033, + "num_tokens": 9285040.0, + "reward": 0.75, + "reward_std": 0.15430335700511932, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.3333333432674408, + "step": 476, + "step_time": 477.0931530073285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 922.125, + "completions/mean_terminated_length": 820.25, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.37738919258117676, + "epoch": 2.5238095238095237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0015391347697004676, + "learning_rate": 5.806659294734255e-07, + "loss": -0.0201, + "num_tokens": 9310586.0, + "reward": 0.6581439971923828, + "reward_std": 0.30061405897140503, + "rewards/itbench_correctness/mean": 0.6581439971923828, + "rewards/itbench_correctness/std": 0.3035711646080017, + "step": 477, + "step_time": 111.43491127341986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 737.75, + "completions/mean_terminated_length": 718.6666870117188, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "entropy": 0.21823111176490784, + "epoch": 2.5291005291005293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0010663650464266539, + "learning_rate": 5.790336816568032e-07, + "loss": -0.0021, + "num_tokens": 9327982.0, + "reward": 0.4713541865348816, + "reward_std": 0.4539119601249695, + "rewards/itbench_correctness/mean": 0.4713541865348816, + "rewards/itbench_correctness/std": 0.4463635981082916, + "step": 478, + "step_time": 79.33199557475746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 493.125, + "completions/mean_terminated_length": 493.125, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.559695839881897, + "epoch": 2.5343915343915344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.0020555509254336357, + "learning_rate": 5.774005695405007e-07, + "loss": 0.0, + "num_tokens": 9340472.0, + "reward": 0.6666666269302368, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.6666666269302368, + "rewards/itbench_correctness/std": 0.17213258147239685, + "step": 479, + "step_time": 60.24873013421893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 519.625, + "completions/mean_terminated_length": 519.625, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.49651190638542175, + "epoch": 2.5396825396825395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03466796875, + "kl": 0.0015541493194177747, + "learning_rate": 5.757666109839702e-07, + "loss": 0.0, + "num_tokens": 9370882.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 480, + "step_time": 1150.014605092816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 575.5, + "completions/mean_terminated_length": 575.5, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.4587315320968628, + "epoch": 2.544973544973545, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.0013555011246353388, + "learning_rate": 5.741318238559209e-07, + "loss": -0.0073, + "num_tokens": 9383698.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 481, + "step_time": 187.38160399720073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 647.4375, + "completions/mean_terminated_length": 647.4375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.3753257989883423, + "epoch": 2.5502645502645502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037841796875, + "kl": 0.0015313706826418638, + "learning_rate": 5.724962260341229e-07, + "loss": 0.0, + "num_tokens": 9398977.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 482, + "step_time": 765.347533389926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 583.375, + "completions/mean_terminated_length": 583.375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.5416755676269531, + "epoch": 2.5555555555555554, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.15625, + "kl": 0.0014298075111582875, + "learning_rate": 5.708598354052121e-07, + "loss": 0.0256, + "num_tokens": 9415967.0, + "reward": 0.8541666865348816, + "reward_std": 0.049801189452409744, + "rewards/itbench_correctness/mean": 0.8541666865348816, + "rewards/itbench_correctness/std": 0.16527193784713745, + "step": 483, + "step_time": 99.65433174744248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 448.9375, + "completions/mean_terminated_length": 448.9375, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "entropy": 0.4521787464618683, + "epoch": 2.560846560846561, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.15625, + "kl": 0.0014567336766049266, + "learning_rate": 5.692226698644937e-07, + "loss": -0.0088, + "num_tokens": 9425990.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 484, + "step_time": 159.2589992955327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 837.875, + "completions/mean_terminated_length": 775.8333740234375, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.38430553674697876, + "epoch": 2.566137566137566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033935546875, + "kl": 0.0012451084330677986, + "learning_rate": 5.675847473157485e-07, + "loss": 0.0, + "num_tokens": 9444988.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 485, + "step_time": 265.1822805535048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 574.375, + "completions/mean_terminated_length": 574.375, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.4108814001083374, + "epoch": 2.571428571428571, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.0019665309228003025, + "learning_rate": 5.659460856710345e-07, + "loss": -0.008, + "num_tokens": 9458426.0, + "reward": 0.8883928656578064, + "reward_std": 0.16245228052139282, + "rewards/itbench_correctness/mean": 0.8883928656578064, + "rewards/itbench_correctness/std": 0.1985812783241272, + "step": 486, + "step_time": 129.54102603532374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 775.9375, + "completions/mean_terminated_length": 583.0, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.32863470911979675, + "epoch": 2.5767195767195767, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0015423308359459043, + "learning_rate": 5.643067028504931e-07, + "loss": -0.0219, + "num_tokens": 9485297.0, + "reward": 0.375, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.4281744360923767, + "step": 487, + "step_time": 71.69435486476868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 509.4375, + "completions/mean_terminated_length": 509.4375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3533308804035187, + "epoch": 2.582010582010582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.0020819108467549086, + "learning_rate": 5.626666167821521e-07, + "loss": 0.0039, + "num_tokens": 9497008.0, + "reward": 0.78125, + "reward_std": 0.3471629321575165, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.4069705307483673, + "step": 488, + "step_time": 89.01397905871272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 567.0625, + "completions/mean_terminated_length": 567.0625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.5466769337654114, + "epoch": 2.5873015873015874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035400390625, + "kl": 0.0019230879843235016, + "learning_rate": 5.6102584540173e-07, + "loss": 0.0, + "num_tokens": 9519441.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 489, + "step_time": 71.94888481497765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 768.8125, + "completions/mean_terminated_length": 768.8125, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.2939598262310028, + "epoch": 2.5925925925925926, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.203125, + "kl": 0.0013061386998742819, + "learning_rate": 5.5938440665244e-07, + "loss": 0.0038, + "num_tokens": 9537934.0, + "reward": 0.84375, + "reward_std": 0.0578637570142746, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.17969882488250732, + "step": 490, + "step_time": 106.79387213569134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 944.9375, + "completions/mean_terminated_length": 771.0, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "entropy": 0.2941993474960327, + "epoch": 2.597883597883598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.0013492838479578495, + "learning_rate": 5.577423184847931e-07, + "loss": 0.0, + "num_tokens": 9561381.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 491, + "step_time": 823.8002629633993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 816.3125, + "completions/mean_terminated_length": 786.6428833007812, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "entropy": 0.4140571057796478, + "epoch": 2.6031746031746033, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.001417625928297639, + "learning_rate": 5.560995988564023e-07, + "loss": 0.0223, + "num_tokens": 9581962.0, + "reward": 0.109375, + "reward_std": 0.2414703369140625, + "rewards/itbench_correctness/mean": 0.109375, + "rewards/itbench_correctness/std": 0.2576940953731537, + "step": 492, + "step_time": 97.65742574445903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 701.125, + "completions/mean_terminated_length": 450.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4649670124053955, + "epoch": 2.6084656084656084, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0015433584339916706, + "learning_rate": 5.544562657317863e-07, + "loss": -0.014, + "num_tokens": 9616972.0, + "reward": 0.3125, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.35939764976501465, + "step": 493, + "step_time": 147.00880005117506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 735.75, + "completions/mean_terminated_length": 694.5714721679688, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.4131838381290436, + "epoch": 2.613756613756614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.001086304779164493, + "learning_rate": 5.528123370821729e-07, + "loss": 0.0469, + "num_tokens": 9632520.0, + "reward": 0.6505681872367859, + "reward_std": 0.23257695138454437, + "rewards/itbench_correctness/mean": 0.6505681872367859, + "rewards/itbench_correctness/std": 0.45813921093940735, + "step": 494, + "step_time": 73.31144659873098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 2.0, + "completions/mean_length": 768.3125, + "completions/mean_terminated_length": 1.25, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.640364408493042, + "epoch": 2.619047619047619, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.46875, + "kl": 0.0018688439158722758, + "learning_rate": 5.511678308853025e-07, + "loss": -0.1102, + "num_tokens": 9660781.0, + "reward": 0.0520833358168602, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.0520833358168602, + "rewards/itbench_correctness/std": 0.13220004737377167, + "step": 495, + "step_time": 85.44449219666421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 750.5, + "completions/mean_terminated_length": 659.3333740234375, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.46902066469192505, + "epoch": 2.624338624338624, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.265625, + "kl": 0.0013028380926698446, + "learning_rate": 5.495227651252315e-07, + "loss": 0.0103, + "num_tokens": 9677765.0, + "reward": 0.59375, + "reward_std": 0.1735912710428238, + "rewards/itbench_correctness/mean": 0.59375, + "rewards/itbench_correctness/std": 0.48196646571159363, + "step": 496, + "step_time": 781.4977411162108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 390.75, + "completions/mean_terminated_length": 390.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4273832440376282, + "epoch": 2.6296296296296298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.0021745527628809214, + "learning_rate": 5.478771577921351e-07, + "loss": -0.0118, + "num_tokens": 9686945.0, + "reward": 0.627480149269104, + "reward_std": 0.2220388650894165, + "rewards/itbench_correctness/mean": 0.627480149269104, + "rewards/itbench_correctness/std": 0.3782695233821869, + "step": 497, + "step_time": 134.51860492676497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 764.5, + "completions/mean_terminated_length": 505.0, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.3531720042228699, + "epoch": 2.634920634920635, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.0014397338964045048, + "learning_rate": 5.462310268821117e-07, + "loss": 0.0163, + "num_tokens": 9713425.0, + "reward": 0.265625, + "reward_std": 0.1724265068769455, + "rewards/itbench_correctness/mean": 0.265625, + "rewards/itbench_correctness/std": 0.3616048991680145, + "step": 498, + "step_time": 152.6961117470637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 576.0625, + "completions/mean_terminated_length": 576.0625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.28642725944519043, + "epoch": 2.64021164021164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0037036272697150707, + "learning_rate": 5.445843903969854e-07, + "loss": -0.0221, + "num_tokens": 9727450.0, + "reward": 0.2291666716337204, + "reward_std": 0.2048145830631256, + "rewards/itbench_correctness/mean": 0.2291666716337204, + "rewards/itbench_correctness/std": 0.22669117152690887, + "step": 499, + "step_time": 78.76068393606693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 794.4375, + "completions/mean_terminated_length": 690.0909423828125, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.3776256740093231, + "epoch": 2.6455026455026456, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0019214467611163855, + "learning_rate": 5.429372663441085e-07, + "loss": 0.0057, + "num_tokens": 9745281.0, + "reward": 0.40625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.20155644416809082, + "step": 500, + "step_time": 542.0401397850364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 570.9375, + "completions/mean_terminated_length": 570.9375, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "entropy": 0.3800766170024872, + "epoch": 2.6507936507936507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.0011877365177497268, + "learning_rate": 5.412896727361662e-07, + "loss": 0.0347, + "num_tokens": 9757936.0, + "reward": 0.8323863744735718, + "reward_std": 0.1946326196193695, + "rewards/itbench_correctness/mean": 0.8323863744735718, + "rewards/itbench_correctness/std": 0.28190067410469055, + "step": 501, + "step_time": 66.27912161499262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 658.25, + "completions/mean_terminated_length": 492.0, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.30991265177726746, + "epoch": 2.656084656084656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.001489053014665842, + "learning_rate": 5.396416275909779e-07, + "loss": 0.0148, + "num_tokens": 9773348.0, + "reward": 0.21875, + "reward_std": 0.3061639666557312, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 502, + "step_time": 1172.296461245045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 537.6875, + "completions/mean_terminated_length": 537.6875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "entropy": 0.3291874825954437, + "epoch": 2.6613756613756614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0015724250115454197, + "learning_rate": 5.379931489313015e-07, + "loss": 0.0265, + "num_tokens": 9786871.0, + "reward": 0.45625001192092896, + "reward_std": 0.05625351518392563, + "rewards/itbench_correctness/mean": 0.45625001192092896, + "rewards/itbench_correctness/std": 0.34699106216430664, + "step": 503, + "step_time": 66.85712667554617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 372.875, + "completions/mean_terminated_length": 372.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4425075352191925, + "epoch": 2.6666666666666665, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.0019720701966434717, + "learning_rate": 5.363442547846355e-07, + "loss": -0.043, + "num_tokens": 9800949.0, + "reward": 0.3671875, + "reward_std": 0.3643017113208771, + "rewards/itbench_correctness/mean": 0.3671875, + "rewards/itbench_correctness/std": 0.4119788408279419, + "step": 504, + "step_time": 72.04895468428731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 784.4375, + "completions/mean_terminated_length": 704.5833740234375, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "entropy": 0.47422516345977783, + "epoch": 2.671957671957672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.001217540237121284, + "learning_rate": 5.34694963183022e-07, + "loss": -0.0013, + "num_tokens": 9817452.0, + "reward": 0.8229166269302368, + "reward_std": 0.2745841145515442, + "rewards/itbench_correctness/mean": 0.8229166269302368, + "rewards/itbench_correctness/std": 0.3303687572479248, + "step": 505, + "step_time": 82.16243282984942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 512.4375, + "completions/mean_terminated_length": 512.4375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5346993803977966, + "epoch": 2.677248677248677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.0038210356142371893, + "learning_rate": 5.330452921628497e-07, + "loss": -0.1759, + "num_tokens": 9828603.0, + "reward": 0.5416666865348816, + "reward_std": 0.21967849135398865, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.24152295291423798, + "step": 506, + "step_time": 130.50164964888245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 873.9375, + "completions/mean_terminated_length": 757.2222290039062, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.446256160736084, + "epoch": 2.682539682539683, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.0013827559305354953, + "learning_rate": 5.313952597646567e-07, + "loss": -0.0479, + "num_tokens": 9851170.0, + "reward": 0.1875, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 507, + "step_time": 549.6535976743326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 426.5, + "completions/mean_terminated_length": 426.5, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.46189919114112854, + "epoch": 2.687830687830688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0022915503941476345, + "learning_rate": 5.297448840329328e-07, + "loss": -0.0217, + "num_tokens": 9860442.0, + "reward": 0.2698863744735718, + "reward_std": 0.15896323323249817, + "rewards/itbench_correctness/mean": 0.2698863744735718, + "rewards/itbench_correctness/std": 0.2320520281791687, + "step": 508, + "step_time": 67.97736590728164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 781.4375, + "completions/mean_terminated_length": 746.7857666015625, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "entropy": 0.2853715121746063, + "epoch": 2.693121693121693, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.203125, + "kl": 0.0011138498084619641, + "learning_rate": 5.280941830159227e-07, + "loss": 0.0167, + "num_tokens": 9880081.0, + "reward": 0.34375, + "reward_std": 0.18600594997406006, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.4366062581539154, + "step": 509, + "step_time": 240.96920191589743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 581.5625, + "completions/mean_terminated_length": 581.5625, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.5089736580848694, + "epoch": 2.6984126984126986, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.001449383096769452, + "learning_rate": 5.264431747654283e-07, + "loss": 0.0141, + "num_tokens": 9910954.0, + "reward": 0.34375, + "reward_std": 0.18600594997406006, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.4366062581539154, + "step": 510, + "step_time": 138.8211078811437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 600.4375, + "completions/mean_terminated_length": 600.4375, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.3697304129600525, + "epoch": 2.7037037037037037, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1015625, + "kl": 0.001372107770293951, + "learning_rate": 5.247918773366111e-07, + "loss": 0.0158, + "num_tokens": 9925225.0, + "reward": 0.875, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 511, + "step_time": 78.60712255910039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 443.125, + "completions/mean_terminated_length": 443.125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 0.36784201860427856, + "epoch": 2.708994708994709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.001885003293864429, + "learning_rate": 5.231403087877955e-07, + "loss": -0.0034, + "num_tokens": 9936491.0, + "reward": 0.5052083730697632, + "reward_std": 0.29658451676368713, + "rewards/itbench_correctness/mean": 0.5052083730697632, + "rewards/itbench_correctness/std": 0.4892064332962036, + "step": 512, + "step_time": 1101.7817776547745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 506.5, + "completions/mean_terminated_length": 506.5, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 0.5034551024436951, + "epoch": 2.7142857142857144, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0027517788112163544, + "learning_rate": 5.214884871802703e-07, + "loss": -0.0104, + "num_tokens": 9958027.0, + "reward": 0.5333333015441895, + "reward_std": 0.24348656833171844, + "rewards/itbench_correctness/mean": 0.5333333015441895, + "rewards/itbench_correctness/std": 0.3538151979446411, + "step": 513, + "step_time": 115.8853734144941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 503.375, + "completions/mean_terminated_length": 468.66668701171875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3277874290943146, + "epoch": 2.7195767195767195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.005129899829626083, + "learning_rate": 5.198364305780921e-07, + "loss": -0.1331, + "num_tokens": 9970817.0, + "reward": 0.4270833432674408, + "reward_std": 0.3061639964580536, + "rewards/itbench_correctness/mean": 0.4270833432674408, + "rewards/itbench_correctness/std": 0.32185083627700806, + "step": 514, + "step_time": 81.30817873775959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 523.0, + "completions/mean_terminated_length": 523.0, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.41108986735343933, + "epoch": 2.7248677248677247, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.34375, + "kl": 0.0014420569641515613, + "learning_rate": 5.181841570478872e-07, + "loss": 0.0416, + "num_tokens": 9982881.0, + "reward": 0.9943181872367859, + "reward_std": 0.016070598736405373, + "rewards/itbench_correctness/mean": 0.9943181872367859, + "rewards/itbench_correctness/std": 0.02272726595401764, + "step": 515, + "step_time": 76.78445727284998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 978.5, + "completions/mean_terminated_length": 878.4000244140625, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.33316299319267273, + "epoch": 2.7301587301587302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0013756535481661558, + "learning_rate": 5.165316846586541e-07, + "loss": -0.0065, + "num_tokens": 10008745.0, + "reward": 0.2053571343421936, + "reward_std": 0.28105252981185913, + "rewards/itbench_correctness/mean": 0.2053571343421936, + "rewards/itbench_correctness/std": 0.32667672634124756, + "step": 516, + "step_time": 816.1864080894738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 624.875, + "completions/mean_terminated_length": 624.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5921184420585632, + "epoch": 2.7354497354497354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0018502407474443316, + "learning_rate": 5.148790314815662e-07, + "loss": -0.0101, + "num_tokens": 10025191.0, + "reward": 0.5062500238418579, + "reward_std": 0.25888073444366455, + "rewards/itbench_correctness/mean": 0.5062500238418579, + "rewards/itbench_correctness/std": 0.4753507673740387, + "step": 517, + "step_time": 72.83533152658492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 823.0625, + "completions/mean_terminated_length": 622.125, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5637481808662415, + "epoch": 2.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.0017002089880406857, + "learning_rate": 5.132262155897738e-07, + "loss": -0.1006, + "num_tokens": 10054440.0, + "reward": 0.4895833134651184, + "reward_std": 0.3517908453941345, + "rewards/itbench_correctness/mean": 0.4895833134651184, + "rewards/itbench_correctness/std": 0.5072392821311951, + "step": 518, + "step_time": 96.34473600052297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 994.25, + "completions/mean_terminated_length": 786.0, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.3198390603065491, + "epoch": 2.746031746031746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.0014571960782632232, + "learning_rate": 5.115732550582069e-07, + "loss": 0.0001, + "num_tokens": 10079516.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 519, + "step_time": 8666.9965882916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 863.5, + "completions/mean_terminated_length": 703.0, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.5211349129676819, + "epoch": 2.751322751322751, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.0015487850178033113, + "learning_rate": 5.099201679633768e-07, + "loss": 0.0001, + "num_tokens": 10101236.0, + "reward": 0.46875, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.185404971241951, + "step": 520, + "step_time": 312.3351803580299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 511.4375, + "completions/mean_terminated_length": 511.4375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4203837215900421, + "epoch": 2.7566137566137567, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.56640625, + "kl": 0.0017837980994954705, + "learning_rate": 5.082669723831793e-07, + "loss": -0.0992, + "num_tokens": 10119339.0, + "reward": 0.37708336114883423, + "reward_std": 0.0176776684820652, + "rewards/itbench_correctness/mean": 0.37708336114883423, + "rewards/itbench_correctness/std": 0.30005404353141785, + "step": 521, + "step_time": 109.74898790102452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 974.6875, + "completions/mean_terminated_length": 866.2000122070312, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.3508816957473755, + "epoch": 2.761904761904762, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.0012084582122042775, + "learning_rate": 5.066136863966962e-07, + "loss": 0.0043, + "num_tokens": 10142334.0, + "reward": 0.46875, + "reward_std": 0.28270021080970764, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4035433530807495, + "step": 522, + "step_time": 106.33708533085883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 875.5625, + "completions/mean_terminated_length": 760.1111450195312, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 0.5824826955795288, + "epoch": 2.7671957671957674, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0019700033590197563, + "learning_rate": 5.049603280839982e-07, + "loss": -0.0028, + "num_tokens": 10170263.0, + "reward": 0.015625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.015625, + "rewards/itbench_correctness/std": 0.0625, + "step": 523, + "step_time": 73.45079297944903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 543.125, + "completions/mean_terminated_length": 543.125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.5413118600845337, + "epoch": 2.7724867724867726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.46875, + "kl": 0.00216845516115427, + "learning_rate": 5.033069155259471e-07, + "loss": -0.0028, + "num_tokens": 10185017.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 524, + "step_time": 89.7467988235876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 734.1875, + "completions/mean_terminated_length": 692.7857666015625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.6020260453224182, + "epoch": 2.7777777777777777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0015728548169136047, + "learning_rate": 5.016534668039976e-07, + "loss": -0.0002, + "num_tokens": 10209820.0, + "reward": 0.5416666865348816, + "reward_std": 0.235702246427536, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.5, + "step": 525, + "step_time": 208.850717083551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 1010.375, + "completions/mean_terminated_length": 951.3333740234375, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.522578239440918, + "epoch": 2.7830687830687832, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.00148448022082448, + "learning_rate": 5e-07, + "loss": 0.0001, + "num_tokens": 10238170.0, + "reward": 0.171875, + "reward_std": 0.188242569565773, + "rewards/itbench_correctness/mean": 0.171875, + "rewards/itbench_correctness/std": 0.3125, + "step": 526, + "step_time": 942.4442430688068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 841.5, + "completions/mean_terminated_length": 659.0, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.45632797479629517, + "epoch": 2.7883597883597884, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.001308864215388894, + "learning_rate": 4.983465331960023e-07, + "loss": -0.0154, + "num_tokens": 10258074.0, + "reward": 0.5531250238418579, + "reward_std": 0.24483326077461243, + "rewards/itbench_correctness/mean": 0.5531250238418579, + "rewards/itbench_correctness/std": 0.421295166015625, + "step": 527, + "step_time": 177.91924435272813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 325.75, + "completions/mean_terminated_length": 325.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.33614733815193176, + "epoch": 2.7936507936507935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.00386231392621994, + "learning_rate": 4.96693084474053e-07, + "loss": -0.0765, + "num_tokens": 10269622.0, + "reward": 0.5625, + "reward_std": 0.3471825420856476, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 528, + "step_time": 59.13615032006055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 629.375, + "completions/mean_terminated_length": 450.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.46077457070350647, + "epoch": 2.798941798941799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.0026442273519933224, + "learning_rate": 4.950396719160018e-07, + "loss": -0.0582, + "num_tokens": 10287964.0, + "reward": 0.3645833432674408, + "reward_std": 0.3061639964580536, + "rewards/itbench_correctness/mean": 0.3645833432674408, + "rewards/itbench_correctness/std": 0.3507597744464874, + "step": 529, + "step_time": 269.48390776105225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 713.5, + "completions/mean_terminated_length": 527.2000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6839523315429688, + "epoch": 2.804232804232804, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.65234375, + "kl": 0.0018830286571756005, + "learning_rate": 4.933863136033039e-07, + "loss": -0.1144, + "num_tokens": 10317900.0, + "reward": 0.3072916567325592, + "reward_std": 0.19150808453559875, + "rewards/itbench_correctness/mean": 0.3072916567325592, + "rewards/itbench_correctness/std": 0.4113198518753052, + "step": 530, + "step_time": 151.5169429546222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 717.125, + "completions/mean_terminated_length": 410.25, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 0.5047934651374817, + "epoch": 2.8095238095238093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0322265625, + "kl": 0.0017540534026920795, + "learning_rate": 4.917330276168208e-07, + "loss": 0.0, + "num_tokens": 10342214.0, + "reward": 0.699999988079071, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.699999988079071, + "rewards/itbench_correctness/std": 0.3098386824131012, + "step": 531, + "step_time": 203.65224741771817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 629.5625, + "completions/mean_terminated_length": 629.5625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.3589794635772705, + "epoch": 2.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0014826505212113261, + "learning_rate": 4.900798320366232e-07, + "loss": -0.017, + "num_tokens": 10355927.0, + "reward": 0.7332720756530762, + "reward_std": 0.21437928080558777, + "rewards/itbench_correctness/mean": 0.7332720756530762, + "rewards/itbench_correctness/std": 0.37859466671943665, + "step": 532, + "step_time": 348.23073250520974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.392578125, + "epoch": 2.82010582010582, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.0014328202232718468, + "learning_rate": 4.88426744941793e-07, + "loss": 0.0001, + "num_tokens": 10384407.0, + "reward": 0.8541666269302368, + "reward_std": 0.2482243776321411, + "rewards/itbench_correctness/mean": 0.8541666269302368, + "rewards/itbench_correctness/std": 0.26440009474754333, + "step": 533, + "step_time": 118.49520284496248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 498.25, + "completions/mean_terminated_length": 498.25, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.38534870743751526, + "epoch": 2.825396825396825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.002330408664420247, + "learning_rate": 4.86773784410226e-07, + "loss": 0.0162, + "num_tokens": 10399339.0, + "reward": 0.46875, + "reward_std": 0.2609178125858307, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4312717914581299, + "step": 534, + "step_time": 567.9226626912132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 921.625, + "completions/mean_terminated_length": 790.0000610351562, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.2951308786869049, + "epoch": 2.8306878306878307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.0014564162120223045, + "learning_rate": 4.851209685184338e-07, + "loss": 0.0001, + "num_tokens": 10424093.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 535, + "step_time": 216.71312026213855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 733.0, + "completions/mean_terminated_length": 733.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.45839017629623413, + "epoch": 2.835978835978836, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0234375, + "kl": 0.0023340799380093813, + "learning_rate": 4.834683153413459e-07, + "loss": -0.1065, + "num_tokens": 10440501.0, + "reward": 0.856249988079071, + "reward_std": 0.17614421248435974, + "rewards/itbench_correctness/mean": 0.856249988079071, + "rewards/itbench_correctness/std": 0.2827690541744232, + "step": 536, + "step_time": 74.85195223800838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 563.875, + "completions/mean_terminated_length": 563.875, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.48237642645835876, + "epoch": 2.8412698412698414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01373291015625, + "kl": 0.001148638199083507, + "learning_rate": 4.818158429521129e-07, + "loss": 0.0, + "num_tokens": 10452811.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 537, + "step_time": 238.9646631795913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 613.0, + "completions/mean_terminated_length": 613.0, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.48939642310142517, + "epoch": 2.8465608465608465, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25, + "kl": 0.0019839375745505095, + "learning_rate": 4.801635694219079e-07, + "loss": 0.0307, + "num_tokens": 10466963.0, + "reward": 0.484375, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.484375, + "rewards/itbench_correctness/std": 0.503891110420227, + "step": 538, + "step_time": 778.2056272830814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 868.4375, + "completions/mean_terminated_length": 747.4444580078125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.27981287240982056, + "epoch": 2.851851851851852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.62890625, + "kl": 0.0014373651938512921, + "learning_rate": 4.785115128197298e-07, + "loss": -0.1155, + "num_tokens": 10487634.0, + "reward": 0.3897058963775635, + "reward_std": 0.16693422198295593, + "rewards/itbench_correctness/mean": 0.3897058963775635, + "rewards/itbench_correctness/std": 0.46261632442474365, + "step": 539, + "step_time": 163.97635082527995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 695.375, + "completions/mean_terminated_length": 546.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5637246370315552, + "epoch": 2.857142857142857, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.734375, + "kl": 0.0018984549678862095, + "learning_rate": 4.768596912122045e-07, + "loss": -0.1365, + "num_tokens": 10528456.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 540, + "step_time": 148.45136263035238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 971.5, + "completions/mean_terminated_length": 856.0, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.4776119291782379, + "epoch": 2.8624338624338623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0011911564506590366, + "learning_rate": 4.752081226633888e-07, + "loss": 0.0568, + "num_tokens": 10571880.0, + "reward": 0.5, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 541, + "step_time": 150.04778977762908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 988.6875, + "completions/mean_terminated_length": 882.75, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "entropy": 0.5259498357772827, + "epoch": 2.867724867724868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.001533015980385244, + "learning_rate": 4.7355682523457173e-07, + "loss": 0.0001, + "num_tokens": 10605523.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 542, + "step_time": 117.57136417739093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 550.3125, + "completions/mean_terminated_length": 550.3125, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 0.523339033126831, + "epoch": 2.873015873015873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.002573953941464424, + "learning_rate": 4.719058169840772e-07, + "loss": 0.0001, + "num_tokens": 10628208.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 543, + "step_time": 99.59436613786966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 829.5, + "completions/mean_terminated_length": 635.0, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.3833634853363037, + "epoch": 2.878306878306878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.0010798868024721742, + "learning_rate": 4.702551159670672e-07, + "loss": -0.0133, + "num_tokens": 10649920.0, + "reward": 0.4388020932674408, + "reward_std": 0.24691906571388245, + "rewards/itbench_correctness/mean": 0.4388020932674408, + "rewards/itbench_correctness/std": 0.3513753414154053, + "step": 544, + "step_time": 112.22929359227419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 786.3125, + "completions/mean_terminated_length": 643.7000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5061600804328918, + "epoch": 2.8835978835978837, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.56640625, + "kl": 0.001771087758243084, + "learning_rate": 4.686047402353433e-07, + "loss": -0.1004, + "num_tokens": 10672789.0, + "reward": 0.3177083432674408, + "reward_std": 0.129746213555336, + "rewards/itbench_correctness/mean": 0.3177083432674408, + "rewards/itbench_correctness/std": 0.37294963002204895, + "step": 545, + "step_time": 111.85205744486302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 618.625, + "completions/mean_terminated_length": 618.625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.2796524465084076, + "epoch": 2.888888888888889, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.203125, + "kl": 0.0013817870058119297, + "learning_rate": 4.669547078371503e-07, + "loss": -0.0018, + "num_tokens": 10688151.0, + "reward": 0.6770833730697632, + "reward_std": 0.08258593082427979, + "rewards/itbench_correctness/mean": 0.6770833730697632, + "rewards/itbench_correctness/std": 0.3520771563053131, + "step": 546, + "step_time": 128.49444034136832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 718.5, + "completions/mean_terminated_length": 698.1333618164062, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.5622825622558594, + "epoch": 2.894179894179894, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3203125, + "kl": 0.0019919220358133316, + "learning_rate": 4.6530503681697796e-07, + "loss": 0.0318, + "num_tokens": 10714559.0, + "reward": 0.17499999701976776, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.17499999701976776, + "rewards/itbench_correctness/std": 0.19832633435726166, + "step": 547, + "step_time": 282.7451619775966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 453.4375, + "completions/mean_terminated_length": 453.4375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.4057891070842743, + "epoch": 2.8994708994708995, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.140625, + "kl": 0.0013302209554240108, + "learning_rate": 4.6365574521536446e-07, + "loss": 0.0067, + "num_tokens": 10725190.0, + "reward": 0.234375, + "reward_std": 0.12387890368700027, + "rewards/itbench_correctness/mean": 0.234375, + "rewards/itbench_correctness/std": 0.29536348581314087, + "step": 548, + "step_time": 440.6484692748636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 755.5625, + "completions/mean_terminated_length": 594.5, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "entropy": 0.5479361414909363, + "epoch": 2.9047619047619047, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.002523271832615137, + "learning_rate": 4.620068510686984e-07, + "loss": 0.0625, + "num_tokens": 10743759.0, + "reward": 0.4166666865348816, + "reward_std": 0.3667176067829132, + "rewards/itbench_correctness/mean": 0.4166666865348816, + "rewards/itbench_correctness/std": 0.38005849719047546, + "step": 549, + "step_time": 73.56386850681156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 692.0, + "completions/mean_terminated_length": 433.77777099609375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.3309248685836792, + "epoch": 2.91005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.001961898524314165, + "learning_rate": 4.60358372409022e-07, + "loss": 0.0117, + "num_tokens": 10761847.0, + "reward": 0.5104166865348816, + "reward_std": 0.2609047293663025, + "rewards/itbench_correctness/mean": 0.5104166865348816, + "rewards/itbench_correctness/std": 0.27533650398254395, + "step": 550, + "step_time": 100.34413592051715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 539.6875, + "completions/mean_terminated_length": 539.6875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.4928778111934662, + "epoch": 2.9153439153439153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.002205328783020377, + "learning_rate": 4.5871032726383385e-07, + "loss": -0.008, + "num_tokens": 10785042.0, + "reward": 0.3125, + "reward_std": 0.44403791427612305, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 551, + "step_time": 102.3933826405555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 950.9375, + "completions/mean_terminated_length": 894.1111450195312, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "entropy": 0.3764705955982208, + "epoch": 2.9206349206349205, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5859375, + "kl": 0.0018417153041809797, + "learning_rate": 4.5706273365589144e-07, + "loss": -0.0023, + "num_tokens": 10806801.0, + "reward": 0.2447916567325592, + "reward_std": 0.20343953371047974, + "rewards/itbench_correctness/mean": 0.2447916567325592, + "rewards/itbench_correctness/std": 0.37573233246803284, + "step": 552, + "step_time": 254.7854423839599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 713.875, + "completions/mean_terminated_length": 713.875, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.4454561471939087, + "epoch": 2.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.0015814114594832063, + "learning_rate": 4.554156096030148e-07, + "loss": 0.0134, + "num_tokens": 10823391.0, + "reward": 0.859375, + "reward_std": 0.17926117777824402, + "rewards/itbench_correctness/mean": 0.859375, + "rewards/itbench_correctness/std": 0.17405499517917633, + "step": 553, + "step_time": 135.78249835129827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 1012.5625, + "completions/mean_terminated_length": 978.25, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.30615395307540894, + "epoch": 2.931216931216931, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6328125, + "kl": 0.001216786215081811, + "learning_rate": 4.5376897311788825e-07, + "loss": -0.002, + "num_tokens": 10847512.0, + "reward": 0.1041666716337204, + "reward_std": 0.12400396913290024, + "rewards/itbench_correctness/mean": 0.1041666716337204, + "rewards/itbench_correctness/std": 0.2006932497024536, + "step": 554, + "step_time": 7353.796993748285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 883.5625, + "completions/mean_terminated_length": 743.125, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "entropy": 0.6835962533950806, + "epoch": 2.9365079365079367, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5625, + "kl": 0.001729651470668614, + "learning_rate": 4.521228422078649e-07, + "loss": 0.0001, + "num_tokens": 10879377.0, + "reward": 0.1484375, + "reward_std": 0.17971175909042358, + "rewards/itbench_correctness/mean": 0.1484375, + "rewards/itbench_correctness/std": 0.2894634008407593, + "step": 555, + "step_time": 211.0395448282361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 859.25, + "completions/mean_terminated_length": 731.1111450195312, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.4469013810157776, + "epoch": 2.941798941798942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.001122955116443336, + "learning_rate": 4.5047723487476864e-07, + "loss": 0.0002, + "num_tokens": 10909653.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 556, + "step_time": 91.0195178175345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 452.875, + "completions/mean_terminated_length": 452.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.47474467754364014, + "epoch": 2.947089947089947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0027064280584454536, + "learning_rate": 4.488321691146975e-07, + "loss": -0.0516, + "num_tokens": 10919539.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 557, + "step_time": 96.5944811757654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 443.125, + "completions/mean_terminated_length": 443.125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.4874471127986908, + "epoch": 2.9523809523809526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0014341843780130148, + "learning_rate": 4.4718766291782723e-07, + "loss": 0.0231, + "num_tokens": 10928989.0, + "reward": 0.1875, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 558, + "step_time": 76.11392251215875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 683.625, + "completions/mean_terminated_length": 528.9091186523438, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "entropy": 0.38032546639442444, + "epoch": 2.9576719576719577, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.0016363576287403703, + "learning_rate": 4.4554373426821367e-07, + "loss": -0.0025, + "num_tokens": 10944967.0, + "reward": 0.4375, + "reward_std": 0.2077372521162033, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.291070818901062, + "step": 559, + "step_time": 134.31548726093024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 681.75, + "completions/mean_terminated_length": 681.75, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.3021635413169861, + "epoch": 2.962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.0013805434573441744, + "learning_rate": 4.439004011435979e-07, + "loss": 0.0081, + "num_tokens": 10961347.0, + "reward": 0.5416666865348816, + "reward_std": 0.21535253524780273, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.4238273799419403, + "step": 560, + "step_time": 93.65936294849962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 731.3125, + "completions/mean_terminated_length": 633.75, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.43756943941116333, + "epoch": 2.9682539682539684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0011807921109721065, + "learning_rate": 4.4225768151520694e-07, + "loss": 0.0115, + "num_tokens": 10977016.0, + "reward": 0.5282738208770752, + "reward_std": 0.09272660315036774, + "rewards/itbench_correctness/mean": 0.5282738208770752, + "rewards/itbench_correctness/std": 0.390090674161911, + "step": 561, + "step_time": 83.44914623722434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 700.75, + "completions/mean_terminated_length": 700.75, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.54513019323349, + "epoch": 2.9735449735449735, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.98828125, + "kl": 0.0026250071823596954, + "learning_rate": 4.406155933475599e-07, + "loss": 0.0199, + "num_tokens": 11001308.0, + "reward": 0.9479166269302368, + "reward_std": 0.043129097670316696, + "rewards/itbench_correctness/mean": 0.9479166269302368, + "rewards/itbench_correctness/std": 0.07978560030460358, + "step": 562, + "step_time": 114.21156205888838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 708.8125, + "completions/mean_terminated_length": 636.0769653320312, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.2962701618671417, + "epoch": 2.9788359788359786, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.001674781320616603, + "learning_rate": 4.389741545982699e-07, + "loss": -0.0206, + "num_tokens": 11017705.0, + "reward": 0.2916666865348816, + "reward_std": 0.4096291959285736, + "rewards/itbench_correctness/mean": 0.2916666865348816, + "rewards/itbench_correctness/std": 0.4013864994049072, + "step": 563, + "step_time": 173.21209927741438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 483.6875, + "completions/mean_terminated_length": 483.6875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 0.6161002516746521, + "epoch": 2.984126984126984, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3203125, + "kl": 0.0027410590555518866, + "learning_rate": 4.3733338321784777e-07, + "loss": 0.0192, + "num_tokens": 11029732.0, + "reward": 0.40625, + "reward_std": 0.08258593827486038, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.43448033928871155, + "step": 564, + "step_time": 86.51283952593803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 822.0625, + "completions/mean_terminated_length": 665.0, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.3941306173801422, + "epoch": 2.9894179894179893, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.0017499164678156376, + "learning_rate": 4.3569329714950703e-07, + "loss": -0.0189, + "num_tokens": 11051933.0, + "reward": 0.21875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.20155644416809082, + "step": 565, + "step_time": 424.9380031451583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 931.5, + "completions/mean_terminated_length": 839.0, + "completions/min_length": 676.0, + "completions/min_terminated_length": 676.0, + "entropy": 0.5625335574150085, + "epoch": 2.9947089947089944, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4453125, + "kl": 0.0013327021151781082, + "learning_rate": 4.340539143289655e-07, + "loss": 0.0, + "num_tokens": 11079021.0, + "reward": 0.11249999701976776, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.11249999701976776, + "rewards/itbench_correctness/std": 0.24186775088310242, + "step": 566, + "step_time": 103.31370590813458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 806.8125, + "completions/mean_terminated_length": 756.6923217773438, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 0.5825393199920654, + "epoch": 3.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.0027419670950621367, + "learning_rate": 4.324152526842517e-07, + "loss": 0.0188, + "num_tokens": 11105082.0, + "reward": 0.3854166865348816, + "reward_std": 0.46477773785591125, + "rewards/itbench_correctness/mean": 0.3854166865348816, + "rewards/itbench_correctness/std": 0.4702983796596527, + "step": 567, + "step_time": 128.40436456073076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 836.625, + "completions/mean_terminated_length": 649.25, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.3920513987541199, + "epoch": 3.005291005291005, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3359375, + "kl": 0.0014088655589148402, + "learning_rate": 4.307773301355062e-07, + "loss": 0.0, + "num_tokens": 11133620.0, + "reward": 0.34166669845581055, + "reward_std": 0.1725163757801056, + "rewards/itbench_correctness/mean": 0.34166669845581055, + "rewards/itbench_correctness/std": 0.3432955741882324, + "step": 568, + "step_time": 109.38008708879352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 684.5625, + "completions/mean_terminated_length": 345.125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.5375696420669556, + "epoch": 3.0105820105820107, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0021559942979365587, + "learning_rate": 4.2914016459478786e-07, + "loss": 0.0188, + "num_tokens": 11152141.0, + "reward": 0.4140625, + "reward_std": 0.24306795001029968, + "rewards/itbench_correctness/mean": 0.4140625, + "rewards/itbench_correctness/std": 0.2446032166481018, + "step": 569, + "step_time": 244.70517920982093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 738.1875, + "completions/mean_terminated_length": 608.2727661132812, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.5039370059967041, + "epoch": 3.015873015873016, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25, + "kl": 0.0019608521834015846, + "learning_rate": 4.275037739658771e-07, + "loss": 0.0055, + "num_tokens": 11174272.0, + "reward": 0.0989583358168602, + "reward_std": 0.03100099228322506, + "rewards/itbench_correctness/mean": 0.0989583358168602, + "rewards/itbench_correctness/std": 0.11063265055418015, + "step": 570, + "step_time": 694.2117186943069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 720.5625, + "completions/mean_terminated_length": 538.5, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.5023853182792664, + "epoch": 3.0211640211640214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026611328125, + "kl": 0.0014148685149848461, + "learning_rate": 4.258681761440789e-07, + "loss": 0.0, + "num_tokens": 11210273.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 571, + "step_time": 249.2116071432829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 720.6875, + "completions/mean_terminated_length": 484.77777099609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.3996184170246124, + "epoch": 3.0264550264550265, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0012311309110373259, + "learning_rate": 4.2423338901602983e-07, + "loss": 0.0147, + "num_tokens": 11229236.0, + "reward": 0.7708333730697632, + "reward_std": 0.19795581698417664, + "rewards/itbench_correctness/mean": 0.7708333730697632, + "rewards/itbench_correctness/std": 0.35939764976501465, + "step": 572, + "step_time": 247.85055056307465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 520.125, + "completions/mean_terminated_length": 520.125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.35183849930763245, + "epoch": 3.0317460317460316, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.03125, + "kl": 0.002790507161989808, + "learning_rate": 4.225994304594993e-07, + "loss": -0.0321, + "num_tokens": 11242278.0, + "reward": 0.1875, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.25, + "step": 573, + "step_time": 426.5937115754932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 835.625, + "completions/mean_terminated_length": 792.1538696289062, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.48107704520225525, + "epoch": 3.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.0021311482414603233, + "learning_rate": 4.2096631834319687e-07, + "loss": -0.0252, + "num_tokens": 11264776.0, + "reward": 0.265625, + "reward_std": 0.2204262614250183, + "rewards/itbench_correctness/mean": 0.265625, + "rewards/itbench_correctness/std": 0.2183031290769577, + "step": 574, + "step_time": 419.11753554455936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 638.6875, + "completions/mean_terminated_length": 638.6875, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.375770628452301, + "epoch": 3.0423280423280423, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0015504560433328152, + "learning_rate": 4.193340705265745e-07, + "loss": 0.0162, + "num_tokens": 11285235.0, + "reward": 0.47968751192092896, + "reward_std": 0.17499202489852905, + "rewards/itbench_correctness/mean": 0.47968751192092896, + "rewards/itbench_correctness/std": 0.4592764973640442, + "step": 575, + "step_time": 152.97607036307454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 607.8125, + "completions/mean_terminated_length": 607.8125, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.39650386571884155, + "epoch": 3.0476190476190474, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0011127422330901027, + "learning_rate": 4.1770270485963294e-07, + "loss": -0.0156, + "num_tokens": 11298640.0, + "reward": 0.6644607782363892, + "reward_std": 0.04214790090918541, + "rewards/itbench_correctness/mean": 0.6644607782363892, + "rewards/itbench_correctness/std": 0.2791382670402527, + "step": 576, + "step_time": 166.16624604724348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 698.3125, + "completions/mean_terminated_length": 698.3125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 0.3837823271751404, + "epoch": 3.052910052910053, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.140625, + "kl": 0.0011828916613012552, + "learning_rate": 4.1607223918272614e-07, + "loss": -0.002, + "num_tokens": 11313829.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.5, + "step": 577, + "step_time": 81.87011110130697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 517.3125, + "completions/mean_terminated_length": 517.3125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.3595505654811859, + "epoch": 3.058201058201058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.001632296247407794, + "learning_rate": 4.1444269132636494e-07, + "loss": 0.0011, + "num_tokens": 11325674.0, + "reward": 0.96875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.96875, + "rewards/itbench_correctness/std": 0.125, + "step": 578, + "step_time": 853.527063309215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 753.4375, + "completions/mean_terminated_length": 630.45458984375, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "entropy": 0.5415180325508118, + "epoch": 3.0634920634920633, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.390625, + "kl": 0.016372833400964737, + "learning_rate": 4.1281407911102424e-07, + "loss": 0.0314, + "num_tokens": 11359377.0, + "reward": 0.0234375, + "reward_std": 0.03234682232141495, + "rewards/itbench_correctness/mean": 0.0234375, + "rewards/itbench_correctness/std": 0.050389111042022705, + "step": 579, + "step_time": 93.2713974667713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 592.8125, + "completions/mean_terminated_length": 531.2142944335938, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.4048497676849365, + "epoch": 3.068783068783069, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0021097399294376373, + "learning_rate": 4.1118642034694565e-07, + "loss": 0.0827, + "num_tokens": 11371702.0, + "reward": 0.34375, + "reward_std": 0.21564549207687378, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.46135368943214417, + "step": 580, + "step_time": 78.31074696686119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 500.0, + "completions/mean_terminated_length": 500.0, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.4180000126361847, + "epoch": 3.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0022473859135061502, + "learning_rate": 4.095597328339452e-07, + "loss": 0.003, + "num_tokens": 11395878.0, + "reward": 0.75, + "reward_std": 0.4355512857437134, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 581, + "step_time": 106.9057395812124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 953.8125, + "completions/mean_terminated_length": 462.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5346962809562683, + "epoch": 3.0793650793650795, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.0019432251574471593, + "learning_rate": 4.079340343612164e-07, + "loss": -0.0605, + "num_tokens": 11434283.0, + "reward": 0.1145833358168602, + "reward_std": 0.17747542262077332, + "rewards/itbench_correctness/mean": 0.1145833358168602, + "rewards/itbench_correctness/std": 0.17969882488250732, + "step": 582, + "step_time": 153.69186680205166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 930.5, + "completions/mean_terminated_length": 810.2857666015625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 0.42557764053344727, + "epoch": 3.0846560846560847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.001963236602023244, + "learning_rate": 4.0630934270713755e-07, + "loss": 0.015, + "num_tokens": 11456923.0, + "reward": 0.20416668057441711, + "reward_std": 0.2299290895462036, + "rewards/itbench_correctness/mean": 0.20416668057441711, + "rewards/itbench_correctness/std": 0.2864534258842468, + "step": 583, + "step_time": 133.70225734543055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 510.6875, + "completions/mean_terminated_length": 510.6875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.5052013397216797, + "epoch": 3.0899470899470898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0240478515625, + "kl": 0.001320964889600873, + "learning_rate": 4.046856756390766e-07, + "loss": 0.0, + "num_tokens": 11468238.0, + "reward": 0.5833333134651184, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5833333134651184, + "rewards/itbench_correctness/std": 0.4303314983844757, + "step": 584, + "step_time": 56.649503622204065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 657.5625, + "completions/mean_terminated_length": 657.5625, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.5353103280067444, + "epoch": 3.0952380952380953, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6328125, + "kl": 0.0017962680431082845, + "learning_rate": 4.030630509131959e-07, + "loss": -0.0069, + "num_tokens": 11496463.0, + "reward": 0.875, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 585, + "step_time": 126.4090378023684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.63671875, + "epoch": 3.1005291005291005, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.0013208300806581974, + "learning_rate": 4.0144148627425986e-07, + "loss": 0.0001, + "num_tokens": 11522143.0, + "reward": 0.1875, + "reward_std": 0.27381423115730286, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.273861289024353, + "step": 586, + "step_time": 592.935981715098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 712.0625, + "completions/mean_terminated_length": 400.125, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.252786785364151, + "epoch": 3.105820105820106, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.85546875, + "kl": 0.001402621390298009, + "learning_rate": 3.998209994554394e-07, + "loss": -0.0316, + "num_tokens": 11543664.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 587, + "step_time": 146.66224777232856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 716.3125, + "completions/mean_terminated_length": 695.800048828125, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "entropy": 0.5612075924873352, + "epoch": 3.111111111111111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022705078125, + "kl": 0.0013308442430570722, + "learning_rate": 3.9820160817811887e-07, + "loss": 0.0, + "num_tokens": 11582781.0, + "reward": 0.05000000074505806, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.05000000074505806, + "rewards/itbench_correctness/std": 0.05163978040218353, + "step": 588, + "step_time": 122.08641688153148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 505.375, + "completions/mean_terminated_length": 505.375, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "entropy": 0.31263911724090576, + "epoch": 3.1164021164021163, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0014801534125581384, + "learning_rate": 3.965833301517016e-07, + "loss": 0.0609, + "num_tokens": 11594227.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 589, + "step_time": 936.1484231920913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 464.875, + "completions/mean_terminated_length": 464.875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.46248990297317505, + "epoch": 3.121693121693122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0018392398487776518, + "learning_rate": 3.9496618307341713e-07, + "loss": 0.0127, + "num_tokens": 11610305.0, + "reward": 0.24289773404598236, + "reward_std": 0.14991973340511322, + "rewards/itbench_correctness/mean": 0.24289773404598236, + "rewards/itbench_correctness/std": 0.15188638865947723, + "step": 590, + "step_time": 92.19220882095397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 608.3125, + "completions/mean_terminated_length": 608.3125, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.5589232444763184, + "epoch": 3.126984126984127, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4609375, + "kl": 0.0023250363301485777, + "learning_rate": 3.9335018462812664e-07, + "loss": 0.0092, + "num_tokens": 11631278.0, + "reward": 0.3999999761581421, + "reward_std": 0.16903084516525269, + "rewards/itbench_correctness/mean": 0.3999999761581421, + "rewards/itbench_correctness/std": 0.47328636050224304, + "step": 591, + "step_time": 87.29508406948298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 611.375, + "completions/mean_terminated_length": 611.375, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.5332242846488953, + "epoch": 3.132275132275132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.0023294584825634956, + "learning_rate": 3.9173535248813017e-07, + "loss": 0.0001, + "num_tokens": 11654876.0, + "reward": 0.0833333358168602, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0833333358168602, + "rewards/itbench_correctness/std": 0.08606629818677902, + "step": 592, + "step_time": 78.46097278501838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 482.6875, + "completions/mean_terminated_length": 482.6875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4537096917629242, + "epoch": 3.1375661375661377, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.416015625, + "kl": 0.0019261433044448495, + "learning_rate": 3.901217043129734e-07, + "loss": -0.0825, + "num_tokens": 11666455.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 593, + "step_time": 112.39422312192619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 745.5, + "completions/mean_terminated_length": 618.9091186523438, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.4077800214290619, + "epoch": 3.142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.0012227533152326941, + "learning_rate": 3.885092577492542e-07, + "loss": 0.0307, + "num_tokens": 11693759.0, + "reward": 0.5625, + "reward_std": 0.49022960662841797, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 594, + "step_time": 92.09622034989297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 869.4375, + "completions/mean_terminated_length": 847.357177734375, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 0.33354899287223816, + "epoch": 3.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0010007465025410056, + "learning_rate": 3.8689803043042996e-07, + "loss": -0.0018, + "num_tokens": 11713342.0, + "reward": 0.5208333730697632, + "reward_std": 0.4459637701511383, + "rewards/itbench_correctness/mean": 0.5208333730697632, + "rewards/itbench_correctness/std": 0.438325971364975, + "step": 595, + "step_time": 515.5870687887073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 442.25, + "completions/mean_terminated_length": 442.25, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.4250989258289337, + "epoch": 3.1534391534391535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.001579253701493144, + "learning_rate": 3.8528803997662423e-07, + "loss": -0.0061, + "num_tokens": 11723002.0, + "reward": 0.48750001192092896, + "reward_std": 0.1636853665113449, + "rewards/itbench_correctness/mean": 0.48750001192092896, + "rewards/itbench_correctness/std": 0.16683325171470642, + "step": 596, + "step_time": 60.05524417478591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 720.0, + "completions/mean_terminated_length": 416.0, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.3097222149372101, + "epoch": 3.1587301587301586, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1484375, + "kl": 0.0024181241169571877, + "learning_rate": 3.8367930399443486e-07, + "loss": -0.0056, + "num_tokens": 11740490.0, + "reward": 0.2395833432674408, + "reward_std": 0.0883883386850357, + "rewards/itbench_correctness/mean": 0.2395833432674408, + "rewards/itbench_correctness/std": 0.27533650398254395, + "step": 597, + "step_time": 7237.127478616312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 860.0, + "completions/mean_terminated_length": 805.3333740234375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.43023255467414856, + "epoch": 3.164021164021164, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3671875, + "kl": 0.001607713638804853, + "learning_rate": 3.8207184007674085e-07, + "loss": 0.001, + "num_tokens": 11764610.0, + "reward": 0.7083333730697632, + "reward_std": 0.19416078925132751, + "rewards/itbench_correctness/mean": 0.7083333730697632, + "rewards/itbench_correctness/std": 0.40138646960258484, + "step": 598, + "step_time": 144.42800151277333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 423.9375, + "completions/mean_terminated_length": 423.9375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "entropy": 0.35382574796676636, + "epoch": 3.1693121693121693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.003284444333985448, + "learning_rate": 3.8046566580250995e-07, + "loss": 0.0187, + "num_tokens": 11774265.0, + "reward": 0.6006944179534912, + "reward_std": 0.2023771107196808, + "rewards/itbench_correctness/mean": 0.6006944179534912, + "rewards/itbench_correctness/std": 0.25677546858787537, + "step": 599, + "step_time": 91.41469971835613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 745.9375, + "completions/mean_terminated_length": 467.875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.4826141595840454, + "epoch": 3.1746031746031744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0022031215485185385, + "learning_rate": 3.788607987366069e-07, + "loss": -0.0035, + "num_tokens": 11791280.0, + "reward": 0.6674107313156128, + "reward_std": 0.29377472400665283, + "rewards/itbench_correctness/mean": 0.6674107313156128, + "rewards/itbench_correctness/std": 0.40818971395492554, + "step": 600, + "step_time": 727.5520837632939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 637.4375, + "completions/mean_terminated_length": 637.4375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.6149622797966003, + "epoch": 3.17989417989418, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0012068739160895348, + "learning_rate": 3.772572564296004e-07, + "loss": 0.0218, + "num_tokens": 11804407.0, + "reward": 0.8541666865348816, + "reward_std": 0.022271769121289253, + "rewards/itbench_correctness/mean": 0.8541666865348816, + "rewards/itbench_correctness/std": 0.15365907549858093, + "step": 601, + "step_time": 197.79692050255835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 553.5, + "completions/mean_terminated_length": 553.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4065040647983551, + "epoch": 3.185185185185185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.0014250976964831352, + "learning_rate": 3.7565505641757266e-07, + "loss": -0.106, + "num_tokens": 11817495.0, + "reward": 0.5593750476837158, + "reward_std": 0.18626472353935242, + "rewards/itbench_correctness/mean": 0.5593750476837158, + "rewards/itbench_correctness/std": 0.26154589653015137, + "step": 602, + "step_time": 91.9841024801135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 829.5, + "completions/mean_terminated_length": 816.5333862304688, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "entropy": 0.44605183601379395, + "epoch": 3.1904761904761907, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.296875, + "kl": 0.0014674547128379345, + "learning_rate": 3.74054216221926e-07, + "loss": -0.0149, + "num_tokens": 11837095.0, + "reward": 0.1666666716337204, + "reward_std": 0.2182178944349289, + "rewards/itbench_correctness/mean": 0.1666666716337204, + "rewards/itbench_correctness/std": 0.3442651927471161, + "step": 603, + "step_time": 129.12120711896569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 551.9375, + "completions/mean_terminated_length": 551.9375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "entropy": 0.24821650981903076, + "epoch": 3.195767195767196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.46875, + "kl": 0.0011176143307238817, + "learning_rate": 3.724547533491924e-07, + "loss": -0.0278, + "num_tokens": 11850606.0, + "reward": 0.2187500149011612, + "reward_std": 0.0883883535861969, + "rewards/itbench_correctness/mean": 0.2187500149011612, + "rewards/itbench_correctness/std": 0.2561737895011902, + "step": 604, + "step_time": 94.76566615886986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 537.6875, + "completions/mean_terminated_length": 537.6875, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.4314773976802826, + "epoch": 3.201058201058201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.0012784149730578065, + "learning_rate": 3.708566852908418e-07, + "loss": 0.0069, + "num_tokens": 11862497.0, + "reward": 0.21875, + "reward_std": 0.2609178125858307, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.3204091787338257, + "step": 605, + "step_time": 816.5052793165669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 623.4375, + "completions/mean_terminated_length": 623.4375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.3528822064399719, + "epoch": 3.2063492063492065, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.001327100908383727, + "learning_rate": 3.692600295230901e-07, + "loss": 0.0131, + "num_tokens": 11878112.0, + "reward": 0.6583333611488342, + "reward_std": 0.11426578462123871, + "rewards/itbench_correctness/mean": 0.6583333611488342, + "rewards/itbench_correctness/std": 0.22377237677574158, + "step": 606, + "step_time": 69.29318222776055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 989.8125, + "completions/mean_terminated_length": 887.25, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "entropy": 0.6627517938613892, + "epoch": 3.2116402116402116, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.609375, + "kl": 0.0013202429981902242, + "learning_rate": 3.6766480350670925e-07, + "loss": 0.0134, + "num_tokens": 11904909.0, + "reward": 0.3958333432674408, + "reward_std": 0.19795581698417664, + "rewards/itbench_correctness/mean": 0.3958333432674408, + "rewards/itbench_correctness/std": 0.4901813864707947, + "step": 607, + "step_time": 82.23766458127648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 1011.5625, + "completions/mean_terminated_length": 957.6666870117188, + "completions/min_length": 915.0, + "completions/min_terminated_length": 915.0, + "entropy": 0.29854804277420044, + "epoch": 3.2169312169312168, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.00097948859911412, + "learning_rate": 3.660710246868352e-07, + "loss": -0.0092, + "num_tokens": 11932430.0, + "reward": 0.6073908805847168, + "reward_std": 0.2576354146003723, + "rewards/itbench_correctness/mean": 0.6073908805847168, + "rewards/itbench_correctness/std": 0.26400262117385864, + "step": 608, + "step_time": 116.86947522684932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 644.625, + "completions/mean_terminated_length": 619.3333740234375, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.5987977385520935, + "epoch": 3.2222222222222223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.001607781508937478, + "learning_rate": 3.6447871049277796e-07, + "loss": 0.0406, + "num_tokens": 11946784.0, + "reward": 0.6499999761581421, + "reward_std": 0.3926178812980652, + "rewards/itbench_correctness/mean": 0.6499999761581421, + "rewards/itbench_correctness/std": 0.43204939365386963, + "step": 609, + "step_time": 80.47608442325145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 678.625, + "completions/mean_terminated_length": 678.625, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.6159513592720032, + "epoch": 3.2275132275132274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.001993054524064064, + "learning_rate": 3.6288787833783016e-07, + "loss": 0.0001, + "num_tokens": 11972402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 610, + "step_time": 71.49992215260863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 725.8125, + "completions/mean_terminated_length": 683.2142944335938, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.410574346780777, + "epoch": 3.2328042328042326, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.001559928059577942, + "learning_rate": 3.612985456190778e-07, + "loss": 0.0063, + "num_tokens": 11988567.0, + "reward": 0.3571428656578064, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.3571428656578064, + "rewards/itbench_correctness/std": 0.39382997155189514, + "step": 611, + "step_time": 165.54018260445446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 925.25, + "completions/mean_terminated_length": 826.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5879492163658142, + "epoch": 3.238095238095238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.0012924325419589877, + "learning_rate": 3.597107297172084e-07, + "loss": -0.0471, + "num_tokens": 12017675.0, + "reward": 0.265625, + "reward_std": 0.3114553987979889, + "rewards/itbench_correctness/mean": 0.265625, + "rewards/itbench_correctness/std": 0.4422362744808197, + "step": 612, + "step_time": 94.39502456784248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 862.375, + "completions/mean_terminated_length": 700.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.3385998010635376, + "epoch": 3.2433862433862433, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.001366258948110044, + "learning_rate": 3.5812444799632247e-07, + "loss": -0.0171, + "num_tokens": 12043681.0, + "reward": 0.5, + "reward_std": 0.5175491571426392, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 613, + "step_time": 669.0384511752054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 880.625, + "completions/mean_terminated_length": 832.8333740234375, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "entropy": 0.5132718086242676, + "epoch": 3.248677248677249, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6015625, + "kl": 0.0020796628668904305, + "learning_rate": 3.565397178037429e-07, + "loss": 0.0037, + "num_tokens": 12073499.0, + "reward": 0.25, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 614, + "step_time": 113.26465024612844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 495.875, + "completions/mean_terminated_length": 495.875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 0.45777666568756104, + "epoch": 3.253968253968254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0019308277405798435, + "learning_rate": 3.5495655646982503e-07, + "loss": -0.0023, + "num_tokens": 12093601.0, + "reward": 0.5227272510528564, + "reward_std": 0.32154878973960876, + "rewards/itbench_correctness/mean": 0.5227272510528564, + "rewards/itbench_correctness/std": 0.41261112689971924, + "step": 615, + "step_time": 105.98786111921072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 480.8125, + "completions/mean_terminated_length": 480.8125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.29949304461479187, + "epoch": 3.259259259259259, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.001420930726453662, + "learning_rate": 3.533749813077677e-07, + "loss": -0.0137, + "num_tokens": 12104318.0, + "reward": 0.43359375, + "reward_std": 0.2698231339454651, + "rewards/itbench_correctness/mean": 0.43359375, + "rewards/itbench_correctness/std": 0.35901251435279846, + "step": 616, + "step_time": 130.85422169603407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 700.875, + "completions/mean_terminated_length": 507.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5279115438461304, + "epoch": 3.2645502645502646, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0030069290660321712, + "learning_rate": 3.517950096134232e-07, + "loss": -0.0693, + "num_tokens": 12120292.0, + "reward": 0.5572916269302368, + "reward_std": 0.408902645111084, + "rewards/itbench_correctness/mean": 0.5572916269302368, + "rewards/itbench_correctness/std": 0.4146828055381775, + "step": 617, + "step_time": 76.69993899855763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 681.9375, + "completions/mean_terminated_length": 681.9375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.3578040599822998, + "epoch": 3.2698412698412698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.001377190463244915, + "learning_rate": 3.502166586651092e-07, + "loss": 0.0161, + "num_tokens": 12136035.0, + "reward": 0.78125, + "reward_std": 0.2896047830581665, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.28321075439453125, + "step": 618, + "step_time": 78.15438072942197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 502.5625, + "completions/mean_terminated_length": 502.5625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.39796045422554016, + "epoch": 3.2751322751322753, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2734375, + "kl": 0.0014053636696189642, + "learning_rate": 3.4863994572341843e-07, + "loss": -0.0057, + "num_tokens": 12146932.0, + "reward": 0.875, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 619, + "step_time": 814.3009329754859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 677.75, + "completions/mean_terminated_length": 408.4444580078125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.4455920457839966, + "epoch": 3.2804232804232805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0013953611487522721, + "learning_rate": 3.470648880310313e-07, + "loss": 0.0023, + "num_tokens": 12164472.0, + "reward": 0.39375001192092896, + "reward_std": 0.23028594255447388, + "rewards/itbench_correctness/mean": 0.39375001192092896, + "rewards/itbench_correctness/std": 0.2535580098628998, + "step": 620, + "step_time": 774.6395965730771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.318359375, + "epoch": 3.2857142857142856, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.609375, + "kl": 0.0011473577469587326, + "learning_rate": 3.454915028125263e-07, + "loss": 0.0, + "num_tokens": 12192136.0, + "reward": 0.2708333432674408, + "reward_std": 0.19795581698417664, + "rewards/itbench_correctness/mean": 0.2708333432674408, + "rewards/itbench_correctness/std": 0.3890872597694397, + "step": 621, + "step_time": 147.70375349000096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 504.4375, + "completions/mean_terminated_length": 504.4375, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.4182876944541931, + "epoch": 3.291005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.002045721746981144, + "learning_rate": 3.43919807274192e-07, + "loss": -0.0186, + "num_tokens": 12203111.0, + "reward": 0.47727274894714355, + "reward_std": 0.29765012860298157, + "rewards/itbench_correctness/mean": 0.47727274894714355, + "rewards/itbench_correctness/std": 0.40613409876823425, + "step": 622, + "step_time": 49.89000040013343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 515.8125, + "completions/mean_terminated_length": 515.8125, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.4788561761379242, + "epoch": 3.2962962962962963, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.0019279435509815812, + "learning_rate": 3.4234981860383927e-07, + "loss": 0.014, + "num_tokens": 12215068.0, + "reward": 0.375, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 623, + "step_time": 93.9447353342548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 831.125, + "completions/mean_terminated_length": 681.1111450195312, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.4283351004123688, + "epoch": 3.3015873015873014, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.234375, + "kl": 0.0011250257957726717, + "learning_rate": 3.407815539706124e-07, + "loss": 0.003, + "num_tokens": 12235334.0, + "reward": 0.5703125, + "reward_std": 0.05964459478855133, + "rewards/itbench_correctness/mean": 0.5703125, + "rewards/itbench_correctness/std": 0.4511992335319519, + "step": 624, + "step_time": 102.94410282652825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 864.875, + "completions/mean_terminated_length": 741.1111450195312, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.3584333062171936, + "epoch": 3.306878306878307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0012643268564715981, + "learning_rate": 3.3921503052480236e-07, + "loss": 0.0147, + "num_tokens": 12256380.0, + "reward": 0.5694444179534912, + "reward_std": 0.30291885137557983, + "rewards/itbench_correctness/mean": 0.5694444179534912, + "rewards/itbench_correctness/std": 0.3557291328907013, + "step": 625, + "step_time": 484.2877219989896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 779.9375, + "completions/mean_terminated_length": 779.9375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.41028928756713867, + "epoch": 3.312169312169312, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0014163429150357842, + "learning_rate": 3.3765026539765827e-07, + "loss": -0.0162, + "num_tokens": 12274915.0, + "reward": 0.30000001192092896, + "reward_std": 0.2507132589817047, + "rewards/itbench_correctness/mean": 0.30000001192092896, + "rewards/itbench_correctness/std": 0.46188023686408997, + "step": 626, + "step_time": 97.47150356322527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 437.3125, + "completions/mean_terminated_length": 437.3125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.541946530342102, + "epoch": 3.317460317460317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0028036432340741158, + "learning_rate": 3.360872757012011e-07, + "loss": 0.0022, + "num_tokens": 12284448.0, + "reward": 0.3958333432674408, + "reward_std": 0.10767625272274017, + "rewards/itbench_correctness/mean": 0.3958333432674408, + "rewards/itbench_correctness/std": 0.3657817840576172, + "step": 627, + "step_time": 93.49457087833434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 665.375, + "completions/mean_terminated_length": 545.8333740234375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.3877512812614441, + "epoch": 3.322751322751323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0014740098267793655, + "learning_rate": 3.345260785280358e-07, + "loss": 0.0046, + "num_tokens": 12301662.0, + "reward": 0.609375, + "reward_std": 0.11860001087188721, + "rewards/itbench_correctness/mean": 0.609375, + "rewards/itbench_correctness/std": 0.19654129445552826, + "step": 628, + "step_time": 155.64716604631394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 985.625, + "completions/mean_terminated_length": 921.6666870117188, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "entropy": 0.3896005153656006, + "epoch": 3.328042328042328, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.0012615715386345983, + "learning_rate": 3.329666909511645e-07, + "loss": 0.0, + "num_tokens": 12328376.0, + "reward": 0.07500000298023224, + "reward_std": 0.09161254018545151, + "rewards/itbench_correctness/mean": 0.07500000298023224, + "rewards/itbench_correctness/std": 0.14719600975513458, + "step": 629, + "step_time": 188.39432869665325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 747.5, + "completions/mean_terminated_length": 621.8181762695312, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.32107022404670715, + "epoch": 3.3333333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.0018245604587718844, + "learning_rate": 3.314091300237999e-07, + "loss": 0.0001, + "num_tokens": 12347128.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 630, + "step_time": 1010.9434453165159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 508.3125, + "completions/mean_terminated_length": 508.3125, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.38755688071250916, + "epoch": 3.3386243386243386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0018675376195460558, + "learning_rate": 3.2985341277917846e-07, + "loss": -0.0092, + "num_tokens": 12363957.0, + "reward": 0.640625, + "reward_std": 0.1446593999862671, + "rewards/itbench_correctness/mean": 0.640625, + "rewards/itbench_correctness/std": 0.29181545972824097, + "step": 631, + "step_time": 96.36415668576956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 822.125, + "completions/mean_terminated_length": 822.125, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "entropy": 0.29314276576042175, + "epoch": 3.3439153439153437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.002633322263136506, + "learning_rate": 3.282995562303753e-07, + "loss": 0.025, + "num_tokens": 12384663.0, + "reward": 0.6741071343421936, + "reward_std": 0.29771745204925537, + "rewards/itbench_correctness/mean": 0.6741071343421936, + "rewards/itbench_correctness/std": 0.3393692374229431, + "step": 632, + "step_time": 81.67576451133937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 741.4375, + "completions/mean_terminated_length": 521.6666870117188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.49633312225341797, + "epoch": 3.3492063492063493, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0015913519309833646, + "learning_rate": 3.2674757737011606e-07, + "loss": 0.0059, + "num_tokens": 12407326.0, + "reward": 0.4937500059604645, + "reward_std": 0.01767767034471035, + "rewards/itbench_correctness/mean": 0.4937500059604645, + "rewards/itbench_correctness/std": 0.510514497756958, + "step": 633, + "step_time": 96.86188104748726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 776.0625, + "completions/mean_terminated_length": 740.6428833007812, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "entropy": 0.6081984639167786, + "epoch": 3.3544973544973544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.0018657725304365158, + "learning_rate": 3.2519749317059327e-07, + "loss": 0.0253, + "num_tokens": 12442815.0, + "reward": 0.4375, + "reward_std": 0.49022960662841797, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 634, + "step_time": 115.92930174898356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 700.625, + "completions/mean_terminated_length": 654.4285888671875, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.3297056257724762, + "epoch": 3.35978835978836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.0020810659043490887, + "learning_rate": 3.236493205832794e-07, + "loss": 0.0001, + "num_tokens": 12459393.0, + "reward": 0.0833333358168602, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0833333358168602, + "rewards/itbench_correctness/std": 0.08606629818677902, + "step": 635, + "step_time": 450.1781229842454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 707.625, + "completions/mean_terminated_length": 563.8181762695312, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.48048049211502075, + "epoch": 3.365079365079365, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.001259908196516335, + "learning_rate": 3.221030765387417e-07, + "loss": -0.0114, + "num_tokens": 12495003.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 636, + "step_time": 161.46651719231158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 671.0, + "completions/mean_terminated_length": 318.0, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.42324888706207275, + "epoch": 3.3703703703703702, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.03125, + "kl": 0.0016913211438804865, + "learning_rate": 3.205587779464576e-07, + "loss": 0.008, + "num_tokens": 12512171.0, + "reward": 0.71875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 637, + "step_time": 103.04572070110589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 563.5, + "completions/mean_terminated_length": 563.5, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.41703638434410095, + "epoch": 3.375661375661376, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0859375, + "kl": 0.0013338474091142416, + "learning_rate": 3.190164416946285e-07, + "loss": -0.022, + "num_tokens": 12524003.0, + "reward": 0.9166666269302368, + "reward_std": 0.044543541967868805, + "rewards/itbench_correctness/mean": 0.9166666269302368, + "rewards/itbench_correctness/std": 0.10540926456451416, + "step": 638, + "step_time": 89.99323462788016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 590.5625, + "completions/mean_terminated_length": 590.5625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.3149539530277252, + "epoch": 3.380952380952381, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.0011156360851600766, + "learning_rate": 3.174760846499972e-07, + "loss": 0.0079, + "num_tokens": 12539556.0, + "reward": 0.40625, + "reward_std": 0.01767767034471035, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.420267790555954, + "step": 639, + "step_time": 1139.211639557965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 881.3125, + "completions/mean_terminated_length": 567.4000244140625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.3472094237804413, + "epoch": 3.386243386243386, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.34375, + "kl": 0.0015863279113546014, + "learning_rate": 3.15937723657661e-07, + "loss": 0.0695, + "num_tokens": 12568353.0, + "reward": 0.34375, + "reward_std": 0.16925080120563507, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.4236907958984375, + "step": 640, + "step_time": 180.37901693582535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 970.4375, + "completions/mean_terminated_length": 881.1666870117188, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "entropy": 0.6265215277671814, + "epoch": 3.3915343915343916, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0014786267420277, + "learning_rate": 3.1440137554088953e-07, + "loss": 0.0438, + "num_tokens": 12603128.0, + "reward": 0.19999998807907104, + "reward_std": 0.3343248665332794, + "rewards/itbench_correctness/mean": 0.19999998807907104, + "rewards/itbench_correctness/std": 0.3326660096645355, + "step": 641, + "step_time": 174.49532955139875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 603.75, + "completions/mean_terminated_length": 603.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4306418299674988, + "epoch": 3.3968253968253967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.002129571046680212, + "learning_rate": 3.1286705710093984e-07, + "loss": -0.0944, + "num_tokens": 12617060.0, + "reward": 0.737500011920929, + "reward_std": 0.25792384147644043, + "rewards/itbench_correctness/mean": 0.737500011920929, + "rewards/itbench_correctness/std": 0.26884526014328003, + "step": 642, + "step_time": 78.10744374617934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 557.0, + "completions/mean_terminated_length": 557.0, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.35727110505104065, + "epoch": 3.402116402116402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.0019330759532749653, + "learning_rate": 3.113347851168721e-07, + "loss": 0.0, + "num_tokens": 12629716.0, + "reward": 0.75, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 643, + "step_time": 797.8743101553991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 699.75, + "completions/mean_terminated_length": 552.3636474609375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "entropy": 0.3372633159160614, + "epoch": 3.4074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.00145020114723593, + "learning_rate": 3.0980457634536774e-07, + "loss": 0.0122, + "num_tokens": 12654240.0, + "reward": 0.46875, + "reward_std": 0.3471629321575165, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.3859512209892273, + "step": 644, + "step_time": 142.23401138465852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 412.625, + "completions/mean_terminated_length": 412.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.43380793929100037, + "epoch": 3.4126984126984126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.003082259325310588, + "learning_rate": 3.082764475205442e-07, + "loss": -0.0126, + "num_tokens": 12663858.0, + "reward": 0.828125, + "reward_std": 0.3143535256385803, + "rewards/itbench_correctness/mean": 0.828125, + "rewards/itbench_correctness/std": 0.3502231538295746, + "step": 645, + "step_time": 1102.395908644423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 635.9375, + "completions/mean_terminated_length": 635.9375, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.4497297406196594, + "epoch": 3.417989417989418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.0015347811859101057, + "learning_rate": 3.06750415353774e-07, + "loss": 0.0033, + "num_tokens": 12680857.0, + "reward": 0.5078125, + "reward_std": 0.19887377321720123, + "rewards/itbench_correctness/mean": 0.5078125, + "rewards/itbench_correctness/std": 0.4642843008041382, + "step": 646, + "step_time": 96.87032896187156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 726.5625, + "completions/mean_terminated_length": 657.923095703125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.3275699019432068, + "epoch": 3.4232804232804233, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.001753173884935677, + "learning_rate": 3.052264965335e-07, + "loss": -0.0118, + "num_tokens": 12701762.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 647, + "step_time": 256.6041612662375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 966.4375, + "completions/mean_terminated_length": 870.5, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.5587531328201294, + "epoch": 3.4285714285714284, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3984375, + "kl": 0.0016219299286603928, + "learning_rate": 3.037047077250543e-07, + "loss": 0.0127, + "num_tokens": 12725865.0, + "reward": 0.3958333432674408, + "reward_std": 0.0862581878900528, + "rewards/itbench_correctness/mean": 0.3958333432674408, + "rewards/itbench_correctness/std": 0.4254627227783203, + "step": 648, + "step_time": 119.4891459485516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 857.0, + "completions/mean_terminated_length": 727.1111450195312, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "entropy": 0.3640606701374054, + "epoch": 3.433862433862434, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5234375, + "kl": 0.001420785440132022, + "learning_rate": 3.02185065570476e-07, + "loss": -0.0097, + "num_tokens": 12745521.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 649, + "step_time": 151.27136832941324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 518.3125, + "completions/mean_terminated_length": 518.3125, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "entropy": 0.3357048034667969, + "epoch": 3.439153439153439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0135498046875, + "kl": 0.0010722068836912513, + "learning_rate": 3.006675866883275e-07, + "loss": 0.0, + "num_tokens": 12758590.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 650, + "step_time": 951.2108103726059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 753.5, + "completions/mean_terminated_length": 735.4666748046875, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.5069674849510193, + "epoch": 3.4444444444444446, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3984375, + "kl": 0.0015233854064717889, + "learning_rate": 2.9915228767351535e-07, + "loss": 0.0302, + "num_tokens": 12775142.0, + "reward": 0.6875, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.3095695972442627, + "step": 651, + "step_time": 164.39702508877963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 945.8125, + "completions/mean_terminated_length": 711.25, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.6090002059936523, + "epoch": 3.4497354497354498, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8359375, + "kl": 0.0016115251928567886, + "learning_rate": 2.9763918509710647e-07, + "loss": 0.0001, + "num_tokens": 12806947.0, + "reward": 0.5625, + "reward_std": 0.09449111670255661, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.4699290990829468, + "step": 652, + "step_time": 204.87098419014364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 806.375, + "completions/mean_terminated_length": 707.45458984375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5605332255363464, + "epoch": 3.455026455026455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.001556203467771411, + "learning_rate": 2.961282955061483e-07, + "loss": -0.0919, + "num_tokens": 12831673.0, + "reward": 0.8125, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 653, + "step_time": 89.19978978857398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 883.1875, + "completions/mean_terminated_length": 819.1818237304688, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.33967873454093933, + "epoch": 3.4603174603174605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0015498069114983082, + "learning_rate": 2.9461963542348733e-07, + "loss": 0.0158, + "num_tokens": 12860092.0, + "reward": 0.6000000238418579, + "reward_std": 0.28192007541656494, + "rewards/itbench_correctness/mean": 0.6000000238418579, + "rewards/itbench_correctness/std": 0.4242640733718872, + "step": 654, + "step_time": 85.3845539437607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 841.75, + "completions/mean_terminated_length": 829.6000366210938, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 0.4942084848880768, + "epoch": 3.4656084656084656, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.0017516858642920852, + "learning_rate": 2.931132213475884e-07, + "loss": 0.0093, + "num_tokens": 12887824.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 655, + "step_time": 330.32038860116154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 744.625, + "completions/mean_terminated_length": 704.7142944335938, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.4163169264793396, + "epoch": 3.4708994708994707, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0016559087671339512, + "learning_rate": 2.916090697523549e-07, + "loss": 0.0156, + "num_tokens": 12904234.0, + "reward": 0.875, + "reward_std": 0.25583362579345703, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.26457512378692627, + "step": 656, + "step_time": 138.49934119079262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 745.0625, + "completions/mean_terminated_length": 618.2727661132812, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4966026246547699, + "epoch": 3.4761904761904763, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0017499123932793736, + "learning_rate": 2.901071970869472e-07, + "loss": -0.0134, + "num_tokens": 12932827.0, + "reward": 0.515625, + "reward_std": 0.32311493158340454, + "rewards/itbench_correctness/mean": 0.515625, + "rewards/itbench_correctness/std": 0.436970591545105, + "step": 657, + "step_time": 79.10722716152668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 691.5, + "completions/mean_terminated_length": 691.5, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "entropy": 0.37310194969177246, + "epoch": 3.4814814814814814, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3203125, + "kl": 0.0011259306920692325, + "learning_rate": 2.8860761977560433e-07, + "loss": -0.0094, + "num_tokens": 12948403.0, + "reward": 0.8828125, + "reward_std": 0.07790146768093109, + "rewards/itbench_correctness/mean": 0.8828125, + "rewards/itbench_correctness/std": 0.16117246448993683, + "step": 658, + "step_time": 615.1303538642824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 474.0, + "completions/mean_terminated_length": 474.0, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.5063291192054749, + "epoch": 3.4867724867724865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.0021006267052143812, + "learning_rate": 2.8711035421746363e-07, + "loss": 0.004, + "num_tokens": 12967043.0, + "reward": 0.2265625, + "reward_std": 0.24306795001029968, + "rewards/itbench_correctness/mean": 0.2265625, + "rewards/itbench_correctness/std": 0.2784583568572998, + "step": 659, + "step_time": 88.2505495576188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 750.375, + "completions/mean_terminated_length": 537.5555419921875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.5357321500778198, + "epoch": 3.492063492063492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.0018287766724824905, + "learning_rate": 2.856154167863814e-07, + "loss": -0.2463, + "num_tokens": 12990473.0, + "reward": 0.4114583134651184, + "reward_std": 0.1860596239566803, + "rewards/itbench_correctness/mean": 0.4114583134651184, + "rewards/itbench_correctness/std": 0.3502231538295746, + "step": 660, + "step_time": 113.05205366853625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 691.875, + "completions/mean_terminated_length": 615.2307739257812, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.5694670081138611, + "epoch": 3.497354497354497, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0017705087084323168, + "learning_rate": 2.841228238307536e-07, + "loss": -0.0069, + "num_tokens": 13014367.0, + "reward": 0.4375, + "reward_std": 0.3532657027244568, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.38188132643699646, + "step": 661, + "step_time": 138.94573136605322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 796.875, + "completions/mean_terminated_length": 764.4285888671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.40658822655677795, + "epoch": 3.502645502645503, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.001931068836711347, + "learning_rate": 2.8263259167333774e-07, + "loss": 0.0253, + "num_tokens": 13039493.0, + "reward": 0.10625000298023224, + "reward_std": 0.14168164134025574, + "rewards/itbench_correctness/mean": 0.10625000298023224, + "rewards/itbench_correctness/std": 0.1722267121076584, + "step": 662, + "step_time": 135.8809123178944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 669.3125, + "completions/mean_terminated_length": 669.3125, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.4213278591632843, + "epoch": 3.507936507936508, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.625, + "kl": 0.001925744814798236, + "learning_rate": 2.811447366110741e-07, + "loss": -0.0154, + "num_tokens": 13055098.0, + "reward": 0.4001736044883728, + "reward_std": 0.14328064024448395, + "rewards/itbench_correctness/mean": 0.4001736044883728, + "rewards/itbench_correctness/std": 0.45731422305107117, + "step": 663, + "step_time": 1172.1156589342281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 654.1875, + "completions/mean_terminated_length": 366.5555725097656, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4555268883705139, + "epoch": 3.5132275132275135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.0016494187293574214, + "learning_rate": 2.7965927491490704e-07, + "loss": -0.1445, + "num_tokens": 13078749.0, + "reward": 0.890625, + "reward_std": 0.2414703369140625, + "rewards/itbench_correctness/mean": 0.890625, + "rewards/itbench_correctness/std": 0.2576940953731537, + "step": 664, + "step_time": 790.0815438805148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 565.625, + "completions/mean_terminated_length": 565.625, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.43668508529663086, + "epoch": 3.5185185185185186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.0019728399347513914, + "learning_rate": 2.7817622282960813e-07, + "loss": 0.0, + "num_tokens": 13091575.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 665, + "step_time": 94.42669316660613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 603.8125, + "completions/mean_terminated_length": 603.8125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "entropy": 0.4007866680622101, + "epoch": 3.5238095238095237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.001383933937177062, + "learning_rate": 2.7669559657359673e-07, + "loss": -0.0116, + "num_tokens": 13105124.0, + "reward": 0.78125, + "reward_std": 0.3471629321575165, + "rewards/itbench_correctness/mean": 0.78125, + "rewards/itbench_correctness/std": 0.4069705307483673, + "step": 666, + "step_time": 72.2391463033855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 718.625, + "completions/mean_terminated_length": 616.8333740234375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4369455575942993, + "epoch": 3.5291005291005293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0016333801904693246, + "learning_rate": 2.7521741233876493e-07, + "loss": -0.1036, + "num_tokens": 13134246.0, + "reward": 0.42500001192092896, + "reward_std": 0.3273707628250122, + "rewards/itbench_correctness/mean": 0.42500001192092896, + "rewards/itbench_correctness/std": 0.44347113370895386, + "step": 667, + "step_time": 192.8937590336427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 604.125, + "completions/mean_terminated_length": 507.23077392578125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.3542313277721405, + "epoch": 3.5343915343915344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0024427901953458786, + "learning_rate": 2.737416862902981e-07, + "loss": -0.0391, + "num_tokens": 13147840.0, + "reward": 0.2569444477558136, + "reward_std": 0.12661024928092957, + "rewards/itbench_correctness/mean": 0.2569444477558136, + "rewards/itbench_correctness/std": 0.23537467420101166, + "step": 668, + "step_time": 92.6653502555564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 689.0, + "completions/mean_terminated_length": 354.0, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "entropy": 0.42670536041259766, + "epoch": 3.5396825396825395, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6640625, + "kl": 0.0015613737050443888, + "learning_rate": 2.722684345665003e-07, + "loss": 0.0, + "num_tokens": 13163432.0, + "reward": 0.5, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.3651483952999115, + "step": 669, + "step_time": 71.54559296742082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 584.25, + "completions/mean_terminated_length": 584.25, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.45186135172843933, + "epoch": 3.544973544973545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.0028362891171127558, + "learning_rate": 2.707976732786166e-07, + "loss": -0.0174, + "num_tokens": 13176020.0, + "reward": 0.3971354365348816, + "reward_std": 0.3280077576637268, + "rewards/itbench_correctness/mean": 0.3971354365348816, + "rewards/itbench_correctness/std": 0.4046509563922882, + "step": 670, + "step_time": 115.24186983983964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 513.3125, + "completions/mean_terminated_length": 513.3125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.4909290075302124, + "epoch": 3.5502645502645502, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0017170688370242715, + "learning_rate": 2.6932941851065615e-07, + "loss": -0.0309, + "num_tokens": 13189441.0, + "reward": 0.4270833432674408, + "reward_std": 0.36084234714508057, + "rewards/itbench_correctness/mean": 0.4270833432674408, + "rewards/itbench_correctness/std": 0.40583136677742004, + "step": 671, + "step_time": 68.62113481201231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 781.125, + "completions/mean_terminated_length": 592.2222290039062, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.35333654284477234, + "epoch": 3.5555555555555554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0257568359375, + "kl": 0.0011492978082969785, + "learning_rate": 2.6786368631921834e-07, + "loss": 0.0, + "num_tokens": 13207635.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 672, + "step_time": 7667.425364185125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 563.0625, + "completions/mean_terminated_length": 532.3333740234375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5292485356330872, + "epoch": 3.560846560846561, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.0018245880492031574, + "learning_rate": 2.664004927333151e-07, + "loss": 0.0172, + "num_tokens": 13229564.0, + "reward": 0.34166666865348816, + "reward_std": 0.13930098712444305, + "rewards/itbench_correctness/mean": 0.34166666865348816, + "rewards/itbench_correctness/std": 0.4009248614311218, + "step": 673, + "step_time": 94.37730458006263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 1013.6875, + "completions/mean_terminated_length": 969.0, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.3610580265522003, + "epoch": 3.566137566137566, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.609375, + "kl": 0.0011879053199663758, + "learning_rate": 2.6493985375419775e-07, + "loss": 0.0, + "num_tokens": 13256847.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 674, + "step_time": 230.72378408256918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 416.375, + "completions/mean_terminated_length": 416.375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5355749130249023, + "epoch": 3.571428571428571, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.65625, + "kl": 0.0020900224335491657, + "learning_rate": 2.6348178535517965e-07, + "loss": -0.1134, + "num_tokens": 13268133.0, + "reward": 0.5729166269302368, + "reward_std": 0.20013636350631714, + "rewards/itbench_correctness/mean": 0.5729166269302368, + "rewards/itbench_correctness/std": 0.28361913561820984, + "step": 675, + "step_time": 56.21729406807572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 439.875, + "completions/mean_terminated_length": 439.875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.4046604037284851, + "epoch": 3.5767195767195767, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9609375, + "kl": 0.0022217335645109415, + "learning_rate": 2.620263034814632e-07, + "loss": -0.0015, + "num_tokens": 13278187.0, + "reward": 0.6875, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 676, + "step_time": 1194.6907618306577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 816.25, + "completions/mean_terminated_length": 608.5, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.45329248905181885, + "epoch": 3.582010582010582, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.0015461933799088001, + "learning_rate": 2.605734240499652e-07, + "loss": -0.0181, + "num_tokens": 13301351.0, + "reward": 0.4479166567325592, + "reward_std": 0.06200198084115982, + "rewards/itbench_correctness/mean": 0.4479166567325592, + "rewards/itbench_correctness/std": 0.4702983796596527, + "step": 677, + "step_time": 130.93884664587677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 537.1875, + "completions/mean_terminated_length": 537.1875, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.4207097291946411, + "epoch": 3.5873015873015874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.001912532257847488, + "learning_rate": 2.591231629491423e-07, + "loss": 0.0, + "num_tokens": 13314330.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 678, + "step_time": 95.27887518052012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 665.8125, + "completions/mean_terminated_length": 665.8125, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "entropy": 0.36646953225135803, + "epoch": 3.5925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.002286773407831788, + "learning_rate": 2.5767553603881764e-07, + "loss": -0.0025, + "num_tokens": 13329519.0, + "reward": 0.625, + "reward_std": 0.3104073107242584, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.3979112207889557, + "step": 679, + "step_time": 807.2301516216248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 451.3125, + "completions/mean_terminated_length": 451.3125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.3833264112472534, + "epoch": 3.597883597883598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0012348750606179237, + "learning_rate": 2.5623055915000686e-07, + "loss": 0.01, + "num_tokens": 13339652.0, + "reward": 0.6193181872367859, + "reward_std": 0.13179811835289001, + "rewards/itbench_correctness/mean": 0.6193181872367859, + "rewards/itbench_correctness/std": 0.4134864807128906, + "step": 680, + "step_time": 135.08529091719538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 646.125, + "completions/mean_terminated_length": 558.923095703125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "entropy": 0.4178757965564728, + "epoch": 3.6031746031746033, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0014911647886037827, + "learning_rate": 2.547882480847461e-07, + "loss": 0.0, + "num_tokens": 13353918.0, + "reward": 0.38749998807907104, + "reward_std": 0.1157275065779686, + "rewards/itbench_correctness/mean": 0.38749998807907104, + "rewards/itbench_correctness/std": 0.4303099811077118, + "step": 681, + "step_time": 99.35422214772552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 769.75, + "completions/mean_terminated_length": 572.0, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.7534914016723633, + "epoch": 3.6084656084656084, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0021665135864168406, + "learning_rate": 2.533486186159175e-07, + "loss": 0.0043, + "num_tokens": 13375346.0, + "reward": 0.390625, + "reward_std": 0.27564918994903564, + "rewards/itbench_correctness/mean": 0.390625, + "rewards/itbench_correctness/std": 0.4913311004638672, + "step": 682, + "step_time": 92.11163073871285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 768.75, + "completions/mean_terminated_length": 513.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3460162580013275, + "epoch": 3.613756613756614, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0234375, + "kl": 0.0016589618753641844, + "learning_rate": 2.5191168648707884e-07, + "loss": -0.0447, + "num_tokens": 13402526.0, + "reward": 0.11328125, + "reward_std": 0.08341467380523682, + "rewards/itbench_correctness/mean": 0.11328125, + "rewards/itbench_correctness/std": 0.16332921385765076, + "step": 683, + "step_time": 159.6508161853999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 546.1875, + "completions/mean_terminated_length": 546.1875, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "entropy": 0.44124042987823486, + "epoch": 3.619047619047619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0013375874841585755, + "learning_rate": 2.5047746741228977e-07, + "loss": -0.0346, + "num_tokens": 13415305.0, + "reward": 0.5416666865348816, + "reward_std": 0.21535253524780273, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.4238273799419403, + "step": 684, + "step_time": 587.8082643058151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 800.875, + "completions/mean_terminated_length": 786.0000610351562, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.34712034463882446, + "epoch": 3.624338624338624, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0234375, + "kl": 0.0008109880145639181, + "learning_rate": 2.490459770759398e-07, + "loss": -0.008, + "num_tokens": 13434959.0, + "reward": 0.296875, + "reward_std": 0.0646936446428299, + "rewards/itbench_correctness/mean": 0.296875, + "rewards/itbench_correctness/std": 0.31909704208374023, + "step": 685, + "step_time": 89.69222616031766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 470.125, + "completions/mean_terminated_length": 470.125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.45094388723373413, + "epoch": 3.6296296296296298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0274658203125, + "kl": 0.0016529399435967207, + "learning_rate": 2.476172311325783e-07, + "loss": 0.0, + "num_tokens": 13445337.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 686, + "step_time": 106.66353179235011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 804.6875, + "completions/mean_terminated_length": 673.1000366210938, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.4324660301208496, + "epoch": 3.634920634920635, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.0018160316394641995, + "learning_rate": 2.4619124520674145e-07, + "loss": 0.0029, + "num_tokens": 13466644.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 687, + "step_time": 257.70799226593226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 765.375, + "completions/mean_terminated_length": 610.2000122070312, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.535685122013092, + "epoch": 3.64021164021164, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.515625, + "kl": 0.0018447366310283542, + "learning_rate": 2.447680348927837e-07, + "loss": 0.0164, + "num_tokens": 13488698.0, + "reward": 0.1875, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.25, + "step": 688, + "step_time": 100.56179421767592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 680.0625, + "completions/mean_terminated_length": 473.70001220703125, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.36467236280441284, + "epoch": 3.6455026455026456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0013507335679605603, + "learning_rate": 2.4334761575470434e-07, + "loss": 0.0176, + "num_tokens": 13505539.0, + "reward": 0.518750011920929, + "reward_std": 0.2103695124387741, + "rewards/itbench_correctness/mean": 0.518750011920929, + "rewards/itbench_correctness/std": 0.38929542899131775, + "step": 689, + "step_time": 76.81764477398247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 653.4375, + "completions/mean_terminated_length": 628.7333374023438, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.4254423677921295, + "epoch": 3.6507936507936507, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.0013152090832591057, + "learning_rate": 2.419300033259798e-07, + "loss": 0.0775, + "num_tokens": 13524146.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 690, + "step_time": 80.11625996977091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 765.9375, + "completions/mean_terminated_length": 565.2222290039062, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.3042023777961731, + "epoch": 3.656084656084656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.002678963588550687, + "learning_rate": 2.4051521310939254e-07, + "loss": 0.0039, + "num_tokens": 13544377.0, + "reward": 0.375, + "reward_std": 0.49871626496315, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 691, + "step_time": 937.4359912928194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 990.8125, + "completions/mean_terminated_length": 847.0, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 0.4723396301269531, + "epoch": 3.6613756613756614, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.001547610154375434, + "learning_rate": 2.3910326057686124e-07, + "loss": 0.0235, + "num_tokens": 13570094.0, + "reward": 0.2622767686843872, + "reward_std": 0.17326994240283966, + "rewards/itbench_correctness/mean": 0.2622767686843872, + "rewards/itbench_correctness/std": 0.22186197340488434, + "step": 692, + "step_time": 241.44549081102014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 634.1875, + "completions/mean_terminated_length": 457.0, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.39735883474349976, + "epoch": 3.6666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0194091796875, + "kl": 0.0012317921500653028, + "learning_rate": 2.3769416116927333e-07, + "loss": 0.0, + "num_tokens": 13591193.0, + "reward": 0.3333333432674408, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.3333333432674408, + "rewards/itbench_correctness/std": 0.3442651927471161, + "step": 693, + "step_time": 156.18573713861406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 772.6875, + "completions/mean_terminated_length": 521.375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.41672733426094055, + "epoch": 3.671957671957672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.0018979725427925587, + "learning_rate": 2.362879302963135e-07, + "loss": 0.0086, + "num_tokens": 13612116.0, + "reward": 0.53125, + "reward_std": 0.29986464977264404, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.4366062581539154, + "step": 694, + "step_time": 268.16689282283187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 817.3125, + "completions/mean_terminated_length": 610.625, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.5040911436080933, + "epoch": 3.677248677248677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0017554876394569874, + "learning_rate": 2.3488458333629773e-07, + "loss": 0.0111, + "num_tokens": 13636593.0, + "reward": 0.768750011920929, + "reward_std": 0.3954527974128723, + "rewards/itbench_correctness/mean": 0.768750011920929, + "rewards/itbench_correctness/std": 0.39355337619781494, + "step": 695, + "step_time": 83.74904467258602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 973.5, + "completions/mean_terminated_length": 822.0, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "entropy": 0.44170519709587097, + "epoch": 3.682539682539683, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.0014584577875211835, + "learning_rate": 2.3348413563600323e-07, + "loss": 0.0101, + "num_tokens": 13659681.0, + "reward": 0.6041666865348816, + "reward_std": 0.19287918508052826, + "rewards/itbench_correctness/mean": 0.6041666865348816, + "rewards/itbench_correctness/std": 0.48638883233070374, + "step": 696, + "step_time": 80.2720007058233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 911.125, + "completions/mean_terminated_length": 766.0000610351562, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.4477980434894562, + "epoch": 3.687830687830688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.0018148425733670592, + "learning_rate": 2.3208660251050156e-07, + "loss": 0.0001, + "num_tokens": 13701451.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 697, + "step_time": 590.3981730565429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 921.625, + "completions/mean_terminated_length": 478.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.5208191871643066, + "epoch": 3.693121693121693, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.003547599073499441, + "learning_rate": 2.306919992429917e-07, + "loss": -0.0535, + "num_tokens": 13724541.0, + "reward": 0.48284316062927246, + "reward_std": 0.2941243052482605, + "rewards/itbench_correctness/mean": 0.48284316062927246, + "rewards/itbench_correctness/std": 0.4697091579437256, + "step": 698, + "step_time": 156.73526183422655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 654.875, + "completions/mean_terminated_length": 654.875, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 0.3054018020629883, + "epoch": 3.6984126984126986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0012958102161064744, + "learning_rate": 2.2930034108463097e-07, + "loss": -0.0132, + "num_tokens": 13740411.0, + "reward": 0.6156250238418579, + "reward_std": 0.1601249873638153, + "rewards/itbench_correctness/mean": 0.6156250238418579, + "rewards/itbench_correctness/std": 0.16301201283931732, + "step": 699, + "step_time": 99.30126603785902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 779.1875, + "completions/mean_terminated_length": 534.375, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.6083259582519531, + "epoch": 3.7037037037037037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020751953125, + "kl": 0.0015235234750434756, + "learning_rate": 2.2791164325437046e-07, + "loss": 0.0, + "num_tokens": 13769414.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 700, + "step_time": 121.30535170529038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 895.125, + "completions/mean_terminated_length": 766.25, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.33514872193336487, + "epoch": 3.708994708994709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.0013990112347528338, + "learning_rate": 2.2652592093878665e-07, + "loss": 0.036, + "num_tokens": 13794072.0, + "reward": 0.125, + "reward_std": 0.1746530830860138, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.17743022739887238, + "step": 701, + "step_time": 441.7110221767798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 561.3125, + "completions/mean_terminated_length": 561.3125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.5237724184989929, + "epoch": 3.7142857142857144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0223388671875, + "kl": 0.0015220106579363346, + "learning_rate": 2.2514318929191706e-07, + "loss": 0.0, + "num_tokens": 13810757.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 702, + "step_time": 111.1009431509301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 699.9375, + "completions/mean_terminated_length": 375.875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.4943298399448395, + "epoch": 3.7195767195767195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.001631591934710741, + "learning_rate": 2.237634634350934e-07, + "loss": 0.0044, + "num_tokens": 13831820.0, + "reward": 0.6041666865348816, + "reward_std": 0.49329501390457153, + "rewards/itbench_correctness/mean": 0.6041666865348816, + "rewards/itbench_correctness/std": 0.4901813864707947, + "step": 703, + "step_time": 122.44539823755622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 715.6875, + "completions/mean_terminated_length": 530.7000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4499170482158661, + "epoch": 3.7248677248677247, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0019055476877838373, + "learning_rate": 2.223867584567766e-07, + "loss": -0.0356, + "num_tokens": 13848519.0, + "reward": 0.6761363744735718, + "reward_std": 0.2798159420490265, + "rewards/itbench_correctness/mean": 0.6761363744735718, + "rewards/itbench_correctness/std": 0.4717574715614319, + "step": 704, + "step_time": 72.04201124608517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 997.9375, + "completions/mean_terminated_length": 815.5, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.5070458054542542, + "epoch": 3.7301587301587302, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4453125, + "kl": 0.0012196371098980308, + "learning_rate": 2.21013089412392e-07, + "loss": 0.0007, + "num_tokens": 13874222.0, + "reward": 0.27085813879966736, + "reward_std": 0.10523707419633865, + "rewards/itbench_correctness/mean": 0.27085813879966736, + "rewards/itbench_correctness/std": 0.3145284056663513, + "step": 705, + "step_time": 71.73409328702837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 560.5, + "completions/mean_terminated_length": 560.5, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "entropy": 0.4085637927055359, + "epoch": 3.7354497354497354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.001750650000758469, + "learning_rate": 2.1964247132416368e-07, + "loss": 0.0159, + "num_tokens": 13886822.0, + "reward": 0.578125, + "reward_std": 0.13797250390052795, + "rewards/itbench_correctness/mean": 0.578125, + "rewards/itbench_correctness/std": 0.3949551582336426, + "step": 706, + "step_time": 854.83407723438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 760.5, + "completions/mean_terminated_length": 497.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4549638330936432, + "epoch": 3.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.004151123110204935, + "learning_rate": 2.1827491918095177e-07, + "loss": -0.0855, + "num_tokens": 13909070.0, + "reward": 0.4791666567325592, + "reward_std": 0.221320241689682, + "rewards/itbench_correctness/mean": 0.4791666567325592, + "rewards/itbench_correctness/std": 0.47871360182762146, + "step": 707, + "step_time": 122.79243450798094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 957.75, + "completions/mean_terminated_length": 847.3333740234375, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.48238056898117065, + "epoch": 3.746031746031746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.001507254783064127, + "learning_rate": 2.1691044793808733e-07, + "loss": 0.0161, + "num_tokens": 13939002.0, + "reward": 0.515625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.515625, + "rewards/itbench_correctness/std": 0.503891110420227, + "step": 708, + "step_time": 193.1200410258025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 613.0625, + "completions/mean_terminated_length": 613.0625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.39963299036026, + "epoch": 3.751322751322751, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.0013470432022586465, + "learning_rate": 2.1554907251720945e-07, + "loss": -0.0121, + "num_tokens": 13952611.0, + "reward": 0.6875, + "reward_std": 0.33614614605903625, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.4425306022167206, + "step": 709, + "step_time": 139.45972900651395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 533.5, + "completions/mean_terminated_length": 533.5, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.5510777831077576, + "epoch": 3.7566137566137567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0260009765625, + "kl": 0.0015925114275887609, + "learning_rate": 2.1419080780610122e-07, + "loss": 0.0, + "num_tokens": 13963859.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 710, + "step_time": 515.4728924324736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 950.6875, + "completions/mean_terminated_length": 877.375, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "entropy": 0.3702583611011505, + "epoch": 3.761904761904762, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.201171875, + "kl": 0.001159166102297604, + "learning_rate": 2.128356686585282e-07, + "loss": 0.0, + "num_tokens": 13988342.0, + "reward": 0.875, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.12909944355487823, + "step": 711, + "step_time": 94.62389472685754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 636.3125, + "completions/mean_terminated_length": 546.84619140625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5091837644577026, + "epoch": 3.7671957671957674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.0018951293313875794, + "learning_rate": 2.1148366989407497e-07, + "loss": -0.0344, + "num_tokens": 14007139.0, + "reward": 0.1041666716337204, + "reward_std": 0.1178511306643486, + "rewards/itbench_correctness/mean": 0.1041666716337204, + "rewards/itbench_correctness/std": 0.13437096774578094, + "step": 712, + "step_time": 456.887752013281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 657.875, + "completions/mean_terminated_length": 573.3846435546875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.40737220644950867, + "epoch": 3.7724867724867726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3984375, + "kl": 0.0015902683371677995, + "learning_rate": 2.101348262979833e-07, + "loss": -0.0082, + "num_tokens": 14027257.0, + "reward": 0.4742647111415863, + "reward_std": 0.0482964813709259, + "rewards/itbench_correctness/mean": 0.4742647111415863, + "rewards/itbench_correctness/std": 0.4942431151866913, + "step": 713, + "step_time": 178.64222278352827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 685.0625, + "completions/mean_terminated_length": 346.125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.6481160521507263, + "epoch": 3.7777777777777777, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.1875, + "kl": 0.0014102908316999674, + "learning_rate": 2.0878915262099096e-07, + "loss": 0.0, + "num_tokens": 14044666.0, + "reward": 0.046875, + "reward_std": 0.09300297498703003, + "rewards/itbench_correctness/mean": 0.046875, + "rewards/itbench_correctness/std": 0.1359764039516449, + "step": 714, + "step_time": 107.09331988729537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 779.3125, + "completions/mean_terminated_length": 668.0909423828125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.49530836939811707, + "epoch": 3.7830687830687832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0014117928221821785, + "learning_rate": 2.0744666357916925e-07, + "loss": 0.0542, + "num_tokens": 14060735.0, + "reward": 0.4479166865348816, + "reward_std": 0.20154890418052673, + "rewards/itbench_correctness/mean": 0.4479166865348816, + "rewards/itbench_correctness/std": 0.4550386667251587, + "step": 715, + "step_time": 420.523594789207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 562.0625, + "completions/mean_terminated_length": 531.2667236328125, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.50172358751297, + "epoch": 3.7883597883597884, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2421875, + "kl": 0.0018034385284408927, + "learning_rate": 2.0610737385376348e-07, + "loss": 0.0156, + "num_tokens": 14075944.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 716, + "step_time": 1006.8671864075586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 761.0625, + "completions/mean_terminated_length": 556.5555419921875, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.41783690452575684, + "epoch": 3.7936507936507935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.0019152258755639195, + "learning_rate": 2.0477129809103145e-07, + "loss": 0.0001, + "num_tokens": 14098977.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 717, + "step_time": 450.0135377245024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 786.25, + "completions/mean_terminated_length": 643.6000366210938, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.521462619304657, + "epoch": 3.798941798941799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0230712890625, + "kl": 0.001451818854548037, + "learning_rate": 2.0343845090208367e-07, + "loss": 0.0, + "num_tokens": 14115053.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 718, + "step_time": 349.72742245160043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 635.5625, + "completions/mean_terminated_length": 635.5625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.4814632833003998, + "epoch": 3.804232804232804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0014020799426361918, + "learning_rate": 2.0210884686272367e-07, + "loss": 0.0339, + "num_tokens": 14129190.0, + "reward": 0.7466298937797546, + "reward_std": 0.3058916926383972, + "rewards/itbench_correctness/mean": 0.7466298937797546, + "rewards/itbench_correctness/std": 0.38068902492523193, + "step": 719, + "step_time": 175.0832874653861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 479.125, + "completions/mean_terminated_length": 479.125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.44873467087745667, + "epoch": 3.8095238095238093, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.21875, + "kl": 0.0020029842853546143, + "learning_rate": 2.0078250051328782e-07, + "loss": 0.0077, + "num_tokens": 14139808.0, + "reward": 0.648809552192688, + "reward_std": 0.17577169835567474, + "rewards/itbench_correctness/mean": 0.648809552192688, + "rewards/itbench_correctness/std": 0.28511843085289, + "step": 720, + "step_time": 102.81194345280528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 414.3125, + "completions/mean_terminated_length": 414.3125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.35721829533576965, + "epoch": 3.814814814814815, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9375, + "kl": 0.0016476555028930306, + "learning_rate": 1.9945942635848745e-07, + "loss": -0.0019, + "num_tokens": 14150685.0, + "reward": 0.453125, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.453125, + "rewards/itbench_correctness/std": 0.5018196105957031, + "step": 721, + "step_time": 877.2679685084149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 784.125, + "completions/mean_terminated_length": 544.25, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.45911046862602234, + "epoch": 3.82010582010582, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.328125, + "kl": 0.0014677182771265507, + "learning_rate": 1.981396388672496e-07, + "loss": 0.0, + "num_tokens": 14172399.0, + "reward": 0.2109375, + "reward_std": 0.06629125773906708, + "rewards/itbench_correctness/mean": 0.2109375, + "rewards/itbench_correctness/std": 0.2359323352575302, + "step": 722, + "step_time": 208.89351680781692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 920.625, + "completions/mean_terminated_length": 817.25, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.262864887714386, + "epoch": 3.825396825396825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0010848907986655831, + "learning_rate": 1.9682315247255892e-07, + "loss": 0.0043, + "num_tokens": 14196345.0, + "reward": 0.3854166865348816, + "reward_std": 0.297717809677124, + "rewards/itbench_correctness/mean": 0.3854166865348816, + "rewards/itbench_correctness/std": 0.3145764470100403, + "step": 723, + "step_time": 847.0749486461282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 442.375, + "completions/mean_terminated_length": 442.375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 0.47018933296203613, + "epoch": 3.8306878306878307, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9609375, + "kl": 0.0014332140563055873, + "learning_rate": 1.9550998157129944e-07, + "loss": -0.0054, + "num_tokens": 14206399.0, + "reward": 0.921875, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.921875, + "rewards/itbench_correctness/std": 0.1983000785112381, + "step": 724, + "step_time": 91.33387219905853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 687.5625, + "completions/mean_terminated_length": 425.8888854980469, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 0.3752386271953583, + "epoch": 3.835978835978836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0017074119532480836, + "learning_rate": 1.942001405240979e-07, + "loss": 0.0057, + "num_tokens": 14222120.0, + "reward": 0.2232142835855484, + "reward_std": 0.18483898043632507, + "rewards/itbench_correctness/mean": 0.2232142835855484, + "rewards/itbench_correctness/std": 0.21329134702682495, + "step": 725, + "step_time": 93.2281863456592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 709.0, + "completions/mean_terminated_length": 604.0, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.5275035500526428, + "epoch": 3.8412698412698414, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.002380735008046031, + "learning_rate": 1.9289364365516607e-07, + "loss": 0.0403, + "num_tokens": 14258400.0, + "reward": 0.453125, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.453125, + "rewards/itbench_correctness/std": 0.5018196105957031, + "step": 726, + "step_time": 588.1754347216338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 655.8125, + "completions/mean_terminated_length": 631.2667236328125, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.5062422752380371, + "epoch": 3.8465608465608465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.001622261363081634, + "learning_rate": 1.915905052521445e-07, + "loss": 0.0287, + "num_tokens": 14277157.0, + "reward": 0.5, + "reward_std": 0.2177756428718567, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.3872983455657959, + "step": 727, + "step_time": 455.7577704479918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 502.9375, + "completions/mean_terminated_length": 502.9375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.3757922351360321, + "epoch": 3.851851851851852, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.0013325168984010816, + "learning_rate": 1.9029073956594604e-07, + "loss": -0.0056, + "num_tokens": 14288780.0, + "reward": 0.90625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.90625, + "rewards/itbench_correctness/std": 0.20155644416809082, + "step": 728, + "step_time": 82.46759236324579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 541.125, + "completions/mean_terminated_length": 541.125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.3862323760986328, + "epoch": 3.857142857142857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033935546875, + "kl": 0.0012876316905021667, + "learning_rate": 1.8899436081059972e-07, + "loss": 0.0, + "num_tokens": 14300838.0, + "reward": 0.75, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 729, + "step_time": 74.73872442170978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 920.1875, + "completions/mean_terminated_length": 786.7142944335938, + "completions/min_length": 736.0, + "completions/min_terminated_length": 736.0, + "entropy": 0.22712762653827667, + "epoch": 3.8624338624338623, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.0, + "kl": 0.0014261262258514762, + "learning_rate": 1.877013831630961e-07, + "loss": -0.022, + "num_tokens": 14323721.0, + "reward": 0.1458333432674408, + "reward_std": 0.0589255690574646, + "rewards/itbench_correctness/mean": 0.1458333432674408, + "rewards/itbench_correctness/std": 0.17078252136707306, + "step": 730, + "step_time": 134.0663373246789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 542.5, + "completions/mean_terminated_length": 542.5, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.26359447836875916, + "epoch": 3.867724867724868, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.265625, + "kl": 0.0024193329736590385, + "learning_rate": 1.8641182076323148e-07, + "loss": -0.0109, + "num_tokens": 14337273.0, + "reward": 0.4166666865348816, + "reward_std": 0.08908706903457642, + "rewards/itbench_correctness/mean": 0.4166666865348816, + "rewards/itbench_correctness/std": 0.14907118678092957, + "step": 731, + "step_time": 69.43494361732155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 759.0625, + "completions/mean_terminated_length": 494.125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.3925895392894745, + "epoch": 3.873015873015873, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5078125, + "kl": 0.0015624676598235965, + "learning_rate": 1.8512568771345378e-07, + "loss": 0.0, + "num_tokens": 14358434.0, + "reward": 0.9166666865348816, + "reward_std": 0.17817416787147522, + "rewards/itbench_correctness/mean": 0.9166666865348816, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 732, + "step_time": 105.75646356213838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 645.6875, + "completions/mean_terminated_length": 620.4666748046875, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.6411770582199097, + "epoch": 3.878306878306878, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0023670424707233906, + "learning_rate": 1.8384299807870805e-07, + "loss": -0.0062, + "num_tokens": 14375989.0, + "reward": 0.4270833134651184, + "reward_std": 0.019287927076220512, + "rewards/itbench_correctness/mean": 0.4270833134651184, + "rewards/itbench_correctness/std": 0.44187626242637634, + "step": 733, + "step_time": 85.2570496154949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 710.0625, + "completions/mean_terminated_length": 396.125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.4816477298736572, + "epoch": 3.8835978835978837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0220947265625, + "kl": 0.0012989024398848414, + "learning_rate": 1.8256376588628235e-07, + "loss": 0.0, + "num_tokens": 14392414.0, + "reward": 0.6785714626312256, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.6785714626312256, + "rewards/itbench_correctness/std": 0.18442778289318085, + "step": 734, + "step_time": 1033.4855123637244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 602.5625, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.44476714730262756, + "epoch": 3.888888888888889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0017304952489212155, + "learning_rate": 1.812880051256551e-07, + "loss": -0.0347, + "num_tokens": 14405887.0, + "reward": 0.3031249940395355, + "reward_std": 0.3322408199310303, + "rewards/itbench_correctness/mean": 0.3031249940395355, + "rewards/itbench_correctness/std": 0.3288711905479431, + "step": 735, + "step_time": 81.48759328760207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 607.625, + "completions/mean_terminated_length": 548.1428833007812, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.3587739169597626, + "epoch": 3.894179894179894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.0017037625657394528, + "learning_rate": 1.8001572974834168e-07, + "loss": 0.0, + "num_tokens": 14424505.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 736, + "step_time": 320.6608420452103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 610.8125, + "completions/mean_terminated_length": 583.2667236328125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4322111904621124, + "epoch": 3.8994708994708995, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.498046875, + "kl": 0.0016470147529616952, + "learning_rate": 1.787469536677419e-07, + "loss": -0.1173, + "num_tokens": 14461894.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 737, + "step_time": 313.65273729898036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 715.9375, + "completions/mean_terminated_length": 476.3333435058594, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 0.7011785507202148, + "epoch": 3.9047619047619047, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.001664389856159687, + "learning_rate": 1.7748169075898727e-07, + "loss": 0.0427, + "num_tokens": 14482421.0, + "reward": 0.2447916716337204, + "reward_std": 0.235783189535141, + "rewards/itbench_correctness/mean": 0.2447916716337204, + "rewards/itbench_correctness/std": 0.3475692868232727, + "step": 738, + "step_time": 84.38055996689945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 416.9375, + "completions/mean_terminated_length": 416.9375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 0.4149302840232849, + "epoch": 3.91005291005291, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.0016000322066247463, + "learning_rate": 1.762199548587906e-07, + "loss": -0.0046, + "num_tokens": 14491636.0, + "reward": 0.546875, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.546875, + "rewards/itbench_correctness/std": 0.07739239931106567, + "step": 739, + "step_time": 123.19425270985812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 759.5, + "completions/mean_terminated_length": 639.2727661132812, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.35286372900009155, + "epoch": 3.9153439153439153, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.84375, + "kl": 0.0021646912209689617, + "learning_rate": 1.7496175976529337e-07, + "loss": -0.0035, + "num_tokens": 14509100.0, + "reward": 0.75, + "reward_std": 0.1259881556034088, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.31031644344329834, + "step": 740, + "step_time": 139.75384074263275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 614.625, + "completions/mean_terminated_length": 614.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5694529414176941, + "epoch": 3.9206349206349205, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.0038674459792673588, + "learning_rate": 1.7370711923791564e-07, + "loss": 0.0097, + "num_tokens": 14532734.0, + "reward": 0.265625, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.265625, + "rewards/itbench_correctness/std": 0.28090256452560425, + "step": 741, + "step_time": 372.90891189686954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 923.75, + "completions/mean_terminated_length": 623.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5044655203819275, + "epoch": 3.925925925925926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.0016132418531924486, + "learning_rate": 1.7245604699720535e-07, + "loss": -0.069, + "num_tokens": 14559434.0, + "reward": 0.18854166567325592, + "reward_std": 0.15936720371246338, + "rewards/itbench_correctness/mean": 0.18854166567325592, + "rewards/itbench_correctness/std": 0.15572041273117065, + "step": 742, + "step_time": 162.81722828093916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 870.875, + "completions/mean_terminated_length": 819.8333740234375, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.4018946588039398, + "epoch": 3.931216931216931, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0014265145873650908, + "learning_rate": 1.7120855672468776e-07, + "loss": -0.0312, + "num_tokens": 14580296.0, + "reward": 0.5625, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 743, + "step_time": 541.2288927352056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 771.0625, + "completions/mean_terminated_length": 574.3333129882812, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.5732349753379822, + "epoch": 3.9365079365079367, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4296875, + "kl": 0.0019290586933493614, + "learning_rate": 1.6996466206271675e-07, + "loss": -0.0139, + "num_tokens": 14620177.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 744, + "step_time": 638.7111993785948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 472.5, + "completions/mean_terminated_length": 472.5, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "entropy": 0.40634921193122864, + "epoch": 3.941798941798942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.0012795755174010992, + "learning_rate": 1.6872437661432516e-07, + "loss": 0.0089, + "num_tokens": 14630313.0, + "reward": 0.734375, + "reward_std": 0.15738674998283386, + "rewards/itbench_correctness/mean": 0.734375, + "rewards/itbench_correctness/std": 0.34856685996055603, + "step": 745, + "step_time": 94.9421982690692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 483.375, + "completions/mean_terminated_length": 483.375, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.39513835310935974, + "epoch": 3.947089947089947, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.004012894816696644, + "learning_rate": 1.674877139430758e-07, + "loss": -0.015, + "num_tokens": 14646847.0, + "reward": 0.4375, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 746, + "step_time": 83.17668206058443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 630.6875, + "completions/mean_terminated_length": 630.6875, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.4281042516231537, + "epoch": 3.9523809523809526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.001534756040200591, + "learning_rate": 1.6625468757291377e-07, + "loss": -0.012, + "num_tokens": 14660506.0, + "reward": 0.640625, + "reward_std": 0.27564918994903564, + "rewards/itbench_correctness/mean": 0.640625, + "rewards/itbench_correctness/std": 0.341183602809906, + "step": 747, + "step_time": 418.7796147307381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 585.625, + "completions/mean_terminated_length": 439.5, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 0.35346850752830505, + "epoch": 3.9576719576719577, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0018646334065124393, + "learning_rate": 1.6502531098801753e-07, + "loss": -0.0214, + "num_tokens": 14676340.0, + "reward": 0.7291666269302368, + "reward_std": 0.3471825420856476, + "rewards/itbench_correctness/mean": 0.7291666269302368, + "rewards/itbench_correctness/std": 0.3542075455188751, + "step": 748, + "step_time": 124.75608675274998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 735.3125, + "completions/mean_terminated_length": 694.0714721679688, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "entropy": 0.5439863801002502, + "epoch": 3.962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.0013193426420912147, + "learning_rate": 1.6379959763265266e-07, + "loss": 0.028, + "num_tokens": 14694473.0, + "reward": 0.375, + "reward_std": 0.249358132481575, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.273861289024353, + "step": 749, + "step_time": 466.49796204734594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 735.25, + "completions/mean_terminated_length": 562.0, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.4950697124004364, + "epoch": 3.9682539682539684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0012579681351780891, + "learning_rate": 1.62577560911024e-07, + "loss": -0.0039, + "num_tokens": 14711301.0, + "reward": 0.65625, + "reward_std": 0.30173346400260925, + "rewards/itbench_correctness/mean": 0.65625, + "rewards/itbench_correctness/std": 0.375, + "step": 750, + "step_time": 988.6653963262215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 411.375, + "completions/mean_terminated_length": 411.375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.47645092010498047, + "epoch": 3.9735449735449735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.002736019669100642, + "learning_rate": 1.6135921418712955e-07, + "loss": 0.0, + "num_tokens": 14720819.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 751, + "step_time": 143.75118728913367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 756.6875, + "completions/mean_terminated_length": 548.7777709960938, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 0.5418353080749512, + "epoch": 3.9788359788359786, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.375, + "kl": 0.0015679231146350503, + "learning_rate": 1.601445707846135e-07, + "loss": 0.0288, + "num_tokens": 14743110.0, + "reward": 0.3697916865348816, + "reward_std": 0.014731383882462978, + "rewards/itbench_correctness/mean": 0.3697916865348816, + "rewards/itbench_correctness/std": 0.3824491500854492, + "step": 752, + "step_time": 114.25977344904095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 878.625, + "completions/mean_terminated_length": 812.5454711914062, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "entropy": 0.29022619128227234, + "epoch": 3.984126984126984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.001140591804869473, + "learning_rate": 1.5893364398662174e-07, + "loss": 0.0, + "num_tokens": 14769432.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 753, + "step_time": 683.4805310554802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 809.3125, + "completions/mean_terminated_length": 680.5, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.2903698980808258, + "epoch": 3.9894179894179893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.0009075679117813706, + "learning_rate": 1.5772644703565564e-07, + "loss": -0.0137, + "num_tokens": 14788877.0, + "reward": 0.7447916865348816, + "reward_std": 0.1927933692932129, + "rewards/itbench_correctness/mean": 0.7447916865348816, + "rewards/itbench_correctness/std": 0.2643453776836395, + "step": 754, + "step_time": 105.73342135362327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 823.0, + "completions/mean_terminated_length": 666.6666870117188, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 0.4835965931415558, + "epoch": 3.9947089947089944, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.453125, + "kl": 0.001044503878802061, + "learning_rate": 1.565229931334277e-07, + "loss": 0.0002, + "num_tokens": 14808949.0, + "reward": 0.596875011920929, + "reward_std": 0.041052017360925674, + "rewards/itbench_correctness/mean": 0.596875011920929, + "rewards/itbench_correctness/std": 0.4201066493988037, + "step": 755, + "step_time": 158.0458857798949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 702.8125, + "completions/mean_terminated_length": 656.9285888671875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.26465097069740295, + "epoch": 4.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.001511210692115128, + "learning_rate": 1.553232954407171e-07, + "loss": 0.0, + "num_tokens": 14833258.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 756, + "step_time": 233.77977779414505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 684.9375, + "completions/mean_terminated_length": 684.9375, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "entropy": 0.49639564752578735, + "epoch": 4.005291005291006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.0017367677064612508, + "learning_rate": 1.5412736707722534e-07, + "loss": -0.0237, + "num_tokens": 14860073.0, + "reward": 0.38749998807907104, + "reward_std": 0.42026326060295105, + "rewards/itbench_correctness/mean": 0.38749998807907104, + "rewards/itbench_correctness/std": 0.49244290590286255, + "step": 757, + "step_time": 126.27328568976372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 696.6875, + "completions/mean_terminated_length": 442.1111145019531, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5597918629646301, + "epoch": 4.01058201058201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.0017616557888686657, + "learning_rate": 1.529352211214337e-07, + "loss": -0.0553, + "num_tokens": 14877180.0, + "reward": 0.5062500238418579, + "reward_std": 0.23287571966648102, + "rewards/itbench_correctness/mean": 0.5062500238418579, + "rewards/itbench_correctness/std": 0.33042481541633606, + "step": 758, + "step_time": 145.3167848372832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 701.1875, + "completions/mean_terminated_length": 378.375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.3793564438819885, + "epoch": 4.015873015873016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.0020792728755623102, + "learning_rate": 1.517468706104589e-07, + "loss": -0.0105, + "num_tokens": 14894423.0, + "reward": 0.75, + "reward_std": 0.1315174549818039, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.2357022613286972, + "step": 759, + "step_time": 143.815074888058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 579.625, + "completions/mean_terminated_length": 579.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4658184051513672, + "epoch": 4.021164021164021, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.0024365338031202555, + "learning_rate": 1.5056232853991208e-07, + "loss": -0.0313, + "num_tokens": 14908353.0, + "reward": 0.7057291865348816, + "reward_std": 0.188043013215065, + "rewards/itbench_correctness/mean": 0.7057291865348816, + "rewards/itbench_correctness/std": 0.325060099363327, + "step": 760, + "step_time": 169.37305662687868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 617.875, + "completions/mean_terminated_length": 617.875, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "entropy": 0.2735181152820587, + "epoch": 4.026455026455026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0019248781027272344, + "learning_rate": 1.493816078637557e-07, + "loss": 0.0042, + "num_tokens": 14922327.0, + "reward": 0.5, + "reward_std": 0.4629100561141968, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 761, + "step_time": 152.33051012922078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 825.9375, + "completions/mean_terminated_length": 707.1000366210938, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.5787363052368164, + "epoch": 4.031746031746032, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.828125, + "kl": 0.0014200946316123009, + "learning_rate": 1.4820472149416153e-07, + "loss": 0.022, + "num_tokens": 14957710.0, + "reward": 0.09375, + "reward_std": 0.03788072243332863, + "rewards/itbench_correctness/mean": 0.09375, + "rewards/itbench_correctness/std": 0.10978876054286957, + "step": 762, + "step_time": 102.25276782084256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 949.5625, + "completions/mean_terminated_length": 875.125, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.3391035199165344, + "epoch": 4.037037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.0015424852026626468, + "learning_rate": 1.470316823013707e-07, + "loss": 0.0162, + "num_tokens": 14984159.0, + "reward": 0.4062500298023224, + "reward_std": 0.2351749688386917, + "rewards/itbench_correctness/mean": 0.4062500298023224, + "rewards/itbench_correctness/std": 0.2916666865348816, + "step": 763, + "step_time": 115.0978871025145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 812.3125, + "completions/mean_terminated_length": 782.0714721679688, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.5022697448730469, + "epoch": 4.042328042328043, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.21875, + "kl": 0.0012976002180948853, + "learning_rate": 1.4586250311355132e-07, + "loss": -0.0125, + "num_tokens": 15003988.0, + "reward": 0.15625, + "reward_std": 0.18600594997406006, + "rewards/itbench_correctness/mean": 0.15625, + "rewards/itbench_correctness/std": 0.3010398745536804, + "step": 764, + "step_time": 219.78546244930476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 883.4375, + "completions/mean_terminated_length": 742.875, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 0.4867350459098816, + "epoch": 4.0476190476190474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.00221477122977376, + "learning_rate": 1.4469719671666043e-07, + "loss": -0.0099, + "num_tokens": 15032371.0, + "reward": 0.7265625, + "reward_std": 0.3056884706020355, + "rewards/itbench_correctness/mean": 0.7265625, + "rewards/itbench_correctness/std": 0.3329750895500183, + "step": 765, + "step_time": 196.3956578373909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 634.0, + "completions/mean_terminated_length": 634.0, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "entropy": 0.44164037704467773, + "epoch": 4.052910052910053, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.078125, + "kl": 0.0014499113894999027, + "learning_rate": 1.435357758543015e-07, + "loss": -0.0066, + "num_tokens": 15046147.0, + "reward": 0.8051470518112183, + "reward_std": 0.252979040145874, + "rewards/itbench_correctness/mean": 0.8051470518112183, + "rewards/itbench_correctness/std": 0.399953156709671, + "step": 766, + "step_time": 170.79877135157585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 731.25, + "completions/mean_terminated_length": 438.5, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.2762393057346344, + "epoch": 4.058201058201059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033447265625, + "kl": 0.0011394057655707002, + "learning_rate": 1.4237825322758735e-07, + "loss": 0.0, + "num_tokens": 15063599.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 767, + "step_time": 863.5087349172682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 764.25, + "completions/mean_terminated_length": 608.4000244140625, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.3663722574710846, + "epoch": 4.063492063492063, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0017420370131731033, + "learning_rate": 1.412246414949997e-07, + "loss": -0.0017, + "num_tokens": 15080899.0, + "reward": 0.234375, + "reward_std": 0.1953546553850174, + "rewards/itbench_correctness/mean": 0.234375, + "rewards/itbench_correctness/std": 0.36032232642173767, + "step": 768, + "step_time": 277.51467712502927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 764.5625, + "completions/mean_terminated_length": 704.6923217773438, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 0.34791138768196106, + "epoch": 4.068783068783069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0010995686752721667, + "learning_rate": 1.400749532722516e-07, + "loss": 0.0227, + "num_tokens": 15098204.0, + "reward": 0.46875, + "reward_std": 0.353828489780426, + "rewards/itbench_correctness/mean": 0.46875, + "rewards/itbench_correctness/std": 0.4181916415691376, + "step": 769, + "step_time": 572.5676989480853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 687.0625, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.5006822347640991, + "epoch": 4.074074074074074, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5703125, + "kl": 0.0013974824687466025, + "learning_rate": 1.389292011321498e-07, + "loss": 0.0, + "num_tokens": 15122029.0, + "reward": 0.6875, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.3095695972442627, + "step": 770, + "step_time": 206.3093525590375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 523.125, + "completions/mean_terminated_length": 523.125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.4262843430042267, + "epoch": 4.079365079365079, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0012783248675987124, + "learning_rate": 1.3778739760445552e-07, + "loss": -0.0522, + "num_tokens": 15132895.0, + "reward": 0.31534090638160706, + "reward_std": 0.3429919481277466, + "rewards/itbench_correctness/mean": 0.31534090638160706, + "rewards/itbench_correctness/std": 0.34952497482299805, + "step": 771, + "step_time": 136.4253909336403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 935.0, + "completions/mean_terminated_length": 905.3333740234375, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "entropy": 0.498395711183548, + "epoch": 4.084656084656085, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.001486019929870963, + "learning_rate": 1.3664955517574967e-07, + "loss": 0.0119, + "num_tokens": 15163503.0, + "reward": 0.4791666865348816, + "reward_std": 0.4529353678226471, + "rewards/itbench_correctness/mean": 0.4791666865348816, + "rewards/itbench_correctness/std": 0.4549115002155304, + "step": 772, + "step_time": 149.56670145317912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 700.25, + "completions/mean_terminated_length": 506.0, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.4455551505088806, + "epoch": 4.08994708994709, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.001568189705722034, + "learning_rate": 1.3551568628929432e-07, + "loss": 0.0036, + "num_tokens": 15185307.0, + "reward": 0.125, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.273861289024353, + "step": 773, + "step_time": 821.3875108454376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 763.9375, + "completions/mean_terminated_length": 763.9375, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 0.44768059253692627, + "epoch": 4.095238095238095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0322265625, + "kl": 0.0012937538558617234, + "learning_rate": 1.3438580334489818e-07, + "loss": 0.0, + "num_tokens": 15207562.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 774, + "step_time": 537.4289036728442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 642.8125, + "completions/mean_terminated_length": 617.4000244140625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.4698103964328766, + "epoch": 4.1005291005291005, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0859375, + "kl": 0.0019668727181851864, + "learning_rate": 1.3325991869878012e-07, + "loss": -0.0025, + "num_tokens": 15221975.0, + "reward": 0.15625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.15625, + "rewards/itbench_correctness/std": 0.23935678601264954, + "step": 775, + "step_time": 191.42062663193792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 593.375, + "completions/mean_terminated_length": 593.375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.434800922870636, + "epoch": 4.105820105820106, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3359375, + "kl": 0.0018295013578608632, + "learning_rate": 1.321380446634342e-07, + "loss": 0.0433, + "num_tokens": 15235517.0, + "reward": 0.75, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.30276504158973694, + "step": 776, + "step_time": 495.94140707794577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 683.5625, + "completions/mean_terminated_length": 418.77777099609375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.5295785069465637, + "epoch": 4.111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.002165441866964102, + "learning_rate": 1.3102019350749527e-07, + "loss": 0.0164, + "num_tokens": 15252526.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 777, + "step_time": 161.9596445625648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 830.625, + "completions/mean_terminated_length": 637.25, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.4839729070663452, + "epoch": 4.116402116402116, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4921875, + "kl": 0.0012055831030011177, + "learning_rate": 1.299063774556042e-07, + "loss": 0.0, + "num_tokens": 15273064.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 778, + "step_time": 104.92770658805966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 831.1875, + "completions/mean_terminated_length": 681.2222290039062, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.42589667439460754, + "epoch": 4.121693121693122, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.203125, + "kl": 0.001626756857149303, + "learning_rate": 1.287966086882751e-07, + "loss": 0.0145, + "num_tokens": 15292003.0, + "reward": 0.8660714626312256, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.8660714626312256, + "rewards/itbench_correctness/std": 0.24169892072677612, + "step": 779, + "step_time": 1019.1612946912646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 657.125, + "completions/mean_terminated_length": 657.125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.366749107837677, + "epoch": 4.1269841269841265, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.0010745770996436477, + "learning_rate": 1.2769089934176126e-07, + "loss": 0.0175, + "num_tokens": 15308181.0, + "reward": 0.9187500476837158, + "reward_std": 0.0258774496614933, + "rewards/itbench_correctness/mean": 0.9187500476837158, + "rewards/itbench_correctness/std": 0.09105858951807022, + "step": 780, + "step_time": 172.30875083897263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 626.6875, + "completions/mean_terminated_length": 600.2000122070312, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "entropy": 0.3909444510936737, + "epoch": 4.132275132275132, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0703125, + "kl": 0.0016821923200041056, + "learning_rate": 1.2658926150792322e-07, + "loss": 0.0125, + "num_tokens": 15333040.0, + "reward": 0.640625, + "reward_std": 0.0867956355214119, + "rewards/itbench_correctness/mean": 0.640625, + "rewards/itbench_correctness/std": 0.3896446228027344, + "step": 781, + "step_time": 372.3970377044752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 615.75, + "completions/mean_terminated_length": 588.5333862304688, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.43524158000946045, + "epoch": 4.137566137566138, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5390625, + "kl": 0.0014195973053574562, + "learning_rate": 1.2549170723409547e-07, + "loss": 0.0509, + "num_tokens": 15354348.0, + "reward": 0.04375000298023224, + "reward_std": 0.0176776684820652, + "rewards/itbench_correctness/mean": 0.04375000298023224, + "rewards/itbench_correctness/std": 0.05123475566506386, + "step": 782, + "step_time": 1192.5468442188576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 385.75, + "completions/mean_terminated_length": 385.75, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.3966299295425415, + "epoch": 4.142857142857143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.0015992495464161038, + "learning_rate": 1.243982485229559e-07, + "loss": -0.0488, + "num_tokens": 15367896.0, + "reward": 0.6400861740112305, + "reward_std": 0.4733222424983978, + "rewards/itbench_correctness/mean": 0.6400861740112305, + "rewards/itbench_correctness/std": 0.4832458794116974, + "step": 783, + "step_time": 120.91243299655616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 658.6875, + "completions/mean_terminated_length": 658.6875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.28237974643707275, + "epoch": 4.148148148148148, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.375, + "kl": 0.0011543561704456806, + "learning_rate": 1.2330889733239368e-07, + "loss": -0.006, + "num_tokens": 15385411.0, + "reward": 0.359375, + "reward_std": 0.04419417306780815, + "rewards/itbench_correctness/mean": 0.359375, + "rewards/itbench_correctness/std": 0.3760402202606201, + "step": 784, + "step_time": 991.2049660263583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 839.375, + "completions/mean_terminated_length": 654.75, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "entropy": 0.3037974536418915, + "epoch": 4.1534391534391535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032958984375, + "kl": 0.0011951366905122995, + "learning_rate": 1.222236655753791e-07, + "loss": 0.0, + "num_tokens": 15404521.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 785, + "step_time": 491.68448298610747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 748.6875, + "completions/mean_terminated_length": 685.1538696289062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.28850486874580383, + "epoch": 4.158730158730159, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.0016689749900251627, + "learning_rate": 1.2114256511983274e-07, + "loss": -0.1494, + "num_tokens": 15422436.0, + "reward": 0.5833333730697632, + "reward_std": 0.3327338695526123, + "rewards/itbench_correctness/mean": 0.5833333730697632, + "rewards/itbench_correctness/std": 0.3648312985897064, + "step": 786, + "step_time": 87.55616814736277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 728.3125, + "completions/mean_terminated_length": 498.3333435058594, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.549214780330658, + "epoch": 4.164021164021164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.0020776954479515553, + "learning_rate": 1.2006560778849579e-07, + "loss": -0.0303, + "num_tokens": 15441321.0, + "reward": 0.4000000059604645, + "reward_std": 0.302165687084198, + "rewards/itbench_correctness/mean": 0.4000000059604645, + "rewards/itbench_correctness/std": 0.4898979663848877, + "step": 787, + "step_time": 132.10281661339104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 955.3125, + "completions/mean_terminated_length": 474.5, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.3663722574710846, + "epoch": 4.169312169312169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.0013061100617051125, + "learning_rate": 1.1899280535880119e-07, + "loss": 0.0, + "num_tokens": 15465854.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 788, + "step_time": 161.56111018918455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 747.875, + "completions/mean_terminated_length": 655.8333740234375, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.26207587122917175, + "epoch": 4.174603174603175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.0017630105139687657, + "learning_rate": 1.1792416956274443e-07, + "loss": 0.0001, + "num_tokens": 15485668.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 789, + "step_time": 249.84946880768985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 826.875, + "completions/mean_terminated_length": 673.5555419921875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.5635676383972168, + "epoch": 4.1798941798941796, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0018597254529595375, + "learning_rate": 1.1685971208675538e-07, + "loss": -0.0061, + "num_tokens": 15514546.0, + "reward": 0.40625, + "reward_std": 0.3471629321575165, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.4366062581539154, + "step": 790, + "step_time": 128.95786687266082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 678.5, + "completions/mean_terminated_length": 678.5, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.5630066394805908, + "epoch": 4.185185185185185, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2265625, + "kl": 0.0021489164792001247, + "learning_rate": 1.1579944457157059e-07, + "loss": -0.0285, + "num_tokens": 15538698.0, + "reward": 0.34375, + "reward_std": 0.22903135418891907, + "rewards/itbench_correctness/mean": 0.34375, + "rewards/itbench_correctness/std": 0.4732423722743988, + "step": 791, + "step_time": 140.31763851176947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 693.5625, + "completions/mean_terminated_length": 646.357177734375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.48157158493995667, + "epoch": 4.190476190476191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.002031001029536128, + "learning_rate": 1.1474337861210543e-07, + "loss": 0.0427, + "num_tokens": 15560955.0, + "reward": 0.375, + "reward_std": 0.4355512857437134, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.5, + "step": 792, + "step_time": 343.5579600026831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 522.625, + "completions/mean_terminated_length": 522.625, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "entropy": 0.5089691281318665, + "epoch": 4.195767195767195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.00155110121704638, + "learning_rate": 1.1369152575732821e-07, + "loss": -0.0072, + "num_tokens": 15572493.0, + "reward": 0.7232142686843872, + "reward_std": 0.3042290210723877, + "rewards/itbench_correctness/mean": 0.7232142686843872, + "rewards/itbench_correctness/std": 0.4347764849662781, + "step": 793, + "step_time": 130.16915812157094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 901.4375, + "completions/mean_terminated_length": 845.727294921875, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "entropy": 0.4392983317375183, + "epoch": 4.201058201058201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.0012185334926471114, + "learning_rate": 1.1264389751013325e-07, + "loss": 0.0073, + "num_tokens": 15592028.0, + "reward": 0.7326388955116272, + "reward_std": 0.3273431062698364, + "rewards/itbench_correctness/mean": 0.7326388955116272, + "rewards/itbench_correctness/std": 0.33346834778785706, + "step": 794, + "step_time": 521.8481605676934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 563.3125, + "completions/mean_terminated_length": 563.3125, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "entropy": 0.39232221245765686, + "epoch": 4.2063492063492065, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.0014798998599871993, + "learning_rate": 1.1160050532721527e-07, + "loss": 0.0313, + "num_tokens": 15605457.0, + "reward": 0.8125, + "reward_std": 0.3657589256763458, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.35939764976501465, + "step": 795, + "step_time": 94.33376376517117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 926.625, + "completions/mean_terminated_length": 882.3636474609375, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 0.40145689249038696, + "epoch": 4.211640211640212, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3671875, + "kl": 0.0010374147677794099, + "learning_rate": 1.1056136061894384e-07, + "loss": -0.0077, + "num_tokens": 15626107.0, + "reward": 0.9632353186607361, + "reward_std": 0.051545556634664536, + "rewards/itbench_correctness/mean": 0.9632353186607361, + "rewards/itbench_correctness/std": 0.08000864833593369, + "step": 796, + "step_time": 205.19225138891488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 572.9375, + "completions/mean_terminated_length": 572.9375, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.45729246735572815, + "epoch": 4.216931216931217, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.375, + "kl": 0.0012878067791461945, + "learning_rate": 1.095264747492391e-07, + "loss": 0.0172, + "num_tokens": 15639266.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 797, + "step_time": 114.62835809681565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 729.6875, + "completions/mean_terminated_length": 710.0667114257812, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 0.3563169240951538, + "epoch": 4.222222222222222, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.171875, + "kl": 0.0015947025967761874, + "learning_rate": 1.0849585903544706e-07, + "loss": -0.0083, + "num_tokens": 15655869.0, + "reward": 0.4285714328289032, + "reward_std": 0.19342948496341705, + "rewards/itbench_correctness/mean": 0.4285714328289032, + "rewards/itbench_correctness/std": 0.27437829971313477, + "step": 798, + "step_time": 143.3468729155138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 741.5, + "completions/mean_terminated_length": 572.0, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.5232636332511902, + "epoch": 4.227513227513228, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.002720025833696127, + "learning_rate": 1.0746952474821613e-07, + "loss": -0.0039, + "num_tokens": 15678173.0, + "reward": 0.3645833432674408, + "reward_std": 0.01928791031241417, + "rewards/itbench_correctness/mean": 0.3645833432674408, + "rewards/itbench_correctness/std": 0.3774610757827759, + "step": 799, + "step_time": 129.29889920540154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 640.9375, + "completions/mean_terminated_length": 411.1000061035156, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.37913212180137634, + "epoch": 4.232804232804233, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.015625, + "kl": 0.0011215686099603772, + "learning_rate": 1.0644748311137375e-07, + "loss": 0.0096, + "num_tokens": 15702932.0, + "reward": 0.21875, + "reward_std": 0.0883883461356163, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.2561737895011902, + "step": 800, + "step_time": 164.42878744658083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 705.5, + "completions/mean_terminated_length": 684.2667236328125, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 0.3798724412918091, + "epoch": 4.238095238095238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0012407073518261313, + "learning_rate": 1.0542974530180327e-07, + "loss": -0.0115, + "num_tokens": 15719212.0, + "reward": 0.905024528503418, + "reward_std": 0.16057346761226654, + "rewards/itbench_correctness/mean": 0.905024528503418, + "rewards/itbench_correctness/std": 0.17238253355026245, + "step": 801, + "step_time": 167.65833072923124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 873.875, + "completions/mean_terminated_length": 783.7999877929688, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.5355457067489624, + "epoch": 4.243386243386244, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4296875, + "kl": 0.001486996770836413, + "learning_rate": 1.0441632244932235e-07, + "loss": 0.0187, + "num_tokens": 15744658.0, + "reward": 0.2916666865348816, + "reward_std": 0.18722420930862427, + "rewards/itbench_correctness/mean": 0.2916666865348816, + "rewards/itbench_correctness/std": 0.395187109708786, + "step": 802, + "step_time": 429.37837726902217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.466796875, + "epoch": 4.248677248677248, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.0013433246640488505, + "learning_rate": 1.0340722563656107e-07, + "loss": 0.0001, + "num_tokens": 15774058.0, + "reward": 0.5989583730697632, + "reward_std": 0.2046467512845993, + "rewards/itbench_correctness/mean": 0.5989583730697632, + "rewards/itbench_correctness/std": 0.3842606544494629, + "step": 803, + "step_time": 261.3252537054941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 631.0625, + "completions/mean_terminated_length": 631.0625, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "entropy": 0.4817272424697876, + "epoch": 4.253968253968254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0022742159198969603, + "learning_rate": 1.0240246589884045e-07, + "loss": 0.0008, + "num_tokens": 15797947.0, + "reward": 0.40833333134651184, + "reward_std": 0.37586042284965515, + "rewards/itbench_correctness/mean": 0.40833333134651184, + "rewards/itbench_correctness/std": 0.392994225025177, + "step": 804, + "step_time": 447.9019009033218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 759.8125, + "completions/mean_terminated_length": 759.8125, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "entropy": 0.5580323934555054, + "epoch": 4.2592592592592595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.002108614193275571, + "learning_rate": 1.0140205422405212e-07, + "loss": 0.0001, + "num_tokens": 15830432.0, + "reward": 0.05000000074505806, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.05000000074505806, + "rewards/itbench_correctness/std": 0.05163978040218353, + "step": 805, + "step_time": 109.95679971016943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 816.0, + "completions/mean_terminated_length": 608.0, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 0.44117647409439087, + "epoch": 4.264550264550264, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0013343130704015493, + "learning_rate": 1.0040600155253764e-07, + "loss": -0.0138, + "num_tokens": 15850416.0, + "reward": 0.9583333730697632, + "reward_std": 0.11785111576318741, + "rewards/itbench_correctness/mean": 0.9583333730697632, + "rewards/itbench_correctness/std": 0.1666666567325592, + "step": 806, + "step_time": 525.2538241520524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 573.125, + "completions/mean_terminated_length": 573.125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.3297709822654724, + "epoch": 4.26984126984127, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.125, + "kl": 0.0016436699079349637, + "learning_rate": 9.941431877696954e-08, + "loss": -0.0137, + "num_tokens": 15863410.0, + "reward": 0.828125, + "reward_std": 0.22097086906433105, + "rewards/itbench_correctness/mean": 0.828125, + "rewards/itbench_correctness/std": 0.3502231538295746, + "step": 807, + "step_time": 648.7502203145996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 924.8125, + "completions/mean_terminated_length": 825.625, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 0.27573156356811523, + "epoch": 4.275132275132275, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0010158391669392586, + "learning_rate": 9.842701674223187e-08, + "loss": 0.0033, + "num_tokens": 15890943.0, + "reward": 0.7604166865348816, + "reward_std": 0.12147815525531769, + "rewards/itbench_correctness/mean": 0.7604166865348816, + "rewards/itbench_correctness/std": 0.2979482412338257, + "step": 808, + "step_time": 365.2914238469675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 530.375, + "completions/mean_terminated_length": 530.375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.41480085253715515, + "epoch": 4.28042328042328, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0703125, + "kl": 0.0014939939137548208, + "learning_rate": 9.744410624530147e-08, + "loss": 0.009, + "num_tokens": 15902429.0, + "reward": 0.6875, + "reward_std": 0.03857584670186043, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.28463754057884216, + "step": 809, + "step_time": 69.47060746885836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 745.8125, + "completions/mean_terminated_length": 467.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4344255328178406, + "epoch": 4.285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0032775227446109056, + "learning_rate": 9.646559803512993e-08, + "loss": -0.0804, + "num_tokens": 15922018.0, + "reward": 0.6015625, + "reward_std": 0.24306795001029968, + "rewards/itbench_correctness/mean": 0.6015625, + "rewards/itbench_correctness/std": 0.3824775218963623, + "step": 810, + "step_time": 242.85561118088663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 685.3125, + "completions/mean_terminated_length": 685.3125, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "entropy": 0.3764705955982208, + "epoch": 4.291005291005291, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.359375, + "kl": 0.001442611450329423, + "learning_rate": 9.549150281252632e-08, + "loss": -0.0122, + "num_tokens": 15938535.0, + "reward": 0.8571428656578064, + "reward_std": 0.11921755969524384, + "rewards/itbench_correctness/mean": 0.8571428656578064, + "rewards/itbench_correctness/std": 0.21977105736732483, + "step": 811, + "step_time": 366.22836083732545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 713.875, + "completions/mean_terminated_length": 713.875, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.36981263756752014, + "epoch": 4.296296296296296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.0017484528943896294, + "learning_rate": 9.452183123003999e-08, + "loss": 0.0069, + "num_tokens": 15954229.0, + "reward": 0.6102941036224365, + "reward_std": 0.3698710799217224, + "rewards/itbench_correctness/mean": 0.6102941036224365, + "rewards/itbench_correctness/std": 0.455075204372406, + "step": 812, + "step_time": 313.4723123824224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 814.5, + "completions/mean_terminated_length": 766.1538696289062, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.586863100528717, + "epoch": 4.301587301587301, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.0016130534932017326, + "learning_rate": 9.355659389184394e-08, + "loss": -0.0401, + "num_tokens": 15987341.0, + "reward": 0.22499999403953552, + "reward_std": 0.28192007541656494, + "rewards/itbench_correctness/mean": 0.22499999403953552, + "rewards/itbench_correctness/std": 0.3872983455657959, + "step": 813, + "step_time": 213.50641488097608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 672.9375, + "completions/mean_terminated_length": 321.875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.3982539176940918, + "epoch": 4.306878306878307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.0019140745280310512, + "learning_rate": 9.259580135361927e-08, + "loss": 0.0014, + "num_tokens": 16004804.0, + "reward": 0.53125, + "reward_std": 0.23289713263511658, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.3326033651828766, + "step": 814, + "step_time": 924.7196069033816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 823.25, + "completions/mean_terminated_length": 667.1111450195312, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.590343177318573, + "epoch": 4.3121693121693125, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.484375, + "kl": 0.0014230167726054788, + "learning_rate": 9.163946412243895e-08, + "loss": -0.0126, + "num_tokens": 16059928.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 815, + "step_time": 286.4796100119129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 703.25, + "completions/mean_terminated_length": 681.86669921875, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "entropy": 0.27444010972976685, + "epoch": 4.317460317460317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.0018459860002622008, + "learning_rate": 9.068759265665382e-08, + "loss": 0.0001, + "num_tokens": 16078292.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 816, + "step_time": 321.2267580414191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 578.3125, + "completions/mean_terminated_length": 578.3125, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.4253755509853363, + "epoch": 4.322751322751323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.001751541974954307, + "learning_rate": 8.974019736577775e-08, + "loss": 0.0, + "num_tokens": 16090817.0, + "reward": 0.8333333730697632, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.8333333730697632, + "rewards/itbench_correctness/std": 0.17213258147239685, + "step": 817, + "step_time": 207.34377425536513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 811.9375, + "completions/mean_terminated_length": 684.7000122070312, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "entropy": 0.330074667930603, + "epoch": 4.328042328042328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.00094667385565117, + "learning_rate": 8.879728861037383e-08, + "loss": 0.0025, + "num_tokens": 16109496.0, + "reward": 0.8690475821495056, + "reward_std": 0.13734711706638336, + "rewards/itbench_correctness/mean": 0.8690475821495056, + "rewards/itbench_correctness/std": 0.15356682240962982, + "step": 818, + "step_time": 157.14436247292906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 744.25, + "completions/mean_terminated_length": 651.0, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "entropy": 0.3815922141075134, + "epoch": 4.333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0016716917743906379, + "learning_rate": 8.785887670194136e-08, + "loss": -0.0363, + "num_tokens": 16129908.0, + "reward": 0.7552083730697632, + "reward_std": 0.12075783312320709, + "rewards/itbench_correctness/mean": 0.7552083730697632, + "rewards/itbench_correctness/std": 0.16796371340751648, + "step": 819, + "step_time": 186.5824106996879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 825.875, + "completions/mean_terminated_length": 671.7777709960938, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.5860450863838196, + "epoch": 4.338624338624339, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0019729058258235455, + "learning_rate": 8.692497190280224e-08, + "loss": -0.0117, + "num_tokens": 16162090.0, + "reward": 0.171875, + "reward_std": 0.3820367455482483, + "rewards/itbench_correctness/mean": 0.171875, + "rewards/itbench_correctness/std": 0.37325987219810486, + "step": 820, + "step_time": 353.5038594137877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 909.6875, + "completions/mean_terminated_length": 795.375, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 0.6024046540260315, + "epoch": 4.343915343915344, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.001822230638936162, + "learning_rate": 8.599558442598998e-08, + "loss": 0.035, + "num_tokens": 16191157.0, + "reward": 0.8125, + "reward_std": 0.3458075523376465, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.3403429687023163, + "step": 821, + "step_time": 505.8641969123855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 400.875, + "completions/mean_terminated_length": 400.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.46897411346435547, + "epoch": 4.349206349206349, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.515625, + "kl": 0.002614083932712674, + "learning_rate": 8.507072443513702e-08, + "loss": -0.0603, + "num_tokens": 16206291.0, + "reward": 0.171875, + "reward_std": 0.07281029224395752, + "rewards/itbench_correctness/mean": 0.171875, + "rewards/itbench_correctness/std": 0.20348526537418365, + "step": 822, + "step_time": 195.5613472936675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 786.8125, + "completions/mean_terminated_length": 752.9285888671875, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 0.4295813739299774, + "epoch": 4.354497354497354, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.0021104791667312384, + "learning_rate": 8.415040204436425e-08, + "loss": 0.0155, + "num_tokens": 16229032.0, + "reward": 0.8999999761581421, + "reward_std": 0.09258200973272324, + "rewards/itbench_correctness/mean": 0.8999999761581421, + "rewards/itbench_correctness/std": 0.1632993221282959, + "step": 823, + "step_time": 379.2500517424196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 892.75, + "completions/mean_terminated_length": 499.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5667880177497864, + "epoch": 4.35978835978836, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.0018997631268575788, + "learning_rate": 8.32346273181696e-08, + "loss": -0.0691, + "num_tokens": 16254500.0, + "reward": 0.125, + "reward_std": 0.1674824357032776, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.17320507764816284, + "step": 824, + "step_time": 547.5418346459046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 736.0625, + "completions/mean_terminated_length": 512.1111450195312, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.41572555899620056, + "epoch": 4.365079365079365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.0017400278011336923, + "learning_rate": 8.232341027131883e-08, + "loss": 0.0001, + "num_tokens": 16277821.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 825, + "step_time": 1151.8339996775612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 460.5, + "completions/mean_terminated_length": 460.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.41910967230796814, + "epoch": 4.37037037037037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.0028239088132977486, + "learning_rate": 8.141676086873573e-08, + "loss": -0.0546, + "num_tokens": 16288837.0, + "reward": 0.375, + "reward_std": 0.19918900728225708, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.20412415266036987, + "step": 826, + "step_time": 428.2450467739254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 784.5, + "completions/mean_terminated_length": 598.2222290039062, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "entropy": 0.4869343638420105, + "epoch": 4.375661375661376, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.515625, + "kl": 0.001279774820432067, + "learning_rate": 8.051468902539271e-08, + "loss": 0.0081, + "num_tokens": 16312141.0, + "reward": 0.75, + "reward_std": 0.2182178944349289, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.394405335187912, + "step": 827, + "step_time": 134.5602146498859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 912.3125, + "completions/mean_terminated_length": 825.4444580078125, + "completions/min_length": 675.0, + "completions/min_terminated_length": 675.0, + "entropy": 0.5195587873458862, + "epoch": 4.380952380952381, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.002217257162556052, + "learning_rate": 7.961720460620319e-08, + "loss": 0.0265, + "num_tokens": 16336338.0, + "reward": 0.5885416269302368, + "reward_std": 0.38563019037246704, + "rewards/itbench_correctness/mean": 0.5885416269302368, + "rewards/itbench_correctness/std": 0.4767450988292694, + "step": 828, + "step_time": 96.03808457683772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 416.9375, + "completions/mean_terminated_length": 416.9375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 0.3237895369529724, + "epoch": 4.386243386243386, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.109375, + "kl": 0.001336383749730885, + "learning_rate": 7.872431742591267e-08, + "loss": -0.009, + "num_tokens": 16345745.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.25, + "step": 829, + "step_time": 79.61548331100494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 697.0625, + "completions/mean_terminated_length": 588.0833740234375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "entropy": 0.39594727754592896, + "epoch": 4.391534391534392, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2890625, + "kl": 0.00119964184705168, + "learning_rate": 7.783603724899257e-08, + "loss": 0.0261, + "num_tokens": 16370170.0, + "reward": 0.25, + "reward_std": 0.26726123690605164, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.44721361994743347, + "step": 830, + "step_time": 94.27793037891388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 753.0625, + "completions/mean_terminated_length": 735.0000610351562, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.33463358879089355, + "epoch": 4.396825396825397, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25, + "kl": 0.0012139711761847138, + "learning_rate": 7.695237378953224e-08, + "loss": -0.0273, + "num_tokens": 16387731.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 831, + "step_time": 357.8091874551028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 880.25, + "completions/mean_terminated_length": 794.0, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.48168134689331055, + "epoch": 4.402116402116402, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.0016746832989156246, + "learning_rate": 7.607333671113408e-08, + "loss": 0.0418, + "num_tokens": 16408031.0, + "reward": 0.375, + "reward_std": 0.20927216112613678, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.4249182939529419, + "step": 832, + "step_time": 128.6728245029226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 810.9375, + "completions/mean_terminated_length": 780.5000610351562, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.3551445007324219, + "epoch": 4.407407407407407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.0015545395435765386, + "learning_rate": 7.519893562680663e-08, + "loss": 0.0, + "num_tokens": 16426366.0, + "reward": 0.0833333358168602, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0833333358168602, + "rewards/itbench_correctness/std": 0.08606629818677902, + "step": 833, + "step_time": 679.0917965397239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 399.3125, + "completions/mean_terminated_length": 399.3125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.3430896997451782, + "epoch": 4.412698412698413, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0546875, + "kl": 0.0014030004385858774, + "learning_rate": 7.432918009885996e-08, + "loss": 0.0148, + "num_tokens": 16436691.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 834, + "step_time": 1008.2919538905844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 666.6875, + "completions/mean_terminated_length": 642.86669921875, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "entropy": 0.4679853618144989, + "epoch": 4.417989417989418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.001401036512106657, + "learning_rate": 7.346407963880136e-08, + "loss": 0.0403, + "num_tokens": 16456902.0, + "reward": 0.5125000476837158, + "reward_std": 0.1552647352218628, + "rewards/itbench_correctness/mean": 0.5125000476837158, + "rewards/itbench_correctness/std": 0.38100746273994446, + "step": 835, + "step_time": 458.4377150340006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 636.4375, + "completions/mean_terminated_length": 507.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.5530786514282227, + "epoch": 4.423280423280423, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.0014363399241119623, + "learning_rate": 7.260364370723043e-08, + "loss": -0.003, + "num_tokens": 16482653.0, + "reward": 0.3046875, + "reward_std": 0.19887377321720123, + "rewards/itbench_correctness/mean": 0.3046875, + "rewards/itbench_correctness/std": 0.3060798943042755, + "step": 836, + "step_time": 170.9447146616876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 629.3125, + "completions/mean_terminated_length": 572.9285888671875, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.45764225721359253, + "epoch": 4.428571428571429, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.0012081885943189263, + "learning_rate": 7.17478817137373e-08, + "loss": 0.0681, + "num_tokens": 16500194.0, + "reward": 0.84375, + "reward_std": 0.2893187999725342, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.286865234375, + "step": 837, + "step_time": 777.2624794654548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 558.375, + "completions/mean_terminated_length": 558.375, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "entropy": 0.40116408467292786, + "epoch": 4.4338624338624335, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.002113268943503499, + "learning_rate": 7.089680301679751e-08, + "loss": -0.0104, + "num_tokens": 16512536.0, + "reward": 0.90625, + "reward_std": 0.2346404492855072, + "rewards/itbench_correctness/mean": 0.90625, + "rewards/itbench_correctness/std": 0.2561737895011902, + "step": 838, + "step_time": 86.38017075136304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 955.75, + "completions/mean_terminated_length": 660.0, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.31388962268829346, + "epoch": 4.439153439153439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037109375, + "kl": 0.001120888045988977, + "learning_rate": 7.005041692367153e-08, + "loss": 0.0, + "num_tokens": 16542196.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 839, + "step_time": 386.5617507044226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 622.1875, + "completions/mean_terminated_length": 622.1875, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 0.40020090341567993, + "epoch": 4.444444444444445, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1875, + "kl": 0.0012403958244249225, + "learning_rate": 6.92087326903022e-08, + "loss": 0.0021, + "num_tokens": 16556511.0, + "reward": 0.4296875, + "reward_std": 0.17499202489852905, + "rewards/itbench_correctness/mean": 0.4296875, + "rewards/itbench_correctness/std": 0.5040848851203918, + "step": 840, + "step_time": 135.079236516729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 765.9375, + "completions/mean_terminated_length": 611.1000366210938, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "entropy": 0.5144022703170776, + "epoch": 4.449735449735449, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.765625, + "kl": 0.0017959108809009194, + "learning_rate": 6.837175952121304e-08, + "loss": 0.0412, + "num_tokens": 16577070.0, + "reward": 0.2395833432674408, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.2395833432674408, + "rewards/itbench_correctness/std": 0.19214914739131927, + "step": 841, + "step_time": 980.2466245274991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 794.0, + "completions/mean_terminated_length": 656.0, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 0.6423173546791077, + "epoch": 4.455026455026455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.001680102082900703, + "learning_rate": 6.753950656940905e-08, + "loss": 0.0577, + "num_tokens": 16599126.0, + "reward": 0.25, + "reward_std": 0.1462520956993103, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.24152295291423798, + "step": 842, + "step_time": 89.48983163572848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 873.75, + "completions/mean_terminated_length": 756.888916015625, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.4738197326660156, + "epoch": 4.4603174603174605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0010829598177224398, + "learning_rate": 6.671198293627479e-08, + "loss": 0.0027, + "num_tokens": 16620106.0, + "reward": 0.4750000238418579, + "reward_std": 0.19992218911647797, + "rewards/itbench_correctness/mean": 0.4750000238418579, + "rewards/itbench_correctness/std": 0.4358898997306824, + "step": 843, + "step_time": 68.92372180242091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 608.5, + "completions/mean_terminated_length": 608.5, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4272801876068115, + "epoch": 4.465608465608465, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.55078125, + "kl": 0.0015739505179226398, + "learning_rate": 6.588919767147638e-08, + "loss": -0.0799, + "num_tokens": 16633634.0, + "reward": 0.8125, + "reward_std": 0.1298656165599823, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.2626432776451111, + "step": 844, + "step_time": 799.5955674275756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 709.375, + "completions/mean_terminated_length": 636.7692260742188, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.38625550270080566, + "epoch": 4.470899470899471, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.0017984964651986957, + "learning_rate": 6.507115977286143e-08, + "loss": 0.0032, + "num_tokens": 16650712.0, + "reward": 0.421875, + "reward_std": 0.09300297498703003, + "rewards/itbench_correctness/mean": 0.421875, + "rewards/itbench_correctness/std": 0.4538607597351074, + "step": 845, + "step_time": 264.149626750499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 482.4375, + "completions/mean_terminated_length": 482.4375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.4415079653263092, + "epoch": 4.476190476190476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.001339221140369773, + "learning_rate": 6.42578781863613e-08, + "loss": 0.0102, + "num_tokens": 16661415.0, + "reward": 0.484375, + "reward_std": 0.16849708557128906, + "rewards/itbench_correctness/mean": 0.484375, + "rewards/itbench_correctness/std": 0.17001838982105255, + "step": 846, + "step_time": 347.23770444560796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 730.5625, + "completions/mean_terminated_length": 554.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.533835232257843, + "epoch": 4.481481481481482, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5390625, + "kl": 0.0018466972978785634, + "learning_rate": 6.34493618058935e-08, + "loss": -0.0754, + "num_tokens": 16700320.0, + "reward": 0.03125, + "reward_std": 0.043129101395606995, + "rewards/itbench_correctness/mean": 0.03125, + "rewards/itbench_correctness/std": 0.06718548387289047, + "step": 847, + "step_time": 213.22028856538236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 455.5, + "completions/mean_terminated_length": 455.5, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.40175631642341614, + "epoch": 4.4867724867724865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0014747100649401546, + "learning_rate": 6.26456194732633e-08, + "loss": -0.0161, + "num_tokens": 16710784.0, + "reward": 0.4895833432674408, + "reward_std": 0.21374498307704926, + "rewards/itbench_correctness/mean": 0.4895833432674408, + "rewards/itbench_correctness/std": 0.20983901619911194, + "step": 848, + "step_time": 55.42429989017546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 442.25, + "completions/mean_terminated_length": 442.25, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.35048049688339233, + "epoch": 4.492063492063492, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.015625, + "kl": 0.003505163826048374, + "learning_rate": 6.184665997806831e-08, + "loss": 0.0185, + "num_tokens": 16721036.0, + "reward": 0.4943181872367859, + "reward_std": 0.016070598736405373, + "rewards/itbench_correctness/mean": 0.4943181872367859, + "rewards/itbench_correctness/std": 0.5110015273094177, + "step": 849, + "step_time": 1021.3690116815269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 910.4375, + "completions/mean_terminated_length": 660.6000366210938, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "entropy": 0.4964646100997925, + "epoch": 4.497354497354498, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.875, + "kl": 0.0017396226758137345, + "learning_rate": 6.105249205760127e-08, + "loss": 0.0387, + "num_tokens": 16749483.0, + "reward": 0.4375, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.48733973503112793, + "step": 850, + "step_time": 576.2763332147151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 593.75, + "completions/mean_terminated_length": 593.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3183158040046692, + "epoch": 4.502645502645502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0289306640625, + "kl": 0.001586488215252757, + "learning_rate": 6.026312439675551e-08, + "loss": 0.0, + "num_tokens": 16763647.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 851, + "step_time": 732.8523011729121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 584.375, + "completions/mean_terminated_length": 555.0667114257812, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.5373262166976929, + "epoch": 4.507936507936508, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1953125, + "kl": 0.0018168711103498936, + "learning_rate": 5.9478565627929244e-08, + "loss": 0.0319, + "num_tokens": 16794397.0, + "reward": 0.0729166716337204, + "reward_std": 0.0294627845287323, + "rewards/itbench_correctness/mean": 0.0729166716337204, + "rewards/itbench_correctness/std": 0.08539126068353653, + "step": 852, + "step_time": 84.26783776376396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 935.625, + "completions/mean_terminated_length": 741.2000122070312, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "entropy": 0.5600534677505493, + "epoch": 4.5132275132275135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034912109375, + "kl": 0.0015273833414539695, + "learning_rate": 5.869882433093154e-08, + "loss": 0.0001, + "num_tokens": 16832975.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 853, + "step_time": 153.46809213608503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 617.25, + "completions/mean_terminated_length": 481.66668701171875, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "entropy": 0.37748077511787415, + "epoch": 4.518518518518518, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.90625, + "kl": 0.0018203725339844823, + "learning_rate": 5.7923909032888295e-08, + "loss": -0.0292, + "num_tokens": 16855027.0, + "reward": 0.2395833432674408, + "reward_std": 0.13684004545211792, + "rewards/itbench_correctness/mean": 0.2395833432674408, + "rewards/itbench_correctness/std": 0.31012991070747375, + "step": 854, + "step_time": 189.47853012941778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 370.75, + "completions/mean_terminated_length": 370.75, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.34794336557388306, + "epoch": 4.523809523809524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.002147156512364745, + "learning_rate": 5.7153828208148846e-08, + "loss": -0.0062, + "num_tokens": 16863943.0, + "reward": 0.53125, + "reward_std": 0.405046284198761, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.4905354380607605, + "step": 855, + "step_time": 1156.329422229901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 743.25, + "completions/mean_terminated_length": 615.6363525390625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.44399595260620117, + "epoch": 4.529100529100529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.0015257069608196616, + "learning_rate": 5.638859027819409e-08, + "loss": -0.0599, + "num_tokens": 16885899.0, + "reward": 0.3812499940395355, + "reward_std": 0.3087776303291321, + "rewards/itbench_correctness/mean": 0.3812499940395355, + "rewards/itbench_correctness/std": 0.46219584345817566, + "step": 856, + "step_time": 461.6809099484235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 481.375, + "completions/mean_terminated_length": 481.375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.4964944124221802, + "epoch": 4.534391534391535, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.203125, + "kl": 0.0020852303132414818, + "learning_rate": 5.562820361154313e-08, + "loss": -0.0059, + "num_tokens": 16896417.0, + "reward": 0.359375, + "reward_std": 0.031000997871160507, + "rewards/itbench_correctness/mean": 0.359375, + "rewards/itbench_correctness/std": 0.3735698163509369, + "step": 857, + "step_time": 93.96854640357196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 994.9375, + "completions/mean_terminated_length": 869.0, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.4140963554382324, + "epoch": 4.5396825396825395, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4296875, + "kl": 0.001920070848427713, + "learning_rate": 5.48726765236629e-08, + "loss": 0.0154, + "num_tokens": 16921448.0, + "reward": 0.2678571343421936, + "reward_std": 0.04959750175476074, + "rewards/itbench_correctness/mean": 0.2678571343421936, + "rewards/itbench_correctness/std": 0.12975645065307617, + "step": 858, + "step_time": 6701.7141972742975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 783.1875, + "completions/mean_terminated_length": 595.888916015625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.43923071026802063, + "epoch": 4.544973544973545, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.140625, + "kl": 0.001265035243704915, + "learning_rate": 5.412201727687643e-08, + "loss": -0.0124, + "num_tokens": 16938763.0, + "reward": 0.8556547164916992, + "reward_std": 0.053405825048685074, + "rewards/itbench_correctness/mean": 0.8556547164916992, + "rewards/itbench_correctness/std": 0.07298243790864944, + "step": 859, + "step_time": 1027.743779040873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 712.5625, + "completions/mean_terminated_length": 401.125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "entropy": 0.5192527174949646, + "epoch": 4.550264550264551, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0014884049305692315, + "learning_rate": 5.337623408027292e-08, + "loss": -0.0094, + "num_tokens": 16957572.0, + "reward": 0.6171875, + "reward_std": 0.37874263525009155, + "rewards/itbench_correctness/mean": 0.6171875, + "rewards/itbench_correctness/std": 0.3991364538669586, + "step": 860, + "step_time": 262.18378533329815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 984.5625, + "completions/mean_terminated_length": 708.5, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "entropy": 0.5484669804573059, + "epoch": 4.555555555555555, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.0018317234935238957, + "learning_rate": 5.263533508961826e-08, + "loss": 0.0379, + "num_tokens": 16981197.0, + "reward": 0.265625, + "reward_std": 0.39435434341430664, + "rewards/itbench_correctness/mean": 0.265625, + "rewards/itbench_correctness/std": 0.4096280336380005, + "step": 861, + "step_time": 77.7364522125572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 457.0625, + "completions/mean_terminated_length": 457.0625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.345685750246048, + "epoch": 4.560846560846561, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.002134555485099554, + "learning_rate": 5.1899328407264855e-08, + "loss": -0.087, + "num_tokens": 16993086.0, + "reward": 0.3812499940395355, + "reward_std": 0.318040132522583, + "rewards/itbench_correctness/mean": 0.3812499940395355, + "rewards/itbench_correctness/std": 0.4069705307483673, + "step": 862, + "step_time": 827.9984636185691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 547.8125, + "completions/mean_terminated_length": 547.8125, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "entropy": 0.46731317043304443, + "epoch": 4.5661375661375665, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.234375, + "kl": 0.0013622003607451916, + "learning_rate": 5.116822208206395e-08, + "loss": -0.0166, + "num_tokens": 17005059.0, + "reward": 0.5588235259056091, + "reward_std": 0.16637806594371796, + "rewards/itbench_correctness/mean": 0.5588235259056091, + "rewards/itbench_correctness/std": 0.5092002749443054, + "step": 863, + "step_time": 190.58081929571927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 952.5, + "completions/mean_terminated_length": 452.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.2813648283481598, + "epoch": 4.571428571428571, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.421875, + "kl": 0.001091470243409276, + "learning_rate": 5.044202410927706e-08, + "loss": -0.0038, + "num_tokens": 17038027.0, + "reward": 0.048076923936605453, + "reward_std": 0.13598206639289856, + "rewards/itbench_correctness/mean": 0.048076923936605453, + "rewards/itbench_correctness/std": 0.192307710647583, + "step": 864, + "step_time": 116.58771913684905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 526.375, + "completions/mean_terminated_length": 526.375, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.391355961561203, + "epoch": 4.576719576719577, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.03125, + "kl": 0.001298400922678411, + "learning_rate": 4.972074243048896e-08, + "loss": -0.0148, + "num_tokens": 17049241.0, + "reward": 0.71875, + "reward_std": 0.18196186423301697, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.3823356628417969, + "step": 865, + "step_time": 239.0914654675871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 552.75, + "completions/mean_terminated_length": 552.75, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "entropy": 0.510176420211792, + "epoch": 4.582010582010582, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3125, + "kl": 0.0015060057630762458, + "learning_rate": 4.9004384933520547e-08, + "loss": -0.0102, + "num_tokens": 17061309.0, + "reward": 0.8125, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 866, + "step_time": 120.828508451581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 870.375, + "completions/mean_terminated_length": 750.888916015625, + "completions/min_length": 648.0, + "completions/min_terminated_length": 648.0, + "entropy": 0.25850924849510193, + "epoch": 4.587301587301587, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.001840650918893516, + "learning_rate": 4.829295945234257e-08, + "loss": 0.01, + "num_tokens": 17083635.0, + "reward": 0.6875, + "reward_std": 0.12400396168231964, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.28463754057884216, + "step": 867, + "step_time": 78.40537928510457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 641.875, + "completions/mean_terminated_length": 641.875, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.5577409863471985, + "epoch": 4.592592592592593, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5, + "kl": 0.0019760173745453358, + "learning_rate": 4.758647376699032e-08, + "loss": -0.0254, + "num_tokens": 17104393.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.5, + "step": 868, + "step_time": 90.65404016617686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 561.0, + "completions/mean_terminated_length": 561.0, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.35650622844696045, + "epoch": 4.597883597883598, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0018372276099398732, + "learning_rate": 4.6884935603477724e-08, + "loss": -0.0127, + "num_tokens": 17119761.0, + "reward": 0.3125, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.3095695972442627, + "step": 869, + "step_time": 567.832233437337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 735.75, + "completions/mean_terminated_length": 562.7999877929688, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4512402415275574, + "epoch": 4.603174603174603, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3125, + "kl": 0.0015587429516017437, + "learning_rate": 4.6188352633713956e-08, + "loss": -0.0225, + "num_tokens": 17156341.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 870, + "step_time": 147.64252108428627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 688.875, + "completions/mean_terminated_length": 666.5333862304688, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.4325893521308899, + "epoch": 4.608465608465608, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.171875, + "kl": 0.0013199535897001624, + "learning_rate": 4.549673247541874e-08, + "loss": -0.0139, + "num_tokens": 17174155.0, + "reward": 0.40625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.4552929699420929, + "step": 871, + "step_time": 94.71705105807632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 609.0625, + "completions/mean_terminated_length": 420.4545593261719, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.36613649129867554, + "epoch": 4.613756613756614, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.71484375, + "kl": 0.001597036374732852, + "learning_rate": 4.48100826920394e-08, + "loss": -0.1975, + "num_tokens": 17194796.0, + "reward": 0.0625, + "reward_std": 0.03857583925127983, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.0833333358168602, + "step": 872, + "step_time": 804.2430580342188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1024.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 0.0, + "entropy": 0.296875, + "epoch": 4.619047619047619, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.001162711065262556, + "learning_rate": 4.412841079266777e-08, + "loss": 0.0, + "num_tokens": 17221364.0, + "reward": 0.2708333432674408, + "reward_std": 0.19795581698417664, + "rewards/itbench_correctness/mean": 0.2708333432674408, + "rewards/itbench_correctness/std": 0.3890872597694397, + "step": 873, + "step_time": 150.55902750603855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 616.125, + "completions/mean_terminated_length": 616.125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.5778048038482666, + "epoch": 4.624338624338624, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.002010586205869913, + "learning_rate": 4.3451724231958645e-08, + "loss": 0.0149, + "num_tokens": 17239318.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.5, + "step": 874, + "step_time": 94.01774641126394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 537.5, + "completions/mean_terminated_length": 537.5, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.26232558488845825, + "epoch": 4.62962962962963, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.109375, + "kl": 0.0017294755671173334, + "learning_rate": 4.2780030410047796e-08, + "loss": -0.0713, + "num_tokens": 17252630.0, + "reward": 0.375, + "reward_std": 0.08908706903457642, + "rewards/itbench_correctness/mean": 0.375, + "rewards/itbench_correctness/std": 0.17743021249771118, + "step": 875, + "step_time": 71.65096860099584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 792.8125, + "completions/mean_terminated_length": 715.75, + "completions/min_length": 616.0, + "completions/min_terminated_length": 616.0, + "entropy": 0.47930628061294556, + "epoch": 4.634920634920634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0011707143858075142, + "learning_rate": 4.2113336672471245e-08, + "loss": 0.0018, + "num_tokens": 17273627.0, + "reward": 0.84375, + "reward_std": 0.2088201940059662, + "rewards/itbench_correctness/mean": 0.84375, + "rewards/itbench_correctness/std": 0.24757154285907745, + "step": 876, + "step_time": 72.07463994249701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 585.375, + "completions/mean_terminated_length": 585.375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.5500747561454773, + "epoch": 4.64021164021164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0019222473492845893, + "learning_rate": 4.145165031008507e-08, + "loss": 0.0357, + "num_tokens": 17291033.0, + "reward": 0.800000011920929, + "reward_std": 0.2777460217475891, + "rewards/itbench_correctness/mean": 0.800000011920929, + "rewards/itbench_correctness/std": 0.3265986442565918, + "step": 877, + "step_time": 86.38187370076776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 519.375, + "completions/mean_terminated_length": 402.923095703125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.379302054643631, + "epoch": 4.645502645502646, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.546875, + "kl": 0.00152775296010077, + "learning_rate": 4.0794978558985e-08, + "loss": 0.0501, + "num_tokens": 17305767.0, + "reward": 0.2395833432674408, + "reward_std": 0.10386862605810165, + "rewards/itbench_correctness/mean": 0.2395833432674408, + "rewards/itbench_correctness/std": 0.2852468192577362, + "step": 878, + "step_time": 181.77928131632507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 672.125, + "completions/mean_terminated_length": 554.8333740234375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.5118095874786377, + "epoch": 4.650793650793651, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.65625, + "kl": 0.0015323893167078495, + "learning_rate": 4.0143328600428294e-08, + "loss": 0.0114, + "num_tokens": 17320513.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 879, + "step_time": 106.95045015309006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 721.0625, + "completions/mean_terminated_length": 539.2999877929688, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.3689000606536865, + "epoch": 4.656084656084656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.0019987330306321383, + "learning_rate": 3.949670756075446e-08, + "loss": 0.0123, + "num_tokens": 17337194.0, + "reward": 0.5625, + "reward_std": 0.3098883032798767, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.4466957151889801, + "step": 880, + "step_time": 1010.3906643372029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 663.1875, + "completions/mean_terminated_length": 302.375, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5669588446617126, + "epoch": 4.661375661375661, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.40625, + "kl": 0.0028189942240715027, + "learning_rate": 3.8855122511307626e-08, + "loss": 0.0001, + "num_tokens": 17354189.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 881, + "step_time": 113.85438270866871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 442.6875, + "completions/mean_terminated_length": 442.6875, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.4585627615451813, + "epoch": 4.666666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.0014935840154066682, + "learning_rate": 3.821858046835913e-08, + "loss": 0.0196, + "num_tokens": 17363480.0, + "reward": 0.5653409361839294, + "reward_std": 0.1779802441596985, + "rewards/itbench_correctness/mean": 0.5653409361839294, + "rewards/itbench_correctness/std": 0.3175182640552521, + "step": 882, + "step_time": 61.071511584334075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 825.4375, + "completions/mean_terminated_length": 779.6154174804688, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.32225334644317627, + "epoch": 4.671957671957672, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4453125, + "kl": 0.0013399338349699974, + "learning_rate": 3.75870883930306e-08, + "loss": 0.0614, + "num_tokens": 17384615.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 883, + "step_time": 223.48245067708194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 802.8125, + "completions/mean_terminated_length": 581.625, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 0.5904242992401123, + "epoch": 4.677248677248677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.0015930193476378918, + "learning_rate": 3.6960653191218324e-08, + "loss": -0.0112, + "num_tokens": 17405748.0, + "reward": 0.8020833134651184, + "reward_std": 0.25074294209480286, + "rewards/itbench_correctness/mean": 0.8020833134651184, + "rewards/itbench_correctness/std": 0.32185086607933044, + "step": 884, + "step_time": 80.26837155316025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 697.0, + "completions/mean_terminated_length": 548.3636474609375, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.42180773615837097, + "epoch": 4.682539682539683, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0014244532212615013, + "learning_rate": 3.63392817135173e-08, + "loss": 0.0199, + "num_tokens": 17421772.0, + "reward": 0.3482142686843872, + "reward_std": 0.14384004473686218, + "rewards/itbench_correctness/mean": 0.3482142686843872, + "rewards/itbench_correctness/std": 0.2632541060447693, + "step": 885, + "step_time": 135.67724260222167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 652.875, + "completions/mean_terminated_length": 652.875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3706682026386261, + "epoch": 4.6878306878306875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.001711677061393857, + "learning_rate": 3.572298075514652e-08, + "loss": -0.118, + "num_tokens": 17441162.0, + "reward": 0.5, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 886, + "step_time": 319.866651549004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 697.0, + "completions/mean_terminated_length": 675.2000122070312, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.4418938159942627, + "epoch": 4.693121693121693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.0013350816443562508, + "learning_rate": 3.5111757055874326e-08, + "loss": 0.0371, + "num_tokens": 17455858.0, + "reward": 0.9236111640930176, + "reward_std": 0.21606040000915527, + "rewards/itbench_correctness/mean": 0.9236111640930176, + "rewards/itbench_correctness/std": 0.20971761643886566, + "step": 887, + "step_time": 798.9065483696759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 778.75, + "completions/mean_terminated_length": 722.1538696289062, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "entropy": 0.4057784974575043, + "epoch": 4.698412698412699, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.0015928231878206134, + "learning_rate": 3.450561729994533e-08, + "loss": 0.0253, + "num_tokens": 17493110.0, + "reward": 0.5625, + "reward_std": 0.5260357856750488, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 888, + "step_time": 112.42098965961486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 633.0, + "completions/mean_terminated_length": 633.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.32543444633483887, + "epoch": 4.703703703703704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.0019525340758264065, + "learning_rate": 3.390456811600673e-08, + "loss": -0.145, + "num_tokens": 17514366.0, + "reward": 0.4375, + "reward_std": 0.23927490413188934, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.30276504158973694, + "step": 889, + "step_time": 95.3800596492365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 672.375, + "completions/mean_terminated_length": 672.375, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "entropy": 0.5919315814971924, + "epoch": 4.708994708994709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.002438145689666271, + "learning_rate": 3.330861607703611e-08, + "loss": 0.0001, + "num_tokens": 17529404.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 890, + "step_time": 194.32226402964443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 910.875, + "completions/mean_terminated_length": 797.75, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.6191848516464233, + "epoch": 4.714285714285714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.002367243403568864, + "learning_rate": 3.271776770026963e-08, + "loss": 0.0057, + "num_tokens": 17554098.0, + "reward": 0.1015625, + "reward_std": 0.21758441627025604, + "rewards/itbench_correctness/mean": 0.1015625, + "rewards/itbench_correctness/std": 0.2550275921821594, + "step": 891, + "step_time": 93.79412244167179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 419.75, + "completions/mean_terminated_length": 419.75, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.38832637667655945, + "epoch": 4.71957671957672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0022835221607238054, + "learning_rate": 3.213202944713023e-08, + "loss": 0.0096, + "num_tokens": 17563774.0, + "reward": 0.4837239682674408, + "reward_std": 0.10496115684509277, + "rewards/itbench_correctness/mean": 0.4837239682674408, + "rewards/itbench_correctness/std": 0.12295603007078171, + "step": 892, + "step_time": 44.90997119899839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 433.8125, + "completions/mean_terminated_length": 433.8125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.4886903762817383, + "epoch": 4.724867724867725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0014157530386000872, + "learning_rate": 3.155140772315773e-08, + "loss": 0.0198, + "num_tokens": 17573195.0, + "reward": 0.4437499940395355, + "reward_std": 0.11511446535587311, + "rewards/itbench_correctness/mean": 0.4437499940395355, + "rewards/itbench_correctness/std": 0.1263263076543808, + "step": 893, + "step_time": 62.78798679355532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 990.5, + "completions/mean_terminated_length": 890.0, + "completions/min_length": 783.0, + "completions/min_terminated_length": 783.0, + "entropy": 0.43210500478744507, + "epoch": 4.73015873015873, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.0013504911912605166, + "learning_rate": 3.097590887793827e-08, + "loss": 0.0194, + "num_tokens": 17598411.0, + "reward": 0.6432291865348816, + "reward_std": 0.2726758122444153, + "rewards/itbench_correctness/mean": 0.6432291865348816, + "rewards/itbench_correctness/std": 0.4446098804473877, + "step": 894, + "step_time": 102.70882797706872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 671.25, + "completions/mean_terminated_length": 553.6666870117188, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.3515828549861908, + "epoch": 4.735449735449736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0012724708067253232, + "learning_rate": 3.040553920503502e-08, + "loss": 0.0152, + "num_tokens": 17614191.0, + "reward": 0.3125, + "reward_std": 0.2335786372423172, + "rewards/itbench_correctness/mean": 0.3125, + "rewards/itbench_correctness/std": 0.23471811413764954, + "step": 895, + "step_time": 131.4134486299008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 748.6875, + "completions/mean_terminated_length": 473.375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.44878536462783813, + "epoch": 4.7407407407407405, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0016055708983913064, + "learning_rate": 2.9840304941919416e-08, + "loss": -0.0042, + "num_tokens": 17634162.0, + "reward": 0.625, + "reward_std": 0.31586384773254395, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.40138646960258484, + "step": 896, + "step_time": 121.16828937549144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 698.125, + "completions/mean_terminated_length": 651.5714721679688, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.4755595326423645, + "epoch": 4.746031746031746, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.84375, + "kl": 0.0011872922768816352, + "learning_rate": 2.9280212269902628e-08, + "loss": 0.0145, + "num_tokens": 17650676.0, + "reward": 0.8125, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.8125, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 897, + "step_time": 88.3342649359256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 678.5, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.38319823145866394, + "epoch": 4.751322751322752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0016045935917645693, + "learning_rate": 2.872526731406849e-08, + "loss": -0.0215, + "num_tokens": 17670076.0, + "reward": 0.5572916865348816, + "reward_std": 0.398481547832489, + "rewards/itbench_correctness/mean": 0.5572916865348816, + "rewards/itbench_correctness/std": 0.38934746384620667, + "step": 898, + "step_time": 123.173085459508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 656.1875, + "completions/mean_terminated_length": 571.3077392578125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.55167156457901, + "epoch": 4.756613756613756, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.0021755106281489134, + "learning_rate": 2.8175476143206145e-08, + "loss": 0.0149, + "num_tokens": 17684807.0, + "reward": 0.778124988079071, + "reward_std": 0.3087267279624939, + "rewards/itbench_correctness/mean": 0.778124988079071, + "rewards/itbench_correctness/std": 0.3087441027164459, + "step": 899, + "step_time": 81.62243656814098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 475.625, + "completions/mean_terminated_length": 475.625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3910643756389618, + "epoch": 4.761904761904762, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4921875, + "kl": 0.0022366566117852926, + "learning_rate": 2.7630844769743756e-08, + "loss": -0.0554, + "num_tokens": 17695441.0, + "reward": 0.6812499761581421, + "reward_std": 0.0752970278263092, + "rewards/itbench_correctness/mean": 0.6812499761581421, + "rewards/itbench_correctness/std": 0.3449033796787262, + "step": 900, + "step_time": 96.98080993723124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 706.625, + "completions/mean_terminated_length": 661.2857666015625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5575800538063049, + "epoch": 4.767195767195767, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.0022240527905523777, + "learning_rate": 2.7091379149682682e-08, + "loss": -0.0212, + "num_tokens": 17720731.0, + "reward": 0.484375, + "reward_std": 0.46940183639526367, + "rewards/itbench_correctness/mean": 0.484375, + "rewards/itbench_correctness/std": 0.4696519374847412, + "step": 901, + "step_time": 119.61820879764855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 772.5625, + "completions/mean_terminated_length": 577.0, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "entropy": 0.5074023008346558, + "epoch": 4.772486772486772, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0625, + "kl": 0.0013676214730367064, + "learning_rate": 2.655708518253258e-08, + "loss": -0.0031, + "num_tokens": 17737724.0, + "reward": 0.5364583134651184, + "reward_std": 0.3415539562702179, + "rewards/itbench_correctness/mean": 0.5364583134651184, + "rewards/itbench_correctness/std": 0.4584280252456665, + "step": 902, + "step_time": 71.03902994468808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 556.0625, + "completions/mean_terminated_length": 556.0625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.467573344707489, + "epoch": 4.777777777777778, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.0027739950455725193, + "learning_rate": 2.6027968711246627e-08, + "loss": -0.0162, + "num_tokens": 17756685.0, + "reward": 0.6875, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.4787135720252991, + "step": 903, + "step_time": 73.05204797629267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 432.375, + "completions/mean_terminated_length": 432.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.33304423093795776, + "epoch": 4.783068783068783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.0020940680988132954, + "learning_rate": 2.550403552215785e-08, + "loss": 0.0, + "num_tokens": 17766963.0, + "reward": 0.75, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 904, + "step_time": 75.88487505353987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 543.375, + "completions/mean_terminated_length": 543.375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 0.36438924074172974, + "epoch": 4.788359788359788, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0625, + "kl": 0.0024348387960344553, + "learning_rate": 2.4985291344915673e-08, + "loss": 0.0049, + "num_tokens": 17780065.0, + "reward": 0.22499999403953552, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.22499999403953552, + "rewards/itbench_correctness/std": 0.29552215337753296, + "step": 905, + "step_time": 420.1394767453894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 911.4375, + "completions/mean_terminated_length": 766.7142944335938, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "entropy": 0.6890214681625366, + "epoch": 4.7936507936507935, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.65625, + "kl": 0.00153902149759233, + "learning_rate": 2.4471741852423233e-08, + "loss": 0.0055, + "num_tokens": 17825752.0, + "reward": 0.03125, + "reward_std": 0.0578637570142746, + "rewards/itbench_correctness/mean": 0.03125, + "rewards/itbench_correctness/std": 0.08539126068353653, + "step": 906, + "step_time": 132.87724316772074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 648.3125, + "completions/mean_terminated_length": 648.3125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4966740608215332, + "epoch": 4.798941798941799, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4765625, + "kl": 0.0030572270043194294, + "learning_rate": 2.396339266077557e-08, + "loss": -0.1071, + "num_tokens": 17839621.0, + "reward": 0.9375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.25, + "step": 907, + "step_time": 503.1076301559806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 917.625, + "completions/mean_terminated_length": 598.5, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.3029560148715973, + "epoch": 4.804232804232804, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0234375, + "kl": 0.0009918678551912308, + "learning_rate": 2.3460249329197823e-08, + "loss": -0.0133, + "num_tokens": 17868991.0, + "reward": 0.3541666865348816, + "reward_std": 0.058925561606884, + "rewards/itbench_correctness/mean": 0.3541666865348816, + "rewards/itbench_correctness/std": 0.37453675270080566, + "step": 908, + "step_time": 243.62568031344563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 767.5625, + "completions/mean_terminated_length": 750.4667358398438, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.2605651021003723, + "epoch": 4.809523809523809, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.001077577588148415, + "learning_rate": 2.2962317359985107e-08, + "loss": -0.0185, + "num_tokens": 17888648.0, + "reward": 0.3333333432674408, + "reward_std": 0.2766174077987671, + "rewards/itbench_correctness/mean": 0.3333333432674408, + "rewards/itbench_correctness/std": 0.3162277638912201, + "step": 909, + "step_time": 128.46637521497905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 711.375, + "completions/mean_terminated_length": 690.5333862304688, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.4948163628578186, + "epoch": 4.814814814814815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.002276236657053232, + "learning_rate": 2.2469602198441573e-08, + "loss": 0.0748, + "num_tokens": 17906702.0, + "reward": 0.5833333134651184, + "reward_std": 0.19287919998168945, + "rewards/itbench_correctness/mean": 0.5833333134651184, + "rewards/itbench_correctness/std": 0.28706690669059753, + "step": 910, + "step_time": 451.41445366758853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 479.5, + "completions/mean_terminated_length": 479.5, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.31699687242507935, + "epoch": 4.8201058201058204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.001171560725197196, + "learning_rate": 2.1982109232821176e-08, + "loss": -0.0032, + "num_tokens": 17917550.0, + "reward": 0.9453125, + "reward_std": 0.08679073303937912, + "rewards/itbench_correctness/mean": 0.9453125, + "rewards/itbench_correctness/std": 0.10174263268709183, + "step": 911, + "step_time": 838.8527243016288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 508.5625, + "completions/mean_terminated_length": 508.5625, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5309081673622131, + "epoch": 4.825396825396825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.0025923862121999264, + "learning_rate": 2.1499843794269058e-08, + "loss": -0.1172, + "num_tokens": 17931191.0, + "reward": 0.5625, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 912, + "step_time": 216.96005523204803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 761.25, + "completions/mean_terminated_length": 673.6666870117188, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.47027915716171265, + "epoch": 4.830687830687831, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0013219509273767471, + "learning_rate": 2.1022811156762576e-08, + "loss": 0.0432, + "num_tokens": 17947115.0, + "reward": 0.25, + "reward_std": 0.30284827947616577, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.3259601294994354, + "step": 913, + "step_time": 100.75746689084917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 631.75, + "completions/mean_terminated_length": 396.3999938964844, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3593193590641022, + "epoch": 4.835978835978836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.002075113356113434, + "learning_rate": 2.055101653705449e-08, + "loss": -0.0584, + "num_tokens": 17963415.0, + "reward": 0.671875, + "reward_std": 0.4432469606399536, + "rewards/itbench_correctness/mean": 0.671875, + "rewards/itbench_correctness/std": 0.4718646705150604, + "step": 914, + "step_time": 378.09901642706245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 865.3125, + "completions/mean_terminated_length": 741.888916015625, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.40216684341430664, + "epoch": 4.841269841269841, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2265625, + "kl": 0.0013008101377636194, + "learning_rate": 2.008446509461498e-08, + "loss": -0.0024, + "num_tokens": 17983796.0, + "reward": 0.8459821343421936, + "reward_std": 0.10143714398145676, + "rewards/itbench_correctness/mean": 0.8459821343421936, + "rewards/itbench_correctness/std": 0.21097390353679657, + "step": 915, + "step_time": 85.97270075790584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 609.9375, + "completions/mean_terminated_length": 287.8888854980469, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5607131719589233, + "epoch": 4.8465608465608465, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3125, + "kl": 0.0020648783538490534, + "learning_rate": 1.9623161931575926e-08, + "loss": -0.0784, + "num_tokens": 18000307.0, + "reward": 0.171875, + "reward_std": 0.1367267221212387, + "rewards/itbench_correctness/mean": 0.171875, + "rewards/itbench_correctness/std": 0.2576940953731537, + "step": 916, + "step_time": 217.30383673589677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 639.0625, + "completions/mean_terminated_length": 639.0625, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.4193643033504486, + "epoch": 4.851851851851852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.0019990454893559217, + "learning_rate": 1.9167112092674796e-08, + "loss": 0.0, + "num_tokens": 18025980.0, + "reward": 0.05000000074505806, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.05000000074505806, + "rewards/itbench_correctness/std": 0.05163978040218353, + "step": 917, + "step_time": 113.52076997049153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 583.625, + "completions/mean_terminated_length": 583.625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.24159349501132965, + "epoch": 4.857142857142857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0012394418008625507, + "learning_rate": 1.8716320565199618e-08, + "loss": 0.0073, + "num_tokens": 18040222.0, + "reward": 0.4027777910232544, + "reward_std": 0.11368955671787262, + "rewards/itbench_correctness/mean": 0.4027777910232544, + "rewards/itbench_correctness/std": 0.17033012211322784, + "step": 918, + "step_time": 70.6566086569801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 733.75, + "completions/mean_terminated_length": 559.6000366210938, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.5805792212486267, + "epoch": 4.862433862433862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0245361328125, + "kl": 0.0014999855775386095, + "learning_rate": 1.82707922789343e-08, + "loss": 0.0, + "num_tokens": 18068698.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 919, + "step_time": 638.5085713258013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 725.3125, + "completions/mean_terminated_length": 725.3125, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "entropy": 0.6038776636123657, + "epoch": 4.867724867724868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.0026757661253213882, + "learning_rate": 1.7830532106104746e-08, + "loss": 0.0001, + "num_tokens": 18099127.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 920, + "step_time": 105.40813992917538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 492.125, + "completions/mean_terminated_length": 492.125, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "entropy": 0.4124968349933624, + "epoch": 4.8730158730158735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.0012788712047040462, + "learning_rate": 1.7395544861325718e-08, + "loss": -0.0021, + "num_tokens": 18109953.0, + "reward": 0.20928031206130981, + "reward_std": 0.10235221683979034, + "rewards/itbench_correctness/mean": 0.20928031206130981, + "rewards/itbench_correctness/std": 0.1258237361907959, + "step": 921, + "step_time": 53.33936434518546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 424.125, + "completions/mean_terminated_length": 424.125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.38903623819351196, + "epoch": 4.878306878306878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0015015568351373076, + "learning_rate": 1.6965835301547936e-08, + "loss": -0.0055, + "num_tokens": 18119051.0, + "reward": 0.5625, + "reward_std": 0.2177756428718567, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.25, + "step": 922, + "step_time": 53.09215545654297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 631.75, + "completions/mean_terminated_length": 631.75, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "entropy": 0.42105263471603394, + "epoch": 4.883597883597884, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4765625, + "kl": 0.0011097542010247707, + "learning_rate": 1.654140812600646e-08, + "loss": 0.0024, + "num_tokens": 18132919.0, + "reward": 0.8697916865348816, + "reward_std": 0.014731383882462978, + "rewards/itbench_correctness/mean": 0.8697916865348816, + "rewards/itbench_correctness/std": 0.1359764039516449, + "step": 923, + "step_time": 800.9228875609115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 391.125, + "completions/mean_terminated_length": 391.125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.3630552887916565, + "epoch": 4.888888888888889, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.96875, + "kl": 0.0026786988601088524, + "learning_rate": 1.612226797616878e-08, + "loss": -0.0025, + "num_tokens": 18141865.0, + "reward": 0.71875, + "reward_std": 0.0578637570142746, + "rewards/itbench_correctness/mean": 0.71875, + "rewards/itbench_correctness/std": 0.23935678601264954, + "step": 924, + "step_time": 811.8248612135649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 992.0, + "completions/mean_terminated_length": 921.6000366210938, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.44556450843811035, + "epoch": 4.894179894179894, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.796875, + "kl": 0.0014187112683430314, + "learning_rate": 1.570841943568446e-08, + "loss": -0.0001, + "num_tokens": 18173001.0, + "reward": 0.30000001192092896, + "reward_std": 0.32691311836242676, + "rewards/itbench_correctness/mean": 0.30000001192092896, + "rewards/itbench_correctness/std": 0.37372004985809326, + "step": 925, + "step_time": 140.93405285663903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 780.3125, + "completions/mean_terminated_length": 590.7777709960938, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "entropy": 0.5049259066581726, + "epoch": 4.8994708994708995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0018744791159406304, + "learning_rate": 1.5299867030334813e-08, + "loss": 0.0123, + "num_tokens": 18191094.0, + "reward": 0.6193181872367859, + "reward_std": 0.19284729659557343, + "rewards/itbench_correctness/mean": 0.6193181872367859, + "rewards/itbench_correctness/std": 0.4520004689693451, + "step": 926, + "step_time": 850.4311718912795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 744.375, + "completions/mean_terminated_length": 576.6000366210938, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "entropy": 0.30495381355285645, + "epoch": 4.904761904761905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.0011643210891634226, + "learning_rate": 1.4896615227983466e-08, + "loss": 0.0384, + "num_tokens": 18211444.0, + "reward": 0.1875, + "reward_std": 0.2700308561325073, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.32702362537384033, + "step": 927, + "step_time": 924.7460502795875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 743.0625, + "completions/mean_terminated_length": 524.5555419921875, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "entropy": 0.2853057384490967, + "epoch": 4.91005291005291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026123046875, + "kl": 0.0013434689026325941, + "learning_rate": 1.4498668438527595e-08, + "loss": 0.0, + "num_tokens": 18229109.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 928, + "step_time": 146.66280045732856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 827.8125, + "completions/mean_terminated_length": 675.2222290039062, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.5242733359336853, + "epoch": 4.915343915343915, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.0013374233385547996, + "learning_rate": 1.4106031013849496e-08, + "loss": -0.0194, + "num_tokens": 18248834.0, + "reward": 0.40312498807907104, + "reward_std": 0.18545761704444885, + "rewards/itbench_correctness/mean": 0.40312498807907104, + "rewards/itbench_correctness/std": 0.21328286826610565, + "step": 929, + "step_time": 373.5206086365506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 554.125, + "completions/mean_terminated_length": 554.125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5053011775016785, + "epoch": 4.920634920634921, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.002205699449405074, + "learning_rate": 1.3718707247769134e-08, + "loss": -0.0714, + "num_tokens": 18262628.0, + "reward": 0.4375, + "reward_std": 0.22226819396018982, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.32221025228500366, + "step": 930, + "step_time": 99.51419737841934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 370.1875, + "completions/mean_terminated_length": 370.1875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.35117340087890625, + "epoch": 4.925925925925926, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.40625, + "kl": 0.00277713593095541, + "learning_rate": 1.3336701375997127e-08, + "loss": -0.0628, + "num_tokens": 18275143.0, + "reward": 0.6875, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.3095695972442627, + "step": 931, + "step_time": 64.9049273962155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 748.5, + "completions/mean_terminated_length": 473.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "entropy": 0.3607214391231537, + "epoch": 4.931216931216931, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0014514221111312509, + "learning_rate": 1.2960017576088444e-08, + "loss": -0.0523, + "num_tokens": 18299639.0, + "reward": 0.4375, + "reward_std": 0.290380597114563, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.45896419882774353, + "step": 932, + "step_time": 117.33534361980855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 515.25, + "completions/mean_terminated_length": 515.25, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "entropy": 0.3842794895172119, + "epoch": 4.936507936507937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0029691443778574467, + "learning_rate": 1.2588659967396997e-08, + "loss": -0.0036, + "num_tokens": 18311107.0, + "reward": 0.8636363744735718, + "reward_std": 0.09819302707910538, + "rewards/itbench_correctness/mean": 0.8636363744735718, + "rewards/itbench_correctness/std": 0.1603485643863678, + "step": 933, + "step_time": 64.65582219231874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 692.875, + "completions/mean_terminated_length": 692.875, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.3896806836128235, + "epoch": 4.941798941798941, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.001258630771189928, + "learning_rate": 1.2222632611029848e-08, + "loss": 0.0216, + "num_tokens": 18326641.0, + "reward": 0.545036792755127, + "reward_std": 0.15080596506595612, + "rewards/itbench_correctness/mean": 0.545036792755127, + "rewards/itbench_correctness/std": 0.3676183223724365, + "step": 934, + "step_time": 169.38152172323316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 851.6875, + "completions/mean_terminated_length": 717.6666870117188, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5119248628616333, + "epoch": 4.947089947089947, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.265625, + "kl": 0.0015083615435287356, + "learning_rate": 1.1861939509803686e-08, + "loss": 0.0377, + "num_tokens": 18350692.0, + "reward": 0.0390625, + "reward_std": 0.05725783854722977, + "rewards/itbench_correctness/mean": 0.0390625, + "rewards/itbench_correctness/std": 0.08801929652690887, + "step": 935, + "step_time": 103.84523029625416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 659.0625, + "completions/mean_terminated_length": 659.0625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.2715979218482971, + "epoch": 4.9523809523809526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.000976603594608605, + "learning_rate": 1.1506584608200364e-08, + "loss": -0.0178, + "num_tokens": 18366501.0, + "reward": 0.6177083253860474, + "reward_std": 0.17081069946289062, + "rewards/itbench_correctness/mean": 0.6177083253860474, + "rewards/itbench_correctness/std": 0.21085968613624573, + "step": 936, + "step_time": 92.50707028061152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 699.0625, + "completions/mean_terminated_length": 504.1000061035156, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4091193675994873, + "epoch": 4.957671957671957, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.21875, + "kl": 0.002065515611320734, + "learning_rate": 1.115657179232421e-08, + "loss": -0.065, + "num_tokens": 18387390.0, + "reward": 0.1875, + "reward_std": 0.10681165754795074, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.14751020073890686, + "step": 937, + "step_time": 594.838815539144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 817.4375, + "completions/mean_terminated_length": 787.9285888671875, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.4061472713947296, + "epoch": 4.962962962962963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.0015314513584598899, + "learning_rate": 1.0811904889859335e-08, + "loss": 0.0224, + "num_tokens": 18405933.0, + "reward": 0.854687511920929, + "reward_std": 0.22741259634494781, + "rewards/itbench_correctness/mean": 0.854687511920929, + "rewards/itbench_correctness/std": 0.24986976385116577, + "step": 938, + "step_time": 84.86788581125438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 857.3125, + "completions/mean_terminated_length": 801.75, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "entropy": 0.5902165174484253, + "epoch": 4.968253968253968, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5078125, + "kl": 0.0016059105983003974, + "learning_rate": 1.0472587670027678e-08, + "loss": 0.0006, + "num_tokens": 18446842.0, + "reward": 0.1875, + "reward_std": 0.2587745785713196, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 939, + "step_time": 161.18546231649816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 533.8125, + "completions/mean_terminated_length": 533.8125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.5657417178153992, + "epoch": 4.973544973544973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01708984375, + "kl": 0.0015139211900532246, + "learning_rate": 1.0138623843548078e-08, + "loss": 0.0, + "num_tokens": 18468391.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.0, + "rewards/itbench_correctness/std": 0.0, + "step": 940, + "step_time": 92.19830699265003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 734.0, + "completions/mean_terminated_length": 444.0, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "entropy": 0.3760218024253845, + "epoch": 4.978835978835979, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2109375, + "kl": 0.001119913998991251, + "learning_rate": 9.810017062595321e-09, + "loss": 0.0, + "num_tokens": 18486999.0, + "reward": 0.734375, + "reward_std": 0.19408094882965088, + "rewards/itbench_correctness/mean": 0.734375, + "rewards/itbench_correctness/std": 0.3815402090549469, + "step": 941, + "step_time": 845.4121110225096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 763.5625, + "completions/mean_terminated_length": 503.125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.5107637047767639, + "epoch": 4.984126984126984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.002076751785352826, + "learning_rate": 9.486770920760667e-09, + "loss": -0.0707, + "num_tokens": 18506976.0, + "reward": 0.53125, + "reward_std": 0.41746097803115845, + "rewards/itbench_correctness/mean": 0.53125, + "rewards/itbench_correctness/std": 0.4989572763442993, + "step": 942, + "step_time": 94.76647205464542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 720.125, + "completions/mean_terminated_length": 582.0, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.5610136985778809, + "epoch": 4.98941798941799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.0018183528445661068, + "learning_rate": 9.168888953011989e-09, + "loss": 0.0444, + "num_tokens": 18531946.0, + "reward": 0.1979166716337204, + "reward_std": 0.2609178125858307, + "rewards/itbench_correctness/mean": 0.1979166716337204, + "rewards/itbench_correctness/std": 0.3232860863208771, + "step": 943, + "step_time": 101.69731870479882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 479.875, + "completions/mean_terminated_length": 479.875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.4459494650363922, + "epoch": 4.994708994708994, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1796875, + "kl": 0.0016703108558431268, + "learning_rate": 8.856374635655695e-09, + "loss": -0.0015, + "num_tokens": 18542184.0, + "reward": 0.4583333432674408, + "reward_std": 0.1178511306643486, + "rewards/itbench_correctness/mean": 0.4583333432674408, + "rewards/itbench_correctness/std": 0.5, + "step": 944, + "step_time": 465.4684395249933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 718.6875, + "completions/mean_terminated_length": 481.22222900390625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.5816158056259155, + "epoch": 5.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0013618582161143422, + "learning_rate": 8.54923138629815e-09, + "loss": 0.013, + "num_tokens": 18560147.0, + "reward": 0.40625, + "reward_std": 0.19776971638202667, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.306526243686676, + "step": 945, + "step_time": 143.76468984037638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 575.8125, + "completions/mean_terminated_length": 545.933349609375, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.4619559347629547, + "epoch": 5.005291005291006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0013607589062303305, + "learning_rate": 8.247462563808816e-09, + "loss": 0.0691, + "num_tokens": 18571720.0, + "reward": 0.8020833134651184, + "reward_std": 0.4064691960811615, + "rewards/itbench_correctness/mean": 0.8020833134651184, + "rewards/itbench_correctness/std": 0.40008679032325745, + "step": 946, + "step_time": 82.60051180887967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 862.5625, + "completions/mean_terminated_length": 701.125, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.5077893137931824, + "epoch": 5.01058201058201, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.421875, + "kl": 0.0012988558737561107, + "learning_rate": 7.951071468283166e-09, + "loss": 0.0, + "num_tokens": 18591313.0, + "reward": 0.796875, + "reward_std": 0.13258251547813416, + "rewards/itbench_correctness/mean": 0.796875, + "rewards/itbench_correctness/std": 0.27716949582099915, + "step": 947, + "step_time": 1016.9221306946129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 920.5625, + "completions/mean_terminated_length": 787.5714721679688, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.4605879485607147, + "epoch": 5.015873015873016, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5390625, + "kl": 0.001742081018164754, + "learning_rate": 7.660061341006718e-09, + "loss": 0.0001, + "num_tokens": 18623898.0, + "reward": 0.25, + "reward_std": 0.15430334210395813, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.3333333432674408, + "step": 948, + "step_time": 110.10245905164629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 639.3125, + "completions/mean_terminated_length": 511.0833435058594, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4192003011703491, + "epoch": 5.021164021164021, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.443359375, + "kl": 0.0027288675773888826, + "learning_rate": 7.374435364419673e-09, + "loss": -0.0643, + "num_tokens": 18638975.0, + "reward": 0.17500001192092896, + "reward_std": 0.0707106739282608, + "rewards/itbench_correctness/mean": 0.17500001192092896, + "rewards/itbench_correctness/std": 0.20493902266025543, + "step": 949, + "step_time": 263.15858253091574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 859.5625, + "completions/mean_terminated_length": 821.6154174804688, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "entropy": 0.3839162290096283, + "epoch": 5.026455026455026, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.000997141352854669, + "learning_rate": 7.09419666208183e-09, + "loss": -0.0236, + "num_tokens": 18657616.0, + "reward": 0.6669219732284546, + "reward_std": 0.24255049228668213, + "rewards/itbench_correctness/mean": 0.6669219732284546, + "rewards/itbench_correctness/std": 0.4117698669433594, + "step": 950, + "step_time": 535.0438889786601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 944.6875, + "completions/mean_terminated_length": 883.0, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.39801523089408875, + "epoch": 5.031746031746032, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0703125, + "kl": 0.001400322187691927, + "learning_rate": 6.819348298638839e-09, + "loss": 0.0019, + "num_tokens": 18678923.0, + "reward": 0.328125, + "reward_std": 0.0646936446428299, + "rewards/itbench_correctness/mean": 0.328125, + "rewards/itbench_correctness/std": 0.3502231538295746, + "step": 951, + "step_time": 90.80604922864586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 787.0625, + "completions/mean_terminated_length": 753.2142944335938, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.36845865845680237, + "epoch": 5.037037037037037, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3515625, + "kl": 0.0017281656619161367, + "learning_rate": 6.549893279788277e-09, + "loss": 0.007, + "num_tokens": 18701348.0, + "reward": 0.9375, + "reward_std": 0.1157275140285492, + "rewards/itbench_correctness/mean": 0.9375, + "rewards/itbench_correctness/std": 0.17078252136707306, + "step": 952, + "step_time": 321.3709614155814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 685.25, + "completions/mean_terminated_length": 685.25, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "entropy": 0.4961692690849304, + "epoch": 5.042328042328043, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.265625, + "kl": 0.0015087542124092579, + "learning_rate": 6.2858345522471265e-09, + "loss": 0.0125, + "num_tokens": 18722424.0, + "reward": 0.34687501192092896, + "reward_std": 0.0646936446428299, + "rewards/itbench_correctness/mean": 0.34687501192092896, + "rewards/itbench_correctness/std": 0.2698571979999542, + "step": 953, + "step_time": 409.55402624513954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 433.375, + "completions/mean_terminated_length": 433.375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.5030285716056824, + "epoch": 5.0476190476190474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.0032849370036274195, + "learning_rate": 6.0271750037193534e-09, + "loss": 0.0001, + "num_tokens": 18731814.0, + "reward": 0.25, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.25, + "rewards/itbench_correctness/std": 0.25819888710975647, + "step": 954, + "step_time": 96.037104123272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 762.25, + "completions/mean_terminated_length": 701.84619140625, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 0.3961954712867737, + "epoch": 5.052910052910053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0015258269850164652, + "learning_rate": 5.773917462864264e-09, + "loss": 0.0162, + "num_tokens": 18749154.0, + "reward": 0.5750000476837158, + "reward_std": 0.24238379299640656, + "rewards/itbench_correctness/mean": 0.5750000476837158, + "rewards/itbench_correctness/std": 0.4028027057647705, + "step": 955, + "step_time": 67.30359940230846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 587.3125, + "completions/mean_terminated_length": 558.2000122070312, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3524529039859772, + "epoch": 5.058201058201059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.002124128630384803, + "learning_rate": 5.526064699265753e-09, + "loss": 0.0, + "num_tokens": 18767295.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 956, + "step_time": 319.00149345304817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 828.9375, + "completions/mean_terminated_length": 633.875, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.35708361864089966, + "epoch": 5.063492063492063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0012050803052261472, + "learning_rate": 5.283619423401997e-09, + "loss": -0.0121, + "num_tokens": 18791526.0, + "reward": 0.1875, + "reward_std": 0.4082317352294922, + "rewards/itbench_correctness/mean": 0.1875, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 957, + "step_time": 935.1128796143457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 1020.0, + "completions/mean_terminated_length": 960.0, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "entropy": 0.31568628549575806, + "epoch": 5.068783068783069, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.00110468955244869, + "learning_rate": 5.046584286615696e-09, + "loss": 0.0017, + "num_tokens": 18820270.0, + "reward": 0.3437500298023224, + "reward_std": 0.23224487900733948, + "rewards/itbench_correctness/mean": 0.3437500298023224, + "rewards/itbench_correctness/std": 0.30712980031967163, + "step": 958, + "step_time": 187.4145915368572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 732.625, + "completions/mean_terminated_length": 732.625, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "entropy": 0.3330489695072174, + "epoch": 5.074074074074074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0017922122497111559, + "learning_rate": 4.8149618810850444e-09, + "loss": 0.0026, + "num_tokens": 18846408.0, + "reward": 0.5416666865348816, + "reward_std": 0.3205420970916748, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.40138646960258484, + "step": 959, + "step_time": 115.98179497290403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 747.9375, + "completions/mean_terminated_length": 582.2999877929688, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.4786496162414551, + "epoch": 5.079365079365079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0233154296875, + "kl": 0.0014122002758085728, + "learning_rate": 4.588754739795586e-09, + "loss": 0.0, + "num_tokens": 18866879.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 960, + "step_time": 447.1240088623017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 872.0625, + "completions/mean_terminated_length": 780.9000244140625, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "entropy": 0.36006593704223633, + "epoch": 5.084656084656085, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.0014327183598652482, + "learning_rate": 4.367965336512403e-09, + "loss": 0.0033, + "num_tokens": 18892360.0, + "reward": 0.5625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.5625, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 961, + "step_time": 565.5308811077848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 447.9375, + "completions/mean_terminated_length": 447.9375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "entropy": 0.4866750240325928, + "epoch": 5.08994708994709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.0028053594287484884, + "learning_rate": 4.152596085753024e-09, + "loss": -0.0055, + "num_tokens": 18904239.0, + "reward": 0.2922794222831726, + "reward_std": 0.34673309326171875, + "rewards/itbench_correctness/mean": 0.2922794222831726, + "rewards/itbench_correctness/std": 0.35596761107444763, + "step": 962, + "step_time": 62.929508111439645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 579.75, + "completions/mean_terminated_length": 431.66668701171875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.3777490258216858, + "epoch": 5.095238095238095, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.03125, + "kl": 0.0031987950205802917, + "learning_rate": 3.9426493427611175e-09, + "loss": 0.0033, + "num_tokens": 18924067.0, + "reward": 0.1171875, + "reward_std": 0.1269381046295166, + "rewards/itbench_correctness/mean": 0.1171875, + "rewards/itbench_correctness/std": 0.2114865630865097, + "step": 963, + "step_time": 148.80320667196065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 530.5625, + "completions/mean_terminated_length": 530.5625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.5051242709159851, + "epoch": 5.1005291005291005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0027365966234356165, + "learning_rate": 3.7381274034805066e-09, + "loss": -0.0151, + "num_tokens": 18945556.0, + "reward": 0.5562499761581421, + "reward_std": 0.3939805328845978, + "rewards/itbench_correctness/mean": 0.5562499761581421, + "rewards/itbench_correctness/std": 0.4657878875732422, + "step": 964, + "step_time": 113.64905078150332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 575.75, + "completions/mean_terminated_length": 545.86669921875, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.35084670782089233, + "epoch": 5.105820105820106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0013689551269635558, + "learning_rate": 3.53903250453047e-09, + "loss": -0.0007, + "num_tokens": 18957760.0, + "reward": 0.6875, + "reward_std": 0.3613206446170807, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.36830443143844604, + "step": 965, + "step_time": 143.10534042678773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 667.4375, + "completions/mean_terminated_length": 453.5, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.4554733633995056, + "epoch": 5.111111111111111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.001402169349603355, + "learning_rate": 3.3453668231809283e-09, + "loss": 0.0134, + "num_tokens": 18976887.0, + "reward": 0.7916666865348816, + "reward_std": 0.3177001476287842, + "rewards/itbench_correctness/mean": 0.7916666865348816, + "rewards/itbench_correctness/std": 0.4013864994049072, + "step": 966, + "step_time": 106.49516909942031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 808.4375, + "completions/mean_terminated_length": 640.7777709960938, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 0.4626207947731018, + "epoch": 5.116402116402116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03515625, + "kl": 0.0013422613264992833, + "learning_rate": 3.1571324773286278e-09, + "loss": 0.0, + "num_tokens": 19000374.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 967, + "step_time": 106.7643784377724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 628.375, + "completions/mean_terminated_length": 448.54547119140625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "entropy": 0.4742391109466553, + "epoch": 5.121693121693122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.001217088894918561, + "learning_rate": 2.9743315254743828e-09, + "loss": 0.0213, + "num_tokens": 19014292.0, + "reward": 0.5, + "reward_std": 0.3745020925998688, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.40824830532073975, + "step": 968, + "step_time": 76.93132317159325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 427.0625, + "completions/mean_terminated_length": 427.0625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.4425581693649292, + "epoch": 5.1269841269841265, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1015625, + "kl": 0.0016664480790495872, + "learning_rate": 2.7969659666999267e-09, + "loss": -0.0014, + "num_tokens": 19023773.0, + "reward": 0.6875, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.6875, + "rewards/itbench_correctness/std": 0.370809942483902, + "step": 969, + "step_time": 200.08597892336547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 686.0, + "completions/mean_terminated_length": 483.20001220703125, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.3600583076477051, + "epoch": 5.132275132275132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0012360253604128957, + "learning_rate": 2.6250377406467627e-09, + "loss": -0.0297, + "num_tokens": 19050061.0, + "reward": 0.515625, + "reward_std": 0.2414703369140625, + "rewards/itbench_correctness/mean": 0.515625, + "rewards/itbench_correctness/std": 0.4784414768218994, + "step": 970, + "step_time": 540.8324056314304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 750.4375, + "completions/mean_terminated_length": 537.6666870117188, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "entropy": 0.4610643684864044, + "epoch": 5.137566137566138, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1171875, + "kl": 0.0013813948025926948, + "learning_rate": 2.458548727494292e-09, + "loss": 0.0043, + "num_tokens": 19075564.0, + "reward": 0.9921875, + "reward_std": 0.022097086533904076, + "rewards/itbench_correctness/mean": 0.9921875, + "rewards/itbench_correctness/std": 0.03125, + "step": 971, + "step_time": 217.15286646224558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 657.375, + "completions/mean_terminated_length": 657.375, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "entropy": 0.40768206119537354, + "epoch": 5.142857142857143, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.640625, + "kl": 0.0014334998559206724, + "learning_rate": 2.2975007479397733e-09, + "loss": 0.0096, + "num_tokens": 19090834.0, + "reward": 0.75, + "reward_std": 0.13363061845302582, + "rewards/itbench_correctness/mean": 0.75, + "rewards/itbench_correctness/std": 0.3162277936935425, + "step": 972, + "step_time": 432.4331463770941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 740.25, + "completions/mean_terminated_length": 699.7142944335938, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "entropy": 0.5214454531669617, + "epoch": 5.148148148148148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0014667396899312735, + "learning_rate": 2.14189556317812e-09, + "loss": -0.0047, + "num_tokens": 19107054.0, + "reward": 0.8541666865348816, + "reward_std": 0.290380597114563, + "rewards/itbench_correctness/mean": 0.8541666865348816, + "rewards/itbench_correctness/std": 0.3435921370983124, + "step": 973, + "step_time": 205.37443487346172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 665.625, + "completions/mean_terminated_length": 665.625, + "completions/min_length": 555.0, + "completions/min_terminated_length": 555.0, + "entropy": 0.39962440729141235, + "epoch": 5.1534391534391535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.001995999598875642, + "learning_rate": 1.9917348748826334e-09, + "loss": -0.0209, + "num_tokens": 19121896.0, + "reward": 0.4375, + "reward_std": 0.13969546556472778, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.2922613024711609, + "step": 974, + "step_time": 112.39083941001445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 617.75, + "completions/mean_terminated_length": 617.75, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 0.2751922309398651, + "epoch": 5.158730158730159, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.0011612976668402553, + "learning_rate": 1.8470203251865768e-09, + "loss": -0.0003, + "num_tokens": 19137156.0, + "reward": 0.578125, + "reward_std": 0.3319548964500427, + "rewards/itbench_correctness/mean": 0.578125, + "rewards/itbench_correctness/std": 0.3842606544494629, + "step": 975, + "step_time": 101.55415380187333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 901.0, + "completions/mean_terminated_length": 778.0, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.4217536151409149, + "epoch": 5.164021164021164, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4375, + "kl": 0.0018101928289979696, + "learning_rate": 1.7077534966650765e-09, + "loss": 0.0001, + "num_tokens": 19166732.0, + "reward": 0.40416669845581055, + "reward_std": 0.2077372521162033, + "rewards/itbench_correctness/mean": 0.40416669845581055, + "rewards/itbench_correctness/std": 0.423368364572525, + "step": 976, + "step_time": 115.02839307207614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 664.25, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 0.5600301027297974, + "epoch": 5.169312169312169, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09375, + "kl": 0.002377049997448921, + "learning_rate": 1.5739359123178585e-09, + "loss": -0.0112, + "num_tokens": 19183648.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 977, + "step_time": 76.0952754272148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 735.0, + "completions/mean_terminated_length": 638.6666870117188, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.38639456033706665, + "epoch": 5.174603174603175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0020921749528497458, + "learning_rate": 1.4455690355525963e-09, + "loss": 0.0233, + "num_tokens": 19205040.0, + "reward": 0.5208333730697632, + "reward_std": 0.347861647605896, + "rewards/itbench_correctness/mean": 0.5208333730697632, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 978, + "step_time": 102.6859831251204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 640.625, + "completions/mean_terminated_length": 615.0667114257812, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.3012682795524597, + "epoch": 5.1798941798941796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.0016695463564246893, + "learning_rate": 1.3226542701689214e-09, + "loss": 0.0201, + "num_tokens": 19219874.0, + "reward": 0.40625, + "reward_std": 0.24511480331420898, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.23935678601264954, + "step": 979, + "step_time": 449.7205182630569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 882.25, + "completions/mean_terminated_length": 740.5, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "entropy": 0.5644658803939819, + "epoch": 5.185185185185185, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5234375, + "kl": 0.0016784444451332092, + "learning_rate": 1.2051929603428823e-09, + "loss": 0.0001, + "num_tokens": 19257654.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 980, + "step_time": 202.28774461336434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 706.5625, + "completions/mean_terminated_length": 633.3077392578125, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.5095090866088867, + "epoch": 5.190476190476191, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.53125, + "kl": 0.0016119469655677676, + "learning_rate": 1.0931863906127325e-09, + "loss": -0.0168, + "num_tokens": 19297487.0, + "reward": 0.625, + "reward_std": 0.2314550280570984, + "rewards/itbench_correctness/mean": 0.625, + "rewards/itbench_correctness/std": 0.5, + "step": 981, + "step_time": 162.3620089488104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 797.4375, + "completions/mean_terminated_length": 721.9166870117188, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.4715103209018707, + "epoch": 5.195767195767195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0011810938594862819, + "learning_rate": 9.866357858642205e-10, + "loss": 0.0159, + "num_tokens": 19314862.0, + "reward": 0.8062499761581421, + "reward_std": 0.2764522433280945, + "rewards/itbench_correctness/mean": 0.8062499761581421, + "rewards/itbench_correctness/std": 0.40078049898147583, + "step": 982, + "step_time": 87.88412514608353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 870.625, + "completions/mean_terminated_length": 751.3333129882812, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.45254844427108765, + "epoch": 5.201058201058201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.001551373046822846, + "learning_rate": 8.855423113177662e-10, + "loss": 0.0001, + "num_tokens": 19335648.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 1.0, + "rewards/itbench_correctness/std": 0.0, + "step": 983, + "step_time": 624.4646268095821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 983.9375, + "completions/mean_terminated_length": 895.7999877929688, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "entropy": 0.5325541496276855, + "epoch": 5.2063492063492065, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4921875, + "kl": 0.0016566679114475846, + "learning_rate": 7.899070725153611e-10, + "loss": -0.0178, + "num_tokens": 19373543.0, + "reward": 0.4375, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.4375, + "rewards/itbench_correctness/std": 0.5123475790023804, + "step": 984, + "step_time": 265.6073463913053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 461.125, + "completions/mean_terminated_length": 461.125, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.4120357930660248, + "epoch": 5.211640211640212, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0234375, + "kl": 0.002073045587167144, + "learning_rate": 6.997311153086882e-10, + "loss": 0.0054, + "num_tokens": 19384705.0, + "reward": 0.28125, + "reward_std": 0.0294627845287323, + "rewards/itbench_correctness/mean": 0.28125, + "rewards/itbench_correctness/std": 0.145535409450531, + "step": 985, + "step_time": 46.38171513937414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 612.9375, + "completions/mean_terminated_length": 612.9375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.5286020040512085, + "epoch": 5.216931216931217, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0022879249881953, + "learning_rate": 6.150154258476314e-10, + "loss": -0.0764, + "num_tokens": 19399360.0, + "reward": 0.7708333730697632, + "reward_std": 0.25392836332321167, + "rewards/itbench_correctness/mean": 0.7708333730697632, + "rewards/itbench_correctness/std": 0.26440009474754333, + "step": 986, + "step_time": 232.74916400574148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 668.0, + "completions/mean_terminated_length": 668.0, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.5179640650749207, + "epoch": 5.222222222222222, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.28125, + "kl": 0.0015953207621350884, + "learning_rate": 5.35760930569229e-10, + "loss": 0.0063, + "num_tokens": 19417072.0, + "reward": 0.0625, + "reward_std": 0.1767766922712326, + "rewards/itbench_correctness/mean": 0.0625, + "rewards/itbench_correctness/std": 0.25, + "step": 987, + "step_time": 90.40588045120239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 559.8125, + "completions/mean_terminated_length": 559.8125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.5394663214683533, + "epoch": 5.227513227513228, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.140625, + "kl": 0.0017291223630309105, + "learning_rate": 4.619684961881254e-10, + "loss": -0.0123, + "num_tokens": 19447357.0, + "reward": 0.40625, + "reward_std": 0.1293872892856598, + "rewards/itbench_correctness/mean": 0.40625, + "rewards/itbench_correctness/std": 0.4552929699420929, + "step": 988, + "step_time": 91.7427905248478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 719.4375, + "completions/mean_terminated_length": 617.9166870117188, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.3725132346153259, + "epoch": 5.232804232804233, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0019212639890611172, + "learning_rate": 3.9363892968641287e-10, + "loss": 0.0219, + "num_tokens": 19463452.0, + "reward": 0.21875, + "reward_std": 0.1978391408920288, + "rewards/itbench_correctness/mean": 0.21875, + "rewards/itbench_correctness/std": 0.22219711542129517, + "step": 989, + "step_time": 508.81171389855444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 719.0, + "completions/mean_terminated_length": 675.4285888671875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.2767733037471771, + "epoch": 5.238095238095238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.0014671377139165998, + "learning_rate": 3.3077297830541585e-10, + "loss": 0.0246, + "num_tokens": 19480260.0, + "reward": 0.5416666865348816, + "reward_std": 0.37473249435424805, + "rewards/itbench_correctness/mean": 0.5416666865348816, + "rewards/itbench_correctness/std": 0.5, + "step": 990, + "step_time": 126.7575543159619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 516.25, + "completions/mean_terminated_length": 516.25, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.46295398473739624, + "epoch": 5.243386243386244, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0078125, + "kl": 0.0014694234123453498, + "learning_rate": 2.733713295369755e-10, + "loss": -0.0213, + "num_tokens": 19491424.0, + "reward": 0.875, + "reward_std": 0.18898223340511322, + "rewards/itbench_correctness/mean": 0.875, + "rewards/itbench_correctness/std": 0.28867512941360474, + "step": 991, + "step_time": 357.908637705259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 658.0625, + "completions/mean_terminated_length": 658.0625, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.6351979970932007, + "epoch": 5.248677248677248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0013824838679283857, + "learning_rate": 2.2143461111645556e-10, + "loss": 0.0117, + "num_tokens": 19507433.0, + "reward": 0.6458333730697632, + "reward_std": 0.2946278154850006, + "rewards/itbench_correctness/mean": 0.6458333730697632, + "rewards/itbench_correctness/std": 0.40311288833618164, + "step": 992, + "step_time": 522.8660918865353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 678.25, + "completions/mean_terminated_length": 655.2000122070312, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.4423147737979889, + "epoch": 5.253968253968254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024169921875, + "kl": 0.0016678622923791409, + "learning_rate": 1.7496339101535918e-10, + "loss": 0.0, + "num_tokens": 19545629.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/itbench_correctness/mean": 0.5, + "rewards/itbench_correctness/std": 0.5163977742195129, + "step": 993, + "step_time": 334.44221889507025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 698.9375, + "completions/mean_terminated_length": 677.2667236328125, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "entropy": 0.39774659276008606, + "epoch": 5.2592592592592595, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2265625, + "kl": 0.0019350027432665229, + "learning_rate": 1.3395817743561132e-10, + "loss": 0.0034, + "num_tokens": 19562180.0, + "reward": 0.1041666716337204, + "reward_std": 0.03857583925127983, + "rewards/itbench_correctness/mean": 0.1041666716337204, + "rewards/itbench_correctness/std": 0.11979921907186508, + "step": 994, + "step_time": 805.2065543290228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 563.8125, + "completions/mean_terminated_length": 533.1333618164062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 0.4930717349052429, + "epoch": 5.264550264550264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0037524921353906393, + "learning_rate": 9.841941880361914e-11, + "loss": -0.0262, + "num_tokens": 19574545.0, + "reward": 0.5104166269302368, + "reward_std": 0.32622629404067993, + "rewards/itbench_correctness/mean": 0.5104166269302368, + "rewards/itbench_correctness/std": 0.3812578022480011, + "step": 995, + "step_time": 363.2403373187408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 771.625, + "completions/mean_terminated_length": 575.3333129882812, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "entropy": 0.5106106996536255, + "epoch": 5.26984126984127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0013155718334019184, + "learning_rate": 6.834750376549791e-11, + "loss": -0.011, + "num_tokens": 19592107.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/itbench_correctness/mean": 0.125, + "rewards/itbench_correctness/std": 0.3415650427341461, + "step": 996, + "step_time": 159.3905362924561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 999.8125, + "completions/mean_terminated_length": 895.0, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "entropy": 0.6281177997589111, + "epoch": 5.275132275132275, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.001502976636402309, + "learning_rate": 4.3742761183018783e-11, + "loss": 0.0133, + "num_tokens": 19617248.0, + "reward": 0.21250000596046448, + "reward_std": 0.1787744164466858, + "rewards/itbench_correctness/mean": 0.21250000596046448, + "rewards/itbench_correctness/std": 0.20124614238739014, + "step": 997, + "step_time": 138.13474278803915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 651.625, + "completions/mean_terminated_length": 651.625, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.40821024775505066, + "epoch": 5.28042328042328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.0013937059557065368, + "learning_rate": 2.4605460129556442e-11, + "loss": -0.0343, + "num_tokens": 19632762.0, + "reward": 0.8848214149475098, + "reward_std": 0.04293148219585419, + "rewards/itbench_correctness/mean": 0.8848214149475098, + "rewards/itbench_correctness/std": 0.10898028314113617, + "step": 998, + "step_time": 133.00229213759303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 707.75, + "completions/mean_terminated_length": 391.5, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 0.6329918503761292, + "epoch": 5.285714285714286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.0015219503547996283, + "learning_rate": 1.0935809887702152e-11, + "loss": 0.0067, + "num_tokens": 19651310.0, + "reward": 0.5859375, + "reward_std": 0.2041938304901123, + "rewards/itbench_correctness/mean": 0.5859375, + "rewards/itbench_correctness/std": 0.4557931423187256, + "step": 999, + "step_time": 188.53237317036837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 665.75, + "completions/mean_terminated_length": 546.3333740234375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "entropy": 0.42959070205688477, + "epoch": 5.291005291005291, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.001996886683627963, + "learning_rate": 2.7339599464326622e-12, + "loss": 0.021, + "num_tokens": 19667050.0, + "reward": 0.1160714328289032, + "reward_std": 0.23404696583747864, + "rewards/itbench_correctness/mean": 0.1160714328289032, + "rewards/itbench_correctness/std": 0.25404882431030273, + "step": 1000, + "step_time": 844.8066724454984 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 19667050, + "num_train_epochs": 6, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}