{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 230.65625, "completions/mean_terminated_length": 93.80000305175781, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 2.7413549423217773, "epoch": 8e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 7605.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 1, "step_time": 20.038708471984137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 220.71875, "completions/mean_terminated_length": 114.875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 2.8548775017261505, "epoch": 0.00016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2438299999999998e-06, "loss": 0.0, "num_tokens": 14888.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 2, "step_time": 22.31387728100526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 216.3125, "completions/mean_terminated_length": 129.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 2.8777474462985992, "epoch": 0.00024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4876599999999997e-06, "loss": 0.0, "num_tokens": 22034.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 3, "step_time": 19.508486614991853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 225.40625, "completions/mean_terminated_length": 92.83333587646484, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 2.708147943019867, "epoch": 0.00032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.73149e-06, "loss": 0.0, "num_tokens": 29471.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 4, "step_time": 20.417726718005724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 92.44444274902344, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.7702046930789948, "epoch": 0.0004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.975319999999999e-06, "loss": 0.0, "num_tokens": 36415.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 5, "step_time": 19.458035143004963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 180.46875, "completions/mean_terminated_length": 70.0769271850586, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 2.7136347889900208, "epoch": 0.00048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.2191499999999996e-06, "loss": 0.0, "num_tokens": 42410.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 6, "step_time": 22.11808781498985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 215.84375, "completions/mean_terminated_length": 113.22222137451172, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 2.875216066837311, "epoch": 0.00056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 7.46298e-06, "loss": 0.0, "num_tokens": 49537.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 7, "step_time": 21.699966289990698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 219.625, "completions/mean_terminated_length": 110.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 2.737216532230377, "epoch": 0.00064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 8.70681e-06, "loss": 0.0, "num_tokens": 56789.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 8, "step_time": 19.435475739024696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 210.9375, "completions/mean_terminated_length": 95.77777862548828, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 2.9164214432239532, "epoch": 0.00072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.950639999999999e-06, "loss": 0.0, "num_tokens": 63759.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 9, "step_time": 23.149850735993823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 245.96875, "completions/mean_terminated_length": 95.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.7655889093875885, "epoch": 0.0008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1194469999999999e-05, "loss": 0.0, "num_tokens": 71850.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 10, "step_time": 21.312995460008096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 119.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.4570625126361847, "epoch": 0.00088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.2438299999999999e-05, "loss": 0.0, "num_tokens": 79170.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 11, "step_time": 18.84161558598862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.59375, "completions/mean_terminated_length": 98.71428680419922, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 2.7960894107818604, "epoch": 0.00096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.368213e-05, "loss": 0.0, "num_tokens": 86481.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 12, "step_time": 21.481500715010043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 175.84375, "completions/mean_terminated_length": 72.78572082519531, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 2.777267038822174, "epoch": 0.00104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.492596e-05, "loss": 0.0, "num_tokens": 92316.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 13, "step_time": 22.648648835995118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 222.03125, "completions/mean_terminated_length": 120.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 2.7303763031959534, "epoch": 0.00112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6169789999999998e-05, "loss": 0.0, "num_tokens": 99645.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 14, "step_time": 19.483697141011362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 110.80000305175781, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 2.6232912838459015, "epoch": 0.0012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.741362e-05, "loss": 0.0, "num_tokens": 106597.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 15, "step_time": 22.048389301991847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 232.78125, "completions/mean_terminated_length": 70.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 2.666172578930855, "epoch": 0.00128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.865745e-05, "loss": 0.0, "num_tokens": 114266.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 16, "step_time": 21.76690764699015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 215.90625, "completions/mean_terminated_length": 113.44444274902344, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.6928308606147766, "epoch": 0.00136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.9901279999999997e-05, "loss": 0.0, "num_tokens": 121399.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 17, "step_time": 19.46455569099635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 183.09375, "completions/mean_terminated_length": 76.53846740722656, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.5316834151744843, "epoch": 0.00144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.114511e-05, "loss": 0.0, "num_tokens": 127478.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 18, "step_time": 22.669331256991427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 178.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 2.7606712579727173, "epoch": 0.00152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.2388939999999998e-05, "loss": 0.0, "num_tokens": 135734.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 19, "step_time": 22.320524194008613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 221.90625, "completions/mean_terminated_length": 100.14286041259766, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 2.6717658638954163, "epoch": 0.0016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.3632769999999996e-05, "loss": 0.0, "num_tokens": 143047.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 20, "step_time": 22.302162043000862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.96875, "completions/mean_terminated_length": 103.875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 2.536282777786255, "epoch": 0.00168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.4876599999999998e-05, "loss": 0.0, "num_tokens": 150246.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 21, "step_time": 19.36471923001227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 206.90625, "completions/mean_terminated_length": 98.9000015258789, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 2.541882336139679, "epoch": 0.00176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.6120429999999997e-05, "loss": 0.0, "num_tokens": 157083.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 22, "step_time": 22.757351590000326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 223.78125, "completions/mean_terminated_length": 84.16667175292969, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 2.657119005918503, "epoch": 0.00184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.736426e-05, "loss": 0.0, "num_tokens": 164464.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 23, "step_time": 22.50556095898355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 219.0625, "completions/mean_terminated_length": 108.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 2.8032337725162506, "epoch": 0.00192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8608089999999997e-05, "loss": 0.0, "num_tokens": 171698.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 24, "step_time": 19.709908160984924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 195.96875, "completions/mean_terminated_length": 63.900001525878906, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.8214994370937347, "epoch": 0.002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.985192e-05, "loss": 0.0, "num_tokens": 178181.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 25, "step_time": 22.27659140600008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 228.3125, "completions/mean_terminated_length": 129.42857360839844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 2.782959371805191, "epoch": 0.00208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.109575e-05, "loss": 0.0, "num_tokens": 185703.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 26, "step_time": 21.666986704993178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 150.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 2.837796986103058, "epoch": 0.00216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.2339579999999996e-05, "loss": 0.0, "num_tokens": 193275.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 27, "step_time": 19.754789013990376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 76.80000305175781, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 2.697370797395706, "epoch": 0.00224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.358341e-05, "loss": 0.0, "num_tokens": 200791.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 28, "step_time": 22.38940100500622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 81.20000457763672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 2.6589381992816925, "epoch": 0.00232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.482724e-05, "loss": 0.0, "num_tokens": 207459.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 29, "step_time": 19.54482721599925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 195.15625, "completions/mean_terminated_length": 93.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 2.9770112335681915, "epoch": 0.0024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.607107e-05, "loss": 0.0, "num_tokens": 213928.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 30, "step_time": 19.39816167599929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 217.6875, "completions/mean_terminated_length": 153.83334350585938, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 2.709945023059845, "epoch": 0.00248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.73149e-05, "loss": 0.0, "num_tokens": 221118.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 31, "step_time": 19.455606768009602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 213.4375, "completions/mean_terminated_length": 119.80000305175781, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 2.7361134737730026, "epoch": 0.00256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.8558729999999996e-05, "loss": 0.0, "num_tokens": 228168.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 32, "step_time": 22.964359290992434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 96.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.909568816423416, "epoch": 0.00264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.9802559999999995e-05, "loss": 0.0, "num_tokens": 235304.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 33, "step_time": 19.29167694800708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 209.1875, "completions/mean_terminated_length": 89.55555725097656, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 2.879765272140503, "epoch": 0.00272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.104638999999999e-05, "loss": 0.0, "num_tokens": 242218.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 34, "step_time": 21.556990506993316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 243.1875, "completions/mean_terminated_length": 119.33333587646484, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 2.5094977021217346, "epoch": 0.0028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.229022e-05, "loss": 0.0, "num_tokens": 250216.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 35, "step_time": 21.540133035996405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 96.44444274902344, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 2.7391299456357956, "epoch": 0.00288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353405e-05, "loss": 0.0, "num_tokens": 257196.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 36, "step_time": 20.61158860699652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 188.9375, "completions/mean_terminated_length": 90.92308044433594, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.5645452737808228, "epoch": 0.00296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534049870739164e-05, "loss": 0.0, "num_tokens": 263466.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 37, "step_time": 19.463059345995134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 243.15625, "completions/mean_terminated_length": 119.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 2.6515900790691376, "epoch": 0.00304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534049482956666e-05, "loss": 0.0, "num_tokens": 271471.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 38, "step_time": 19.32687450600497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 233.4375, "completions/mean_terminated_length": 152.85714721679688, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 2.815293073654175, "epoch": 0.00312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353404883665252e-05, "loss": 0.0, "num_tokens": 279165.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 39, "step_time": 19.368901928013656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 227.8125, "completions/mean_terminated_length": 105.66667175292969, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 2.8266728222370148, "epoch": 0.0032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353404793182672e-05, "loss": 0.0, "num_tokens": 286679.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 40, "step_time": 19.347783612996864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 78.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.8580541610717773, "epoch": 0.00328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534046768479294e-05, "loss": 0.0, "num_tokens": 293667.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 41, "step_time": 22.78021706399886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 63.66666793823242, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.999121993780136, "epoch": 0.00336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353404534661025e-05, "loss": 0.0, "num_tokens": 300929.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 42, "step_time": 19.530366815997695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 222.59375, "completions/mean_terminated_length": 122.375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 2.805495321750641, "epoch": 0.00344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353404366621962e-05, "loss": 0.0, "num_tokens": 308276.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 43, "step_time": 19.35092342599819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 233.1875, "completions/mean_terminated_length": 134.33334350585938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 2.8153499960899353, "epoch": 0.00352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534041727307414e-05, "loss": 0.0, "num_tokens": 315962.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 44, "step_time": 19.47554490200855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 182.71875, "completions/mean_terminated_length": 75.61538696289062, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6707731187343597, "epoch": 0.0036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534039529873685e-05, "loss": 0.0, "num_tokens": 322029.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 45, "step_time": 22.798208642001555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 236.9375, "completions/mean_terminated_length": 134.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 2.5431984215974808, "epoch": 0.00368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534037073918466e-05, "loss": 0.0, "num_tokens": 329835.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 46, "step_time": 19.88411584899586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 147.1999969482422, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 2.9005468487739563, "epoch": 0.00376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353403435944177e-05, "loss": 0.0, "num_tokens": 337163.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 47, "step_time": 19.48703931599448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 198.15625, "completions/mean_terminated_length": 101.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 2.7508918046951294, "epoch": 0.00384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353403138644366e-05, "loss": 0.0, "num_tokens": 343724.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 48, "step_time": 22.090118679989246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 226.65625, "completions/mean_terminated_length": 99.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 2.669710338115692, "epoch": 0.00392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534028154924186e-05, "loss": 0.0, "num_tokens": 351201.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 49, "step_time": 19.470153064998158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 219.40625, "completions/mean_terminated_length": 60.833335876464844, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.4048453122377396, "epoch": 0.004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.35340246648834e-05, "loss": 0.0, "num_tokens": 358438.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 50, "step_time": 22.60661829500168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 222.65625, "completions/mean_terminated_length": 122.625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 2.9614007472991943, "epoch": 0.00408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534020916321335e-05, "loss": 0.0, "num_tokens": 365783.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 51, "step_time": 22.327632517990423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 126.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 2.654112696647644, "epoch": 0.00416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3534016909238075e-05, "loss": 0.0, "num_tokens": 373419.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 52, "step_time": 19.20923549200961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 215.09375, "completions/mean_terminated_length": 92.375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 2.7122311294078827, "epoch": 0.00424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353401264363367e-05, "loss": 0.0, "num_tokens": 380526.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 53, "step_time": 19.398160734999692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 246.46875, "completions/mean_terminated_length": 154.33334350585938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 2.6178116649389267, "epoch": 0.00432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.35340081195082e-05, "loss": 0.0, "num_tokens": 388637.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 54, "step_time": 19.338012945008813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 230.5625, "completions/mean_terminated_length": 52.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 2.716535747051239, "epoch": 0.0044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353400333686172e-05, "loss": 0.0, "num_tokens": 396239.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 55, "step_time": 20.151991571001417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 218.3125, "completions/mean_terminated_length": 135.40000915527344, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 2.8078980445861816, "epoch": 0.00448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353399829569432e-05, "loss": 0.0, "num_tokens": 403433.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 56, "step_time": 22.507306526997127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 118.66666412353516, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 2.833368271589279, "epoch": 0.00456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353399299600607e-05, "loss": 0.0, "num_tokens": 410613.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 57, "step_time": 19.39536341799976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 89.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 2.6436397433280945, "epoch": 0.00464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353398743779707e-05, "loss": 0.0, "num_tokens": 417697.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 58, "step_time": 19.44620426499023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 216.5625, "completions/mean_terminated_length": 98.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 2.812237471342087, "epoch": 0.00472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353398162106738e-05, "loss": 0.0, "num_tokens": 424847.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 59, "step_time": 22.402232911990723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 198.5625, "completions/mean_terminated_length": 72.20000457763672, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 2.834031730890274, "epoch": 0.0048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353397554581712e-05, "loss": 0.0, "num_tokens": 431425.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 60, "step_time": 19.969688828998187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 112.83333587646484, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 2.8446905314922333, "epoch": 0.00488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3533969212046366e-05, "loss": 0.0, "num_tokens": 438123.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 61, "step_time": 19.51650980200793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 218.71875, "completions/mean_terminated_length": 85.5714340209961, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 2.6947758495807648, "epoch": 0.00496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3533962619755234e-05, "loss": 0.0, "num_tokens": 445342.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 62, "step_time": 21.935334763016726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 187.90625, "completions/mean_terminated_length": 74.41667175292969, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 2.6052669137716293, "epoch": 0.00504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353395576894381e-05, "loss": 0.0, "num_tokens": 451571.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 63, "step_time": 22.139962298009777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 219.90625, "completions/mean_terminated_length": 63.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 2.774052321910858, "epoch": 0.00512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353394865961223e-05, "loss": 0.0, "num_tokens": 458828.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 64, "step_time": 21.979154282984382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 206.4375, "completions/mean_terminated_length": 97.4000015258789, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 2.6954946517944336, "epoch": 0.0052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353394129176058e-05, "loss": 0.0, "num_tokens": 465654.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 65, "step_time": 22.214402237004833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 131.33334350585938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 2.571521297097206, "epoch": 0.00528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353393366538898e-05, "loss": 0.0, "num_tokens": 473322.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 66, "step_time": 19.304359686997486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 121.42857360839844, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 2.909770429134369, "epoch": 0.00536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353392578049757e-05, "loss": 0.0, "num_tokens": 479854.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 67, "step_time": 19.36013725100929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 140.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 2.744248181581497, "epoch": 0.00544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353391763708646e-05, "loss": 0.0, "num_tokens": 487338.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 68, "step_time": 21.965071903985518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 223.9375, "completions/mean_terminated_length": 85.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 2.6102449893951416, "epoch": 0.00552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353390923515578e-05, "loss": 0.0, "num_tokens": 494716.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 69, "step_time": 23.3430329200055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 210.8125, "completions/mean_terminated_length": 111.4000015258789, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 2.576720654964447, "epoch": 0.0056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353390057470567e-05, "loss": 0.0, "num_tokens": 501686.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 70, "step_time": 19.697308761002205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 233.6875, "completions/mean_terminated_length": 113.20000457763672, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 2.9254136979579926, "epoch": 0.00568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353389165573626e-05, "loss": 0.0, "num_tokens": 509388.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 71, "step_time": 19.443207848002203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 227.15625, "completions/mean_terminated_length": 102.16667175292969, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.8628475964069366, "epoch": 0.00576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353388247824768e-05, "loss": 0.0, "num_tokens": 516881.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 72, "step_time": 20.01639660699584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 124.66667175292969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 2.296522408723831, "epoch": 0.00584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.3533873042240096e-05, "loss": 0.0, "num_tokens": 524501.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 73, "step_time": 23.112452601002587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 236.3125, "completions/mean_terminated_length": 186.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 2.8620297014713287, "epoch": 0.00592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353386334771366e-05, "loss": 0.0, "num_tokens": 532283.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 74, "step_time": 23.19768616399233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.5197259187698364, "epoch": 0.006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.353385339466851e-05, "loss": 0.0, "num_tokens": 540059.0, "reward": 0.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 0.0, "step": 75, "step_time": 19.854602511004487 } ], "logging_steps": 1.0, "max_steps": 25000, "num_input_tokens_seen": 540059, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }