{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.571428571428571, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.060131815262138844, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.05771088972687721, "kl": 0.0, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 17832.0, "reward": 1.0437500476837158, "reward_std": 0.0353553369641304, "rewards/oai_reward_function/mean": 0.5218750014901161, "rewards/oai_reward_function/std": 0.043879419565200806, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06413675658404827, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.03477979078888893, "kl": 0.0003001746808877215, "learning_rate": 4.928571428571429e-05, "loss": 0.0, "num_tokens": 35712.0, "reward": 1.046875, "reward_std": 0.028149789199233055, "rewards/oai_reward_function/mean": 0.5234375, "rewards/oai_reward_function/std": 0.049161311239004135, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.052969515323638916, "epoch": 0.04285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005888506420888007, "kl": 0.0004545010087895207, "learning_rate": 4.8571428571428576e-05, "loss": 0.0, "num_tokens": 53424.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06199027318507433, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.04447643458843231, "kl": 0.0005710393161280081, "learning_rate": 4.785714285714286e-05, "loss": 0.0, "num_tokens": 71248.0, "reward": 1.2265625, "reward_std": 0.004419416189193726, "rewards/oai_reward_function/mean": 0.61328125, "rewards/oai_reward_function/std": 0.1993926614522934, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0633242828771472, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.043302807956933975, "kl": 0.001818844728404656, "learning_rate": 4.714285714285714e-05, "loss": 0.0, "num_tokens": 89000.0, "reward": 1.032812476158142, "reward_std": 0.022097092121839523, "rewards/oai_reward_function/mean": 0.5164062492549419, "rewards/oai_reward_function/std": 0.03570114076137543, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06267449539154768, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 0.048733897507190704, "kl": 0.0011250173120060936, "learning_rate": 4.642857142857143e-05, "loss": 0.0, "num_tokens": 106816.0, "reward": 1.071874976158142, "reward_std": 0.03390505909919739, "rewards/oai_reward_function/mean": 0.5359375029802322, "rewards/oai_reward_function/std": 0.07097747921943665, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06247459910809994, "epoch": 0.1, "frac_reward_zero_std": 0.5, "grad_norm": 0.07503627240657806, "kl": 0.0016785123152658343, "learning_rate": 4.5714285714285716e-05, "loss": 0.0, "num_tokens": 124592.0, "reward": 1.181249976158142, "reward_std": 0.06808801740407944, "rewards/oai_reward_function/mean": 0.5906250029802322, "rewards/oai_reward_function/std": 0.13951963186264038, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09173925407230854, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 0.046880681067705154, "kl": 0.004017388273496181, "learning_rate": 4.5e-05, "loss": 0.0, "num_tokens": 142368.0, "reward": 1.001562476158142, "reward_std": 0.004419416189193726, "rewards/oai_reward_function/mean": 0.5007812500116415, "rewards/oai_reward_function/std": 0.0044194175861775875, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07956545054912567, "epoch": 0.12857142857142856, "frac_reward_zero_std": 0.75, "grad_norm": 0.061871547251939774, "kl": 0.00639598595444113, "learning_rate": 4.428571428571428e-05, "loss": 0.0001, "num_tokens": 160160.0, "reward": 1.021875023841858, "reward_std": 0.052504248917102814, "rewards/oai_reward_function/mean": 0.5109375007450581, "rewards/oai_reward_function/std": 0.053482551127672195, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06956008821725845, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.06243785470724106, "kl": 0.00972771504893899, "learning_rate": 4.3571428571428576e-05, "loss": 0.0001, "num_tokens": 177984.0, "reward": 1.2296874523162842, "reward_std": 0.01684970036149025, "rewards/oai_reward_function/mean": 0.6148437485098839, "rewards/oai_reward_function/std": 0.1987723708152771, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07967641018331051, "epoch": 0.15714285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 0.07661325484514236, "kl": 0.0069638064596802, "learning_rate": 4.2857142857142856e-05, "loss": 0.0001, "num_tokens": 195896.0, "reward": 1.1062500476837158, "reward_std": 0.06087504327297211, "rewards/oai_reward_function/mean": 0.5531250014901161, "rewards/oai_reward_function/std": 0.08584260195493698, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0841637123376131, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.05056445300579071, "kl": 0.011949660489335656, "learning_rate": 4.214285714285714e-05, "loss": 0.0001, "num_tokens": 213760.0, "reward": 1.131250023841858, "reward_std": 0.029124131426215172, "rewards/oai_reward_function/mean": 0.5656249970197678, "rewards/oai_reward_function/std": 0.11875531077384949, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0837175901979208, "epoch": 0.18571428571428572, "frac_reward_zero_std": 0.25, "grad_norm": 0.1177188903093338, "kl": 0.01176721346564591, "learning_rate": 4.1428571428571437e-05, "loss": 0.0001, "num_tokens": 231664.0, "reward": 1.2421875, "reward_std": 0.02758825570344925, "rewards/oai_reward_function/mean": 0.62109375, "rewards/oai_reward_function/std": 0.1868790090084076, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07715502567589283, "epoch": 0.2, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020164160523563623, "kl": 0.013234916375949979, "learning_rate": 4.0714285714285717e-05, "loss": 0.0001, "num_tokens": 249528.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0780396144837141, "epoch": 0.21428571428571427, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018555221613496542, "kl": 0.011373426881618798, "learning_rate": 4e-05, "loss": 0.0001, "num_tokens": 267168.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0733959898352623, "epoch": 0.22857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 0.09124160557985306, "kl": 0.021819928660988808, "learning_rate": 3.928571428571429e-05, "loss": 0.0002, "num_tokens": 284928.0, "reward": 1.0484375953674316, "reward_std": 0.05051835626363754, "rewards/oai_reward_function/mean": 0.5242187511175871, "rewards/oai_reward_function/std": 0.044669199734926224, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09740176424384117, "epoch": 0.24285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.052958983927965164, "kl": 0.028434510342776775, "learning_rate": 3.857142857142858e-05, "loss": 0.0003, "num_tokens": 302816.0, "reward": 1.071874976158142, "reward_std": 0.06469365209341049, "rewards/oai_reward_function/mean": 0.5359374992549419, "rewards/oai_reward_function/std": 0.0882028192281723, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08574963361024857, "epoch": 0.2571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.04292497783899307, "kl": 0.033173230942338705, "learning_rate": 3.785714285714286e-05, "loss": 0.0003, "num_tokens": 320584.0, "reward": 1.001562476158142, "reward_std": 0.004419416189193726, "rewards/oai_reward_function/mean": 0.5007812500116415, "rewards/oai_reward_function/std": 0.0044194175861775875, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12318380549550056, "epoch": 0.2714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.06502827256917953, "kl": 0.03774468321353197, "learning_rate": 3.7142857142857143e-05, "loss": 0.0004, "num_tokens": 338448.0, "reward": 1.109375, "reward_std": 0.05164698138833046, "rewards/oai_reward_function/mean": 0.5546875, "rewards/oai_reward_function/std": 0.10803177952766418, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08734610676765442, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.07792048156261444, "kl": 0.028081147465854883, "learning_rate": 3.642857142857143e-05, "loss": 0.0003, "num_tokens": 356200.0, "reward": 1.03125, "reward_std": 0.047612957656383514, "rewards/oai_reward_function/mean": 0.515625, "rewards/oai_reward_function/std": 0.04151855409145355, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08050715737044811, "epoch": 0.3, "frac_reward_zero_std": 0.75, "grad_norm": 0.047350119799375534, "kl": 0.026192680466920137, "learning_rate": 3.571428571428572e-05, "loss": 0.0003, "num_tokens": 373912.0, "reward": 0.503125011920929, "reward_std": 0.008838832378387451, "rewards/oai_reward_function/mean": 0.25156250002328306, "rewards/oai_reward_function/std": 0.26283908169716597, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07859978079795837, "epoch": 0.3142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.18296176195144653, "kl": 0.035550063010305166, "learning_rate": 3.5e-05, "loss": 0.0004, "num_tokens": 391880.0, "reward": 0.2578125, "reward_std": 0.4363012909889221, "rewards/oai_reward_function/mean": 0.12890625, "rewards/oai_reward_function/std": 0.28480061888694763, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07332467474043369, "epoch": 0.32857142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.26302284002304077, "kl": 0.02304189372807741, "learning_rate": 3.428571428571429e-05, "loss": 0.0002, "num_tokens": 409592.0, "reward": 0.4375, "reward_std": 0.3335031569004059, "rewards/oai_reward_function/mean": 0.21875, "rewards/oai_reward_function/std": 0.2520080506801605, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08370361104607582, "epoch": 0.34285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.07096434384584427, "kl": 0.02303632628172636, "learning_rate": 3.357142857142857e-05, "loss": 0.0002, "num_tokens": 427504.0, "reward": 1.0906250476837158, "reward_std": 0.12288369983434677, "rewards/oai_reward_function/mean": 0.5453124977648258, "rewards/oai_reward_function/std": 0.18977738916873932, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10516241379082203, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.10253780335187912, "kl": 0.022893703542649746, "learning_rate": 3.285714285714286e-05, "loss": 0.0002, "num_tokens": 445464.0, "reward": 1.0281250476837158, "reward_std": 0.11285631358623505, "rewards/oai_reward_function/mean": 0.5140625014901161, "rewards/oai_reward_function/std": 0.13734418153762817, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10455058515071869, "epoch": 0.37142857142857144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020646003540605307, "kl": 0.013847913593053818, "learning_rate": 3.2142857142857144e-05, "loss": 0.0001, "num_tokens": 463176.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10624882206320763, "epoch": 0.38571428571428573, "frac_reward_zero_std": 0.25, "grad_norm": 0.08771698921918869, "kl": 0.023737956769764423, "learning_rate": 3.142857142857143e-05, "loss": 0.0002, "num_tokens": 480896.0, "reward": 1.1234374046325684, "reward_std": 0.12076057493686676, "rewards/oai_reward_function/mean": 0.5617187507450581, "rewards/oai_reward_function/std": 0.09692539274692535, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09867865778505802, "epoch": 0.4, "frac_reward_zero_std": 0.75, "grad_norm": 0.04759760946035385, "kl": 0.016957666259258986, "learning_rate": 3.071428571428572e-05, "loss": 0.0002, "num_tokens": 498752.0, "reward": 1.0515625476837158, "reward_std": 0.016952523961663246, "rewards/oai_reward_function/mean": 0.525781249627471, "rewards/oai_reward_function/std": 0.048144761472940445, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12629481963813305, "epoch": 0.4142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.058567993342876434, "kl": 0.017663696315139532, "learning_rate": 3e-05, "loss": 0.0002, "num_tokens": 516552.0, "reward": 1.2234375476837158, "reward_std": 0.018139135092496872, "rewards/oai_reward_function/mean": 0.6117187514901161, "rewards/oai_reward_function/std": 0.1933349370956421, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.1236942820250988, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.5, "grad_norm": 0.08224395662546158, "kl": 0.011707060737535357, "learning_rate": 2.9285714285714288e-05, "loss": 0.0001, "num_tokens": 534336.0, "reward": 1.1906249523162842, "reward_std": 0.09417471289634705, "rewards/oai_reward_function/mean": 0.5953124985098839, "rewards/oai_reward_function/std": 0.2836897447705269, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12007096596062183, "epoch": 0.44285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.12164021283388138, "kl": 0.015199759975075722, "learning_rate": 2.857142857142857e-05, "loss": 0.0002, "num_tokens": 552288.0, "reward": 1.459375023841858, "reward_std": 0.23513765633106232, "rewards/oai_reward_function/mean": 0.729687511920929, "rewards/oai_reward_function/std": 0.31374088674783707, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12509393319487572, "epoch": 0.45714285714285713, "frac_reward_zero_std": 0.5, "grad_norm": 0.09083209186792374, "kl": 0.01757637900300324, "learning_rate": 2.785714285714286e-05, "loss": 0.0002, "num_tokens": 570160.0, "reward": 1.076562523841858, "reward_std": 0.04446931555867195, "rewards/oai_reward_function/mean": 0.5382812470197678, "rewards/oai_reward_function/std": 0.07267481088638306, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12463634088635445, "epoch": 0.4714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.002431818749755621, "kl": 0.014958202606067061, "learning_rate": 2.714285714285714e-05, "loss": 0.0001, "num_tokens": 587872.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11686164513230324, "epoch": 0.4857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.05484098196029663, "kl": 0.010952673037536442, "learning_rate": 2.642857142857143e-05, "loss": 0.0001, "num_tokens": 605824.0, "reward": 1.0968749523162842, "reward_std": 0.09722718596458435, "rewards/oai_reward_function/mean": 0.5484375022351742, "rewards/oai_reward_function/std": 0.0920066386461258, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12041523866355419, "epoch": 0.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.05225397273898125, "kl": 0.006640716805122793, "learning_rate": 2.5714285714285714e-05, "loss": 0.0001, "num_tokens": 623624.0, "reward": 1.0046875476837158, "reward_std": 0.0093002924695611, "rewards/oai_reward_function/mean": 0.5023437500931323, "rewards/oai_reward_function/std": 0.009753772988915443, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12032002210617065, "epoch": 0.5142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.07161174714565277, "kl": 0.010428835870698094, "learning_rate": 2.5e-05, "loss": 0.0001, "num_tokens": 641400.0, "reward": 1.0437500476837158, "reward_std": 0.052891530096530914, "rewards/oai_reward_function/mean": 0.521874999627471, "rewards/oai_reward_function/std": 0.04741290956735611, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.13013662584125996, "epoch": 0.5285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.05804259702563286, "kl": 0.01170262903906405, "learning_rate": 2.4285714285714288e-05, "loss": 0.0001, "num_tokens": 659192.0, "reward": 1.0812499523162842, "reward_std": 0.07288689911365509, "rewards/oai_reward_function/mean": 0.5406250022351742, "rewards/oai_reward_function/std": 0.09954533725976944, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09727449901401997, "epoch": 0.5428571428571428, "frac_reward_zero_std": 0.25, "grad_norm": 0.07038255035877228, "kl": 0.009029814857058227, "learning_rate": 2.357142857142857e-05, "loss": 0.0001, "num_tokens": 677088.0, "reward": 1.423437476158142, "reward_std": 0.03818885609507561, "rewards/oai_reward_function/mean": 0.711718738079071, "rewards/oai_reward_function/std": 0.18094559013843536, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11586509644985199, "epoch": 0.5571428571428572, "frac_reward_zero_std": 0.25, "grad_norm": 0.08595240861177444, "kl": 0.011346436338499188, "learning_rate": 2.2857142857142858e-05, "loss": 0.0001, "num_tokens": 694920.0, "reward": 1.3796875476837158, "reward_std": 0.049540840089321136, "rewards/oai_reward_function/mean": 0.6898437440395355, "rewards/oai_reward_function/std": 0.20018735527992249, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.1129021979868412, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.03896208480000496, "kl": 0.011648714076727629, "learning_rate": 2.214285714285714e-05, "loss": 0.0001, "num_tokens": 712560.0, "reward": 1.0031249523162842, "reward_std": 0.008838832378387451, "rewards/oai_reward_function/mean": 0.5015625000232831, "rewards/oai_reward_function/std": 0.008838835172355175, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12671913765370846, "epoch": 0.5857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.060481104999780655, "kl": 0.009595283307135105, "learning_rate": 2.1428571428571428e-05, "loss": 0.0001, "num_tokens": 730352.0, "reward": 1.037500023841858, "reward_std": 0.026726119220256805, "rewards/oai_reward_function/mean": 0.5187500007450581, "rewards/oai_reward_function/std": 0.0416397787630558, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.1355144940316677, "epoch": 0.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.10250900685787201, "kl": 0.010706432163715363, "learning_rate": 2.0714285714285718e-05, "loss": 0.0001, "num_tokens": 748080.0, "reward": 0.971875011920929, "reward_std": 0.16737449169158936, "rewards/oai_reward_function/mean": 0.48593750037252903, "rewards/oai_reward_function/std": 0.18062228709459305, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11447549611330032, "epoch": 0.6142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.07029257714748383, "kl": 0.011148489313200116, "learning_rate": 2e-05, "loss": 0.0001, "num_tokens": 765848.0, "reward": 1.029687523841858, "reward_std": 0.06395581364631653, "rewards/oai_reward_function/mean": 0.5148437507450581, "rewards/oai_reward_function/std": 0.05420219525694847, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12138544581830502, "epoch": 0.6285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.07489942759275436, "kl": 0.009310122113674879, "learning_rate": 1.928571428571429e-05, "loss": 0.0001, "num_tokens": 783552.0, "reward": 1.015625, "reward_std": 0.03808925300836563, "rewards/oai_reward_function/mean": 0.5078125, "rewards/oai_reward_function/std": 0.02870701625943184, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11241224221885204, "epoch": 0.6428571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.06217681244015694, "kl": 0.015002928674221039, "learning_rate": 1.8571428571428572e-05, "loss": 0.0001, "num_tokens": 801392.0, "reward": 1.171875, "reward_std": 0.12756596505641937, "rewards/oai_reward_function/mean": 0.5859375, "rewards/oai_reward_function/std": 0.13151375949382782, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10430784896016121, "epoch": 0.6571428571428571, "frac_reward_zero_std": 0.25, "grad_norm": 0.07851167023181915, "kl": 0.013715020613744855, "learning_rate": 1.785714285714286e-05, "loss": 0.0001, "num_tokens": 819120.0, "reward": 1.1765625476837158, "reward_std": 0.11721621453762054, "rewards/oai_reward_function/mean": 0.5882812440395355, "rewards/oai_reward_function/std": 0.23893966525793076, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09174064546823502, "epoch": 0.6714285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 0.08239107578992844, "kl": 0.0339348167181015, "learning_rate": 1.7142857142857145e-05, "loss": 0.0003, "num_tokens": 836976.0, "reward": 1.1734375953674316, "reward_std": 0.07495103776454926, "rewards/oai_reward_function/mean": 0.5867187529802322, "rewards/oai_reward_function/std": 0.09024705737829208, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12333916500210762, "epoch": 0.6857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.06266991049051285, "kl": 0.011174799175933003, "learning_rate": 1.642857142857143e-05, "loss": 0.0001, "num_tokens": 854808.0, "reward": 1.021875023841858, "reward_std": 0.03390507400035858, "rewards/oai_reward_function/mean": 0.5109375007450581, "rewards/oai_reward_function/std": 0.03753358870744705, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11699695512652397, "epoch": 0.7, "frac_reward_zero_std": 0.5, "grad_norm": 0.06871096044778824, "kl": 0.011643779696896672, "learning_rate": 1.5714285714285715e-05, "loss": 0.0001, "num_tokens": 872616.0, "reward": 1.2109375, "reward_std": 0.020290398970246315, "rewards/oai_reward_function/mean": 0.6054687574505806, "rewards/oai_reward_function/std": 0.18247121572494507, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11293753050267696, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.03746737167239189, "kl": 0.008202132536098361, "learning_rate": 1.5e-05, "loss": 0.0001, "num_tokens": 890472.0, "reward": 1.0281250476837158, "reward_std": 0.008838837966322899, "rewards/oai_reward_function/mean": 0.514062499627471, "rewards/oai_reward_function/std": 0.026133574545383453, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.12108040601015091, "epoch": 0.7285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.09108272194862366, "kl": 0.009169791359454393, "learning_rate": 1.4285714285714285e-05, "loss": 0.0001, "num_tokens": 908352.0, "reward": 1.126562476158142, "reward_std": 0.1380167454481125, "rewards/oai_reward_function/mean": 0.5632812455296516, "rewards/oai_reward_function/std": 0.1459098607301712, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10115997679531574, "epoch": 0.7428571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.06731049716472626, "kl": 0.007746399496681988, "learning_rate": 1.357142857142857e-05, "loss": 0.0001, "num_tokens": 926032.0, "reward": 1.045312523841858, "reward_std": 0.04133228585124016, "rewards/oai_reward_function/mean": 0.5226562507450581, "rewards/oai_reward_function/std": 0.04369957000017166, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.114451814442873, "epoch": 0.7571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016380356391891837, "kl": 0.008339080261066556, "learning_rate": 1.2857142857142857e-05, "loss": 0.0001, "num_tokens": 943784.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.1146883126348257, "epoch": 0.7714285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.0652991309762001, "kl": 0.014742115745320916, "learning_rate": 1.2142857142857144e-05, "loss": 0.0001, "num_tokens": 961592.0, "reward": 1.162500023841858, "reward_std": 0.09099893271923065, "rewards/oai_reward_function/mean": 0.5812500044703484, "rewards/oai_reward_function/std": 0.11896733194589615, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0963958241045475, "epoch": 0.7857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.06369847059249878, "kl": 0.00836158636957407, "learning_rate": 1.1428571428571429e-05, "loss": 0.0001, "num_tokens": 979312.0, "reward": 1.1375000476837158, "reward_std": 0.055009134113788605, "rewards/oai_reward_function/mean": 0.5687500014901161, "rewards/oai_reward_function/std": 0.12556324899196625, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11964921839535236, "epoch": 0.8, "frac_reward_zero_std": 0.5, "grad_norm": 0.10623525083065033, "kl": 0.008312122779898345, "learning_rate": 1.0714285714285714e-05, "loss": 0.0001, "num_tokens": 997072.0, "reward": 1.0031249523162842, "reward_std": 0.11129148304462433, "rewards/oai_reward_function/mean": 0.501562500372529, "rewards/oai_reward_function/std": 0.12565238773822784, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10028179734945297, "epoch": 0.8142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.05555571988224983, "kl": 0.012380573665723205, "learning_rate": 1e-05, "loss": 0.0001, "num_tokens": 1014864.0, "reward": 1.015625, "reward_std": 0.0265165027230978, "rewards/oai_reward_function/mean": 0.5078125, "rewards/oai_reward_function/std": 0.02870701625943184, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09480222314596176, "epoch": 0.8285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 0.09733164310455322, "kl": 0.010292174993082881, "learning_rate": 9.285714285714286e-06, "loss": 0.0001, "num_tokens": 1032656.0, "reward": 1.125, "reward_std": 0.10169674456119537, "rewards/oai_reward_function/mean": 0.5624999962747097, "rewards/oai_reward_function/std": 0.10375995188951492, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10845490545034409, "epoch": 0.8428571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.07499091327190399, "kl": 0.009870404610410333, "learning_rate": 8.571428571428573e-06, "loss": 0.0001, "num_tokens": 1050472.0, "reward": 1.0187499523162842, "reward_std": 0.02493581920862198, "rewards/oai_reward_function/mean": 0.509375000372529, "rewards/oai_reward_function/std": 0.019827887415885925, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11447742953896523, "epoch": 0.8571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.05116976425051689, "kl": 0.005832118098624051, "learning_rate": 7.857142857142858e-06, "loss": 0.0001, "num_tokens": 1068288.0, "reward": 1.0343749523162842, "reward_std": 0.029693374410271645, "rewards/oai_reward_function/mean": 0.517187500372529, "rewards/oai_reward_function/std": 0.04136652871966362, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10109574533998966, "epoch": 0.8714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.06300117075443268, "kl": 0.007976277614943683, "learning_rate": 7.142857142857143e-06, "loss": 0.0001, "num_tokens": 1086200.0, "reward": 1.0109374523162842, "reward_std": 0.023685520514845848, "rewards/oai_reward_function/mean": 0.5054687499068677, "rewards/oai_reward_function/std": 0.01765984110534191, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10328171029686928, "epoch": 0.8857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.041226889938116074, "kl": 0.00717292504850775, "learning_rate": 6.428571428571429e-06, "loss": 0.0001, "num_tokens": 1104056.0, "reward": 1.09375, "reward_std": 0.03720119222998619, "rewards/oai_reward_function/mean": 0.546875, "rewards/oai_reward_function/std": 0.08974651247262955, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09696869738399982, "epoch": 0.9, "frac_reward_zero_std": 0.75, "grad_norm": 0.03586564213037491, "kl": 0.009956882800906897, "learning_rate": 5.7142857142857145e-06, "loss": 0.0001, "num_tokens": 1121872.0, "reward": 1.1218750476837158, "reward_std": 0.031160593032836914, "rewards/oai_reward_function/mean": 0.5609375014901161, "rewards/oai_reward_function/std": 0.11124978214502335, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10472088679671288, "epoch": 0.9142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.045453496277332306, "kl": 0.008746590930968523, "learning_rate": 5e-06, "loss": 0.0001, "num_tokens": 1139616.0, "reward": 1.0046875476837158, "reward_std": 0.00930030457675457, "rewards/oai_reward_function/mean": 0.5023437500931323, "rewards/oai_reward_function/std": 0.009753772988915443, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09775208681821823, "epoch": 0.9285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.07111279666423798, "kl": 0.007679712725803256, "learning_rate": 4.285714285714286e-06, "loss": 0.0001, "num_tokens": 1157472.0, "reward": 1.0890624523162842, "reward_std": 0.04253753647208214, "rewards/oai_reward_function/mean": 0.5445312485098839, "rewards/oai_reward_function/std": 0.08174862712621689, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10566045716404915, "epoch": 0.9428571428571428, "frac_reward_zero_std": 0.25, "grad_norm": 0.09820882230997086, "kl": 0.005833235685713589, "learning_rate": 3.5714285714285714e-06, "loss": 0.0001, "num_tokens": 1175344.0, "reward": 1.3125, "reward_std": 0.0763113722205162, "rewards/oai_reward_function/mean": 0.65625, "rewards/oai_reward_function/std": 0.199495330452919, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09616570547223091, "epoch": 0.9571428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.060254957526922226, "kl": 0.005365552264265716, "learning_rate": 2.8571428571428573e-06, "loss": 0.0001, "num_tokens": 1193248.0, "reward": 1.0703125, "reward_std": 0.026579536497592926, "rewards/oai_reward_function/mean": 0.53515625, "rewards/oai_reward_function/std": 0.06377232819795609, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09799160063266754, "epoch": 0.9714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033068626653403044, "kl": 0.010790573665872216, "learning_rate": 2.142857142857143e-06, "loss": 0.0001, "num_tokens": 1210992.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09545023553073406, "epoch": 0.9857142857142858, "frac_reward_zero_std": 0.75, "grad_norm": 0.05336514860391617, "kl": 0.005807638866826892, "learning_rate": 1.4285714285714286e-06, "loss": 0.0001, "num_tokens": 1228816.0, "reward": 1.0125000476837158, "reward_std": 0.013363069854676723, "rewards/oai_reward_function/mean": 0.5062500000931323, "rewards/oai_reward_function/std": 0.016800537705421448, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10368440486490726, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.050165776163339615, "kl": 0.008004623581655324, "learning_rate": 7.142857142857143e-07, "loss": 0.0001, "num_tokens": 1246584.0, "reward": 1.017187476158142, "reward_std": 0.017598580569028854, "rewards/oai_reward_function/mean": 0.5085937501862645, "rewards/oai_reward_function/std": 0.022548669949173927, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09804531745612621, "epoch": 1.0142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.05341744422912598, "kl": 0.01710776425898075, "learning_rate": 0.0, "loss": 0.0002, "num_tokens": 1264416.0, "reward": 1.0875000953674316, "reward_std": 0.03174196928739548, "rewards/oai_reward_function/mean": 0.5437500029802322, "rewards/oai_reward_function/std": 0.0375671461224556, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09854021109640598, "epoch": 1.0285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.06506258249282837, "kl": 0.009508747374638915, "learning_rate": 4.4928571428571434e-05, "loss": 0.0001, "num_tokens": 1282296.0, "reward": 1.0406250953674316, "reward_std": 0.0222018975764513, "rewards/oai_reward_function/mean": 0.5203125011175871, "rewards/oai_reward_function/std": 0.035603947937488556, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08583058044314384, "epoch": 1.042857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.07061073184013367, "kl": 0.005951485480181873, "learning_rate": 4.485714285714286e-05, "loss": 0.0001, "num_tokens": 1300008.0, "reward": 1.0234375, "reward_std": 0.01804211549460888, "rewards/oai_reward_function/mean": 0.51171875, "rewards/oai_reward_function/std": 0.020064787939190865, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09507345780730247, "epoch": 1.0571428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.05930415913462639, "kl": 0.007875082548707724, "learning_rate": 4.478571428571429e-05, "loss": 0.0001, "num_tokens": 1317832.0, "reward": 1.234375, "reward_std": 0.01088879257440567, "rewards/oai_reward_function/mean": 0.6171875, "rewards/oai_reward_function/std": 0.20452910661697388, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09059166349470615, "epoch": 1.0714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.04293040931224823, "kl": 0.01093563821632415, "learning_rate": 4.471428571428571e-05, "loss": 0.0001, "num_tokens": 1335584.0, "reward": 1.0281250476837158, "reward_std": 0.008838837966322899, "rewards/oai_reward_function/mean": 0.514062499627471, "rewards/oai_reward_function/std": 0.026133574545383453, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08499786630272865, "epoch": 1.0857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 0.06890492141246796, "kl": 0.007947787176817656, "learning_rate": 4.464285714285715e-05, "loss": 0.0001, "num_tokens": 1353400.0, "reward": 1.0734374523162842, "reward_std": 0.03388907015323639, "rewards/oai_reward_function/mean": 0.5367187522351742, "rewards/oai_reward_function/std": 0.05607611685991287, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08190344646573067, "epoch": 1.1, "frac_reward_zero_std": 0.5, "grad_norm": 0.07835045456886292, "kl": 0.010214838432148099, "learning_rate": 4.4571428571428574e-05, "loss": 0.0001, "num_tokens": 1371176.0, "reward": 1.21875, "reward_std": 0.03153933212161064, "rewards/oai_reward_function/mean": 0.609375, "rewards/oai_reward_function/std": 0.12727762758731842, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09901309013366699, "epoch": 1.1142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.07858891785144806, "kl": 0.006452999892644584, "learning_rate": 4.4500000000000004e-05, "loss": 0.0001, "num_tokens": 1388952.0, "reward": 1.053125023841858, "reward_std": 0.028757737949490547, "rewards/oai_reward_function/mean": 0.5265625007450581, "rewards/oai_reward_function/std": 0.02905604988336563, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09012427926063538, "epoch": 1.1285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.08503545820713043, "kl": 0.01038876292295754, "learning_rate": 4.442857142857143e-05, "loss": 0.0001, "num_tokens": 1406744.0, "reward": 1.084375023841858, "reward_std": 0.07080081105232239, "rewards/oai_reward_function/mean": 0.5421874970197678, "rewards/oai_reward_function/std": 0.09233474731445312, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07716062478721142, "epoch": 1.1428571428571428, "frac_reward_zero_std": 0.25, "grad_norm": 0.0729447677731514, "kl": 0.012507579056546092, "learning_rate": 4.435714285714286e-05, "loss": 0.0001, "num_tokens": 1424568.0, "reward": 1.2468750476837158, "reward_std": 0.03139737993478775, "rewards/oai_reward_function/mean": 0.6234374940395355, "rewards/oai_reward_function/std": 0.19123300909996033, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09090105071663857, "epoch": 1.157142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.07926075905561447, "kl": 0.010987127898260951, "learning_rate": 4.428571428571428e-05, "loss": 0.0001, "num_tokens": 1442480.0, "reward": 1.0984375476837158, "reward_std": 0.0697232112288475, "rewards/oai_reward_function/mean": 0.5492187514901161, "rewards/oai_reward_function/std": 0.06006864085793495, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08928278088569641, "epoch": 1.1714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.08457206189632416, "kl": 0.004951049922965467, "learning_rate": 4.4214285714285714e-05, "loss": 0.0, "num_tokens": 1460344.0, "reward": 1.0875000953674316, "reward_std": 0.04518735408782959, "rewards/oai_reward_function/mean": 0.5437499992549419, "rewards/oai_reward_function/std": 0.07156093418598175, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08692280948162079, "epoch": 1.1857142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.09246931225061417, "kl": 0.015749768121168017, "learning_rate": 4.4142857142857144e-05, "loss": 0.0002, "num_tokens": 1478248.0, "reward": 1.264062523841858, "reward_std": 0.03826536983251572, "rewards/oai_reward_function/mean": 0.6320312470197678, "rewards/oai_reward_function/std": 0.17668935656547546, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08908558450639248, "epoch": 1.2, "frac_reward_zero_std": 0.75, "grad_norm": 0.05387440696358681, "kl": 0.0058196637546643615, "learning_rate": 4.4071428571428575e-05, "loss": 0.0001, "num_tokens": 1496112.0, "reward": 1.0078125, "reward_std": 0.011451572179794312, "rewards/oai_reward_function/mean": 0.50390625, "rewards/oai_reward_function/std": 0.012872475199401379, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08258137106895447, "epoch": 1.2142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.05658518522977829, "kl": 0.004440092074219137, "learning_rate": 4.4000000000000006e-05, "loss": 0.0, "num_tokens": 1513752.0, "reward": 1.001562476158142, "reward_std": 0.004419416189193726, "rewards/oai_reward_function/mean": 0.5007812500116415, "rewards/oai_reward_function/std": 0.0044194175861775875, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07238267548382282, "epoch": 1.2285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.07421422004699707, "kl": 0.006456690724007785, "learning_rate": 4.392857142857143e-05, "loss": 0.0001, "num_tokens": 1531512.0, "reward": 1.048437476158142, "reward_std": 0.023024337366223335, "rewards/oai_reward_function/mean": 0.5242187492549419, "rewards/oai_reward_function/std": 0.030772563070058823, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08944158256053925, "epoch": 1.2428571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.06827304512262344, "kl": 0.006732087349519134, "learning_rate": 4.385714285714286e-05, "loss": 0.0001, "num_tokens": 1549400.0, "reward": 1.1703124046325684, "reward_std": 0.06780597567558289, "rewards/oai_reward_function/mean": 0.5851562544703484, "rewards/oai_reward_function/std": 0.15987133979797363, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08645510673522949, "epoch": 1.2571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.06966782361268997, "kl": 0.008698969963006675, "learning_rate": 4.3785714285714284e-05, "loss": 0.0001, "num_tokens": 1567168.0, "reward": 1.0187499523162842, "reward_std": 0.018725106492638588, "rewards/oai_reward_function/mean": 0.509375000372529, "rewards/oai_reward_function/std": 0.01878357119858265, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10576100833714008, "epoch": 1.2714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.04759611934423447, "kl": 0.009460748406127095, "learning_rate": 4.371428571428572e-05, "loss": 0.0001, "num_tokens": 1585032.0, "reward": 1.0812499523162842, "reward_std": 0.07165143638849258, "rewards/oai_reward_function/mean": 0.5406249985098839, "rewards/oai_reward_function/std": 0.0987318754196167, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07568562775850296, "epoch": 1.2857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 0.052769020199775696, "kl": 0.005130159552209079, "learning_rate": 4.3642857142857146e-05, "loss": 0.0001, "num_tokens": 1602784.0, "reward": 1.0562500953674316, "reward_std": 0.03836483508348465, "rewards/oai_reward_function/mean": 0.5281250011175871, "rewards/oai_reward_function/std": 0.03952847048640251, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08074977435171604, "epoch": 1.3, "frac_reward_zero_std": 0.5, "grad_norm": 0.07572436332702637, "kl": 0.00757291610352695, "learning_rate": 4.3571428571428576e-05, "loss": 0.0001, "num_tokens": 1620496.0, "reward": 1.0546875, "reward_std": 0.032445792108774185, "rewards/oai_reward_function/mean": 0.52734375, "rewards/oai_reward_function/std": 0.03321446478366852, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07088322378695011, "epoch": 1.3142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.07621358335018158, "kl": 0.005998459528200328, "learning_rate": 4.35e-05, "loss": 0.0001, "num_tokens": 1638464.0, "reward": 1.2015624046325684, "reward_std": 0.11482575535774231, "rewards/oai_reward_function/mean": 0.6007812544703484, "rewards/oai_reward_function/std": 0.26282399147748947, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06381132267415524, "epoch": 1.3285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 0.030587607994675636, "kl": 0.007369687547907233, "learning_rate": 4.342857142857143e-05, "loss": 0.0001, "num_tokens": 1656176.0, "reward": 1.0031249523162842, "reward_std": 0.008838832378387451, "rewards/oai_reward_function/mean": 0.5015625000232831, "rewards/oai_reward_function/std": 0.008838835172355175, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07285293377935886, "epoch": 1.342857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.0500265508890152, "kl": 0.006194314104504883, "learning_rate": 4.3357142857142855e-05, "loss": 0.0001, "num_tokens": 1674088.0, "reward": 1.109375, "reward_std": 0.04590248316526413, "rewards/oai_reward_function/mean": 0.5546875037252903, "rewards/oai_reward_function/std": 0.07967613637447357, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.086557412520051, "epoch": 1.3571428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.07414961606264114, "kl": 0.010996793280355632, "learning_rate": 4.328571428571429e-05, "loss": 0.0001, "num_tokens": 1692048.0, "reward": 1.0671875476837158, "reward_std": 0.03708447515964508, "rewards/oai_reward_function/mean": 0.5335937514901161, "rewards/oai_reward_function/std": 0.04561823233962059, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08167718537151814, "epoch": 1.3714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010896283201873302, "kl": 0.004855156294070184, "learning_rate": 4.3214285714285716e-05, "loss": 0.0, "num_tokens": 1709760.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07798840664327145, "epoch": 1.3857142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.09227096289396286, "kl": 0.014819784788414836, "learning_rate": 4.314285714285715e-05, "loss": 0.0001, "num_tokens": 1727480.0, "reward": 1.2296874523162842, "reward_std": 0.1029118224978447, "rewards/oai_reward_function/mean": 0.6148437485098839, "rewards/oai_reward_function/std": 0.12918156385421753, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07132465578615665, "epoch": 1.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.07690515369176865, "kl": 0.0082227170933038, "learning_rate": 4.307142857142857e-05, "loss": 0.0001, "num_tokens": 1745336.0, "reward": 1.037500023841858, "reward_std": 0.023145508021116257, "rewards/oai_reward_function/mean": 0.5187500007450581, "rewards/oai_reward_function/std": 0.030453559011220932, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08682013675570488, "epoch": 1.4142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 0.09899340569972992, "kl": 0.007965923519805074, "learning_rate": 4.3e-05, "loss": 0.0001, "num_tokens": 1763136.0, "reward": 1.1531250476837158, "reward_std": 0.13869836926460266, "rewards/oai_reward_function/mean": 0.5765625014901161, "rewards/oai_reward_function/std": 0.2821534648537636, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08919607102870941, "epoch": 1.4285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.080818772315979, "kl": 0.007981272647157311, "learning_rate": 4.292857142857143e-05, "loss": 0.0001, "num_tokens": 1780920.0, "reward": 1.2093749046325684, "reward_std": 0.020411580801010132, "rewards/oai_reward_function/mean": 0.6046875044703484, "rewards/oai_reward_function/std": 0.18110741674900055, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07852962799370289, "epoch": 1.4428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.09296028316020966, "kl": 0.012157463701441884, "learning_rate": 4.2857142857142856e-05, "loss": 0.0001, "num_tokens": 1798872.0, "reward": 1.5265624523162842, "reward_std": 0.04206090793013573, "rewards/oai_reward_function/mean": 0.7632812559604645, "rewards/oai_reward_function/std": 0.22771519422531128, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.086120730265975, "epoch": 1.457142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.0995735228061676, "kl": 0.012111627496778965, "learning_rate": 4.278571428571429e-05, "loss": 0.0001, "num_tokens": 1816744.0, "reward": 1.0968749523162842, "reward_std": 0.049927353858947754, "rewards/oai_reward_function/mean": 0.5484374985098839, "rewards/oai_reward_function/std": 0.049974795430898666, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07998536713421345, "epoch": 1.4714285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.06524667888879776, "kl": 0.006055153091438115, "learning_rate": 4.271428571428572e-05, "loss": 0.0001, "num_tokens": 1834456.0, "reward": 1.0125000476837158, "reward_std": 0.02314549870789051, "rewards/oai_reward_function/mean": 0.5062500000931323, "rewards/oai_reward_function/std": 0.016800537705421448, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07001950591802597, "epoch": 1.4857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.11485958099365234, "kl": 0.00914135156199336, "learning_rate": 4.264285714285715e-05, "loss": 0.0001, "num_tokens": 1852408.0, "reward": 1.1859374046325684, "reward_std": 0.08434940874576569, "rewards/oai_reward_function/mean": 0.5929687544703484, "rewards/oai_reward_function/std": 0.09821683913469315, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07728070393204689, "epoch": 1.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.07163766771554947, "kl": 0.006072127376683056, "learning_rate": 4.257142857142857e-05, "loss": 0.0001, "num_tokens": 1870208.0, "reward": 1.0125000476837158, "reward_std": 0.02314549870789051, "rewards/oai_reward_function/mean": 0.5062500000931323, "rewards/oai_reward_function/std": 0.016800537705421448, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08027334697544575, "epoch": 1.5142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.07313752919435501, "kl": 0.011275349417701364, "learning_rate": 4.25e-05, "loss": 0.0001, "num_tokens": 1887984.0, "reward": 1.0593750476837158, "reward_std": 0.022558562457561493, "rewards/oai_reward_function/mean": 0.529687499627471, "rewards/oai_reward_function/std": 0.03386256843805313, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08169634826481342, "epoch": 1.5285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.09352786093950272, "kl": 0.014267339138314128, "learning_rate": 4.242857142857143e-05, "loss": 0.0001, "num_tokens": 1905776.0, "reward": 1.115625023841858, "reward_std": 0.055196452885866165, "rewards/oai_reward_function/mean": 0.5578125044703484, "rewards/oai_reward_function/std": 0.09427942335605621, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05949794687330723, "epoch": 1.5428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.08965466171503067, "kl": 0.014173903269693255, "learning_rate": 4.2357142857142864e-05, "loss": 0.0001, "num_tokens": 1923672.0, "reward": 1.357812523841858, "reward_std": 0.07401138544082642, "rewards/oai_reward_function/mean": 0.6789062470197678, "rewards/oai_reward_function/std": 0.1690949946641922, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08150264620780945, "epoch": 1.5571428571428572, "frac_reward_zero_std": 0.25, "grad_norm": 0.08684907853603363, "kl": 0.015842870343476534, "learning_rate": 4.228571428571429e-05, "loss": 0.0002, "num_tokens": 1941504.0, "reward": 1.334375023841858, "reward_std": 0.03491953760385513, "rewards/oai_reward_function/mean": 0.6671874970197678, "rewards/oai_reward_function/std": 0.18364998698234558, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06982677057385445, "epoch": 1.5714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.06661536544561386, "kl": 0.008391729556024075, "learning_rate": 4.221428571428572e-05, "loss": 0.0001, "num_tokens": 1959144.0, "reward": 1.03125, "reward_std": 0.019731827080249786, "rewards/oai_reward_function/mean": 0.515625, "rewards/oai_reward_function/std": 0.025988519191741943, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09021224454045296, "epoch": 1.5857142857142859, "frac_reward_zero_std": 0.75, "grad_norm": 0.05824963003396988, "kl": 0.008994318312034011, "learning_rate": 4.214285714285714e-05, "loss": 0.0001, "num_tokens": 1976936.0, "reward": 1.037500023841858, "reward_std": 0.013363069854676723, "rewards/oai_reward_function/mean": 0.5187500007450581, "rewards/oai_reward_function/std": 0.0353553369641304, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09183148294687271, "epoch": 1.6, "frac_reward_zero_std": 0.5, "grad_norm": 0.08340641111135483, "kl": 0.010920959059149027, "learning_rate": 4.2071428571428574e-05, "loss": 0.0001, "num_tokens": 1994664.0, "reward": 1.0390625, "reward_std": 0.027564914897084236, "rewards/oai_reward_function/mean": 0.51953125, "rewards/oai_reward_function/std": 0.0395205020904541, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08369805663824081, "epoch": 1.6142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.09711972624063492, "kl": 0.009947408339940012, "learning_rate": 4.2e-05, "loss": 0.0001, "num_tokens": 2012432.0, "reward": 1.0703125, "reward_std": 0.02308514341711998, "rewards/oai_reward_function/mean": 0.53515625, "rewards/oai_reward_function/std": 0.04438621550798416, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.096822340041399, "epoch": 1.6285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.07382018864154816, "kl": 0.017859197221696377, "learning_rate": 4.192857142857143e-05, "loss": 0.0002, "num_tokens": 2030136.0, "reward": 1.0359375476837158, "reward_std": 0.035533398389816284, "rewards/oai_reward_function/mean": 0.517968749627471, "rewards/oai_reward_function/std": 0.03252441808581352, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08726120926439762, "epoch": 1.6428571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.0782691165804863, "kl": 0.012930417666211724, "learning_rate": 4.185714285714286e-05, "loss": 0.0001, "num_tokens": 2047976.0, "reward": 1.1749999523162842, "reward_std": 0.08762745559215546, "rewards/oai_reward_function/mean": 0.5874999985098839, "rewards/oai_reward_function/std": 0.1177750751376152, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07392177730798721, "epoch": 1.657142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.07505157589912415, "kl": 0.012273511849343777, "learning_rate": 4.178571428571429e-05, "loss": 0.0001, "num_tokens": 2065704.0, "reward": 1.1953125, "reward_std": 0.06742400676012039, "rewards/oai_reward_function/mean": 0.59765625, "rewards/oai_reward_function/std": 0.13019207119941711, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.065590625628829, "epoch": 1.6714285714285713, "frac_reward_zero_std": 0.25, "grad_norm": 0.08693981915712357, "kl": 0.02323699276894331, "learning_rate": 4.1714285714285714e-05, "loss": 0.0002, "num_tokens": 2083560.0, "reward": 1.1328125, "reward_std": 0.04593653976917267, "rewards/oai_reward_function/mean": 0.56640625, "rewards/oai_reward_function/std": 0.04902656376361847, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09886737167835236, "epoch": 1.6857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.05793704837560654, "kl": 0.015232619596645236, "learning_rate": 4.1642857142857144e-05, "loss": 0.0002, "num_tokens": 2101392.0, "reward": 1.03125, "reward_std": 0.011572758667171001, "rewards/oai_reward_function/mean": 0.515625, "rewards/oai_reward_function/std": 0.029614457860589027, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09572554007172585, "epoch": 1.7, "frac_reward_zero_std": 0.75, "grad_norm": 0.03639426827430725, "kl": 0.017274728044867516, "learning_rate": 4.1571428571428575e-05, "loss": 0.0002, "num_tokens": 2119200.0, "reward": 1.2000000476837158, "reward_std": 0.018898215144872665, "rewards/oai_reward_function/mean": 0.5999999940395355, "rewards/oai_reward_function/std": 0.17689070105552673, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08603023178875446, "epoch": 1.7142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.10388008505105972, "kl": 0.022097071167081594, "learning_rate": 4.15e-05, "loss": 0.0002, "num_tokens": 2137056.0, "reward": 1.0578125715255737, "reward_std": 0.04522190988063812, "rewards/oai_reward_function/mean": 0.528906250372529, "rewards/oai_reward_function/std": 0.03971134498715401, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0950616579502821, "epoch": 1.7285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 0.09005559235811234, "kl": 0.015552334254607558, "learning_rate": 4.1428571428571437e-05, "loss": 0.0002, "num_tokens": 2154936.0, "reward": 1.268125057220459, "reward_std": 0.0341712087392807, "rewards/oai_reward_function/mean": 0.6340624988079071, "rewards/oai_reward_function/std": 0.2080029398202896, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07687668316066265, "epoch": 1.7428571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.08279760181903839, "kl": 0.009949938859790564, "learning_rate": 4.135714285714286e-05, "loss": 0.0001, "num_tokens": 2172616.0, "reward": 1.0625, "reward_std": 0.04204372316598892, "rewards/oai_reward_function/mean": 0.53125, "rewards/oai_reward_function/std": 0.04353345185518265, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08731912076473236, "epoch": 1.7571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.06586393713951111, "kl": 0.008837034576572478, "learning_rate": 4.128571428571429e-05, "loss": 0.0001, "num_tokens": 2190368.0, "reward": 1.0187499523162842, "reward_std": 0.021777570247650146, "rewards/oai_reward_function/mean": 0.509375000372529, "rewards/oai_reward_function/std": 0.019827887415885925, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08952882327139378, "epoch": 1.7714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.09899036586284637, "kl": 0.013539569219574332, "learning_rate": 4.1214285714285715e-05, "loss": 0.0001, "num_tokens": 2208176.0, "reward": 1.1140625476837158, "reward_std": 0.06264616549015045, "rewards/oai_reward_function/mean": 0.5570312514901161, "rewards/oai_reward_function/std": 0.04455622285604477, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07565303146839142, "epoch": 1.7857142857142856, "frac_reward_zero_std": 0.25, "grad_norm": 0.08115291595458984, "kl": 0.009995393920689821, "learning_rate": 4.1142857142857146e-05, "loss": 0.0001, "num_tokens": 2225896.0, "reward": 1.1749999523162842, "reward_std": 0.06661029160022736, "rewards/oai_reward_function/mean": 0.5874999985098839, "rewards/oai_reward_function/std": 0.1399884670972824, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09428555145859718, "epoch": 1.8, "frac_reward_zero_std": 0.75, "grad_norm": 0.06345849484205246, "kl": 0.01248577213846147, "learning_rate": 4.107142857142857e-05, "loss": 0.0001, "num_tokens": 2243656.0, "reward": 1.0390625, "reward_std": 0.02052600309252739, "rewards/oai_reward_function/mean": 0.51953125, "rewards/oai_reward_function/std": 0.0395205020904541, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07595096342265606, "epoch": 1.8142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.09258188307285309, "kl": 0.00919699075166136, "learning_rate": 4.1e-05, "loss": 0.0001, "num_tokens": 2261448.0, "reward": 1.0343749523162842, "reward_std": 0.03808924928307533, "rewards/oai_reward_function/mean": 0.517187500372529, "rewards/oai_reward_function/std": 0.029400940984487534, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08019419759511948, "epoch": 1.8285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.13585439324378967, "kl": 0.019885767716914415, "learning_rate": 4.092857142857143e-05, "loss": 0.0002, "num_tokens": 2279240.0, "reward": 1.3046875, "reward_std": 0.1113169863820076, "rewards/oai_reward_function/mean": 0.65234375, "rewards/oai_reward_function/std": 0.17964564263820648, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09248529188334942, "epoch": 1.842857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.08260349929332733, "kl": 0.013745760545134544, "learning_rate": 4.085714285714286e-05, "loss": 0.0001, "num_tokens": 2297056.0, "reward": 1.0734375715255737, "reward_std": 0.034589797258377075, "rewards/oai_reward_function/mean": 0.5367187522351742, "rewards/oai_reward_function/std": 0.03359169885516167, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09751161187887192, "epoch": 1.8571428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 0.06302473694086075, "kl": 0.011824949877336621, "learning_rate": 4.0785714285714286e-05, "loss": 0.0001, "num_tokens": 2314872.0, "reward": 1.0203125476837158, "reward_std": 0.013258256018161774, "rewards/oai_reward_function/mean": 0.510156249627471, "rewards/oai_reward_function/std": 0.021867798641324043, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09040896967053413, "epoch": 1.8714285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 0.09831514209508896, "kl": 0.01766400644555688, "learning_rate": 4.0714285714285717e-05, "loss": 0.0002, "num_tokens": 2332784.0, "reward": 1.0187499523162842, "reward_std": 0.028380058705806732, "rewards/oai_reward_function/mean": 0.509375000372529, "rewards/oai_reward_function/std": 0.01878357119858265, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09301545284688473, "epoch": 1.8857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.08136897534132004, "kl": 0.02844544965773821, "learning_rate": 4.064285714285714e-05, "loss": 0.0003, "num_tokens": 2350640.0, "reward": 1.060937523841858, "reward_std": 0.03093591332435608, "rewards/oai_reward_function/mean": 0.5304687507450581, "rewards/oai_reward_function/std": 0.054895199835300446, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08573882840573788, "epoch": 1.9, "frac_reward_zero_std": 0.5, "grad_norm": 0.0735137015581131, "kl": 0.023084456101059914, "learning_rate": 4.057142857142857e-05, "loss": 0.0002, "num_tokens": 2368456.0, "reward": 1.0734374523162842, "reward_std": 0.02894335612654686, "rewards/oai_reward_function/mean": 0.5367187522351742, "rewards/oai_reward_function/std": 0.05461905151605606, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10109273716807365, "epoch": 1.9142857142857141, "frac_reward_zero_std": 0.25, "grad_norm": 0.10477015376091003, "kl": 0.03489594021812081, "learning_rate": 4.05e-05, "loss": 0.0003, "num_tokens": 2386200.0, "reward": 1.0046875476837158, "reward_std": 0.11875393241643906, "rewards/oai_reward_function/mean": 0.5023437514901161, "rewards/oai_reward_function/std": 0.11234594881534576, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08120713755488396, "epoch": 1.9285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.10714246332645416, "kl": 0.024183190893381834, "learning_rate": 4.042857142857143e-05, "loss": 0.0002, "num_tokens": 2404056.0, "reward": 1.09375, "reward_std": 0.05726175755262375, "rewards/oai_reward_function/mean": 0.546875, "rewards/oai_reward_function/std": 0.05982164293527603, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.089906245470047, "epoch": 1.9428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.12431693077087402, "kl": 0.038056795950978994, "learning_rate": 4.035714285714286e-05, "loss": 0.0004, "num_tokens": 2421928.0, "reward": 1.3406250476837158, "reward_std": 0.05260005593299866, "rewards/oai_reward_function/mean": 0.6703125089406967, "rewards/oai_reward_function/std": 0.19317355751991272, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09174446761608124, "epoch": 1.9571428571428573, "frac_reward_zero_std": 0.5, "grad_norm": 0.07566652446985245, "kl": 0.023516141809523106, "learning_rate": 4.028571428571429e-05, "loss": 0.0002, "num_tokens": 2439832.0, "reward": 1.0640625953674316, "reward_std": 0.026437407359480858, "rewards/oai_reward_function/mean": 0.5320312529802322, "rewards/oai_reward_function/std": 0.04027845337986946, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0811004675924778, "epoch": 1.9714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.1272999793291092, "kl": 0.035705497954040766, "learning_rate": 4.021428571428572e-05, "loss": 0.0004, "num_tokens": 2457576.0, "reward": 1.0421874523162842, "reward_std": 0.04250866919755936, "rewards/oai_reward_function/mean": 0.521093750372529, "rewards/oai_reward_function/std": 0.03971134498715401, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08940452709794044, "epoch": 1.9857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 0.10179316252470016, "kl": 0.03542056027799845, "learning_rate": 4.014285714285714e-05, "loss": 0.0004, "num_tokens": 2475400.0, "reward": 1.0484375953674316, "reward_std": 0.03541836887598038, "rewards/oai_reward_function/mean": 0.5242187511175871, "rewards/oai_reward_function/std": 0.04375720024108887, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08982641063630581, "epoch": 2.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.05322287976741791, "kl": 0.024960508104413748, "learning_rate": 4.007142857142857e-05, "loss": 0.0002, "num_tokens": 2493168.0, "reward": 1.0421874523162842, "reward_std": 0.024032622575759888, "rewards/oai_reward_function/mean": 0.521093750372529, "rewards/oai_reward_function/std": 0.04358407482504845, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07347713969647884, "epoch": 2.0142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.054996706545352936, "kl": 0.029410825110971928, "learning_rate": 4e-05, "loss": 0.0003, "num_tokens": 2510968.0, "reward": 1.2937500476837158, "reward_std": 0.006681524682790041, "rewards/oai_reward_function/mean": 0.6468750089406967, "rewards/oai_reward_function/std": 0.2041652947664261, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11200576089322567, "epoch": 2.0285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.07548272609710693, "kl": 0.03502320311963558, "learning_rate": 3.9928571428571434e-05, "loss": 0.0004, "num_tokens": 2528744.0, "reward": 1.095312476158142, "reward_std": 0.0437462255358696, "rewards/oai_reward_function/mean": 0.5476562529802322, "rewards/oai_reward_function/std": 0.05692360922694206, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09486313536763191, "epoch": 2.0428571428571427, "frac_reward_zero_std": 0.75, "grad_norm": 0.05399833247065544, "kl": 0.03851825185120106, "learning_rate": 3.985714285714286e-05, "loss": 0.0004, "num_tokens": 2546488.0, "reward": 1.0125000476837158, "reward_std": 0.01336306519806385, "rewards/oai_reward_function/mean": 0.5062500000931323, "rewards/oai_reward_function/std": 0.016800537705421448, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08246604166924953, "epoch": 2.057142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.042957283556461334, "kl": 0.03783240728080273, "learning_rate": 3.978571428571429e-05, "loss": 0.0004, "num_tokens": 2564176.0, "reward": 1.0234375, "reward_std": 0.01695253700017929, "rewards/oai_reward_function/mean": 0.51171875, "rewards/oai_reward_function/std": 0.026169713586568832, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10432570241391659, "epoch": 2.0714285714285716, "frac_reward_zero_std": 0.25, "grad_norm": 0.09850599616765976, "kl": 0.037014870904386044, "learning_rate": 3.971428571428571e-05, "loss": 0.0004, "num_tokens": 2581944.0, "reward": 1.0250000953674316, "reward_std": 0.15622428059577942, "rewards/oai_reward_function/mean": 0.5124999992549419, "rewards/oai_reward_function/std": 0.17416272684931755, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09751013852655888, "epoch": 2.085714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.060189735144376755, "kl": 0.050427704118192196, "learning_rate": 3.964285714285714e-05, "loss": 0.0005, "num_tokens": 2599616.0, "reward": 1.0265624523162842, "reward_std": 0.008010865189135075, "rewards/oai_reward_function/mean": 0.513281250372529, "rewards/oai_reward_function/std": 0.024580655619502068, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.1012180857360363, "epoch": 2.1, "frac_reward_zero_std": 0.25, "grad_norm": 0.09098206460475922, "kl": 0.05104807484894991, "learning_rate": 3.9571428571428574e-05, "loss": 0.0005, "num_tokens": 2617576.0, "reward": 1.2890625, "reward_std": 0.033694587647914886, "rewards/oai_reward_function/mean": 0.6445312350988388, "rewards/oai_reward_function/std": 0.1918431520462036, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08674592711031437, "epoch": 2.1142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.0632624626159668, "kl": 0.035351223312318325, "learning_rate": 3.9500000000000005e-05, "loss": 0.0004, "num_tokens": 2635248.0, "reward": 1.0265624523162842, "reward_std": 0.01813914254307747, "rewards/oai_reward_function/mean": 0.513281250372529, "rewards/oai_reward_function/std": 0.021982740610837936, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11086461879312992, "epoch": 2.1285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.13065817952156067, "kl": 0.06040171813219786, "learning_rate": 3.942857142857143e-05, "loss": 0.0006, "num_tokens": 2653096.0, "reward": 1.037500023841858, "reward_std": 0.14793866872787476, "rewards/oai_reward_function/mean": 0.5187500044703484, "rewards/oai_reward_function/std": 0.12740343809127808, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.11983237601816654, "epoch": 2.142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.11544425040483475, "kl": 0.06588536128401756, "learning_rate": 3.935714285714286e-05, "loss": 0.0007, "num_tokens": 2670944.0, "reward": 1.0812499523162842, "reward_std": 0.035140641033649445, "rewards/oai_reward_function/mean": 0.5406249985098839, "rewards/oai_reward_function/std": 0.023546453565359116, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08963452652096748, "epoch": 2.157142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.06575662642717361, "kl": 0.05113219376653433, "learning_rate": 3.928571428571429e-05, "loss": 0.0005, "num_tokens": 2688680.0, "reward": 1.154687523841858, "reward_std": 0.03592789173126221, "rewards/oai_reward_function/mean": 0.5773437470197678, "rewards/oai_reward_function/std": 0.13520585000514984, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10959535092115402, "epoch": 2.1714285714285713, "frac_reward_zero_std": 0.25, "grad_norm": 0.09100169688463211, "kl": 0.04979555029422045, "learning_rate": 3.9214285714285714e-05, "loss": 0.0005, "num_tokens": 2706528.0, "reward": 1.3046875, "reward_std": 0.032156482338905334, "rewards/oai_reward_function/mean": 0.6523437350988388, "rewards/oai_reward_function/std": 0.1942063421010971, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10072515532374382, "epoch": 2.185714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.11344427615404129, "kl": 0.08104220405220985, "learning_rate": 3.9142857142857145e-05, "loss": 0.0008, "num_tokens": 2724424.0, "reward": 1.3984375, "reward_std": 0.06430189311504364, "rewards/oai_reward_function/mean": 0.69921875, "rewards/oai_reward_function/std": 0.19055142998695374, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.13380656391382217, "epoch": 2.2, "frac_reward_zero_std": 0.5, "grad_norm": 0.10259576886892319, "kl": 0.047852903604507446, "learning_rate": 3.9071428571428575e-05, "loss": 0.0005, "num_tokens": 2742272.0, "reward": 1.0578124523162842, "reward_std": 0.026579542085528374, "rewards/oai_reward_function/mean": 0.5289062485098839, "rewards/oai_reward_function/std": 0.05354730039834976, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.1082126721739769, "epoch": 2.2142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 0.10249610245227814, "kl": 0.07078076247125864, "learning_rate": 3.9000000000000006e-05, "loss": 0.0007, "num_tokens": 2760088.0, "reward": 1.0593750476837158, "reward_std": 0.036339618265628815, "rewards/oai_reward_function/mean": 0.5296875014901161, "rewards/oai_reward_function/std": 0.04327215999364853, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10915260016918182, "epoch": 2.2285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 0.08330399543046951, "kl": 0.07353132590651512, "learning_rate": 3.892857142857143e-05, "loss": 0.0007, "num_tokens": 2777936.0, "reward": 1.25, "reward_std": 0.046066030859947205, "rewards/oai_reward_function/mean": 0.625, "rewards/oai_reward_function/std": 0.1287345290184021, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08624540269374847, "epoch": 2.242857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.0943220779299736, "kl": 0.06203949544578791, "learning_rate": 3.885714285714286e-05, "loss": 0.0006, "num_tokens": 2795664.0, "reward": 1.024999976158142, "reward_std": 0.023145508021116257, "rewards/oai_reward_function/mean": 0.5125000001862645, "rewards/oai_reward_function/std": 0.02199706807732582, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09054199792444706, "epoch": 2.257142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.07400333881378174, "kl": 0.04232563078403473, "learning_rate": 3.8785714285714285e-05, "loss": 0.0004, "num_tokens": 2813352.0, "reward": 1.0499999523162842, "reward_std": 0.0258774571120739, "rewards/oai_reward_function/mean": 0.5250000022351742, "rewards/oai_reward_function/std": 0.03810004144906998, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.10258343070745468, "epoch": 2.2714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.09145762026309967, "kl": 0.07726636342704296, "learning_rate": 3.8714285714285715e-05, "loss": 0.0008, "num_tokens": 2831304.0, "reward": 1.0593750476837158, "reward_std": 0.03966484218835831, "rewards/oai_reward_function/mean": 0.5296875014901161, "rewards/oai_reward_function/std": 0.05057631433010101, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08886106871068478, "epoch": 2.2857142857142856, "frac_reward_zero_std": 0.75, "grad_norm": 0.050053730607032776, "kl": 0.0593466404825449, "learning_rate": 3.8642857142857146e-05, "loss": 0.0006, "num_tokens": 2849216.0, "reward": 1.0031249523162842, "reward_std": 0.008838832378387451, "rewards/oai_reward_function/mean": 0.5015625000232831, "rewards/oai_reward_function/std": 0.008838835172355175, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08666450530290604, "epoch": 2.3, "frac_reward_zero_std": 0.25, "grad_norm": 0.09668877720832825, "kl": 0.037179723381996155, "learning_rate": 3.857142857142858e-05, "loss": 0.0004, "num_tokens": 2867016.0, "reward": 1.0671875476837158, "reward_std": 0.034592773765325546, "rewards/oai_reward_function/mean": 0.5335937514901161, "rewards/oai_reward_function/std": 0.038942355662584305, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08739319443702698, "epoch": 2.314285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.05101403221487999, "kl": 0.016047978308051825, "learning_rate": 3.85e-05, "loss": 0.0002, "num_tokens": 2884784.0, "reward": 1.001562476158142, "reward_std": 0.004419416189193726, "rewards/oai_reward_function/mean": 0.5007812500116415, "rewards/oai_reward_function/std": 0.0044194175861775875, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07174593396484852, "epoch": 2.3285714285714287, "frac_reward_zero_std": 0.25, "grad_norm": 0.09460754692554474, "kl": 0.031096406280994415, "learning_rate": 3.842857142857143e-05, "loss": 0.0003, "num_tokens": 2902576.0, "reward": 1.0315624475479126, "reward_std": 0.03596320003271103, "rewards/oai_reward_function/mean": 0.5157812498509884, "rewards/oai_reward_function/std": 0.02667333371937275, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0965243298560381, "epoch": 2.342857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.10237058997154236, "kl": 0.025380919221788645, "learning_rate": 3.8357142857142855e-05, "loss": 0.0003, "num_tokens": 2920416.0, "reward": 1.071874976158142, "reward_std": 0.045641690492630005, "rewards/oai_reward_function/mean": 0.5359374992549419, "rewards/oai_reward_function/std": 0.04396548494696617, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0735629927366972, "epoch": 2.357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.001382152666337788, "kl": 0.011448808014392853, "learning_rate": 3.8285714285714286e-05, "loss": 0.0001, "num_tokens": 2938224.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07639794796705246, "epoch": 2.3714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.05336588993668556, "kl": 0.0100309734698385, "learning_rate": 3.821428571428572e-05, "loss": 0.0001, "num_tokens": 2956000.0, "reward": 1.0187499523162842, "reward_std": 0.017677675932645798, "rewards/oai_reward_function/mean": 0.509375000372529, "rewards/oai_reward_function/std": 0.023546453565359116, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0797042902559042, "epoch": 2.3857142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.09860816597938538, "kl": 0.02236688695847988, "learning_rate": 3.814285714285715e-05, "loss": 0.0002, "num_tokens": 2973728.0, "reward": 1.0421874523162842, "reward_std": 0.03380424156785011, "rewards/oai_reward_function/mean": 0.5210937485098839, "rewards/oai_reward_function/std": 0.03052588365972042, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.09247681871056557, "epoch": 2.4, "frac_reward_zero_std": 0.25, "grad_norm": 0.11807835847139359, "kl": 0.028510943986475468, "learning_rate": 3.807142857142857e-05, "loss": 0.0003, "num_tokens": 2991648.0, "reward": 1.0703125, "reward_std": 0.04926247149705887, "rewards/oai_reward_function/mean": 0.53515625, "rewards/oai_reward_function/std": 0.036400206387043, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0499194972217083, "epoch": 2.414285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.11116263270378113, "kl": 0.04615373630076647, "learning_rate": 3.8e-05, "loss": 0.0005, "num_tokens": 3009472.0, "reward": 1.131250023841858, "reward_std": 0.0681503415107727, "rewards/oai_reward_function/mean": 0.5656249970197678, "rewards/oai_reward_function/std": 0.06772513687610626, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07691787928342819, "epoch": 2.4285714285714284, "frac_reward_zero_std": 0.75, "grad_norm": 0.043317168951034546, "kl": 0.026559457648545504, "learning_rate": 3.792857142857143e-05, "loss": 0.0003, "num_tokens": 3027312.0, "reward": 1.0812499523162842, "reward_std": 0.013363059610128403, "rewards/oai_reward_function/mean": 0.5406249985098839, "rewards/oai_reward_function/std": 0.04867187887430191, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08763985149562359, "epoch": 2.442857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.11738862097263336, "kl": 0.028244417626410723, "learning_rate": 3.785714285714286e-05, "loss": 0.0003, "num_tokens": 3045248.0, "reward": 1.2421875, "reward_std": 0.038010139018297195, "rewards/oai_reward_function/mean": 0.62109375, "rewards/oai_reward_function/std": 0.17051976919174194, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07618978433310986, "epoch": 2.4571428571428573, "frac_reward_zero_std": 0.75, "grad_norm": 0.06447312235832214, "kl": 0.009853521827608347, "learning_rate": 3.778571428571429e-05, "loss": 0.0001, "num_tokens": 3063104.0, "reward": 1.0125000476837158, "reward_std": 0.013363069854676723, "rewards/oai_reward_function/mean": 0.5062500000931323, "rewards/oai_reward_function/std": 0.016800537705421448, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05904076434671879, "epoch": 2.4714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.08978112041950226, "kl": 0.03310586418956518, "learning_rate": 3.771428571428572e-05, "loss": 0.0003, "num_tokens": 3081032.0, "reward": 1.109375, "reward_std": 0.04799327254295349, "rewards/oai_reward_function/mean": 0.5546875, "rewards/oai_reward_function/std": 0.05903713405132294, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0563393235206604, "epoch": 2.4857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 0.11673219501972198, "kl": 0.01918662153184414, "learning_rate": 3.764285714285715e-05, "loss": 0.0002, "num_tokens": 3098880.0, "reward": 1.1156249046325684, "reward_std": 0.04642024636268616, "rewards/oai_reward_function/mean": 0.5578125007450581, "rewards/oai_reward_function/std": 0.04554221034049988, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.054440722800791264, "epoch": 2.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016670229379087687, "kl": 0.011914134491235018, "learning_rate": 3.757142857142857e-05, "loss": 0.0001, "num_tokens": 3116528.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07713918946683407, "epoch": 2.5142857142857142, "frac_reward_zero_std": 0.25, "grad_norm": 0.09991607069969177, "kl": 0.014265456004068255, "learning_rate": 3.7500000000000003e-05, "loss": 0.0001, "num_tokens": 3134280.0, "reward": 1.0140624046325684, "reward_std": 0.026196977123618126, "rewards/oai_reward_function/mean": 0.5070312502793968, "rewards/oai_reward_function/std": 0.017079481855034828, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.060492053627967834, "epoch": 2.5285714285714285, "frac_reward_zero_std": 0.25, "grad_norm": 0.11897552013397217, "kl": 0.03578268736600876, "learning_rate": 3.742857142857143e-05, "loss": 0.0004, "num_tokens": 3152096.0, "reward": 1.09375, "reward_std": 0.05294632539153099, "rewards/oai_reward_function/mean": 0.546875, "rewards/oai_reward_function/std": 0.044336508959531784, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.062405264005064964, "epoch": 2.5428571428571427, "frac_reward_zero_std": 0.5, "grad_norm": 0.09012026339769363, "kl": 0.017184360651299357, "learning_rate": 3.735714285714286e-05, "loss": 0.0002, "num_tokens": 3169776.0, "reward": 1.2531249523162842, "reward_std": 0.029978279024362564, "rewards/oai_reward_function/mean": 0.6265625059604645, "rewards/oai_reward_function/std": 0.17049944400787354, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06565988063812256, "epoch": 2.557142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.126982182264328, "kl": 0.038017953746020794, "learning_rate": 3.728571428571428e-05, "loss": 0.0004, "num_tokens": 3187712.0, "reward": 1.3125, "reward_std": 0.041240036487579346, "rewards/oai_reward_function/mean": 0.65625, "rewards/oai_reward_function/std": 0.1866512894630432, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.040435372851789, "epoch": 2.571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.07750152051448822, "kl": 0.03242550138384104, "learning_rate": 3.721428571428572e-05, "loss": 0.0003, "num_tokens": 3205592.0, "reward": 1.256250023841858, "reward_std": 0.025646153837442398, "rewards/oai_reward_function/mean": 0.628125011920929, "rewards/oai_reward_function/std": 0.18651622533798218, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.047216037288308144, "epoch": 2.585714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.12098561972379684, "kl": 0.026824071537703276, "learning_rate": 3.7142857142857143e-05, "loss": 0.0003, "num_tokens": 3223480.0, "reward": 1.0093750953674316, "reward_std": 0.19897010922431946, "rewards/oai_reward_function/mean": 0.5046875029802322, "rewards/oai_reward_function/std": 0.21528521552681923, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08010220341384411, "epoch": 2.6, "frac_reward_zero_std": 0.25, "grad_norm": 0.11290978640317917, "kl": 0.022963001858443022, "learning_rate": 3.7071428571428574e-05, "loss": 0.0002, "num_tokens": 3241304.0, "reward": 1.1468751430511475, "reward_std": 0.08732541650533676, "rewards/oai_reward_function/mean": 0.5734374970197678, "rewards/oai_reward_function/std": 0.11707756668329239, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06848571076989174, "epoch": 2.6142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.10999053716659546, "kl": 0.024655529763549566, "learning_rate": 3.7e-05, "loss": 0.0002, "num_tokens": 3259048.0, "reward": 1.165624976158142, "reward_std": 0.0838727056980133, "rewards/oai_reward_function/mean": 0.5828125029802322, "rewards/oai_reward_function/std": 0.14652389287948608, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.049521847628057, "epoch": 2.6285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.10953173786401749, "kl": 0.02982867369428277, "learning_rate": 3.692857142857143e-05, "loss": 0.0003, "num_tokens": 3276808.0, "reward": 1.171875, "reward_std": 0.061461035162210464, "rewards/oai_reward_function/mean": 0.5859375, "rewards/oai_reward_function/std": 0.1271488517522812, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07211006805300713, "epoch": 2.642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.1074090376496315, "kl": 0.036498697474598885, "learning_rate": 3.685714285714286e-05, "loss": 0.0004, "num_tokens": 3294808.0, "reward": 1.162500023841858, "reward_std": 0.13342483341693878, "rewards/oai_reward_function/mean": 0.5812499970197678, "rewards/oai_reward_function/std": 0.1636282056570053, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04203084297478199, "epoch": 2.657142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.07064300775527954, "kl": 0.02704466599971056, "learning_rate": 3.678571428571429e-05, "loss": 0.0003, "num_tokens": 3312640.0, "reward": 1.0750000476837158, "reward_std": 0.018898211419582367, "rewards/oai_reward_function/mean": 0.5375000014901161, "rewards/oai_reward_function/std": 0.06839166581630707, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.054328473284840584, "epoch": 2.6714285714285713, "frac_reward_zero_std": 0.75, "grad_norm": 0.06820113956928253, "kl": 0.022003832273185253, "learning_rate": 3.671428571428572e-05, "loss": 0.0002, "num_tokens": 3330408.0, "reward": 1.1531250476837158, "reward_std": 0.0646936446428299, "rewards/oai_reward_function/mean": 0.5765625014901161, "rewards/oai_reward_function/std": 0.14809781312942505, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.051017552614212036, "epoch": 2.685714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.08293981850147247, "kl": 0.02239195117726922, "learning_rate": 3.6642857142857145e-05, "loss": 0.0002, "num_tokens": 3348064.0, "reward": 1.0109374523162842, "reward_std": 0.017358144745230675, "rewards/oai_reward_function/mean": 0.505468750372529, "rewards/oai_reward_function/std": 0.015206077136099339, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.049897488206624985, "epoch": 2.7, "frac_reward_zero_std": 0.25, "grad_norm": 0.09536179155111313, "kl": 0.05423136055469513, "learning_rate": 3.6571428571428576e-05, "loss": 0.0005, "num_tokens": 3365896.0, "reward": 1.1484375, "reward_std": 0.06608611345291138, "rewards/oai_reward_function/mean": 0.57421875, "rewards/oai_reward_function/std": 0.10268264263868332, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05249054729938507, "epoch": 2.7142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 0.13304099440574646, "kl": 0.04763131029903889, "learning_rate": 3.65e-05, "loss": 0.0005, "num_tokens": 3383800.0, "reward": 1.0828125476837158, "reward_std": 0.04099529981613159, "rewards/oai_reward_function/mean": 0.5414062514901161, "rewards/oai_reward_function/std": 0.04943608492612839, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.062329867854714394, "epoch": 2.7285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 0.11173869669437408, "kl": 0.03640593169257045, "learning_rate": 3.642857142857143e-05, "loss": 0.0004, "num_tokens": 3401648.0, "reward": 1.0562500953674316, "reward_std": 0.023689784109592438, "rewards/oai_reward_function/mean": 0.5281250011175871, "rewards/oai_reward_function/std": 0.0274963341653347, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.058776866644620895, "epoch": 2.742857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.10382409393787384, "kl": 0.03305045561864972, "learning_rate": 3.6357142857142854e-05, "loss": 0.0003, "num_tokens": 3419408.0, "reward": 1.217187523841858, "reward_std": 0.02610759809613228, "rewards/oai_reward_function/mean": 0.6085937470197678, "rewards/oai_reward_function/std": 0.1844356507062912, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04789746552705765, "epoch": 2.757142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.05706682801246643, "kl": 0.03994939010590315, "learning_rate": 3.628571428571429e-05, "loss": 0.0004, "num_tokens": 3437112.0, "reward": 1.015625, "reward_std": 0.01860060542821884, "rewards/oai_reward_function/mean": 0.5078125, "rewards/oai_reward_function/std": 0.022394467145204544, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.062057855539023876, "epoch": 2.7714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.15330444276332855, "kl": 0.084334472194314, "learning_rate": 3.6214285714285716e-05, "loss": 0.0008, "num_tokens": 3454904.0, "reward": 1.470312476158142, "reward_std": 0.04739333689212799, "rewards/oai_reward_function/mean": 0.735156238079071, "rewards/oai_reward_function/std": 0.19775548577308655, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04913834575563669, "epoch": 2.7857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.11146184056997299, "kl": 0.06489744689315557, "learning_rate": 3.6142857142857146e-05, "loss": 0.0006, "num_tokens": 3472632.0, "reward": 1.0703125, "reward_std": 0.045694079250097275, "rewards/oai_reward_function/mean": 0.53515625, "rewards/oai_reward_function/std": 0.04438621550798416, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.057510885410010815, "epoch": 2.8, "frac_reward_zero_std": 0.5, "grad_norm": 0.1315256953239441, "kl": 0.06008041184395552, "learning_rate": 3.607142857142857e-05, "loss": 0.0006, "num_tokens": 3490592.0, "reward": 1.060937523841858, "reward_std": 0.02575094997882843, "rewards/oai_reward_function/mean": 0.5304687507450581, "rewards/oai_reward_function/std": 0.04522986710071564, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05490284040570259, "epoch": 2.814285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 0.1558970808982849, "kl": 0.0605736318975687, "learning_rate": 3.6e-05, "loss": 0.0006, "num_tokens": 3508408.0, "reward": 1.1359374523162842, "reward_std": 0.16093073785305023, "rewards/oai_reward_function/mean": 0.5679687559604645, "rewards/oai_reward_function/std": 0.12798601388931274, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05418549384921789, "epoch": 2.8285714285714287, "frac_reward_zero_std": 0.75, "grad_norm": 0.060409124940633774, "kl": 0.07027391903102398, "learning_rate": 3.5928571428571425e-05, "loss": 0.0007, "num_tokens": 3526168.0, "reward": 1.0281250476837158, "reward_std": 0.008838837966322899, "rewards/oai_reward_function/mean": 0.514062499627471, "rewards/oai_reward_function/std": 0.026133574545383453, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.03633992746472359, "epoch": 2.842857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.003255989169701934, "kl": 0.07370059937238693, "learning_rate": 3.585714285714286e-05, "loss": 0.0007, "num_tokens": 3543864.0, "reward": 1.0, "reward_std": 0.0, "rewards/oai_reward_function/mean": 0.5, "rewards/oai_reward_function/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.057329680770635605, "epoch": 2.857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.11008545011281967, "kl": 0.06796468701213598, "learning_rate": 3.5785714285714286e-05, "loss": 0.0007, "num_tokens": 3561688.0, "reward": 1.25, "reward_std": 0.014625202864408493, "rewards/oai_reward_function/mean": 0.625, "rewards/oai_reward_function/std": 0.21655291318893433, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04236162081360817, "epoch": 2.8714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.15067800879478455, "kl": 0.10442700423300266, "learning_rate": 3.571428571428572e-05, "loss": 0.001, "num_tokens": 3579560.0, "reward": 1.2906250953674316, "reward_std": 0.06347659230232239, "rewards/oai_reward_function/mean": 0.6453125029802322, "rewards/oai_reward_function/std": 0.18144108355045319, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04874769877642393, "epoch": 2.8857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.08031316101551056, "kl": 0.06067081820219755, "learning_rate": 3.564285714285715e-05, "loss": 0.0006, "num_tokens": 3597200.0, "reward": 1.037500023841858, "reward_std": 0.019918914884328842, "rewards/oai_reward_function/mean": 0.5187500007450581, "rewards/oai_reward_function/std": 0.023759547621011734, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06220845878124237, "epoch": 2.9, "frac_reward_zero_std": 0.25, "grad_norm": 0.12046143412590027, "kl": 0.05884059518575668, "learning_rate": 3.557142857142857e-05, "loss": 0.0006, "num_tokens": 3615112.0, "reward": 1.076562523841858, "reward_std": 0.05444490164518356, "rewards/oai_reward_function/mean": 0.5382812507450581, "rewards/oai_reward_function/std": 0.04835369065403938, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05830034799873829, "epoch": 2.914285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 0.09531212598085403, "kl": 0.05973371770232916, "learning_rate": 3.55e-05, "loss": 0.0006, "num_tokens": 3632936.0, "reward": 1.1078124046325684, "reward_std": 0.037323713302612305, "rewards/oai_reward_function/mean": 0.5539062544703484, "rewards/oai_reward_function/std": 0.07622901350259781, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05407467018812895, "epoch": 2.928571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.07925046980381012, "kl": 0.07386120036244392, "learning_rate": 3.5428571428571426e-05, "loss": 0.0007, "num_tokens": 3650760.0, "reward": 1.0140624046325684, "reward_std": 0.02122672274708748, "rewards/oai_reward_function/mean": 0.5070312502793968, "rewards/oai_reward_function/std": 0.017079481855034828, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06182014662772417, "epoch": 2.942857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.11716300994157791, "kl": 0.06741901952773333, "learning_rate": 3.5357142857142864e-05, "loss": 0.0007, "num_tokens": 3668512.0, "reward": 1.0906250476837158, "reward_std": 0.055445872247219086, "rewards/oai_reward_function/mean": 0.5453125014901161, "rewards/oai_reward_function/std": 0.06968752294778824, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.059788716956973076, "epoch": 2.9571428571428573, "frac_reward_zero_std": 0.25, "grad_norm": 0.13249847292900085, "kl": 0.08083864115178585, "learning_rate": 3.528571428571429e-05, "loss": 0.0008, "num_tokens": 3686296.0, "reward": 1.2609375715255737, "reward_std": 0.032799478620290756, "rewards/oai_reward_function/mean": 0.6304687410593033, "rewards/oai_reward_function/std": 0.16235841810703278, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06789828836917877, "epoch": 2.9714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.1304040104150772, "kl": 0.07522418349981308, "learning_rate": 3.521428571428572e-05, "loss": 0.0008, "num_tokens": 3704008.0, "reward": 1.2593750953674316, "reward_std": 0.05023520812392235, "rewards/oai_reward_function/mean": 0.6296875029802322, "rewards/oai_reward_function/std": 0.1668539047241211, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04057574924081564, "epoch": 2.9857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 0.1246933788061142, "kl": 0.10525520890951157, "learning_rate": 3.514285714285714e-05, "loss": 0.0011, "num_tokens": 3721888.0, "reward": 1.2625000476837158, "reward_std": 0.03328196331858635, "rewards/oai_reward_function/mean": 0.6312500089406967, "rewards/oai_reward_function/std": 0.1866512894630432, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06538868602365255, "epoch": 3.0, "frac_reward_zero_std": 0.25, "grad_norm": 0.11987200379371643, "kl": 0.090326476842165, "learning_rate": 3.507142857142857e-05, "loss": 0.0009, "num_tokens": 3739752.0, "reward": 1.0437500476837158, "reward_std": 0.040318816900253296, "rewards/oai_reward_function/mean": 0.5218750014901161, "rewards/oai_reward_function/std": 0.03521248698234558, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06806700490415096, "epoch": 3.0142857142857142, "frac_reward_zero_std": 0.5, "grad_norm": 0.08975692093372345, "kl": 0.08757120184600353, "learning_rate": 3.5e-05, "loss": 0.0009, "num_tokens": 3757520.0, "reward": 1.0203125476837158, "reward_std": 0.024814628064632416, "rewards/oai_reward_function/mean": 0.510156249627471, "rewards/oai_reward_function/std": 0.019938793033361435, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05651993863284588, "epoch": 3.0285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.10675106197595596, "kl": 0.09534911066293716, "learning_rate": 3.4928571428571434e-05, "loss": 0.001, "num_tokens": 3775296.0, "reward": 1.0750000476837158, "reward_std": 0.06767623126506805, "rewards/oai_reward_function/mean": 0.5375000014901161, "rewards/oai_reward_function/std": 0.07378040999174118, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0639553228393197, "epoch": 3.0428571428571427, "frac_reward_zero_std": 0.5, "grad_norm": 0.09777996689081192, "kl": 0.07890664599835873, "learning_rate": 3.485714285714286e-05, "loss": 0.0008, "num_tokens": 3793032.0, "reward": 1.0281250476837158, "reward_std": 0.020751874893903732, "rewards/oai_reward_function/mean": 0.5140625005587935, "rewards/oai_reward_function/std": 0.021939707919955254, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06698552891612053, "epoch": 3.057142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.1115046888589859, "kl": 0.07001018989831209, "learning_rate": 3.478571428571429e-05, "loss": 0.0007, "num_tokens": 3810744.0, "reward": 1.056249976158142, "reward_std": 0.030470959842205048, "rewards/oai_reward_function/mean": 0.5281250011175871, "rewards/oai_reward_function/std": 0.04741290956735611, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06257231812924147, "epoch": 3.0714285714285716, "frac_reward_zero_std": 0.75, "grad_norm": 0.10517910867929459, "kl": 0.07934985496103764, "learning_rate": 3.471428571428571e-05, "loss": 0.0008, "num_tokens": 3828496.0, "reward": 1.235937476158142, "reward_std": 0.012387894093990326, "rewards/oai_reward_function/mean": 0.6179687529802322, "rewards/oai_reward_function/std": 0.20793089270591736, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.055911571718752384, "epoch": 3.085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.15163163840770721, "kl": 0.11663151904940605, "learning_rate": 3.4642857142857144e-05, "loss": 0.0012, "num_tokens": 3846408.0, "reward": 1.2062499523162842, "reward_std": 0.1645711362361908, "rewards/oai_reward_function/mean": 0.6031250059604645, "rewards/oai_reward_function/std": 0.10957211256027222, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05231211241334677, "epoch": 3.1, "frac_reward_zero_std": 0.25, "grad_norm": 0.17118224501609802, "kl": 0.12115776538848877, "learning_rate": 3.4571428571428574e-05, "loss": 0.0012, "num_tokens": 3864168.0, "reward": 1.0859375, "reward_std": 0.1350831389427185, "rewards/oai_reward_function/mean": 0.5429687462747097, "rewards/oai_reward_function/std": 0.11469355970621109, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05661669000983238, "epoch": 3.1142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.13312071561813354, "kl": 0.08971596322953701, "learning_rate": 3.45e-05, "loss": 0.0009, "num_tokens": 3882016.0, "reward": 1.2296874523162842, "reward_std": 0.02697797492146492, "rewards/oai_reward_function/mean": 0.6148437485098839, "rewards/oai_reward_function/std": 0.1935303658246994, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0506694195792079, "epoch": 3.1285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 0.10720871388912201, "kl": 0.07849705778062344, "learning_rate": 3.442857142857143e-05, "loss": 0.0008, "num_tokens": 3899952.0, "reward": 1.1015625, "reward_std": 0.044115059077739716, "rewards/oai_reward_function/mean": 0.55078125, "rewards/oai_reward_function/std": 0.055534202605485916, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05330024380236864, "epoch": 3.142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.12851352989673615, "kl": 0.0983127523213625, "learning_rate": 3.435714285714286e-05, "loss": 0.001, "num_tokens": 3917688.0, "reward": 1.365625023841858, "reward_std": 0.12765255570411682, "rewards/oai_reward_function/mean": 0.6828124970197678, "rewards/oai_reward_function/std": 0.20135001838207245, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05522188264876604, "epoch": 3.157142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.11083745956420898, "kl": 0.07289117947220802, "learning_rate": 3.428571428571429e-05, "loss": 0.0007, "num_tokens": 3935528.0, "reward": 1.046875, "reward_std": 0.0414334312081337, "rewards/oai_reward_function/mean": 0.5234375, "rewards/oai_reward_function/std": 0.039623990654945374, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05520590580999851, "epoch": 3.1714285714285713, "frac_reward_zero_std": 0.5, "grad_norm": 0.11108041554689407, "kl": 0.08299623243510723, "learning_rate": 3.4214285714285714e-05, "loss": 0.0008, "num_tokens": 3953320.0, "reward": 1.2531250715255737, "reward_std": 0.017311176285147667, "rewards/oai_reward_function/mean": 0.6265624910593033, "rewards/oai_reward_function/std": 0.18985748291015625, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05928301624953747, "epoch": 3.185714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.1239665076136589, "kl": 0.11972067691385746, "learning_rate": 3.4142857142857145e-05, "loss": 0.0012, "num_tokens": 3971032.0, "reward": 1.1640625, "reward_std": 0.04833199828863144, "rewards/oai_reward_function/mean": 0.58203125, "rewards/oai_reward_function/std": 0.12608151137828827, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.042910450138151646, "epoch": 3.2, "frac_reward_zero_std": 0.25, "grad_norm": 0.1018366813659668, "kl": 0.08956374414265156, "learning_rate": 3.407142857142857e-05, "loss": 0.0009, "num_tokens": 3988864.0, "reward": 1.2765624523162842, "reward_std": 0.06503090262413025, "rewards/oai_reward_function/mean": 0.6382812410593033, "rewards/oai_reward_function/std": 0.19134333729743958, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06987146660685539, "epoch": 3.2142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 0.13029474020004272, "kl": 0.11290079541504383, "learning_rate": 3.4000000000000007e-05, "loss": 0.0011, "num_tokens": 4006800.0, "reward": 1.3203125, "reward_std": 0.04008040949702263, "rewards/oai_reward_function/mean": 0.66015625, "rewards/oai_reward_function/std": 0.20209181308746338, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07145651057362556, "epoch": 3.2285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 0.1022149994969368, "kl": 0.05857388116419315, "learning_rate": 3.392857142857143e-05, "loss": 0.0006, "num_tokens": 4024680.0, "reward": 1.0499999523162842, "reward_std": 0.03877411410212517, "rewards/oai_reward_function/mean": 0.5249999985098839, "rewards/oai_reward_function/std": 0.0416397750377655, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.048385005444288254, "epoch": 3.242857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.12312141805887222, "kl": 0.07377888821065426, "learning_rate": 3.385714285714286e-05, "loss": 0.0007, "num_tokens": 4042472.0, "reward": 1.4500000476837158, "reward_std": 0.02340090088546276, "rewards/oai_reward_function/mean": 0.7249999940395355, "rewards/oai_reward_function/std": 0.23026981949806213, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0545792318880558, "epoch": 3.257142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.09947756677865982, "kl": 0.09583424963057041, "learning_rate": 3.3785714285714285e-05, "loss": 0.001, "num_tokens": 4060248.0, "reward": 1.0265624523162842, "reward_std": 0.10836321860551834, "rewards/oai_reward_function/mean": 0.5132812485098839, "rewards/oai_reward_function/std": 0.12380600348114967, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06396409310400486, "epoch": 3.2714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.0686774030327797, "kl": 0.09425997547805309, "learning_rate": 3.3714285714285716e-05, "loss": 0.0009, "num_tokens": 4077880.0, "reward": 1.2703125476837158, "reward_std": 0.017598576843738556, "rewards/oai_reward_function/mean": 0.6351562440395355, "rewards/oai_reward_function/std": 0.2018921971321106, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08321201242506504, "epoch": 3.2857142857142856, "frac_reward_zero_std": 0.75, "grad_norm": 0.06190980598330498, "kl": 0.03877187706530094, "learning_rate": 3.364285714285714e-05, "loss": 0.0004, "num_tokens": 4095672.0, "reward": 1.0046875476837158, "reward_std": 0.00646935636177659, "rewards/oai_reward_function/mean": 0.5023437500931323, "rewards/oai_reward_function/std": 0.007403614930808544, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.0575382262468338, "epoch": 3.3, "frac_reward_zero_std": 0.5, "grad_norm": 0.11537446081638336, "kl": 0.08915554732084274, "learning_rate": 3.357142857142857e-05, "loss": 0.0009, "num_tokens": 4113480.0, "reward": 1.2359375953674316, "reward_std": 0.08257875591516495, "rewards/oai_reward_function/mean": 0.617968738079071, "rewards/oai_reward_function/std": 0.156090646982193, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05642772279679775, "epoch": 3.314285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.13337863981723785, "kl": 0.10237299278378487, "learning_rate": 3.35e-05, "loss": 0.001, "num_tokens": 4131224.0, "reward": 1.4734375476837158, "reward_std": 0.01958364248275757, "rewards/oai_reward_function/mean": 0.7367187440395355, "rewards/oai_reward_function/std": 0.2408807873725891, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06528778094798326, "epoch": 3.3285714285714287, "frac_reward_zero_std": 0.25, "grad_norm": 0.13227899372577667, "kl": 0.11398253589868546, "learning_rate": 3.342857142857143e-05, "loss": 0.0011, "num_tokens": 4149184.0, "reward": 1.3203125, "reward_std": 0.03324369713664055, "rewards/oai_reward_function/mean": 0.6601562350988388, "rewards/oai_reward_function/std": 0.19673332571983337, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06553995609283447, "epoch": 3.342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.20257574319839478, "kl": 0.12704718858003616, "learning_rate": 3.3357142857142856e-05, "loss": 0.0013, "num_tokens": 4167104.0, "reward": 1.4609375, "reward_std": 0.1858925223350525, "rewards/oai_reward_function/mean": 0.73046875, "rewards/oai_reward_function/std": 0.1834629327058792, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04393093287944794, "epoch": 3.357142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.12950977683067322, "kl": 0.09361258894205093, "learning_rate": 3.3285714285714286e-05, "loss": 0.0009, "num_tokens": 4184944.0, "reward": 1.1656250953674316, "reward_std": 0.150077685713768, "rewards/oai_reward_function/mean": 0.5828124955296516, "rewards/oai_reward_function/std": 0.13220180571079254, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06838994100689888, "epoch": 3.3714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.14525781571865082, "kl": 0.09100262448191643, "learning_rate": 3.321428571428572e-05, "loss": 0.0009, "num_tokens": 4202672.0, "reward": 1.4187500476837158, "reward_std": 0.02699536457657814, "rewards/oai_reward_function/mean": 0.7093749940395355, "rewards/oai_reward_function/std": 0.21884500980377197, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05996893718838692, "epoch": 3.3857142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 0.12374205142259598, "kl": 0.13878228701651096, "learning_rate": 3.314285714285714e-05, "loss": 0.0014, "num_tokens": 4220472.0, "reward": 1.443750023841858, "reward_std": 0.060242824256420135, "rewards/oai_reward_function/mean": 0.721875011920929, "rewards/oai_reward_function/std": 0.18898604810237885, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04969180002808571, "epoch": 3.4, "frac_reward_zero_std": 0.25, "grad_norm": 0.14094124734401703, "kl": 0.11433868668973446, "learning_rate": 3.307142857142858e-05, "loss": 0.0011, "num_tokens": 4238352.0, "reward": 1.5062499046325684, "reward_std": 0.04232252389192581, "rewards/oai_reward_function/mean": 0.7531249821186066, "rewards/oai_reward_function/std": 0.21019864082336426, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05842717830091715, "epoch": 3.414285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.13528573513031006, "kl": 0.13230286352336407, "learning_rate": 3.3e-05, "loss": 0.0013, "num_tokens": 4256072.0, "reward": 1.5250000953674316, "reward_std": 0.0736992210149765, "rewards/oai_reward_function/mean": 0.762499988079071, "rewards/oai_reward_function/std": 0.19999998807907104, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.053723374381661415, "epoch": 3.4285714285714284, "frac_reward_zero_std": 0.5, "grad_norm": 0.10746068507432938, "kl": 0.0968917403370142, "learning_rate": 3.292857142857143e-05, "loss": 0.001, "num_tokens": 4273848.0, "reward": 1.0499999523162842, "reward_std": 0.017677675932645798, "rewards/oai_reward_function/mean": 0.525000000372529, "rewards/oai_reward_function/std": 0.028398092836141586, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.08724895678460598, "epoch": 3.442857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.1673547476530075, "kl": 0.16037143021821976, "learning_rate": 3.285714285714286e-05, "loss": 0.0016, "num_tokens": 4291872.0, "reward": 1.603124976158142, "reward_std": 0.11311184614896774, "rewards/oai_reward_function/mean": 0.801562488079071, "rewards/oai_reward_function/std": 0.22014817595481873, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06301301345229149, "epoch": 3.4571428571428573, "frac_reward_zero_std": 0.5, "grad_norm": 0.127055823802948, "kl": 0.10341309197247028, "learning_rate": 3.278571428571429e-05, "loss": 0.001, "num_tokens": 4309640.0, "reward": 1.265625, "reward_std": 0.02265283279120922, "rewards/oai_reward_function/mean": 0.6328125, "rewards/oai_reward_function/std": 0.19933734834194183, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.04968675132840872, "epoch": 3.4714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.1374424248933792, "kl": 0.1188412457704544, "learning_rate": 3.271428571428571e-05, "loss": 0.0012, "num_tokens": 4327360.0, "reward": 1.303125023841858, "reward_std": 0.10906177759170532, "rewards/oai_reward_function/mean": 0.651562511920929, "rewards/oai_reward_function/std": 0.2058555632829666, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06477249693125486, "epoch": 3.4857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 0.1306895911693573, "kl": 0.11430021747946739, "learning_rate": 3.264285714285714e-05, "loss": 0.0011, "num_tokens": 4345192.0, "reward": 1.4375, "reward_std": 0.1379069834947586, "rewards/oai_reward_function/mean": 0.71875, "rewards/oai_reward_function/std": 0.18447834253311157, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06405621953308582, "epoch": 3.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.1333678811788559, "kl": 0.17231638357043266, "learning_rate": 3.257142857142857e-05, "loss": 0.0017, "num_tokens": 4363064.0, "reward": 1.5109375715255737, "reward_std": 0.0335906445980072, "rewards/oai_reward_function/mean": 0.7554687559604645, "rewards/oai_reward_function/std": 0.22403325140476227, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.067863704636693, "epoch": 3.5142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.13658057153224945, "kl": 0.17316893115639687, "learning_rate": 3.2500000000000004e-05, "loss": 0.0017, "num_tokens": 4380968.0, "reward": 1.5546875, "reward_std": 0.05777457728981972, "rewards/oai_reward_function/mean": 0.77734375, "rewards/oai_reward_function/std": 0.2121661901473999, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.05905670113861561, "epoch": 3.5285714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.09611335396766663, "kl": 0.050939660519361496, "learning_rate": 3.242857142857143e-05, "loss": 0.0005, "num_tokens": 4398608.0, "reward": 1.0125000476837158, "reward_std": 0.02314549870789051, "rewards/oai_reward_function/mean": 0.5062500000931323, "rewards/oai_reward_function/std": 0.016800537705421448, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07766996510326862, "epoch": 3.5428571428571427, "frac_reward_zero_std": 0.25, "grad_norm": 0.13582761585712433, "kl": 0.12659209407866, "learning_rate": 3.235714285714286e-05, "loss": 0.0013, "num_tokens": 4416392.0, "reward": 1.5640625953674316, "reward_std": 0.07988262921571732, "rewards/oai_reward_function/mean": 0.782031238079071, "rewards/oai_reward_function/std": 0.2193496972322464, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.07497746869921684, "epoch": 3.557142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1760389357805252, "kl": 0.24591631814837456, "learning_rate": 3.228571428571428e-05, "loss": 0.0025, "num_tokens": 4434272.0, "reward": 1.798437476158142, "reward_std": 0.12221544235944748, "rewards/oai_reward_function/mean": 0.899218738079071, "rewards/oai_reward_function/std": 0.10424157232046127, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "entropy": 0.06046187411993742, "epoch": 3.571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.15973122417926788, "kl": 0.22475793957710266, "learning_rate": 3.221428571428571e-05, "loss": 0.0022, "num_tokens": 4452104.0, "reward": 1.6359374523162842, "reward_std": 0.0966869369149208, "rewards/oai_reward_function/mean": 0.8179687559604645, "rewards/oai_reward_function/std": 0.19666926562786102, "step": 250 } ], "logging_steps": 1, "max_steps": 700, "num_input_tokens_seen": 4452104, "num_train_epochs": 10, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }