trl_model_step_250 / trainer_state.json
adraganov's picture
Upload folder using huggingface_hub
c67b758 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.571428571428571,
"eval_steps": 500,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.060131815262138844,
"epoch": 0.014285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05771088972687721,
"kl": 0.0,
"learning_rate": 5e-05,
"loss": 0.0,
"num_tokens": 17832.0,
"reward": 1.0437500476837158,
"reward_std": 0.0353553369641304,
"rewards/oai_reward_function/mean": 0.5218750014901161,
"rewards/oai_reward_function/std": 0.043879419565200806,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06413675658404827,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.03477979078888893,
"kl": 0.0003001746808877215,
"learning_rate": 4.928571428571429e-05,
"loss": 0.0,
"num_tokens": 35712.0,
"reward": 1.046875,
"reward_std": 0.028149789199233055,
"rewards/oai_reward_function/mean": 0.5234375,
"rewards/oai_reward_function/std": 0.049161311239004135,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.052969515323638916,
"epoch": 0.04285714285714286,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0005888506420888007,
"kl": 0.0004545010087895207,
"learning_rate": 4.8571428571428576e-05,
"loss": 0.0,
"num_tokens": 53424.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06199027318507433,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.04447643458843231,
"kl": 0.0005710393161280081,
"learning_rate": 4.785714285714286e-05,
"loss": 0.0,
"num_tokens": 71248.0,
"reward": 1.2265625,
"reward_std": 0.004419416189193726,
"rewards/oai_reward_function/mean": 0.61328125,
"rewards/oai_reward_function/std": 0.1993926614522934,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0633242828771472,
"epoch": 0.07142857142857142,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.043302807956933975,
"kl": 0.001818844728404656,
"learning_rate": 4.714285714285714e-05,
"loss": 0.0,
"num_tokens": 89000.0,
"reward": 1.032812476158142,
"reward_std": 0.022097092121839523,
"rewards/oai_reward_function/mean": 0.5164062492549419,
"rewards/oai_reward_function/std": 0.03570114076137543,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06267449539154768,
"epoch": 0.08571428571428572,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.048733897507190704,
"kl": 0.0011250173120060936,
"learning_rate": 4.642857142857143e-05,
"loss": 0.0,
"num_tokens": 106816.0,
"reward": 1.071874976158142,
"reward_std": 0.03390505909919739,
"rewards/oai_reward_function/mean": 0.5359375029802322,
"rewards/oai_reward_function/std": 0.07097747921943665,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06247459910809994,
"epoch": 0.1,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07503627240657806,
"kl": 0.0016785123152658343,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.0,
"num_tokens": 124592.0,
"reward": 1.181249976158142,
"reward_std": 0.06808801740407944,
"rewards/oai_reward_function/mean": 0.5906250029802322,
"rewards/oai_reward_function/std": 0.13951963186264038,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09173925407230854,
"epoch": 0.11428571428571428,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.046880681067705154,
"kl": 0.004017388273496181,
"learning_rate": 4.5e-05,
"loss": 0.0,
"num_tokens": 142368.0,
"reward": 1.001562476158142,
"reward_std": 0.004419416189193726,
"rewards/oai_reward_function/mean": 0.5007812500116415,
"rewards/oai_reward_function/std": 0.0044194175861775875,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07956545054912567,
"epoch": 0.12857142857142856,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.061871547251939774,
"kl": 0.00639598595444113,
"learning_rate": 4.428571428571428e-05,
"loss": 0.0001,
"num_tokens": 160160.0,
"reward": 1.021875023841858,
"reward_std": 0.052504248917102814,
"rewards/oai_reward_function/mean": 0.5109375007450581,
"rewards/oai_reward_function/std": 0.053482551127672195,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06956008821725845,
"epoch": 0.14285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06243785470724106,
"kl": 0.00972771504893899,
"learning_rate": 4.3571428571428576e-05,
"loss": 0.0001,
"num_tokens": 177984.0,
"reward": 1.2296874523162842,
"reward_std": 0.01684970036149025,
"rewards/oai_reward_function/mean": 0.6148437485098839,
"rewards/oai_reward_function/std": 0.1987723708152771,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07967641018331051,
"epoch": 0.15714285714285714,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07661325484514236,
"kl": 0.0069638064596802,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.0001,
"num_tokens": 195896.0,
"reward": 1.1062500476837158,
"reward_std": 0.06087504327297211,
"rewards/oai_reward_function/mean": 0.5531250014901161,
"rewards/oai_reward_function/std": 0.08584260195493698,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0841637123376131,
"epoch": 0.17142857142857143,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05056445300579071,
"kl": 0.011949660489335656,
"learning_rate": 4.214285714285714e-05,
"loss": 0.0001,
"num_tokens": 213760.0,
"reward": 1.131250023841858,
"reward_std": 0.029124131426215172,
"rewards/oai_reward_function/mean": 0.5656249970197678,
"rewards/oai_reward_function/std": 0.11875531077384949,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0837175901979208,
"epoch": 0.18571428571428572,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1177188903093338,
"kl": 0.01176721346564591,
"learning_rate": 4.1428571428571437e-05,
"loss": 0.0001,
"num_tokens": 231664.0,
"reward": 1.2421875,
"reward_std": 0.02758825570344925,
"rewards/oai_reward_function/mean": 0.62109375,
"rewards/oai_reward_function/std": 0.1868790090084076,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07715502567589283,
"epoch": 0.2,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0020164160523563623,
"kl": 0.013234916375949979,
"learning_rate": 4.0714285714285717e-05,
"loss": 0.0001,
"num_tokens": 249528.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0780396144837141,
"epoch": 0.21428571428571427,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0018555221613496542,
"kl": 0.011373426881618798,
"learning_rate": 4e-05,
"loss": 0.0001,
"num_tokens": 267168.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0733959898352623,
"epoch": 0.22857142857142856,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09124160557985306,
"kl": 0.021819928660988808,
"learning_rate": 3.928571428571429e-05,
"loss": 0.0002,
"num_tokens": 284928.0,
"reward": 1.0484375953674316,
"reward_std": 0.05051835626363754,
"rewards/oai_reward_function/mean": 0.5242187511175871,
"rewards/oai_reward_function/std": 0.044669199734926224,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09740176424384117,
"epoch": 0.24285714285714285,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.052958983927965164,
"kl": 0.028434510342776775,
"learning_rate": 3.857142857142858e-05,
"loss": 0.0003,
"num_tokens": 302816.0,
"reward": 1.071874976158142,
"reward_std": 0.06469365209341049,
"rewards/oai_reward_function/mean": 0.5359374992549419,
"rewards/oai_reward_function/std": 0.0882028192281723,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08574963361024857,
"epoch": 0.2571428571428571,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.04292497783899307,
"kl": 0.033173230942338705,
"learning_rate": 3.785714285714286e-05,
"loss": 0.0003,
"num_tokens": 320584.0,
"reward": 1.001562476158142,
"reward_std": 0.004419416189193726,
"rewards/oai_reward_function/mean": 0.5007812500116415,
"rewards/oai_reward_function/std": 0.0044194175861775875,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12318380549550056,
"epoch": 0.2714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06502827256917953,
"kl": 0.03774468321353197,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.0004,
"num_tokens": 338448.0,
"reward": 1.109375,
"reward_std": 0.05164698138833046,
"rewards/oai_reward_function/mean": 0.5546875,
"rewards/oai_reward_function/std": 0.10803177952766418,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08734610676765442,
"epoch": 0.2857142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07792048156261444,
"kl": 0.028081147465854883,
"learning_rate": 3.642857142857143e-05,
"loss": 0.0003,
"num_tokens": 356200.0,
"reward": 1.03125,
"reward_std": 0.047612957656383514,
"rewards/oai_reward_function/mean": 0.515625,
"rewards/oai_reward_function/std": 0.04151855409145355,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08050715737044811,
"epoch": 0.3,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.047350119799375534,
"kl": 0.026192680466920137,
"learning_rate": 3.571428571428572e-05,
"loss": 0.0003,
"num_tokens": 373912.0,
"reward": 0.503125011920929,
"reward_std": 0.008838832378387451,
"rewards/oai_reward_function/mean": 0.25156250002328306,
"rewards/oai_reward_function/std": 0.26283908169716597,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07859978079795837,
"epoch": 0.3142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.18296176195144653,
"kl": 0.035550063010305166,
"learning_rate": 3.5e-05,
"loss": 0.0004,
"num_tokens": 391880.0,
"reward": 0.2578125,
"reward_std": 0.4363012909889221,
"rewards/oai_reward_function/mean": 0.12890625,
"rewards/oai_reward_function/std": 0.28480061888694763,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07332467474043369,
"epoch": 0.32857142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.26302284002304077,
"kl": 0.02304189372807741,
"learning_rate": 3.428571428571429e-05,
"loss": 0.0002,
"num_tokens": 409592.0,
"reward": 0.4375,
"reward_std": 0.3335031569004059,
"rewards/oai_reward_function/mean": 0.21875,
"rewards/oai_reward_function/std": 0.2520080506801605,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08370361104607582,
"epoch": 0.34285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07096434384584427,
"kl": 0.02303632628172636,
"learning_rate": 3.357142857142857e-05,
"loss": 0.0002,
"num_tokens": 427504.0,
"reward": 1.0906250476837158,
"reward_std": 0.12288369983434677,
"rewards/oai_reward_function/mean": 0.5453124977648258,
"rewards/oai_reward_function/std": 0.18977738916873932,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10516241379082203,
"epoch": 0.35714285714285715,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10253780335187912,
"kl": 0.022893703542649746,
"learning_rate": 3.285714285714286e-05,
"loss": 0.0002,
"num_tokens": 445464.0,
"reward": 1.0281250476837158,
"reward_std": 0.11285631358623505,
"rewards/oai_reward_function/mean": 0.5140625014901161,
"rewards/oai_reward_function/std": 0.13734418153762817,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10455058515071869,
"epoch": 0.37142857142857144,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0020646003540605307,
"kl": 0.013847913593053818,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.0001,
"num_tokens": 463176.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10624882206320763,
"epoch": 0.38571428571428573,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08771698921918869,
"kl": 0.023737956769764423,
"learning_rate": 3.142857142857143e-05,
"loss": 0.0002,
"num_tokens": 480896.0,
"reward": 1.1234374046325684,
"reward_std": 0.12076057493686676,
"rewards/oai_reward_function/mean": 0.5617187507450581,
"rewards/oai_reward_function/std": 0.09692539274692535,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09867865778505802,
"epoch": 0.4,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.04759760946035385,
"kl": 0.016957666259258986,
"learning_rate": 3.071428571428572e-05,
"loss": 0.0002,
"num_tokens": 498752.0,
"reward": 1.0515625476837158,
"reward_std": 0.016952523961663246,
"rewards/oai_reward_function/mean": 0.525781249627471,
"rewards/oai_reward_function/std": 0.048144761472940445,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12629481963813305,
"epoch": 0.4142857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.058567993342876434,
"kl": 0.017663696315139532,
"learning_rate": 3e-05,
"loss": 0.0002,
"num_tokens": 516552.0,
"reward": 1.2234375476837158,
"reward_std": 0.018139135092496872,
"rewards/oai_reward_function/mean": 0.6117187514901161,
"rewards/oai_reward_function/std": 0.1933349370956421,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1236942820250988,
"epoch": 0.42857142857142855,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08224395662546158,
"kl": 0.011707060737535357,
"learning_rate": 2.9285714285714288e-05,
"loss": 0.0001,
"num_tokens": 534336.0,
"reward": 1.1906249523162842,
"reward_std": 0.09417471289634705,
"rewards/oai_reward_function/mean": 0.5953124985098839,
"rewards/oai_reward_function/std": 0.2836897447705269,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12007096596062183,
"epoch": 0.44285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12164021283388138,
"kl": 0.015199759975075722,
"learning_rate": 2.857142857142857e-05,
"loss": 0.0002,
"num_tokens": 552288.0,
"reward": 1.459375023841858,
"reward_std": 0.23513765633106232,
"rewards/oai_reward_function/mean": 0.729687511920929,
"rewards/oai_reward_function/std": 0.31374088674783707,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12509393319487572,
"epoch": 0.45714285714285713,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09083209186792374,
"kl": 0.01757637900300324,
"learning_rate": 2.785714285714286e-05,
"loss": 0.0002,
"num_tokens": 570160.0,
"reward": 1.076562523841858,
"reward_std": 0.04446931555867195,
"rewards/oai_reward_function/mean": 0.5382812470197678,
"rewards/oai_reward_function/std": 0.07267481088638306,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12463634088635445,
"epoch": 0.4714285714285714,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002431818749755621,
"kl": 0.014958202606067061,
"learning_rate": 2.714285714285714e-05,
"loss": 0.0001,
"num_tokens": 587872.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11686164513230324,
"epoch": 0.4857142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05484098196029663,
"kl": 0.010952673037536442,
"learning_rate": 2.642857142857143e-05,
"loss": 0.0001,
"num_tokens": 605824.0,
"reward": 1.0968749523162842,
"reward_std": 0.09722718596458435,
"rewards/oai_reward_function/mean": 0.5484375022351742,
"rewards/oai_reward_function/std": 0.0920066386461258,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12041523866355419,
"epoch": 0.5,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05225397273898125,
"kl": 0.006640716805122793,
"learning_rate": 2.5714285714285714e-05,
"loss": 0.0001,
"num_tokens": 623624.0,
"reward": 1.0046875476837158,
"reward_std": 0.0093002924695611,
"rewards/oai_reward_function/mean": 0.5023437500931323,
"rewards/oai_reward_function/std": 0.009753772988915443,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12032002210617065,
"epoch": 0.5142857142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07161174714565277,
"kl": 0.010428835870698094,
"learning_rate": 2.5e-05,
"loss": 0.0001,
"num_tokens": 641400.0,
"reward": 1.0437500476837158,
"reward_std": 0.052891530096530914,
"rewards/oai_reward_function/mean": 0.521874999627471,
"rewards/oai_reward_function/std": 0.04741290956735611,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.13013662584125996,
"epoch": 0.5285714285714286,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05804259702563286,
"kl": 0.01170262903906405,
"learning_rate": 2.4285714285714288e-05,
"loss": 0.0001,
"num_tokens": 659192.0,
"reward": 1.0812499523162842,
"reward_std": 0.07288689911365509,
"rewards/oai_reward_function/mean": 0.5406250022351742,
"rewards/oai_reward_function/std": 0.09954533725976944,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09727449901401997,
"epoch": 0.5428571428571428,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07038255035877228,
"kl": 0.009029814857058227,
"learning_rate": 2.357142857142857e-05,
"loss": 0.0001,
"num_tokens": 677088.0,
"reward": 1.423437476158142,
"reward_std": 0.03818885609507561,
"rewards/oai_reward_function/mean": 0.711718738079071,
"rewards/oai_reward_function/std": 0.18094559013843536,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11586509644985199,
"epoch": 0.5571428571428572,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08595240861177444,
"kl": 0.011346436338499188,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.0001,
"num_tokens": 694920.0,
"reward": 1.3796875476837158,
"reward_std": 0.049540840089321136,
"rewards/oai_reward_function/mean": 0.6898437440395355,
"rewards/oai_reward_function/std": 0.20018735527992249,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1129021979868412,
"epoch": 0.5714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.03896208480000496,
"kl": 0.011648714076727629,
"learning_rate": 2.214285714285714e-05,
"loss": 0.0001,
"num_tokens": 712560.0,
"reward": 1.0031249523162842,
"reward_std": 0.008838832378387451,
"rewards/oai_reward_function/mean": 0.5015625000232831,
"rewards/oai_reward_function/std": 0.008838835172355175,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12671913765370846,
"epoch": 0.5857142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.060481104999780655,
"kl": 0.009595283307135105,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.0001,
"num_tokens": 730352.0,
"reward": 1.037500023841858,
"reward_std": 0.026726119220256805,
"rewards/oai_reward_function/mean": 0.5187500007450581,
"rewards/oai_reward_function/std": 0.0416397787630558,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1355144940316677,
"epoch": 0.6,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10250900685787201,
"kl": 0.010706432163715363,
"learning_rate": 2.0714285714285718e-05,
"loss": 0.0001,
"num_tokens": 748080.0,
"reward": 0.971875011920929,
"reward_std": 0.16737449169158936,
"rewards/oai_reward_function/mean": 0.48593750037252903,
"rewards/oai_reward_function/std": 0.18062228709459305,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11447549611330032,
"epoch": 0.6142857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07029257714748383,
"kl": 0.011148489313200116,
"learning_rate": 2e-05,
"loss": 0.0001,
"num_tokens": 765848.0,
"reward": 1.029687523841858,
"reward_std": 0.06395581364631653,
"rewards/oai_reward_function/mean": 0.5148437507450581,
"rewards/oai_reward_function/std": 0.05420219525694847,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12138544581830502,
"epoch": 0.6285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07489942759275436,
"kl": 0.009310122113674879,
"learning_rate": 1.928571428571429e-05,
"loss": 0.0001,
"num_tokens": 783552.0,
"reward": 1.015625,
"reward_std": 0.03808925300836563,
"rewards/oai_reward_function/mean": 0.5078125,
"rewards/oai_reward_function/std": 0.02870701625943184,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11241224221885204,
"epoch": 0.6428571428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06217681244015694,
"kl": 0.015002928674221039,
"learning_rate": 1.8571428571428572e-05,
"loss": 0.0001,
"num_tokens": 801392.0,
"reward": 1.171875,
"reward_std": 0.12756596505641937,
"rewards/oai_reward_function/mean": 0.5859375,
"rewards/oai_reward_function/std": 0.13151375949382782,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10430784896016121,
"epoch": 0.6571428571428571,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07851167023181915,
"kl": 0.013715020613744855,
"learning_rate": 1.785714285714286e-05,
"loss": 0.0001,
"num_tokens": 819120.0,
"reward": 1.1765625476837158,
"reward_std": 0.11721621453762054,
"rewards/oai_reward_function/mean": 0.5882812440395355,
"rewards/oai_reward_function/std": 0.23893966525793076,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09174064546823502,
"epoch": 0.6714285714285714,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08239107578992844,
"kl": 0.0339348167181015,
"learning_rate": 1.7142857142857145e-05,
"loss": 0.0003,
"num_tokens": 836976.0,
"reward": 1.1734375953674316,
"reward_std": 0.07495103776454926,
"rewards/oai_reward_function/mean": 0.5867187529802322,
"rewards/oai_reward_function/std": 0.09024705737829208,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12333916500210762,
"epoch": 0.6857142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06266991049051285,
"kl": 0.011174799175933003,
"learning_rate": 1.642857142857143e-05,
"loss": 0.0001,
"num_tokens": 854808.0,
"reward": 1.021875023841858,
"reward_std": 0.03390507400035858,
"rewards/oai_reward_function/mean": 0.5109375007450581,
"rewards/oai_reward_function/std": 0.03753358870744705,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11699695512652397,
"epoch": 0.7,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06871096044778824,
"kl": 0.011643779696896672,
"learning_rate": 1.5714285714285715e-05,
"loss": 0.0001,
"num_tokens": 872616.0,
"reward": 1.2109375,
"reward_std": 0.020290398970246315,
"rewards/oai_reward_function/mean": 0.6054687574505806,
"rewards/oai_reward_function/std": 0.18247121572494507,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11293753050267696,
"epoch": 0.7142857142857143,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.03746737167239189,
"kl": 0.008202132536098361,
"learning_rate": 1.5e-05,
"loss": 0.0001,
"num_tokens": 890472.0,
"reward": 1.0281250476837158,
"reward_std": 0.008838837966322899,
"rewards/oai_reward_function/mean": 0.514062499627471,
"rewards/oai_reward_function/std": 0.026133574545383453,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.12108040601015091,
"epoch": 0.7285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09108272194862366,
"kl": 0.009169791359454393,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.0001,
"num_tokens": 908352.0,
"reward": 1.126562476158142,
"reward_std": 0.1380167454481125,
"rewards/oai_reward_function/mean": 0.5632812455296516,
"rewards/oai_reward_function/std": 0.1459098607301712,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10115997679531574,
"epoch": 0.7428571428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06731049716472626,
"kl": 0.007746399496681988,
"learning_rate": 1.357142857142857e-05,
"loss": 0.0001,
"num_tokens": 926032.0,
"reward": 1.045312523841858,
"reward_std": 0.04133228585124016,
"rewards/oai_reward_function/mean": 0.5226562507450581,
"rewards/oai_reward_function/std": 0.04369957000017166,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.114451814442873,
"epoch": 0.7571428571428571,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0016380356391891837,
"kl": 0.008339080261066556,
"learning_rate": 1.2857142857142857e-05,
"loss": 0.0001,
"num_tokens": 943784.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1146883126348257,
"epoch": 0.7714285714285715,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0652991309762001,
"kl": 0.014742115745320916,
"learning_rate": 1.2142857142857144e-05,
"loss": 0.0001,
"num_tokens": 961592.0,
"reward": 1.162500023841858,
"reward_std": 0.09099893271923065,
"rewards/oai_reward_function/mean": 0.5812500044703484,
"rewards/oai_reward_function/std": 0.11896733194589615,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0963958241045475,
"epoch": 0.7857142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06369847059249878,
"kl": 0.00836158636957407,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.0001,
"num_tokens": 979312.0,
"reward": 1.1375000476837158,
"reward_std": 0.055009134113788605,
"rewards/oai_reward_function/mean": 0.5687500014901161,
"rewards/oai_reward_function/std": 0.12556324899196625,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11964921839535236,
"epoch": 0.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10623525083065033,
"kl": 0.008312122779898345,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.0001,
"num_tokens": 997072.0,
"reward": 1.0031249523162842,
"reward_std": 0.11129148304462433,
"rewards/oai_reward_function/mean": 0.501562500372529,
"rewards/oai_reward_function/std": 0.12565238773822784,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10028179734945297,
"epoch": 0.8142857142857143,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05555571988224983,
"kl": 0.012380573665723205,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 1014864.0,
"reward": 1.015625,
"reward_std": 0.0265165027230978,
"rewards/oai_reward_function/mean": 0.5078125,
"rewards/oai_reward_function/std": 0.02870701625943184,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09480222314596176,
"epoch": 0.8285714285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09733164310455322,
"kl": 0.010292174993082881,
"learning_rate": 9.285714285714286e-06,
"loss": 0.0001,
"num_tokens": 1032656.0,
"reward": 1.125,
"reward_std": 0.10169674456119537,
"rewards/oai_reward_function/mean": 0.5624999962747097,
"rewards/oai_reward_function/std": 0.10375995188951492,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10845490545034409,
"epoch": 0.8428571428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07499091327190399,
"kl": 0.009870404610410333,
"learning_rate": 8.571428571428573e-06,
"loss": 0.0001,
"num_tokens": 1050472.0,
"reward": 1.0187499523162842,
"reward_std": 0.02493581920862198,
"rewards/oai_reward_function/mean": 0.509375000372529,
"rewards/oai_reward_function/std": 0.019827887415885925,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11447742953896523,
"epoch": 0.8571428571428571,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05116976425051689,
"kl": 0.005832118098624051,
"learning_rate": 7.857142857142858e-06,
"loss": 0.0001,
"num_tokens": 1068288.0,
"reward": 1.0343749523162842,
"reward_std": 0.029693374410271645,
"rewards/oai_reward_function/mean": 0.517187500372529,
"rewards/oai_reward_function/std": 0.04136652871966362,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10109574533998966,
"epoch": 0.8714285714285714,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06300117075443268,
"kl": 0.007976277614943683,
"learning_rate": 7.142857142857143e-06,
"loss": 0.0001,
"num_tokens": 1086200.0,
"reward": 1.0109374523162842,
"reward_std": 0.023685520514845848,
"rewards/oai_reward_function/mean": 0.5054687499068677,
"rewards/oai_reward_function/std": 0.01765984110534191,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10328171029686928,
"epoch": 0.8857142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.041226889938116074,
"kl": 0.00717292504850775,
"learning_rate": 6.428571428571429e-06,
"loss": 0.0001,
"num_tokens": 1104056.0,
"reward": 1.09375,
"reward_std": 0.03720119222998619,
"rewards/oai_reward_function/mean": 0.546875,
"rewards/oai_reward_function/std": 0.08974651247262955,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09696869738399982,
"epoch": 0.9,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.03586564213037491,
"kl": 0.009956882800906897,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0001,
"num_tokens": 1121872.0,
"reward": 1.1218750476837158,
"reward_std": 0.031160593032836914,
"rewards/oai_reward_function/mean": 0.5609375014901161,
"rewards/oai_reward_function/std": 0.11124978214502335,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10472088679671288,
"epoch": 0.9142857142857143,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.045453496277332306,
"kl": 0.008746590930968523,
"learning_rate": 5e-06,
"loss": 0.0001,
"num_tokens": 1139616.0,
"reward": 1.0046875476837158,
"reward_std": 0.00930030457675457,
"rewards/oai_reward_function/mean": 0.5023437500931323,
"rewards/oai_reward_function/std": 0.009753772988915443,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09775208681821823,
"epoch": 0.9285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07111279666423798,
"kl": 0.007679712725803256,
"learning_rate": 4.285714285714286e-06,
"loss": 0.0001,
"num_tokens": 1157472.0,
"reward": 1.0890624523162842,
"reward_std": 0.04253753647208214,
"rewards/oai_reward_function/mean": 0.5445312485098839,
"rewards/oai_reward_function/std": 0.08174862712621689,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10566045716404915,
"epoch": 0.9428571428571428,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09820882230997086,
"kl": 0.005833235685713589,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.0001,
"num_tokens": 1175344.0,
"reward": 1.3125,
"reward_std": 0.0763113722205162,
"rewards/oai_reward_function/mean": 0.65625,
"rewards/oai_reward_function/std": 0.199495330452919,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09616570547223091,
"epoch": 0.9571428571428572,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.060254957526922226,
"kl": 0.005365552264265716,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0001,
"num_tokens": 1193248.0,
"reward": 1.0703125,
"reward_std": 0.026579536497592926,
"rewards/oai_reward_function/mean": 0.53515625,
"rewards/oai_reward_function/std": 0.06377232819795609,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09799160063266754,
"epoch": 0.9714285714285714,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0033068626653403044,
"kl": 0.010790573665872216,
"learning_rate": 2.142857142857143e-06,
"loss": 0.0001,
"num_tokens": 1210992.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09545023553073406,
"epoch": 0.9857142857142858,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05336514860391617,
"kl": 0.005807638866826892,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0001,
"num_tokens": 1228816.0,
"reward": 1.0125000476837158,
"reward_std": 0.013363069854676723,
"rewards/oai_reward_function/mean": 0.5062500000931323,
"rewards/oai_reward_function/std": 0.016800537705421448,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10368440486490726,
"epoch": 1.0,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.050165776163339615,
"kl": 0.008004623581655324,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0001,
"num_tokens": 1246584.0,
"reward": 1.017187476158142,
"reward_std": 0.017598580569028854,
"rewards/oai_reward_function/mean": 0.5085937501862645,
"rewards/oai_reward_function/std": 0.022548669949173927,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09804531745612621,
"epoch": 1.0142857142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05341744422912598,
"kl": 0.01710776425898075,
"learning_rate": 0.0,
"loss": 0.0002,
"num_tokens": 1264416.0,
"reward": 1.0875000953674316,
"reward_std": 0.03174196928739548,
"rewards/oai_reward_function/mean": 0.5437500029802322,
"rewards/oai_reward_function/std": 0.0375671461224556,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09854021109640598,
"epoch": 1.0285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06506258249282837,
"kl": 0.009508747374638915,
"learning_rate": 4.4928571428571434e-05,
"loss": 0.0001,
"num_tokens": 1282296.0,
"reward": 1.0406250953674316,
"reward_std": 0.0222018975764513,
"rewards/oai_reward_function/mean": 0.5203125011175871,
"rewards/oai_reward_function/std": 0.035603947937488556,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08583058044314384,
"epoch": 1.042857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07061073184013367,
"kl": 0.005951485480181873,
"learning_rate": 4.485714285714286e-05,
"loss": 0.0001,
"num_tokens": 1300008.0,
"reward": 1.0234375,
"reward_std": 0.01804211549460888,
"rewards/oai_reward_function/mean": 0.51171875,
"rewards/oai_reward_function/std": 0.020064787939190865,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09507345780730247,
"epoch": 1.0571428571428572,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05930415913462639,
"kl": 0.007875082548707724,
"learning_rate": 4.478571428571429e-05,
"loss": 0.0001,
"num_tokens": 1317832.0,
"reward": 1.234375,
"reward_std": 0.01088879257440567,
"rewards/oai_reward_function/mean": 0.6171875,
"rewards/oai_reward_function/std": 0.20452910661697388,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09059166349470615,
"epoch": 1.0714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.04293040931224823,
"kl": 0.01093563821632415,
"learning_rate": 4.471428571428571e-05,
"loss": 0.0001,
"num_tokens": 1335584.0,
"reward": 1.0281250476837158,
"reward_std": 0.008838837966322899,
"rewards/oai_reward_function/mean": 0.514062499627471,
"rewards/oai_reward_function/std": 0.026133574545383453,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08499786630272865,
"epoch": 1.0857142857142856,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06890492141246796,
"kl": 0.007947787176817656,
"learning_rate": 4.464285714285715e-05,
"loss": 0.0001,
"num_tokens": 1353400.0,
"reward": 1.0734374523162842,
"reward_std": 0.03388907015323639,
"rewards/oai_reward_function/mean": 0.5367187522351742,
"rewards/oai_reward_function/std": 0.05607611685991287,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08190344646573067,
"epoch": 1.1,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07835045456886292,
"kl": 0.010214838432148099,
"learning_rate": 4.4571428571428574e-05,
"loss": 0.0001,
"num_tokens": 1371176.0,
"reward": 1.21875,
"reward_std": 0.03153933212161064,
"rewards/oai_reward_function/mean": 0.609375,
"rewards/oai_reward_function/std": 0.12727762758731842,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09901309013366699,
"epoch": 1.1142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07858891785144806,
"kl": 0.006452999892644584,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.0001,
"num_tokens": 1388952.0,
"reward": 1.053125023841858,
"reward_std": 0.028757737949490547,
"rewards/oai_reward_function/mean": 0.5265625007450581,
"rewards/oai_reward_function/std": 0.02905604988336563,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09012427926063538,
"epoch": 1.1285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08503545820713043,
"kl": 0.01038876292295754,
"learning_rate": 4.442857142857143e-05,
"loss": 0.0001,
"num_tokens": 1406744.0,
"reward": 1.084375023841858,
"reward_std": 0.07080081105232239,
"rewards/oai_reward_function/mean": 0.5421874970197678,
"rewards/oai_reward_function/std": 0.09233474731445312,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07716062478721142,
"epoch": 1.1428571428571428,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0729447677731514,
"kl": 0.012507579056546092,
"learning_rate": 4.435714285714286e-05,
"loss": 0.0001,
"num_tokens": 1424568.0,
"reward": 1.2468750476837158,
"reward_std": 0.03139737993478775,
"rewards/oai_reward_function/mean": 0.6234374940395355,
"rewards/oai_reward_function/std": 0.19123300909996033,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09090105071663857,
"epoch": 1.157142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07926075905561447,
"kl": 0.010987127898260951,
"learning_rate": 4.428571428571428e-05,
"loss": 0.0001,
"num_tokens": 1442480.0,
"reward": 1.0984375476837158,
"reward_std": 0.0697232112288475,
"rewards/oai_reward_function/mean": 0.5492187514901161,
"rewards/oai_reward_function/std": 0.06006864085793495,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08928278088569641,
"epoch": 1.1714285714285715,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08457206189632416,
"kl": 0.004951049922965467,
"learning_rate": 4.4214285714285714e-05,
"loss": 0.0,
"num_tokens": 1460344.0,
"reward": 1.0875000953674316,
"reward_std": 0.04518735408782959,
"rewards/oai_reward_function/mean": 0.5437499992549419,
"rewards/oai_reward_function/std": 0.07156093418598175,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08692280948162079,
"epoch": 1.1857142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09246931225061417,
"kl": 0.015749768121168017,
"learning_rate": 4.4142857142857144e-05,
"loss": 0.0002,
"num_tokens": 1478248.0,
"reward": 1.264062523841858,
"reward_std": 0.03826536983251572,
"rewards/oai_reward_function/mean": 0.6320312470197678,
"rewards/oai_reward_function/std": 0.17668935656547546,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08908558450639248,
"epoch": 1.2,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05387440696358681,
"kl": 0.0058196637546643615,
"learning_rate": 4.4071428571428575e-05,
"loss": 0.0001,
"num_tokens": 1496112.0,
"reward": 1.0078125,
"reward_std": 0.011451572179794312,
"rewards/oai_reward_function/mean": 0.50390625,
"rewards/oai_reward_function/std": 0.012872475199401379,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08258137106895447,
"epoch": 1.2142857142857142,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05658518522977829,
"kl": 0.004440092074219137,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0,
"num_tokens": 1513752.0,
"reward": 1.001562476158142,
"reward_std": 0.004419416189193726,
"rewards/oai_reward_function/mean": 0.5007812500116415,
"rewards/oai_reward_function/std": 0.0044194175861775875,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07238267548382282,
"epoch": 1.2285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07421422004699707,
"kl": 0.006456690724007785,
"learning_rate": 4.392857142857143e-05,
"loss": 0.0001,
"num_tokens": 1531512.0,
"reward": 1.048437476158142,
"reward_std": 0.023024337366223335,
"rewards/oai_reward_function/mean": 0.5242187492549419,
"rewards/oai_reward_function/std": 0.030772563070058823,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08944158256053925,
"epoch": 1.2428571428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06827304512262344,
"kl": 0.006732087349519134,
"learning_rate": 4.385714285714286e-05,
"loss": 0.0001,
"num_tokens": 1549400.0,
"reward": 1.1703124046325684,
"reward_std": 0.06780597567558289,
"rewards/oai_reward_function/mean": 0.5851562544703484,
"rewards/oai_reward_function/std": 0.15987133979797363,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08645510673522949,
"epoch": 1.2571428571428571,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06966782361268997,
"kl": 0.008698969963006675,
"learning_rate": 4.3785714285714284e-05,
"loss": 0.0001,
"num_tokens": 1567168.0,
"reward": 1.0187499523162842,
"reward_std": 0.018725106492638588,
"rewards/oai_reward_function/mean": 0.509375000372529,
"rewards/oai_reward_function/std": 0.01878357119858265,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10576100833714008,
"epoch": 1.2714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.04759611934423447,
"kl": 0.009460748406127095,
"learning_rate": 4.371428571428572e-05,
"loss": 0.0001,
"num_tokens": 1585032.0,
"reward": 1.0812499523162842,
"reward_std": 0.07165143638849258,
"rewards/oai_reward_function/mean": 0.5406249985098839,
"rewards/oai_reward_function/std": 0.0987318754196167,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07568562775850296,
"epoch": 1.2857142857142856,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.052769020199775696,
"kl": 0.005130159552209079,
"learning_rate": 4.3642857142857146e-05,
"loss": 0.0001,
"num_tokens": 1602784.0,
"reward": 1.0562500953674316,
"reward_std": 0.03836483508348465,
"rewards/oai_reward_function/mean": 0.5281250011175871,
"rewards/oai_reward_function/std": 0.03952847048640251,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08074977435171604,
"epoch": 1.3,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07572436332702637,
"kl": 0.00757291610352695,
"learning_rate": 4.3571428571428576e-05,
"loss": 0.0001,
"num_tokens": 1620496.0,
"reward": 1.0546875,
"reward_std": 0.032445792108774185,
"rewards/oai_reward_function/mean": 0.52734375,
"rewards/oai_reward_function/std": 0.03321446478366852,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07088322378695011,
"epoch": 1.3142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07621358335018158,
"kl": 0.005998459528200328,
"learning_rate": 4.35e-05,
"loss": 0.0001,
"num_tokens": 1638464.0,
"reward": 1.2015624046325684,
"reward_std": 0.11482575535774231,
"rewards/oai_reward_function/mean": 0.6007812544703484,
"rewards/oai_reward_function/std": 0.26282399147748947,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06381132267415524,
"epoch": 1.3285714285714285,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.030587607994675636,
"kl": 0.007369687547907233,
"learning_rate": 4.342857142857143e-05,
"loss": 0.0001,
"num_tokens": 1656176.0,
"reward": 1.0031249523162842,
"reward_std": 0.008838832378387451,
"rewards/oai_reward_function/mean": 0.5015625000232831,
"rewards/oai_reward_function/std": 0.008838835172355175,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07285293377935886,
"epoch": 1.342857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0500265508890152,
"kl": 0.006194314104504883,
"learning_rate": 4.3357142857142855e-05,
"loss": 0.0001,
"num_tokens": 1674088.0,
"reward": 1.109375,
"reward_std": 0.04590248316526413,
"rewards/oai_reward_function/mean": 0.5546875037252903,
"rewards/oai_reward_function/std": 0.07967613637447357,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.086557412520051,
"epoch": 1.3571428571428572,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07414961606264114,
"kl": 0.010996793280355632,
"learning_rate": 4.328571428571429e-05,
"loss": 0.0001,
"num_tokens": 1692048.0,
"reward": 1.0671875476837158,
"reward_std": 0.03708447515964508,
"rewards/oai_reward_function/mean": 0.5335937514901161,
"rewards/oai_reward_function/std": 0.04561823233962059,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08167718537151814,
"epoch": 1.3714285714285714,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0010896283201873302,
"kl": 0.004855156294070184,
"learning_rate": 4.3214285714285716e-05,
"loss": 0.0,
"num_tokens": 1709760.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07798840664327145,
"epoch": 1.3857142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09227096289396286,
"kl": 0.014819784788414836,
"learning_rate": 4.314285714285715e-05,
"loss": 0.0001,
"num_tokens": 1727480.0,
"reward": 1.2296874523162842,
"reward_std": 0.1029118224978447,
"rewards/oai_reward_function/mean": 0.6148437485098839,
"rewards/oai_reward_function/std": 0.12918156385421753,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07132465578615665,
"epoch": 1.4,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07690515369176865,
"kl": 0.0082227170933038,
"learning_rate": 4.307142857142857e-05,
"loss": 0.0001,
"num_tokens": 1745336.0,
"reward": 1.037500023841858,
"reward_std": 0.023145508021116257,
"rewards/oai_reward_function/mean": 0.5187500007450581,
"rewards/oai_reward_function/std": 0.030453559011220932,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08682013675570488,
"epoch": 1.4142857142857144,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09899340569972992,
"kl": 0.007965923519805074,
"learning_rate": 4.3e-05,
"loss": 0.0001,
"num_tokens": 1763136.0,
"reward": 1.1531250476837158,
"reward_std": 0.13869836926460266,
"rewards/oai_reward_function/mean": 0.5765625014901161,
"rewards/oai_reward_function/std": 0.2821534648537636,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08919607102870941,
"epoch": 1.4285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.080818772315979,
"kl": 0.007981272647157311,
"learning_rate": 4.292857142857143e-05,
"loss": 0.0001,
"num_tokens": 1780920.0,
"reward": 1.2093749046325684,
"reward_std": 0.020411580801010132,
"rewards/oai_reward_function/mean": 0.6046875044703484,
"rewards/oai_reward_function/std": 0.18110741674900055,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07852962799370289,
"epoch": 1.4428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09296028316020966,
"kl": 0.012157463701441884,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.0001,
"num_tokens": 1798872.0,
"reward": 1.5265624523162842,
"reward_std": 0.04206090793013573,
"rewards/oai_reward_function/mean": 0.7632812559604645,
"rewards/oai_reward_function/std": 0.22771519422531128,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.086120730265975,
"epoch": 1.457142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0995735228061676,
"kl": 0.012111627496778965,
"learning_rate": 4.278571428571429e-05,
"loss": 0.0001,
"num_tokens": 1816744.0,
"reward": 1.0968749523162842,
"reward_std": 0.049927353858947754,
"rewards/oai_reward_function/mean": 0.5484374985098839,
"rewards/oai_reward_function/std": 0.049974795430898666,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07998536713421345,
"epoch": 1.4714285714285715,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06524667888879776,
"kl": 0.006055153091438115,
"learning_rate": 4.271428571428572e-05,
"loss": 0.0001,
"num_tokens": 1834456.0,
"reward": 1.0125000476837158,
"reward_std": 0.02314549870789051,
"rewards/oai_reward_function/mean": 0.5062500000931323,
"rewards/oai_reward_function/std": 0.016800537705421448,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07001950591802597,
"epoch": 1.4857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11485958099365234,
"kl": 0.00914135156199336,
"learning_rate": 4.264285714285715e-05,
"loss": 0.0001,
"num_tokens": 1852408.0,
"reward": 1.1859374046325684,
"reward_std": 0.08434940874576569,
"rewards/oai_reward_function/mean": 0.5929687544703484,
"rewards/oai_reward_function/std": 0.09821683913469315,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07728070393204689,
"epoch": 1.5,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07163766771554947,
"kl": 0.006072127376683056,
"learning_rate": 4.257142857142857e-05,
"loss": 0.0001,
"num_tokens": 1870208.0,
"reward": 1.0125000476837158,
"reward_std": 0.02314549870789051,
"rewards/oai_reward_function/mean": 0.5062500000931323,
"rewards/oai_reward_function/std": 0.016800537705421448,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08027334697544575,
"epoch": 1.5142857142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07313752919435501,
"kl": 0.011275349417701364,
"learning_rate": 4.25e-05,
"loss": 0.0001,
"num_tokens": 1887984.0,
"reward": 1.0593750476837158,
"reward_std": 0.022558562457561493,
"rewards/oai_reward_function/mean": 0.529687499627471,
"rewards/oai_reward_function/std": 0.03386256843805313,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08169634826481342,
"epoch": 1.5285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09352786093950272,
"kl": 0.014267339138314128,
"learning_rate": 4.242857142857143e-05,
"loss": 0.0001,
"num_tokens": 1905776.0,
"reward": 1.115625023841858,
"reward_std": 0.055196452885866165,
"rewards/oai_reward_function/mean": 0.5578125044703484,
"rewards/oai_reward_function/std": 0.09427942335605621,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05949794687330723,
"epoch": 1.5428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08965466171503067,
"kl": 0.014173903269693255,
"learning_rate": 4.2357142857142864e-05,
"loss": 0.0001,
"num_tokens": 1923672.0,
"reward": 1.357812523841858,
"reward_std": 0.07401138544082642,
"rewards/oai_reward_function/mean": 0.6789062470197678,
"rewards/oai_reward_function/std": 0.1690949946641922,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08150264620780945,
"epoch": 1.5571428571428572,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08684907853603363,
"kl": 0.015842870343476534,
"learning_rate": 4.228571428571429e-05,
"loss": 0.0002,
"num_tokens": 1941504.0,
"reward": 1.334375023841858,
"reward_std": 0.03491953760385513,
"rewards/oai_reward_function/mean": 0.6671874970197678,
"rewards/oai_reward_function/std": 0.18364998698234558,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06982677057385445,
"epoch": 1.5714285714285714,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06661536544561386,
"kl": 0.008391729556024075,
"learning_rate": 4.221428571428572e-05,
"loss": 0.0001,
"num_tokens": 1959144.0,
"reward": 1.03125,
"reward_std": 0.019731827080249786,
"rewards/oai_reward_function/mean": 0.515625,
"rewards/oai_reward_function/std": 0.025988519191741943,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09021224454045296,
"epoch": 1.5857142857142859,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05824963003396988,
"kl": 0.008994318312034011,
"learning_rate": 4.214285714285714e-05,
"loss": 0.0001,
"num_tokens": 1976936.0,
"reward": 1.037500023841858,
"reward_std": 0.013363069854676723,
"rewards/oai_reward_function/mean": 0.5187500007450581,
"rewards/oai_reward_function/std": 0.0353553369641304,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09183148294687271,
"epoch": 1.6,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08340641111135483,
"kl": 0.010920959059149027,
"learning_rate": 4.2071428571428574e-05,
"loss": 0.0001,
"num_tokens": 1994664.0,
"reward": 1.0390625,
"reward_std": 0.027564914897084236,
"rewards/oai_reward_function/mean": 0.51953125,
"rewards/oai_reward_function/std": 0.0395205020904541,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08369805663824081,
"epoch": 1.6142857142857143,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.09711972624063492,
"kl": 0.009947408339940012,
"learning_rate": 4.2e-05,
"loss": 0.0001,
"num_tokens": 2012432.0,
"reward": 1.0703125,
"reward_std": 0.02308514341711998,
"rewards/oai_reward_function/mean": 0.53515625,
"rewards/oai_reward_function/std": 0.04438621550798416,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.096822340041399,
"epoch": 1.6285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07382018864154816,
"kl": 0.017859197221696377,
"learning_rate": 4.192857142857143e-05,
"loss": 0.0002,
"num_tokens": 2030136.0,
"reward": 1.0359375476837158,
"reward_std": 0.035533398389816284,
"rewards/oai_reward_function/mean": 0.517968749627471,
"rewards/oai_reward_function/std": 0.03252441808581352,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08726120926439762,
"epoch": 1.6428571428571428,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0782691165804863,
"kl": 0.012930417666211724,
"learning_rate": 4.185714285714286e-05,
"loss": 0.0001,
"num_tokens": 2047976.0,
"reward": 1.1749999523162842,
"reward_std": 0.08762745559215546,
"rewards/oai_reward_function/mean": 0.5874999985098839,
"rewards/oai_reward_function/std": 0.1177750751376152,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07392177730798721,
"epoch": 1.657142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07505157589912415,
"kl": 0.012273511849343777,
"learning_rate": 4.178571428571429e-05,
"loss": 0.0001,
"num_tokens": 2065704.0,
"reward": 1.1953125,
"reward_std": 0.06742400676012039,
"rewards/oai_reward_function/mean": 0.59765625,
"rewards/oai_reward_function/std": 0.13019207119941711,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.065590625628829,
"epoch": 1.6714285714285713,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08693981915712357,
"kl": 0.02323699276894331,
"learning_rate": 4.1714285714285714e-05,
"loss": 0.0002,
"num_tokens": 2083560.0,
"reward": 1.1328125,
"reward_std": 0.04593653976917267,
"rewards/oai_reward_function/mean": 0.56640625,
"rewards/oai_reward_function/std": 0.04902656376361847,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09886737167835236,
"epoch": 1.6857142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05793704837560654,
"kl": 0.015232619596645236,
"learning_rate": 4.1642857142857144e-05,
"loss": 0.0002,
"num_tokens": 2101392.0,
"reward": 1.03125,
"reward_std": 0.011572758667171001,
"rewards/oai_reward_function/mean": 0.515625,
"rewards/oai_reward_function/std": 0.029614457860589027,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09572554007172585,
"epoch": 1.7,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.03639426827430725,
"kl": 0.017274728044867516,
"learning_rate": 4.1571428571428575e-05,
"loss": 0.0002,
"num_tokens": 2119200.0,
"reward": 1.2000000476837158,
"reward_std": 0.018898215144872665,
"rewards/oai_reward_function/mean": 0.5999999940395355,
"rewards/oai_reward_function/std": 0.17689070105552673,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08603023178875446,
"epoch": 1.7142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10388008505105972,
"kl": 0.022097071167081594,
"learning_rate": 4.15e-05,
"loss": 0.0002,
"num_tokens": 2137056.0,
"reward": 1.0578125715255737,
"reward_std": 0.04522190988063812,
"rewards/oai_reward_function/mean": 0.528906250372529,
"rewards/oai_reward_function/std": 0.03971134498715401,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0950616579502821,
"epoch": 1.7285714285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09005559235811234,
"kl": 0.015552334254607558,
"learning_rate": 4.1428571428571437e-05,
"loss": 0.0002,
"num_tokens": 2154936.0,
"reward": 1.268125057220459,
"reward_std": 0.0341712087392807,
"rewards/oai_reward_function/mean": 0.6340624988079071,
"rewards/oai_reward_function/std": 0.2080029398202896,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07687668316066265,
"epoch": 1.7428571428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08279760181903839,
"kl": 0.009949938859790564,
"learning_rate": 4.135714285714286e-05,
"loss": 0.0001,
"num_tokens": 2172616.0,
"reward": 1.0625,
"reward_std": 0.04204372316598892,
"rewards/oai_reward_function/mean": 0.53125,
"rewards/oai_reward_function/std": 0.04353345185518265,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08731912076473236,
"epoch": 1.7571428571428571,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06586393713951111,
"kl": 0.008837034576572478,
"learning_rate": 4.128571428571429e-05,
"loss": 0.0001,
"num_tokens": 2190368.0,
"reward": 1.0187499523162842,
"reward_std": 0.021777570247650146,
"rewards/oai_reward_function/mean": 0.509375000372529,
"rewards/oai_reward_function/std": 0.019827887415885925,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08952882327139378,
"epoch": 1.7714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09899036586284637,
"kl": 0.013539569219574332,
"learning_rate": 4.1214285714285715e-05,
"loss": 0.0001,
"num_tokens": 2208176.0,
"reward": 1.1140625476837158,
"reward_std": 0.06264616549015045,
"rewards/oai_reward_function/mean": 0.5570312514901161,
"rewards/oai_reward_function/std": 0.04455622285604477,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07565303146839142,
"epoch": 1.7857142857142856,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08115291595458984,
"kl": 0.009995393920689821,
"learning_rate": 4.1142857142857146e-05,
"loss": 0.0001,
"num_tokens": 2225896.0,
"reward": 1.1749999523162842,
"reward_std": 0.06661029160022736,
"rewards/oai_reward_function/mean": 0.5874999985098839,
"rewards/oai_reward_function/std": 0.1399884670972824,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09428555145859718,
"epoch": 1.8,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06345849484205246,
"kl": 0.01248577213846147,
"learning_rate": 4.107142857142857e-05,
"loss": 0.0001,
"num_tokens": 2243656.0,
"reward": 1.0390625,
"reward_std": 0.02052600309252739,
"rewards/oai_reward_function/mean": 0.51953125,
"rewards/oai_reward_function/std": 0.0395205020904541,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07595096342265606,
"epoch": 1.8142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09258188307285309,
"kl": 0.00919699075166136,
"learning_rate": 4.1e-05,
"loss": 0.0001,
"num_tokens": 2261448.0,
"reward": 1.0343749523162842,
"reward_std": 0.03808924928307533,
"rewards/oai_reward_function/mean": 0.517187500372529,
"rewards/oai_reward_function/std": 0.029400940984487534,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08019419759511948,
"epoch": 1.8285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13585439324378967,
"kl": 0.019885767716914415,
"learning_rate": 4.092857142857143e-05,
"loss": 0.0002,
"num_tokens": 2279240.0,
"reward": 1.3046875,
"reward_std": 0.1113169863820076,
"rewards/oai_reward_function/mean": 0.65234375,
"rewards/oai_reward_function/std": 0.17964564263820648,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09248529188334942,
"epoch": 1.842857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08260349929332733,
"kl": 0.013745760545134544,
"learning_rate": 4.085714285714286e-05,
"loss": 0.0001,
"num_tokens": 2297056.0,
"reward": 1.0734375715255737,
"reward_std": 0.034589797258377075,
"rewards/oai_reward_function/mean": 0.5367187522351742,
"rewards/oai_reward_function/std": 0.03359169885516167,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09751161187887192,
"epoch": 1.8571428571428572,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06302473694086075,
"kl": 0.011824949877336621,
"learning_rate": 4.0785714285714286e-05,
"loss": 0.0001,
"num_tokens": 2314872.0,
"reward": 1.0203125476837158,
"reward_std": 0.013258256018161774,
"rewards/oai_reward_function/mean": 0.510156249627471,
"rewards/oai_reward_function/std": 0.021867798641324043,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09040896967053413,
"epoch": 1.8714285714285714,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09831514209508896,
"kl": 0.01766400644555688,
"learning_rate": 4.0714285714285717e-05,
"loss": 0.0002,
"num_tokens": 2332784.0,
"reward": 1.0187499523162842,
"reward_std": 0.028380058705806732,
"rewards/oai_reward_function/mean": 0.509375000372529,
"rewards/oai_reward_function/std": 0.01878357119858265,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09301545284688473,
"epoch": 1.8857142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08136897534132004,
"kl": 0.02844544965773821,
"learning_rate": 4.064285714285714e-05,
"loss": 0.0003,
"num_tokens": 2350640.0,
"reward": 1.060937523841858,
"reward_std": 0.03093591332435608,
"rewards/oai_reward_function/mean": 0.5304687507450581,
"rewards/oai_reward_function/std": 0.054895199835300446,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08573882840573788,
"epoch": 1.9,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0735137015581131,
"kl": 0.023084456101059914,
"learning_rate": 4.057142857142857e-05,
"loss": 0.0002,
"num_tokens": 2368456.0,
"reward": 1.0734374523162842,
"reward_std": 0.02894335612654686,
"rewards/oai_reward_function/mean": 0.5367187522351742,
"rewards/oai_reward_function/std": 0.05461905151605606,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10109273716807365,
"epoch": 1.9142857142857141,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10477015376091003,
"kl": 0.03489594021812081,
"learning_rate": 4.05e-05,
"loss": 0.0003,
"num_tokens": 2386200.0,
"reward": 1.0046875476837158,
"reward_std": 0.11875393241643906,
"rewards/oai_reward_function/mean": 0.5023437514901161,
"rewards/oai_reward_function/std": 0.11234594881534576,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08120713755488396,
"epoch": 1.9285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10714246332645416,
"kl": 0.024183190893381834,
"learning_rate": 4.042857142857143e-05,
"loss": 0.0002,
"num_tokens": 2404056.0,
"reward": 1.09375,
"reward_std": 0.05726175755262375,
"rewards/oai_reward_function/mean": 0.546875,
"rewards/oai_reward_function/std": 0.05982164293527603,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.089906245470047,
"epoch": 1.9428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12431693077087402,
"kl": 0.038056795950978994,
"learning_rate": 4.035714285714286e-05,
"loss": 0.0004,
"num_tokens": 2421928.0,
"reward": 1.3406250476837158,
"reward_std": 0.05260005593299866,
"rewards/oai_reward_function/mean": 0.6703125089406967,
"rewards/oai_reward_function/std": 0.19317355751991272,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09174446761608124,
"epoch": 1.9571428571428573,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07566652446985245,
"kl": 0.023516141809523106,
"learning_rate": 4.028571428571429e-05,
"loss": 0.0002,
"num_tokens": 2439832.0,
"reward": 1.0640625953674316,
"reward_std": 0.026437407359480858,
"rewards/oai_reward_function/mean": 0.5320312529802322,
"rewards/oai_reward_function/std": 0.04027845337986946,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0811004675924778,
"epoch": 1.9714285714285715,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1272999793291092,
"kl": 0.035705497954040766,
"learning_rate": 4.021428571428572e-05,
"loss": 0.0004,
"num_tokens": 2457576.0,
"reward": 1.0421874523162842,
"reward_std": 0.04250866919755936,
"rewards/oai_reward_function/mean": 0.521093750372529,
"rewards/oai_reward_function/std": 0.03971134498715401,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08940452709794044,
"epoch": 1.9857142857142858,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10179316252470016,
"kl": 0.03542056027799845,
"learning_rate": 4.014285714285714e-05,
"loss": 0.0004,
"num_tokens": 2475400.0,
"reward": 1.0484375953674316,
"reward_std": 0.03541836887598038,
"rewards/oai_reward_function/mean": 0.5242187511175871,
"rewards/oai_reward_function/std": 0.04375720024108887,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08982641063630581,
"epoch": 2.0,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05322287976741791,
"kl": 0.024960508104413748,
"learning_rate": 4.007142857142857e-05,
"loss": 0.0002,
"num_tokens": 2493168.0,
"reward": 1.0421874523162842,
"reward_std": 0.024032622575759888,
"rewards/oai_reward_function/mean": 0.521093750372529,
"rewards/oai_reward_function/std": 0.04358407482504845,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07347713969647884,
"epoch": 2.0142857142857142,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.054996706545352936,
"kl": 0.029410825110971928,
"learning_rate": 4e-05,
"loss": 0.0003,
"num_tokens": 2510968.0,
"reward": 1.2937500476837158,
"reward_std": 0.006681524682790041,
"rewards/oai_reward_function/mean": 0.6468750089406967,
"rewards/oai_reward_function/std": 0.2041652947664261,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11200576089322567,
"epoch": 2.0285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07548272609710693,
"kl": 0.03502320311963558,
"learning_rate": 3.9928571428571434e-05,
"loss": 0.0004,
"num_tokens": 2528744.0,
"reward": 1.095312476158142,
"reward_std": 0.0437462255358696,
"rewards/oai_reward_function/mean": 0.5476562529802322,
"rewards/oai_reward_function/std": 0.05692360922694206,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09486313536763191,
"epoch": 2.0428571428571427,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05399833247065544,
"kl": 0.03851825185120106,
"learning_rate": 3.985714285714286e-05,
"loss": 0.0004,
"num_tokens": 2546488.0,
"reward": 1.0125000476837158,
"reward_std": 0.01336306519806385,
"rewards/oai_reward_function/mean": 0.5062500000931323,
"rewards/oai_reward_function/std": 0.016800537705421448,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08246604166924953,
"epoch": 2.057142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.042957283556461334,
"kl": 0.03783240728080273,
"learning_rate": 3.978571428571429e-05,
"loss": 0.0004,
"num_tokens": 2564176.0,
"reward": 1.0234375,
"reward_std": 0.01695253700017929,
"rewards/oai_reward_function/mean": 0.51171875,
"rewards/oai_reward_function/std": 0.026169713586568832,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10432570241391659,
"epoch": 2.0714285714285716,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09850599616765976,
"kl": 0.037014870904386044,
"learning_rate": 3.971428571428571e-05,
"loss": 0.0004,
"num_tokens": 2581944.0,
"reward": 1.0250000953674316,
"reward_std": 0.15622428059577942,
"rewards/oai_reward_function/mean": 0.5124999992549419,
"rewards/oai_reward_function/std": 0.17416272684931755,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09751013852655888,
"epoch": 2.085714285714286,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.060189735144376755,
"kl": 0.050427704118192196,
"learning_rate": 3.964285714285714e-05,
"loss": 0.0005,
"num_tokens": 2599616.0,
"reward": 1.0265624523162842,
"reward_std": 0.008010865189135075,
"rewards/oai_reward_function/mean": 0.513281250372529,
"rewards/oai_reward_function/std": 0.024580655619502068,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1012180857360363,
"epoch": 2.1,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09098206460475922,
"kl": 0.05104807484894991,
"learning_rate": 3.9571428571428574e-05,
"loss": 0.0005,
"num_tokens": 2617576.0,
"reward": 1.2890625,
"reward_std": 0.033694587647914886,
"rewards/oai_reward_function/mean": 0.6445312350988388,
"rewards/oai_reward_function/std": 0.1918431520462036,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08674592711031437,
"epoch": 2.1142857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0632624626159668,
"kl": 0.035351223312318325,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.0004,
"num_tokens": 2635248.0,
"reward": 1.0265624523162842,
"reward_std": 0.01813914254307747,
"rewards/oai_reward_function/mean": 0.513281250372529,
"rewards/oai_reward_function/std": 0.021982740610837936,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11086461879312992,
"epoch": 2.1285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13065817952156067,
"kl": 0.06040171813219786,
"learning_rate": 3.942857142857143e-05,
"loss": 0.0006,
"num_tokens": 2653096.0,
"reward": 1.037500023841858,
"reward_std": 0.14793866872787476,
"rewards/oai_reward_function/mean": 0.5187500044703484,
"rewards/oai_reward_function/std": 0.12740343809127808,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.11983237601816654,
"epoch": 2.142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11544425040483475,
"kl": 0.06588536128401756,
"learning_rate": 3.935714285714286e-05,
"loss": 0.0007,
"num_tokens": 2670944.0,
"reward": 1.0812499523162842,
"reward_std": 0.035140641033649445,
"rewards/oai_reward_function/mean": 0.5406249985098839,
"rewards/oai_reward_function/std": 0.023546453565359116,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08963452652096748,
"epoch": 2.157142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06575662642717361,
"kl": 0.05113219376653433,
"learning_rate": 3.928571428571429e-05,
"loss": 0.0005,
"num_tokens": 2688680.0,
"reward": 1.154687523841858,
"reward_std": 0.03592789173126221,
"rewards/oai_reward_function/mean": 0.5773437470197678,
"rewards/oai_reward_function/std": 0.13520585000514984,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10959535092115402,
"epoch": 2.1714285714285713,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09100169688463211,
"kl": 0.04979555029422045,
"learning_rate": 3.9214285714285714e-05,
"loss": 0.0005,
"num_tokens": 2706528.0,
"reward": 1.3046875,
"reward_std": 0.032156482338905334,
"rewards/oai_reward_function/mean": 0.6523437350988388,
"rewards/oai_reward_function/std": 0.1942063421010971,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10072515532374382,
"epoch": 2.185714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11344427615404129,
"kl": 0.08104220405220985,
"learning_rate": 3.9142857142857145e-05,
"loss": 0.0008,
"num_tokens": 2724424.0,
"reward": 1.3984375,
"reward_std": 0.06430189311504364,
"rewards/oai_reward_function/mean": 0.69921875,
"rewards/oai_reward_function/std": 0.19055142998695374,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.13380656391382217,
"epoch": 2.2,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10259576886892319,
"kl": 0.047852903604507446,
"learning_rate": 3.9071428571428575e-05,
"loss": 0.0005,
"num_tokens": 2742272.0,
"reward": 1.0578124523162842,
"reward_std": 0.026579542085528374,
"rewards/oai_reward_function/mean": 0.5289062485098839,
"rewards/oai_reward_function/std": 0.05354730039834976,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.1082126721739769,
"epoch": 2.2142857142857144,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10249610245227814,
"kl": 0.07078076247125864,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.0007,
"num_tokens": 2760088.0,
"reward": 1.0593750476837158,
"reward_std": 0.036339618265628815,
"rewards/oai_reward_function/mean": 0.5296875014901161,
"rewards/oai_reward_function/std": 0.04327215999364853,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10915260016918182,
"epoch": 2.2285714285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08330399543046951,
"kl": 0.07353132590651512,
"learning_rate": 3.892857142857143e-05,
"loss": 0.0007,
"num_tokens": 2777936.0,
"reward": 1.25,
"reward_std": 0.046066030859947205,
"rewards/oai_reward_function/mean": 0.625,
"rewards/oai_reward_function/std": 0.1287345290184021,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08624540269374847,
"epoch": 2.242857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0943220779299736,
"kl": 0.06203949544578791,
"learning_rate": 3.885714285714286e-05,
"loss": 0.0006,
"num_tokens": 2795664.0,
"reward": 1.024999976158142,
"reward_std": 0.023145508021116257,
"rewards/oai_reward_function/mean": 0.5125000001862645,
"rewards/oai_reward_function/std": 0.02199706807732582,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09054199792444706,
"epoch": 2.257142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07400333881378174,
"kl": 0.04232563078403473,
"learning_rate": 3.8785714285714285e-05,
"loss": 0.0004,
"num_tokens": 2813352.0,
"reward": 1.0499999523162842,
"reward_std": 0.0258774571120739,
"rewards/oai_reward_function/mean": 0.5250000022351742,
"rewards/oai_reward_function/std": 0.03810004144906998,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.10258343070745468,
"epoch": 2.2714285714285714,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09145762026309967,
"kl": 0.07726636342704296,
"learning_rate": 3.8714285714285715e-05,
"loss": 0.0008,
"num_tokens": 2831304.0,
"reward": 1.0593750476837158,
"reward_std": 0.03966484218835831,
"rewards/oai_reward_function/mean": 0.5296875014901161,
"rewards/oai_reward_function/std": 0.05057631433010101,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08886106871068478,
"epoch": 2.2857142857142856,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.050053730607032776,
"kl": 0.0593466404825449,
"learning_rate": 3.8642857142857146e-05,
"loss": 0.0006,
"num_tokens": 2849216.0,
"reward": 1.0031249523162842,
"reward_std": 0.008838832378387451,
"rewards/oai_reward_function/mean": 0.5015625000232831,
"rewards/oai_reward_function/std": 0.008838835172355175,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08666450530290604,
"epoch": 2.3,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09668877720832825,
"kl": 0.037179723381996155,
"learning_rate": 3.857142857142858e-05,
"loss": 0.0004,
"num_tokens": 2867016.0,
"reward": 1.0671875476837158,
"reward_std": 0.034592773765325546,
"rewards/oai_reward_function/mean": 0.5335937514901161,
"rewards/oai_reward_function/std": 0.038942355662584305,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08739319443702698,
"epoch": 2.314285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05101403221487999,
"kl": 0.016047978308051825,
"learning_rate": 3.85e-05,
"loss": 0.0002,
"num_tokens": 2884784.0,
"reward": 1.001562476158142,
"reward_std": 0.004419416189193726,
"rewards/oai_reward_function/mean": 0.5007812500116415,
"rewards/oai_reward_function/std": 0.0044194175861775875,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07174593396484852,
"epoch": 2.3285714285714287,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09460754692554474,
"kl": 0.031096406280994415,
"learning_rate": 3.842857142857143e-05,
"loss": 0.0003,
"num_tokens": 2902576.0,
"reward": 1.0315624475479126,
"reward_std": 0.03596320003271103,
"rewards/oai_reward_function/mean": 0.5157812498509884,
"rewards/oai_reward_function/std": 0.02667333371937275,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0965243298560381,
"epoch": 2.342857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10237058997154236,
"kl": 0.025380919221788645,
"learning_rate": 3.8357142857142855e-05,
"loss": 0.0003,
"num_tokens": 2920416.0,
"reward": 1.071874976158142,
"reward_std": 0.045641690492630005,
"rewards/oai_reward_function/mean": 0.5359374992549419,
"rewards/oai_reward_function/std": 0.04396548494696617,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0735629927366972,
"epoch": 2.357142857142857,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.001382152666337788,
"kl": 0.011448808014392853,
"learning_rate": 3.8285714285714286e-05,
"loss": 0.0001,
"num_tokens": 2938224.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07639794796705246,
"epoch": 2.3714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05336588993668556,
"kl": 0.0100309734698385,
"learning_rate": 3.821428571428572e-05,
"loss": 0.0001,
"num_tokens": 2956000.0,
"reward": 1.0187499523162842,
"reward_std": 0.017677675932645798,
"rewards/oai_reward_function/mean": 0.509375000372529,
"rewards/oai_reward_function/std": 0.023546453565359116,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0797042902559042,
"epoch": 2.3857142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09860816597938538,
"kl": 0.02236688695847988,
"learning_rate": 3.814285714285715e-05,
"loss": 0.0002,
"num_tokens": 2973728.0,
"reward": 1.0421874523162842,
"reward_std": 0.03380424156785011,
"rewards/oai_reward_function/mean": 0.5210937485098839,
"rewards/oai_reward_function/std": 0.03052588365972042,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.09247681871056557,
"epoch": 2.4,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11807835847139359,
"kl": 0.028510943986475468,
"learning_rate": 3.807142857142857e-05,
"loss": 0.0003,
"num_tokens": 2991648.0,
"reward": 1.0703125,
"reward_std": 0.04926247149705887,
"rewards/oai_reward_function/mean": 0.53515625,
"rewards/oai_reward_function/std": 0.036400206387043,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0499194972217083,
"epoch": 2.414285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11116263270378113,
"kl": 0.04615373630076647,
"learning_rate": 3.8e-05,
"loss": 0.0005,
"num_tokens": 3009472.0,
"reward": 1.131250023841858,
"reward_std": 0.0681503415107727,
"rewards/oai_reward_function/mean": 0.5656249970197678,
"rewards/oai_reward_function/std": 0.06772513687610626,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07691787928342819,
"epoch": 2.4285714285714284,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.043317168951034546,
"kl": 0.026559457648545504,
"learning_rate": 3.792857142857143e-05,
"loss": 0.0003,
"num_tokens": 3027312.0,
"reward": 1.0812499523162842,
"reward_std": 0.013363059610128403,
"rewards/oai_reward_function/mean": 0.5406249985098839,
"rewards/oai_reward_function/std": 0.04867187887430191,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08763985149562359,
"epoch": 2.442857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11738862097263336,
"kl": 0.028244417626410723,
"learning_rate": 3.785714285714286e-05,
"loss": 0.0003,
"num_tokens": 3045248.0,
"reward": 1.2421875,
"reward_std": 0.038010139018297195,
"rewards/oai_reward_function/mean": 0.62109375,
"rewards/oai_reward_function/std": 0.17051976919174194,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07618978433310986,
"epoch": 2.4571428571428573,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06447312235832214,
"kl": 0.009853521827608347,
"learning_rate": 3.778571428571429e-05,
"loss": 0.0001,
"num_tokens": 3063104.0,
"reward": 1.0125000476837158,
"reward_std": 0.013363069854676723,
"rewards/oai_reward_function/mean": 0.5062500000931323,
"rewards/oai_reward_function/std": 0.016800537705421448,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05904076434671879,
"epoch": 2.4714285714285715,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.08978112041950226,
"kl": 0.03310586418956518,
"learning_rate": 3.771428571428572e-05,
"loss": 0.0003,
"num_tokens": 3081032.0,
"reward": 1.109375,
"reward_std": 0.04799327254295349,
"rewards/oai_reward_function/mean": 0.5546875,
"rewards/oai_reward_function/std": 0.05903713405132294,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0563393235206604,
"epoch": 2.4857142857142858,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11673219501972198,
"kl": 0.01918662153184414,
"learning_rate": 3.764285714285715e-05,
"loss": 0.0002,
"num_tokens": 3098880.0,
"reward": 1.1156249046325684,
"reward_std": 0.04642024636268616,
"rewards/oai_reward_function/mean": 0.5578125007450581,
"rewards/oai_reward_function/std": 0.04554221034049988,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.054440722800791264,
"epoch": 2.5,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0016670229379087687,
"kl": 0.011914134491235018,
"learning_rate": 3.757142857142857e-05,
"loss": 0.0001,
"num_tokens": 3116528.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07713918946683407,
"epoch": 2.5142857142857142,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09991607069969177,
"kl": 0.014265456004068255,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0001,
"num_tokens": 3134280.0,
"reward": 1.0140624046325684,
"reward_std": 0.026196977123618126,
"rewards/oai_reward_function/mean": 0.5070312502793968,
"rewards/oai_reward_function/std": 0.017079481855034828,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.060492053627967834,
"epoch": 2.5285714285714285,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11897552013397217,
"kl": 0.03578268736600876,
"learning_rate": 3.742857142857143e-05,
"loss": 0.0004,
"num_tokens": 3152096.0,
"reward": 1.09375,
"reward_std": 0.05294632539153099,
"rewards/oai_reward_function/mean": 0.546875,
"rewards/oai_reward_function/std": 0.044336508959531784,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.062405264005064964,
"epoch": 2.5428571428571427,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09012026339769363,
"kl": 0.017184360651299357,
"learning_rate": 3.735714285714286e-05,
"loss": 0.0002,
"num_tokens": 3169776.0,
"reward": 1.2531249523162842,
"reward_std": 0.029978279024362564,
"rewards/oai_reward_function/mean": 0.6265625059604645,
"rewards/oai_reward_function/std": 0.17049944400787354,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06565988063812256,
"epoch": 2.557142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.126982182264328,
"kl": 0.038017953746020794,
"learning_rate": 3.728571428571428e-05,
"loss": 0.0004,
"num_tokens": 3187712.0,
"reward": 1.3125,
"reward_std": 0.041240036487579346,
"rewards/oai_reward_function/mean": 0.65625,
"rewards/oai_reward_function/std": 0.1866512894630432,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.040435372851789,
"epoch": 2.571428571428571,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07750152051448822,
"kl": 0.03242550138384104,
"learning_rate": 3.721428571428572e-05,
"loss": 0.0003,
"num_tokens": 3205592.0,
"reward": 1.256250023841858,
"reward_std": 0.025646153837442398,
"rewards/oai_reward_function/mean": 0.628125011920929,
"rewards/oai_reward_function/std": 0.18651622533798218,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.047216037288308144,
"epoch": 2.585714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12098561972379684,
"kl": 0.026824071537703276,
"learning_rate": 3.7142857142857143e-05,
"loss": 0.0003,
"num_tokens": 3223480.0,
"reward": 1.0093750953674316,
"reward_std": 0.19897010922431946,
"rewards/oai_reward_function/mean": 0.5046875029802322,
"rewards/oai_reward_function/std": 0.21528521552681923,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08010220341384411,
"epoch": 2.6,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11290978640317917,
"kl": 0.022963001858443022,
"learning_rate": 3.7071428571428574e-05,
"loss": 0.0002,
"num_tokens": 3241304.0,
"reward": 1.1468751430511475,
"reward_std": 0.08732541650533676,
"rewards/oai_reward_function/mean": 0.5734374970197678,
"rewards/oai_reward_function/std": 0.11707756668329239,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06848571076989174,
"epoch": 2.6142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10999053716659546,
"kl": 0.024655529763549566,
"learning_rate": 3.7e-05,
"loss": 0.0002,
"num_tokens": 3259048.0,
"reward": 1.165624976158142,
"reward_std": 0.0838727056980133,
"rewards/oai_reward_function/mean": 0.5828125029802322,
"rewards/oai_reward_function/std": 0.14652389287948608,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.049521847628057,
"epoch": 2.6285714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10953173786401749,
"kl": 0.02982867369428277,
"learning_rate": 3.692857142857143e-05,
"loss": 0.0003,
"num_tokens": 3276808.0,
"reward": 1.171875,
"reward_std": 0.061461035162210464,
"rewards/oai_reward_function/mean": 0.5859375,
"rewards/oai_reward_function/std": 0.1271488517522812,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07211006805300713,
"epoch": 2.642857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.1074090376496315,
"kl": 0.036498697474598885,
"learning_rate": 3.685714285714286e-05,
"loss": 0.0004,
"num_tokens": 3294808.0,
"reward": 1.162500023841858,
"reward_std": 0.13342483341693878,
"rewards/oai_reward_function/mean": 0.5812499970197678,
"rewards/oai_reward_function/std": 0.1636282056570053,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04203084297478199,
"epoch": 2.657142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.07064300775527954,
"kl": 0.02704466599971056,
"learning_rate": 3.678571428571429e-05,
"loss": 0.0003,
"num_tokens": 3312640.0,
"reward": 1.0750000476837158,
"reward_std": 0.018898211419582367,
"rewards/oai_reward_function/mean": 0.5375000014901161,
"rewards/oai_reward_function/std": 0.06839166581630707,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.054328473284840584,
"epoch": 2.6714285714285713,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06820113956928253,
"kl": 0.022003832273185253,
"learning_rate": 3.671428571428572e-05,
"loss": 0.0002,
"num_tokens": 3330408.0,
"reward": 1.1531250476837158,
"reward_std": 0.0646936446428299,
"rewards/oai_reward_function/mean": 0.5765625014901161,
"rewards/oai_reward_function/std": 0.14809781312942505,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.051017552614212036,
"epoch": 2.685714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08293981850147247,
"kl": 0.02239195117726922,
"learning_rate": 3.6642857142857145e-05,
"loss": 0.0002,
"num_tokens": 3348064.0,
"reward": 1.0109374523162842,
"reward_std": 0.017358144745230675,
"rewards/oai_reward_function/mean": 0.505468750372529,
"rewards/oai_reward_function/std": 0.015206077136099339,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.049897488206624985,
"epoch": 2.7,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09536179155111313,
"kl": 0.05423136055469513,
"learning_rate": 3.6571428571428576e-05,
"loss": 0.0005,
"num_tokens": 3365896.0,
"reward": 1.1484375,
"reward_std": 0.06608611345291138,
"rewards/oai_reward_function/mean": 0.57421875,
"rewards/oai_reward_function/std": 0.10268264263868332,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05249054729938507,
"epoch": 2.7142857142857144,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13304099440574646,
"kl": 0.04763131029903889,
"learning_rate": 3.65e-05,
"loss": 0.0005,
"num_tokens": 3383800.0,
"reward": 1.0828125476837158,
"reward_std": 0.04099529981613159,
"rewards/oai_reward_function/mean": 0.5414062514901161,
"rewards/oai_reward_function/std": 0.04943608492612839,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.062329867854714394,
"epoch": 2.7285714285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11173869669437408,
"kl": 0.03640593169257045,
"learning_rate": 3.642857142857143e-05,
"loss": 0.0004,
"num_tokens": 3401648.0,
"reward": 1.0562500953674316,
"reward_std": 0.023689784109592438,
"rewards/oai_reward_function/mean": 0.5281250011175871,
"rewards/oai_reward_function/std": 0.0274963341653347,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.058776866644620895,
"epoch": 2.742857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10382409393787384,
"kl": 0.03305045561864972,
"learning_rate": 3.6357142857142854e-05,
"loss": 0.0003,
"num_tokens": 3419408.0,
"reward": 1.217187523841858,
"reward_std": 0.02610759809613228,
"rewards/oai_reward_function/mean": 0.6085937470197678,
"rewards/oai_reward_function/std": 0.1844356507062912,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04789746552705765,
"epoch": 2.757142857142857,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.05706682801246643,
"kl": 0.03994939010590315,
"learning_rate": 3.628571428571429e-05,
"loss": 0.0004,
"num_tokens": 3437112.0,
"reward": 1.015625,
"reward_std": 0.01860060542821884,
"rewards/oai_reward_function/mean": 0.5078125,
"rewards/oai_reward_function/std": 0.022394467145204544,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.062057855539023876,
"epoch": 2.7714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15330444276332855,
"kl": 0.084334472194314,
"learning_rate": 3.6214285714285716e-05,
"loss": 0.0008,
"num_tokens": 3454904.0,
"reward": 1.470312476158142,
"reward_std": 0.04739333689212799,
"rewards/oai_reward_function/mean": 0.735156238079071,
"rewards/oai_reward_function/std": 0.19775548577308655,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04913834575563669,
"epoch": 2.7857142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11146184056997299,
"kl": 0.06489744689315557,
"learning_rate": 3.6142857142857146e-05,
"loss": 0.0006,
"num_tokens": 3472632.0,
"reward": 1.0703125,
"reward_std": 0.045694079250097275,
"rewards/oai_reward_function/mean": 0.53515625,
"rewards/oai_reward_function/std": 0.04438621550798416,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.057510885410010815,
"epoch": 2.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.1315256953239441,
"kl": 0.06008041184395552,
"learning_rate": 3.607142857142857e-05,
"loss": 0.0006,
"num_tokens": 3490592.0,
"reward": 1.060937523841858,
"reward_std": 0.02575094997882843,
"rewards/oai_reward_function/mean": 0.5304687507450581,
"rewards/oai_reward_function/std": 0.04522986710071564,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05490284040570259,
"epoch": 2.814285714285714,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1558970808982849,
"kl": 0.0605736318975687,
"learning_rate": 3.6e-05,
"loss": 0.0006,
"num_tokens": 3508408.0,
"reward": 1.1359374523162842,
"reward_std": 0.16093073785305023,
"rewards/oai_reward_function/mean": 0.5679687559604645,
"rewards/oai_reward_function/std": 0.12798601388931274,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05418549384921789,
"epoch": 2.8285714285714287,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.060409124940633774,
"kl": 0.07027391903102398,
"learning_rate": 3.5928571428571425e-05,
"loss": 0.0007,
"num_tokens": 3526168.0,
"reward": 1.0281250476837158,
"reward_std": 0.008838837966322899,
"rewards/oai_reward_function/mean": 0.514062499627471,
"rewards/oai_reward_function/std": 0.026133574545383453,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.03633992746472359,
"epoch": 2.842857142857143,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003255989169701934,
"kl": 0.07370059937238693,
"learning_rate": 3.585714285714286e-05,
"loss": 0.0007,
"num_tokens": 3543864.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/oai_reward_function/mean": 0.5,
"rewards/oai_reward_function/std": 0.0,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.057329680770635605,
"epoch": 2.857142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.11008545011281967,
"kl": 0.06796468701213598,
"learning_rate": 3.5785714285714286e-05,
"loss": 0.0007,
"num_tokens": 3561688.0,
"reward": 1.25,
"reward_std": 0.014625202864408493,
"rewards/oai_reward_function/mean": 0.625,
"rewards/oai_reward_function/std": 0.21655291318893433,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04236162081360817,
"epoch": 2.8714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15067800879478455,
"kl": 0.10442700423300266,
"learning_rate": 3.571428571428572e-05,
"loss": 0.001,
"num_tokens": 3579560.0,
"reward": 1.2906250953674316,
"reward_std": 0.06347659230232239,
"rewards/oai_reward_function/mean": 0.6453125029802322,
"rewards/oai_reward_function/std": 0.18144108355045319,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04874769877642393,
"epoch": 2.8857142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08031316101551056,
"kl": 0.06067081820219755,
"learning_rate": 3.564285714285715e-05,
"loss": 0.0006,
"num_tokens": 3597200.0,
"reward": 1.037500023841858,
"reward_std": 0.019918914884328842,
"rewards/oai_reward_function/mean": 0.5187500007450581,
"rewards/oai_reward_function/std": 0.023759547621011734,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06220845878124237,
"epoch": 2.9,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12046143412590027,
"kl": 0.05884059518575668,
"learning_rate": 3.557142857142857e-05,
"loss": 0.0006,
"num_tokens": 3615112.0,
"reward": 1.076562523841858,
"reward_std": 0.05444490164518356,
"rewards/oai_reward_function/mean": 0.5382812507450581,
"rewards/oai_reward_function/std": 0.04835369065403938,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05830034799873829,
"epoch": 2.914285714285714,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.09531212598085403,
"kl": 0.05973371770232916,
"learning_rate": 3.55e-05,
"loss": 0.0006,
"num_tokens": 3632936.0,
"reward": 1.1078124046325684,
"reward_std": 0.037323713302612305,
"rewards/oai_reward_function/mean": 0.5539062544703484,
"rewards/oai_reward_function/std": 0.07622901350259781,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05407467018812895,
"epoch": 2.928571428571429,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.07925046980381012,
"kl": 0.07386120036244392,
"learning_rate": 3.5428571428571426e-05,
"loss": 0.0007,
"num_tokens": 3650760.0,
"reward": 1.0140624046325684,
"reward_std": 0.02122672274708748,
"rewards/oai_reward_function/mean": 0.5070312502793968,
"rewards/oai_reward_function/std": 0.017079481855034828,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06182014662772417,
"epoch": 2.942857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11716300994157791,
"kl": 0.06741901952773333,
"learning_rate": 3.5357142857142864e-05,
"loss": 0.0007,
"num_tokens": 3668512.0,
"reward": 1.0906250476837158,
"reward_std": 0.055445872247219086,
"rewards/oai_reward_function/mean": 0.5453125014901161,
"rewards/oai_reward_function/std": 0.06968752294778824,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.059788716956973076,
"epoch": 2.9571428571428573,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13249847292900085,
"kl": 0.08083864115178585,
"learning_rate": 3.528571428571429e-05,
"loss": 0.0008,
"num_tokens": 3686296.0,
"reward": 1.2609375715255737,
"reward_std": 0.032799478620290756,
"rewards/oai_reward_function/mean": 0.6304687410593033,
"rewards/oai_reward_function/std": 0.16235841810703278,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06789828836917877,
"epoch": 2.9714285714285715,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1304040104150772,
"kl": 0.07522418349981308,
"learning_rate": 3.521428571428572e-05,
"loss": 0.0008,
"num_tokens": 3704008.0,
"reward": 1.2593750953674316,
"reward_std": 0.05023520812392235,
"rewards/oai_reward_function/mean": 0.6296875029802322,
"rewards/oai_reward_function/std": 0.1668539047241211,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04057574924081564,
"epoch": 2.9857142857142858,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1246933788061142,
"kl": 0.10525520890951157,
"learning_rate": 3.514285714285714e-05,
"loss": 0.0011,
"num_tokens": 3721888.0,
"reward": 1.2625000476837158,
"reward_std": 0.03328196331858635,
"rewards/oai_reward_function/mean": 0.6312500089406967,
"rewards/oai_reward_function/std": 0.1866512894630432,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06538868602365255,
"epoch": 3.0,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11987200379371643,
"kl": 0.090326476842165,
"learning_rate": 3.507142857142857e-05,
"loss": 0.0009,
"num_tokens": 3739752.0,
"reward": 1.0437500476837158,
"reward_std": 0.040318816900253296,
"rewards/oai_reward_function/mean": 0.5218750014901161,
"rewards/oai_reward_function/std": 0.03521248698234558,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06806700490415096,
"epoch": 3.0142857142857142,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.08975692093372345,
"kl": 0.08757120184600353,
"learning_rate": 3.5e-05,
"loss": 0.0009,
"num_tokens": 3757520.0,
"reward": 1.0203125476837158,
"reward_std": 0.024814628064632416,
"rewards/oai_reward_function/mean": 0.510156249627471,
"rewards/oai_reward_function/std": 0.019938793033361435,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05651993863284588,
"epoch": 3.0285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10675106197595596,
"kl": 0.09534911066293716,
"learning_rate": 3.4928571428571434e-05,
"loss": 0.001,
"num_tokens": 3775296.0,
"reward": 1.0750000476837158,
"reward_std": 0.06767623126506805,
"rewards/oai_reward_function/mean": 0.5375000014901161,
"rewards/oai_reward_function/std": 0.07378040999174118,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0639553228393197,
"epoch": 3.0428571428571427,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09777996689081192,
"kl": 0.07890664599835873,
"learning_rate": 3.485714285714286e-05,
"loss": 0.0008,
"num_tokens": 3793032.0,
"reward": 1.0281250476837158,
"reward_std": 0.020751874893903732,
"rewards/oai_reward_function/mean": 0.5140625005587935,
"rewards/oai_reward_function/std": 0.021939707919955254,
"step": 213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06698552891612053,
"epoch": 3.057142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.1115046888589859,
"kl": 0.07001018989831209,
"learning_rate": 3.478571428571429e-05,
"loss": 0.0007,
"num_tokens": 3810744.0,
"reward": 1.056249976158142,
"reward_std": 0.030470959842205048,
"rewards/oai_reward_function/mean": 0.5281250011175871,
"rewards/oai_reward_function/std": 0.04741290956735611,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06257231812924147,
"epoch": 3.0714285714285716,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.10517910867929459,
"kl": 0.07934985496103764,
"learning_rate": 3.471428571428571e-05,
"loss": 0.0008,
"num_tokens": 3828496.0,
"reward": 1.235937476158142,
"reward_std": 0.012387894093990326,
"rewards/oai_reward_function/mean": 0.6179687529802322,
"rewards/oai_reward_function/std": 0.20793089270591736,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.055911571718752384,
"epoch": 3.085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15163163840770721,
"kl": 0.11663151904940605,
"learning_rate": 3.4642857142857144e-05,
"loss": 0.0012,
"num_tokens": 3846408.0,
"reward": 1.2062499523162842,
"reward_std": 0.1645711362361908,
"rewards/oai_reward_function/mean": 0.6031250059604645,
"rewards/oai_reward_function/std": 0.10957211256027222,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05231211241334677,
"epoch": 3.1,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.17118224501609802,
"kl": 0.12115776538848877,
"learning_rate": 3.4571428571428574e-05,
"loss": 0.0012,
"num_tokens": 3864168.0,
"reward": 1.0859375,
"reward_std": 0.1350831389427185,
"rewards/oai_reward_function/mean": 0.5429687462747097,
"rewards/oai_reward_function/std": 0.11469355970621109,
"step": 217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05661669000983238,
"epoch": 3.1142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13312071561813354,
"kl": 0.08971596322953701,
"learning_rate": 3.45e-05,
"loss": 0.0009,
"num_tokens": 3882016.0,
"reward": 1.2296874523162842,
"reward_std": 0.02697797492146492,
"rewards/oai_reward_function/mean": 0.6148437485098839,
"rewards/oai_reward_function/std": 0.1935303658246994,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0506694195792079,
"epoch": 3.1285714285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.10720871388912201,
"kl": 0.07849705778062344,
"learning_rate": 3.442857142857143e-05,
"loss": 0.0008,
"num_tokens": 3899952.0,
"reward": 1.1015625,
"reward_std": 0.044115059077739716,
"rewards/oai_reward_function/mean": 0.55078125,
"rewards/oai_reward_function/std": 0.055534202605485916,
"step": 219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05330024380236864,
"epoch": 3.142857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12851352989673615,
"kl": 0.0983127523213625,
"learning_rate": 3.435714285714286e-05,
"loss": 0.001,
"num_tokens": 3917688.0,
"reward": 1.365625023841858,
"reward_std": 0.12765255570411682,
"rewards/oai_reward_function/mean": 0.6828124970197678,
"rewards/oai_reward_function/std": 0.20135001838207245,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05522188264876604,
"epoch": 3.157142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.11083745956420898,
"kl": 0.07289117947220802,
"learning_rate": 3.428571428571429e-05,
"loss": 0.0007,
"num_tokens": 3935528.0,
"reward": 1.046875,
"reward_std": 0.0414334312081337,
"rewards/oai_reward_function/mean": 0.5234375,
"rewards/oai_reward_function/std": 0.039623990654945374,
"step": 221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05520590580999851,
"epoch": 3.1714285714285713,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.11108041554689407,
"kl": 0.08299623243510723,
"learning_rate": 3.4214285714285714e-05,
"loss": 0.0008,
"num_tokens": 3953320.0,
"reward": 1.2531250715255737,
"reward_std": 0.017311176285147667,
"rewards/oai_reward_function/mean": 0.6265624910593033,
"rewards/oai_reward_function/std": 0.18985748291015625,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05928301624953747,
"epoch": 3.185714285714286,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.1239665076136589,
"kl": 0.11972067691385746,
"learning_rate": 3.4142857142857145e-05,
"loss": 0.0012,
"num_tokens": 3971032.0,
"reward": 1.1640625,
"reward_std": 0.04833199828863144,
"rewards/oai_reward_function/mean": 0.58203125,
"rewards/oai_reward_function/std": 0.12608151137828827,
"step": 223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.042910450138151646,
"epoch": 3.2,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1018366813659668,
"kl": 0.08956374414265156,
"learning_rate": 3.407142857142857e-05,
"loss": 0.0009,
"num_tokens": 3988864.0,
"reward": 1.2765624523162842,
"reward_std": 0.06503090262413025,
"rewards/oai_reward_function/mean": 0.6382812410593033,
"rewards/oai_reward_function/std": 0.19134333729743958,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06987146660685539,
"epoch": 3.2142857142857144,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13029474020004272,
"kl": 0.11290079541504383,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.0011,
"num_tokens": 4006800.0,
"reward": 1.3203125,
"reward_std": 0.04008040949702263,
"rewards/oai_reward_function/mean": 0.66015625,
"rewards/oai_reward_function/std": 0.20209181308746338,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07145651057362556,
"epoch": 3.2285714285714286,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1022149994969368,
"kl": 0.05857388116419315,
"learning_rate": 3.392857142857143e-05,
"loss": 0.0006,
"num_tokens": 4024680.0,
"reward": 1.0499999523162842,
"reward_std": 0.03877411410212517,
"rewards/oai_reward_function/mean": 0.5249999985098839,
"rewards/oai_reward_function/std": 0.0416397750377655,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.048385005444288254,
"epoch": 3.242857142857143,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.12312141805887222,
"kl": 0.07377888821065426,
"learning_rate": 3.385714285714286e-05,
"loss": 0.0007,
"num_tokens": 4042472.0,
"reward": 1.4500000476837158,
"reward_std": 0.02340090088546276,
"rewards/oai_reward_function/mean": 0.7249999940395355,
"rewards/oai_reward_function/std": 0.23026981949806213,
"step": 227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0545792318880558,
"epoch": 3.257142857142857,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09947756677865982,
"kl": 0.09583424963057041,
"learning_rate": 3.3785714285714285e-05,
"loss": 0.001,
"num_tokens": 4060248.0,
"reward": 1.0265624523162842,
"reward_std": 0.10836321860551834,
"rewards/oai_reward_function/mean": 0.5132812485098839,
"rewards/oai_reward_function/std": 0.12380600348114967,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06396409310400486,
"epoch": 3.2714285714285714,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0686774030327797,
"kl": 0.09425997547805309,
"learning_rate": 3.3714285714285716e-05,
"loss": 0.0009,
"num_tokens": 4077880.0,
"reward": 1.2703125476837158,
"reward_std": 0.017598576843738556,
"rewards/oai_reward_function/mean": 0.6351562440395355,
"rewards/oai_reward_function/std": 0.2018921971321106,
"step": 229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08321201242506504,
"epoch": 3.2857142857142856,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.06190980598330498,
"kl": 0.03877187706530094,
"learning_rate": 3.364285714285714e-05,
"loss": 0.0004,
"num_tokens": 4095672.0,
"reward": 1.0046875476837158,
"reward_std": 0.00646935636177659,
"rewards/oai_reward_function/mean": 0.5023437500931323,
"rewards/oai_reward_function/std": 0.007403614930808544,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.0575382262468338,
"epoch": 3.3,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.11537446081638336,
"kl": 0.08915554732084274,
"learning_rate": 3.357142857142857e-05,
"loss": 0.0009,
"num_tokens": 4113480.0,
"reward": 1.2359375953674316,
"reward_std": 0.08257875591516495,
"rewards/oai_reward_function/mean": 0.617968738079071,
"rewards/oai_reward_function/std": 0.156090646982193,
"step": 231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05642772279679775,
"epoch": 3.314285714285714,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.13337863981723785,
"kl": 0.10237299278378487,
"learning_rate": 3.35e-05,
"loss": 0.001,
"num_tokens": 4131224.0,
"reward": 1.4734375476837158,
"reward_std": 0.01958364248275757,
"rewards/oai_reward_function/mean": 0.7367187440395355,
"rewards/oai_reward_function/std": 0.2408807873725891,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06528778094798326,
"epoch": 3.3285714285714287,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13227899372577667,
"kl": 0.11398253589868546,
"learning_rate": 3.342857142857143e-05,
"loss": 0.0011,
"num_tokens": 4149184.0,
"reward": 1.3203125,
"reward_std": 0.03324369713664055,
"rewards/oai_reward_function/mean": 0.6601562350988388,
"rewards/oai_reward_function/std": 0.19673332571983337,
"step": 233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06553995609283447,
"epoch": 3.342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20257574319839478,
"kl": 0.12704718858003616,
"learning_rate": 3.3357142857142856e-05,
"loss": 0.0013,
"num_tokens": 4167104.0,
"reward": 1.4609375,
"reward_std": 0.1858925223350525,
"rewards/oai_reward_function/mean": 0.73046875,
"rewards/oai_reward_function/std": 0.1834629327058792,
"step": 234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04393093287944794,
"epoch": 3.357142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12950977683067322,
"kl": 0.09361258894205093,
"learning_rate": 3.3285714285714286e-05,
"loss": 0.0009,
"num_tokens": 4184944.0,
"reward": 1.1656250953674316,
"reward_std": 0.150077685713768,
"rewards/oai_reward_function/mean": 0.5828124955296516,
"rewards/oai_reward_function/std": 0.13220180571079254,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06838994100689888,
"epoch": 3.3714285714285714,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.14525781571865082,
"kl": 0.09100262448191643,
"learning_rate": 3.321428571428572e-05,
"loss": 0.0009,
"num_tokens": 4202672.0,
"reward": 1.4187500476837158,
"reward_std": 0.02699536457657814,
"rewards/oai_reward_function/mean": 0.7093749940395355,
"rewards/oai_reward_function/std": 0.21884500980377197,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05996893718838692,
"epoch": 3.3857142857142857,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12374205142259598,
"kl": 0.13878228701651096,
"learning_rate": 3.314285714285714e-05,
"loss": 0.0014,
"num_tokens": 4220472.0,
"reward": 1.443750023841858,
"reward_std": 0.060242824256420135,
"rewards/oai_reward_function/mean": 0.721875011920929,
"rewards/oai_reward_function/std": 0.18898604810237885,
"step": 237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04969180002808571,
"epoch": 3.4,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.14094124734401703,
"kl": 0.11433868668973446,
"learning_rate": 3.307142857142858e-05,
"loss": 0.0011,
"num_tokens": 4238352.0,
"reward": 1.5062499046325684,
"reward_std": 0.04232252389192581,
"rewards/oai_reward_function/mean": 0.7531249821186066,
"rewards/oai_reward_function/std": 0.21019864082336426,
"step": 238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05842717830091715,
"epoch": 3.414285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13528573513031006,
"kl": 0.13230286352336407,
"learning_rate": 3.3e-05,
"loss": 0.0013,
"num_tokens": 4256072.0,
"reward": 1.5250000953674316,
"reward_std": 0.0736992210149765,
"rewards/oai_reward_function/mean": 0.762499988079071,
"rewards/oai_reward_function/std": 0.19999998807907104,
"step": 239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.053723374381661415,
"epoch": 3.4285714285714284,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.10746068507432938,
"kl": 0.0968917403370142,
"learning_rate": 3.292857142857143e-05,
"loss": 0.001,
"num_tokens": 4273848.0,
"reward": 1.0499999523162842,
"reward_std": 0.017677675932645798,
"rewards/oai_reward_function/mean": 0.525000000372529,
"rewards/oai_reward_function/std": 0.028398092836141586,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.08724895678460598,
"epoch": 3.442857142857143,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1673547476530075,
"kl": 0.16037143021821976,
"learning_rate": 3.285714285714286e-05,
"loss": 0.0016,
"num_tokens": 4291872.0,
"reward": 1.603124976158142,
"reward_std": 0.11311184614896774,
"rewards/oai_reward_function/mean": 0.801562488079071,
"rewards/oai_reward_function/std": 0.22014817595481873,
"step": 241
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06301301345229149,
"epoch": 3.4571428571428573,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.127055823802948,
"kl": 0.10341309197247028,
"learning_rate": 3.278571428571429e-05,
"loss": 0.001,
"num_tokens": 4309640.0,
"reward": 1.265625,
"reward_std": 0.02265283279120922,
"rewards/oai_reward_function/mean": 0.6328125,
"rewards/oai_reward_function/std": 0.19933734834194183,
"step": 242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.04968675132840872,
"epoch": 3.4714285714285715,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1374424248933792,
"kl": 0.1188412457704544,
"learning_rate": 3.271428571428571e-05,
"loss": 0.0012,
"num_tokens": 4327360.0,
"reward": 1.303125023841858,
"reward_std": 0.10906177759170532,
"rewards/oai_reward_function/mean": 0.651562511920929,
"rewards/oai_reward_function/std": 0.2058555632829666,
"step": 243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06477249693125486,
"epoch": 3.4857142857142858,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1306895911693573,
"kl": 0.11430021747946739,
"learning_rate": 3.264285714285714e-05,
"loss": 0.0011,
"num_tokens": 4345192.0,
"reward": 1.4375,
"reward_std": 0.1379069834947586,
"rewards/oai_reward_function/mean": 0.71875,
"rewards/oai_reward_function/std": 0.18447834253311157,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06405621953308582,
"epoch": 3.5,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.1333678811788559,
"kl": 0.17231638357043266,
"learning_rate": 3.257142857142857e-05,
"loss": 0.0017,
"num_tokens": 4363064.0,
"reward": 1.5109375715255737,
"reward_std": 0.0335906445980072,
"rewards/oai_reward_function/mean": 0.7554687559604645,
"rewards/oai_reward_function/std": 0.22403325140476227,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.067863704636693,
"epoch": 3.5142857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13658057153224945,
"kl": 0.17316893115639687,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.0017,
"num_tokens": 4380968.0,
"reward": 1.5546875,
"reward_std": 0.05777457728981972,
"rewards/oai_reward_function/mean": 0.77734375,
"rewards/oai_reward_function/std": 0.2121661901473999,
"step": 246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.05905670113861561,
"epoch": 3.5285714285714285,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.09611335396766663,
"kl": 0.050939660519361496,
"learning_rate": 3.242857142857143e-05,
"loss": 0.0005,
"num_tokens": 4398608.0,
"reward": 1.0125000476837158,
"reward_std": 0.02314549870789051,
"rewards/oai_reward_function/mean": 0.5062500000931323,
"rewards/oai_reward_function/std": 0.016800537705421448,
"step": 247
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07766996510326862,
"epoch": 3.5428571428571427,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.13582761585712433,
"kl": 0.12659209407866,
"learning_rate": 3.235714285714286e-05,
"loss": 0.0013,
"num_tokens": 4416392.0,
"reward": 1.5640625953674316,
"reward_std": 0.07988262921571732,
"rewards/oai_reward_function/mean": 0.782031238079071,
"rewards/oai_reward_function/std": 0.2193496972322464,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.07497746869921684,
"epoch": 3.557142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1760389357805252,
"kl": 0.24591631814837456,
"learning_rate": 3.228571428571428e-05,
"loss": 0.0025,
"num_tokens": 4434272.0,
"reward": 1.798437476158142,
"reward_std": 0.12221544235944748,
"rewards/oai_reward_function/mean": 0.899218738079071,
"rewards/oai_reward_function/std": 0.10424157232046127,
"step": 249
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.06046187411993742,
"epoch": 3.571428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15973122417926788,
"kl": 0.22475793957710266,
"learning_rate": 3.221428571428571e-05,
"loss": 0.0022,
"num_tokens": 4452104.0,
"reward": 1.6359374523162842,
"reward_std": 0.0966869369149208,
"rewards/oai_reward_function/mean": 0.8179687559604645,
"rewards/oai_reward_function/std": 0.19666926562786102,
"step": 250
}
],
"logging_steps": 1,
"max_steps": 700,
"num_input_tokens_seen": 4452104,
"num_train_epochs": 10,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}