{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.571428571428571,
  "eval_steps": 500,
  "global_step": 250,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.060131815262138844,
      "epoch": 0.014285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.05771088972687721,
      "kl": 0.0,
      "learning_rate": 5e-05,
      "loss": 0.0,
      "num_tokens": 17832.0,
      "reward": 1.0437500476837158,
      "reward_std": 0.0353553369641304,
      "rewards/oai_reward_function/mean": 0.5218750014901161,
      "rewards/oai_reward_function/std": 0.043879419565200806,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06413675658404827,
      "epoch": 0.02857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03477979078888893,
      "kl": 0.0003001746808877215,
      "learning_rate": 4.928571428571429e-05,
      "loss": 0.0,
      "num_tokens": 35712.0,
      "reward": 1.046875,
      "reward_std": 0.028149789199233055,
      "rewards/oai_reward_function/mean": 0.5234375,
      "rewards/oai_reward_function/std": 0.049161311239004135,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.052969515323638916,
      "epoch": 0.04285714285714286,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0005888506420888007,
      "kl": 0.0004545010087895207,
      "learning_rate": 4.8571428571428576e-05,
      "loss": 0.0,
      "num_tokens": 53424.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06199027318507433,
      "epoch": 0.05714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04447643458843231,
      "kl": 0.0005710393161280081,
      "learning_rate": 4.785714285714286e-05,
      "loss": 0.0,
      "num_tokens": 71248.0,
      "reward": 1.2265625,
      "reward_std": 0.004419416189193726,
      "rewards/oai_reward_function/mean": 0.61328125,
      "rewards/oai_reward_function/std": 0.1993926614522934,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0633242828771472,
      "epoch": 0.07142857142857142,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.043302807956933975,
      "kl": 0.001818844728404656,
      "learning_rate": 4.714285714285714e-05,
      "loss": 0.0,
      "num_tokens": 89000.0,
      "reward": 1.032812476158142,
      "reward_std": 0.022097092121839523,
      "rewards/oai_reward_function/mean": 0.5164062492549419,
      "rewards/oai_reward_function/std": 0.03570114076137543,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06267449539154768,
      "epoch": 0.08571428571428572,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.048733897507190704,
      "kl": 0.0011250173120060936,
      "learning_rate": 4.642857142857143e-05,
      "loss": 0.0,
      "num_tokens": 106816.0,
      "reward": 1.071874976158142,
      "reward_std": 0.03390505909919739,
      "rewards/oai_reward_function/mean": 0.5359375029802322,
      "rewards/oai_reward_function/std": 0.07097747921943665,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06247459910809994,
      "epoch": 0.1,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07503627240657806,
      "kl": 0.0016785123152658343,
      "learning_rate": 4.5714285714285716e-05,
      "loss": 0.0,
      "num_tokens": 124592.0,
      "reward": 1.181249976158142,
      "reward_std": 0.06808801740407944,
      "rewards/oai_reward_function/mean": 0.5906250029802322,
      "rewards/oai_reward_function/std": 0.13951963186264038,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09173925407230854,
      "epoch": 0.11428571428571428,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.046880681067705154,
      "kl": 0.004017388273496181,
      "learning_rate": 4.5e-05,
      "loss": 0.0,
      "num_tokens": 142368.0,
      "reward": 1.001562476158142,
      "reward_std": 0.004419416189193726,
      "rewards/oai_reward_function/mean": 0.5007812500116415,
      "rewards/oai_reward_function/std": 0.0044194175861775875,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07956545054912567,
      "epoch": 0.12857142857142856,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.061871547251939774,
      "kl": 0.00639598595444113,
      "learning_rate": 4.428571428571428e-05,
      "loss": 0.0001,
      "num_tokens": 160160.0,
      "reward": 1.021875023841858,
      "reward_std": 0.052504248917102814,
      "rewards/oai_reward_function/mean": 0.5109375007450581,
      "rewards/oai_reward_function/std": 0.053482551127672195,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06956008821725845,
      "epoch": 0.14285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06243785470724106,
      "kl": 0.00972771504893899,
      "learning_rate": 4.3571428571428576e-05,
      "loss": 0.0001,
      "num_tokens": 177984.0,
      "reward": 1.2296874523162842,
      "reward_std": 0.01684970036149025,
      "rewards/oai_reward_function/mean": 0.6148437485098839,
      "rewards/oai_reward_function/std": 0.1987723708152771,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07967641018331051,
      "epoch": 0.15714285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07661325484514236,
      "kl": 0.0069638064596802,
      "learning_rate": 4.2857142857142856e-05,
      "loss": 0.0001,
      "num_tokens": 195896.0,
      "reward": 1.1062500476837158,
      "reward_std": 0.06087504327297211,
      "rewards/oai_reward_function/mean": 0.5531250014901161,
      "rewards/oai_reward_function/std": 0.08584260195493698,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0841637123376131,
      "epoch": 0.17142857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05056445300579071,
      "kl": 0.011949660489335656,
      "learning_rate": 4.214285714285714e-05,
      "loss": 0.0001,
      "num_tokens": 213760.0,
      "reward": 1.131250023841858,
      "reward_std": 0.029124131426215172,
      "rewards/oai_reward_function/mean": 0.5656249970197678,
      "rewards/oai_reward_function/std": 0.11875531077384949,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0837175901979208,
      "epoch": 0.18571428571428572,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1177188903093338,
      "kl": 0.01176721346564591,
      "learning_rate": 4.1428571428571437e-05,
      "loss": 0.0001,
      "num_tokens": 231664.0,
      "reward": 1.2421875,
      "reward_std": 0.02758825570344925,
      "rewards/oai_reward_function/mean": 0.62109375,
      "rewards/oai_reward_function/std": 0.1868790090084076,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07715502567589283,
      "epoch": 0.2,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020164160523563623,
      "kl": 0.013234916375949979,
      "learning_rate": 4.0714285714285717e-05,
      "loss": 0.0001,
      "num_tokens": 249528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0780396144837141,
      "epoch": 0.21428571428571427,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0018555221613496542,
      "kl": 0.011373426881618798,
      "learning_rate": 4e-05,
      "loss": 0.0001,
      "num_tokens": 267168.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0733959898352623,
      "epoch": 0.22857142857142856,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09124160557985306,
      "kl": 0.021819928660988808,
      "learning_rate": 3.928571428571429e-05,
      "loss": 0.0002,
      "num_tokens": 284928.0,
      "reward": 1.0484375953674316,
      "reward_std": 0.05051835626363754,
      "rewards/oai_reward_function/mean": 0.5242187511175871,
      "rewards/oai_reward_function/std": 0.044669199734926224,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09740176424384117,
      "epoch": 0.24285714285714285,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.052958983927965164,
      "kl": 0.028434510342776775,
      "learning_rate": 3.857142857142858e-05,
      "loss": 0.0003,
      "num_tokens": 302816.0,
      "reward": 1.071874976158142,
      "reward_std": 0.06469365209341049,
      "rewards/oai_reward_function/mean": 0.5359374992549419,
      "rewards/oai_reward_function/std": 0.0882028192281723,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08574963361024857,
      "epoch": 0.2571428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04292497783899307,
      "kl": 0.033173230942338705,
      "learning_rate": 3.785714285714286e-05,
      "loss": 0.0003,
      "num_tokens": 320584.0,
      "reward": 1.001562476158142,
      "reward_std": 0.004419416189193726,
      "rewards/oai_reward_function/mean": 0.5007812500116415,
      "rewards/oai_reward_function/std": 0.0044194175861775875,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12318380549550056,
      "epoch": 0.2714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06502827256917953,
      "kl": 0.03774468321353197,
      "learning_rate": 3.7142857142857143e-05,
      "loss": 0.0004,
      "num_tokens": 338448.0,
      "reward": 1.109375,
      "reward_std": 0.05164698138833046,
      "rewards/oai_reward_function/mean": 0.5546875,
      "rewards/oai_reward_function/std": 0.10803177952766418,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08734610676765442,
      "epoch": 0.2857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07792048156261444,
      "kl": 0.028081147465854883,
      "learning_rate": 3.642857142857143e-05,
      "loss": 0.0003,
      "num_tokens": 356200.0,
      "reward": 1.03125,
      "reward_std": 0.047612957656383514,
      "rewards/oai_reward_function/mean": 0.515625,
      "rewards/oai_reward_function/std": 0.04151855409145355,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08050715737044811,
      "epoch": 0.3,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.047350119799375534,
      "kl": 0.026192680466920137,
      "learning_rate": 3.571428571428572e-05,
      "loss": 0.0003,
      "num_tokens": 373912.0,
      "reward": 0.503125011920929,
      "reward_std": 0.008838832378387451,
      "rewards/oai_reward_function/mean": 0.25156250002328306,
      "rewards/oai_reward_function/std": 0.26283908169716597,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07859978079795837,
      "epoch": 0.3142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.18296176195144653,
      "kl": 0.035550063010305166,
      "learning_rate": 3.5e-05,
      "loss": 0.0004,
      "num_tokens": 391880.0,
      "reward": 0.2578125,
      "reward_std": 0.4363012909889221,
      "rewards/oai_reward_function/mean": 0.12890625,
      "rewards/oai_reward_function/std": 0.28480061888694763,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07332467474043369,
      "epoch": 0.32857142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.26302284002304077,
      "kl": 0.02304189372807741,
      "learning_rate": 3.428571428571429e-05,
      "loss": 0.0002,
      "num_tokens": 409592.0,
      "reward": 0.4375,
      "reward_std": 0.3335031569004059,
      "rewards/oai_reward_function/mean": 0.21875,
      "rewards/oai_reward_function/std": 0.2520080506801605,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08370361104607582,
      "epoch": 0.34285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07096434384584427,
      "kl": 0.02303632628172636,
      "learning_rate": 3.357142857142857e-05,
      "loss": 0.0002,
      "num_tokens": 427504.0,
      "reward": 1.0906250476837158,
      "reward_std": 0.12288369983434677,
      "rewards/oai_reward_function/mean": 0.5453124977648258,
      "rewards/oai_reward_function/std": 0.18977738916873932,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10516241379082203,
      "epoch": 0.35714285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10253780335187912,
      "kl": 0.022893703542649746,
      "learning_rate": 3.285714285714286e-05,
      "loss": 0.0002,
      "num_tokens": 445464.0,
      "reward": 1.0281250476837158,
      "reward_std": 0.11285631358623505,
      "rewards/oai_reward_function/mean": 0.5140625014901161,
      "rewards/oai_reward_function/std": 0.13734418153762817,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10455058515071869,
      "epoch": 0.37142857142857144,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0020646003540605307,
      "kl": 0.013847913593053818,
      "learning_rate": 3.2142857142857144e-05,
      "loss": 0.0001,
      "num_tokens": 463176.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10624882206320763,
      "epoch": 0.38571428571428573,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08771698921918869,
      "kl": 0.023737956769764423,
      "learning_rate": 3.142857142857143e-05,
      "loss": 0.0002,
      "num_tokens": 480896.0,
      "reward": 1.1234374046325684,
      "reward_std": 0.12076057493686676,
      "rewards/oai_reward_function/mean": 0.5617187507450581,
      "rewards/oai_reward_function/std": 0.09692539274692535,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09867865778505802,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04759760946035385,
      "kl": 0.016957666259258986,
      "learning_rate": 3.071428571428572e-05,
      "loss": 0.0002,
      "num_tokens": 498752.0,
      "reward": 1.0515625476837158,
      "reward_std": 0.016952523961663246,
      "rewards/oai_reward_function/mean": 0.525781249627471,
      "rewards/oai_reward_function/std": 0.048144761472940445,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12629481963813305,
      "epoch": 0.4142857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.058567993342876434,
      "kl": 0.017663696315139532,
      "learning_rate": 3e-05,
      "loss": 0.0002,
      "num_tokens": 516552.0,
      "reward": 1.2234375476837158,
      "reward_std": 0.018139135092496872,
      "rewards/oai_reward_function/mean": 0.6117187514901161,
      "rewards/oai_reward_function/std": 0.1933349370956421,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.1236942820250988,
      "epoch": 0.42857142857142855,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08224395662546158,
      "kl": 0.011707060737535357,
      "learning_rate": 2.9285714285714288e-05,
      "loss": 0.0001,
      "num_tokens": 534336.0,
      "reward": 1.1906249523162842,
      "reward_std": 0.09417471289634705,
      "rewards/oai_reward_function/mean": 0.5953124985098839,
      "rewards/oai_reward_function/std": 0.2836897447705269,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12007096596062183,
      "epoch": 0.44285714285714284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12164021283388138,
      "kl": 0.015199759975075722,
      "learning_rate": 2.857142857142857e-05,
      "loss": 0.0002,
      "num_tokens": 552288.0,
      "reward": 1.459375023841858,
      "reward_std": 0.23513765633106232,
      "rewards/oai_reward_function/mean": 0.729687511920929,
      "rewards/oai_reward_function/std": 0.31374088674783707,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12509393319487572,
      "epoch": 0.45714285714285713,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09083209186792374,
      "kl": 0.01757637900300324,
      "learning_rate": 2.785714285714286e-05,
      "loss": 0.0002,
      "num_tokens": 570160.0,
      "reward": 1.076562523841858,
      "reward_std": 0.04446931555867195,
      "rewards/oai_reward_function/mean": 0.5382812470197678,
      "rewards/oai_reward_function/std": 0.07267481088638306,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12463634088635445,
      "epoch": 0.4714285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.002431818749755621,
      "kl": 0.014958202606067061,
      "learning_rate": 2.714285714285714e-05,
      "loss": 0.0001,
      "num_tokens": 587872.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11686164513230324,
      "epoch": 0.4857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.05484098196029663,
      "kl": 0.010952673037536442,
      "learning_rate": 2.642857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 605824.0,
      "reward": 1.0968749523162842,
      "reward_std": 0.09722718596458435,
      "rewards/oai_reward_function/mean": 0.5484375022351742,
      "rewards/oai_reward_function/std": 0.0920066386461258,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12041523866355419,
      "epoch": 0.5,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05225397273898125,
      "kl": 0.006640716805122793,
      "learning_rate": 2.5714285714285714e-05,
      "loss": 0.0001,
      "num_tokens": 623624.0,
      "reward": 1.0046875476837158,
      "reward_std": 0.0093002924695611,
      "rewards/oai_reward_function/mean": 0.5023437500931323,
      "rewards/oai_reward_function/std": 0.009753772988915443,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12032002210617065,
      "epoch": 0.5142857142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07161174714565277,
      "kl": 0.010428835870698094,
      "learning_rate": 2.5e-05,
      "loss": 0.0001,
      "num_tokens": 641400.0,
      "reward": 1.0437500476837158,
      "reward_std": 0.052891530096530914,
      "rewards/oai_reward_function/mean": 0.521874999627471,
      "rewards/oai_reward_function/std": 0.04741290956735611,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.13013662584125996,
      "epoch": 0.5285714285714286,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05804259702563286,
      "kl": 0.01170262903906405,
      "learning_rate": 2.4285714285714288e-05,
      "loss": 0.0001,
      "num_tokens": 659192.0,
      "reward": 1.0812499523162842,
      "reward_std": 0.07288689911365509,
      "rewards/oai_reward_function/mean": 0.5406250022351742,
      "rewards/oai_reward_function/std": 0.09954533725976944,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09727449901401997,
      "epoch": 0.5428571428571428,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07038255035877228,
      "kl": 0.009029814857058227,
      "learning_rate": 2.357142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 677088.0,
      "reward": 1.423437476158142,
      "reward_std": 0.03818885609507561,
      "rewards/oai_reward_function/mean": 0.711718738079071,
      "rewards/oai_reward_function/std": 0.18094559013843536,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11586509644985199,
      "epoch": 0.5571428571428572,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08595240861177444,
      "kl": 0.011346436338499188,
      "learning_rate": 2.2857142857142858e-05,
      "loss": 0.0001,
      "num_tokens": 694920.0,
      "reward": 1.3796875476837158,
      "reward_std": 0.049540840089321136,
      "rewards/oai_reward_function/mean": 0.6898437440395355,
      "rewards/oai_reward_function/std": 0.20018735527992249,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.1129021979868412,
      "epoch": 0.5714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03896208480000496,
      "kl": 0.011648714076727629,
      "learning_rate": 2.214285714285714e-05,
      "loss": 0.0001,
      "num_tokens": 712560.0,
      "reward": 1.0031249523162842,
      "reward_std": 0.008838832378387451,
      "rewards/oai_reward_function/mean": 0.5015625000232831,
      "rewards/oai_reward_function/std": 0.008838835172355175,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12671913765370846,
      "epoch": 0.5857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.060481104999780655,
      "kl": 0.009595283307135105,
      "learning_rate": 2.1428571428571428e-05,
      "loss": 0.0001,
      "num_tokens": 730352.0,
      "reward": 1.037500023841858,
      "reward_std": 0.026726119220256805,
      "rewards/oai_reward_function/mean": 0.5187500007450581,
      "rewards/oai_reward_function/std": 0.0416397787630558,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.1355144940316677,
      "epoch": 0.6,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10250900685787201,
      "kl": 0.010706432163715363,
      "learning_rate": 2.0714285714285718e-05,
      "loss": 0.0001,
      "num_tokens": 748080.0,
      "reward": 0.971875011920929,
      "reward_std": 0.16737449169158936,
      "rewards/oai_reward_function/mean": 0.48593750037252903,
      "rewards/oai_reward_function/std": 0.18062228709459305,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11447549611330032,
      "epoch": 0.6142857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07029257714748383,
      "kl": 0.011148489313200116,
      "learning_rate": 2e-05,
      "loss": 0.0001,
      "num_tokens": 765848.0,
      "reward": 1.029687523841858,
      "reward_std": 0.06395581364631653,
      "rewards/oai_reward_function/mean": 0.5148437507450581,
      "rewards/oai_reward_function/std": 0.05420219525694847,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12138544581830502,
      "epoch": 0.6285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07489942759275436,
      "kl": 0.009310122113674879,
      "learning_rate": 1.928571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 783552.0,
      "reward": 1.015625,
      "reward_std": 0.03808925300836563,
      "rewards/oai_reward_function/mean": 0.5078125,
      "rewards/oai_reward_function/std": 0.02870701625943184,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11241224221885204,
      "epoch": 0.6428571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06217681244015694,
      "kl": 0.015002928674221039,
      "learning_rate": 1.8571428571428572e-05,
      "loss": 0.0001,
      "num_tokens": 801392.0,
      "reward": 1.171875,
      "reward_std": 0.12756596505641937,
      "rewards/oai_reward_function/mean": 0.5859375,
      "rewards/oai_reward_function/std": 0.13151375949382782,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10430784896016121,
      "epoch": 0.6571428571428571,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07851167023181915,
      "kl": 0.013715020613744855,
      "learning_rate": 1.785714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 819120.0,
      "reward": 1.1765625476837158,
      "reward_std": 0.11721621453762054,
      "rewards/oai_reward_function/mean": 0.5882812440395355,
      "rewards/oai_reward_function/std": 0.23893966525793076,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09174064546823502,
      "epoch": 0.6714285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08239107578992844,
      "kl": 0.0339348167181015,
      "learning_rate": 1.7142857142857145e-05,
      "loss": 0.0003,
      "num_tokens": 836976.0,
      "reward": 1.1734375953674316,
      "reward_std": 0.07495103776454926,
      "rewards/oai_reward_function/mean": 0.5867187529802322,
      "rewards/oai_reward_function/std": 0.09024705737829208,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12333916500210762,
      "epoch": 0.6857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06266991049051285,
      "kl": 0.011174799175933003,
      "learning_rate": 1.642857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 854808.0,
      "reward": 1.021875023841858,
      "reward_std": 0.03390507400035858,
      "rewards/oai_reward_function/mean": 0.5109375007450581,
      "rewards/oai_reward_function/std": 0.03753358870744705,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11699695512652397,
      "epoch": 0.7,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06871096044778824,
      "kl": 0.011643779696896672,
      "learning_rate": 1.5714285714285715e-05,
      "loss": 0.0001,
      "num_tokens": 872616.0,
      "reward": 1.2109375,
      "reward_std": 0.020290398970246315,
      "rewards/oai_reward_function/mean": 0.6054687574505806,
      "rewards/oai_reward_function/std": 0.18247121572494507,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11293753050267696,
      "epoch": 0.7142857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03746737167239189,
      "kl": 0.008202132536098361,
      "learning_rate": 1.5e-05,
      "loss": 0.0001,
      "num_tokens": 890472.0,
      "reward": 1.0281250476837158,
      "reward_std": 0.008838837966322899,
      "rewards/oai_reward_function/mean": 0.514062499627471,
      "rewards/oai_reward_function/std": 0.026133574545383453,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.12108040601015091,
      "epoch": 0.7285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09108272194862366,
      "kl": 0.009169791359454393,
      "learning_rate": 1.4285714285714285e-05,
      "loss": 0.0001,
      "num_tokens": 908352.0,
      "reward": 1.126562476158142,
      "reward_std": 0.1380167454481125,
      "rewards/oai_reward_function/mean": 0.5632812455296516,
      "rewards/oai_reward_function/std": 0.1459098607301712,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10115997679531574,
      "epoch": 0.7428571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06731049716472626,
      "kl": 0.007746399496681988,
      "learning_rate": 1.357142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 926032.0,
      "reward": 1.045312523841858,
      "reward_std": 0.04133228585124016,
      "rewards/oai_reward_function/mean": 0.5226562507450581,
      "rewards/oai_reward_function/std": 0.04369957000017166,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.114451814442873,
      "epoch": 0.7571428571428571,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016380356391891837,
      "kl": 0.008339080261066556,
      "learning_rate": 1.2857142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 943784.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.1146883126348257,
      "epoch": 0.7714285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0652991309762001,
      "kl": 0.014742115745320916,
      "learning_rate": 1.2142857142857144e-05,
      "loss": 0.0001,
      "num_tokens": 961592.0,
      "reward": 1.162500023841858,
      "reward_std": 0.09099893271923065,
      "rewards/oai_reward_function/mean": 0.5812500044703484,
      "rewards/oai_reward_function/std": 0.11896733194589615,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0963958241045475,
      "epoch": 0.7857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06369847059249878,
      "kl": 0.00836158636957407,
      "learning_rate": 1.1428571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 979312.0,
      "reward": 1.1375000476837158,
      "reward_std": 0.055009134113788605,
      "rewards/oai_reward_function/mean": 0.5687500014901161,
      "rewards/oai_reward_function/std": 0.12556324899196625,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11964921839535236,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10623525083065033,
      "kl": 0.008312122779898345,
      "learning_rate": 1.0714285714285714e-05,
      "loss": 0.0001,
      "num_tokens": 997072.0,
      "reward": 1.0031249523162842,
      "reward_std": 0.11129148304462433,
      "rewards/oai_reward_function/mean": 0.501562500372529,
      "rewards/oai_reward_function/std": 0.12565238773822784,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10028179734945297,
      "epoch": 0.8142857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05555571988224983,
      "kl": 0.012380573665723205,
      "learning_rate": 1e-05,
      "loss": 0.0001,
      "num_tokens": 1014864.0,
      "reward": 1.015625,
      "reward_std": 0.0265165027230978,
      "rewards/oai_reward_function/mean": 0.5078125,
      "rewards/oai_reward_function/std": 0.02870701625943184,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09480222314596176,
      "epoch": 0.8285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09733164310455322,
      "kl": 0.010292174993082881,
      "learning_rate": 9.285714285714286e-06,
      "loss": 0.0001,
      "num_tokens": 1032656.0,
      "reward": 1.125,
      "reward_std": 0.10169674456119537,
      "rewards/oai_reward_function/mean": 0.5624999962747097,
      "rewards/oai_reward_function/std": 0.10375995188951492,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10845490545034409,
      "epoch": 0.8428571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07499091327190399,
      "kl": 0.009870404610410333,
      "learning_rate": 8.571428571428573e-06,
      "loss": 0.0001,
      "num_tokens": 1050472.0,
      "reward": 1.0187499523162842,
      "reward_std": 0.02493581920862198,
      "rewards/oai_reward_function/mean": 0.509375000372529,
      "rewards/oai_reward_function/std": 0.019827887415885925,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11447742953896523,
      "epoch": 0.8571428571428571,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05116976425051689,
      "kl": 0.005832118098624051,
      "learning_rate": 7.857142857142858e-06,
      "loss": 0.0001,
      "num_tokens": 1068288.0,
      "reward": 1.0343749523162842,
      "reward_std": 0.029693374410271645,
      "rewards/oai_reward_function/mean": 0.517187500372529,
      "rewards/oai_reward_function/std": 0.04136652871966362,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10109574533998966,
      "epoch": 0.8714285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06300117075443268,
      "kl": 0.007976277614943683,
      "learning_rate": 7.142857142857143e-06,
      "loss": 0.0001,
      "num_tokens": 1086200.0,
      "reward": 1.0109374523162842,
      "reward_std": 0.023685520514845848,
      "rewards/oai_reward_function/mean": 0.5054687499068677,
      "rewards/oai_reward_function/std": 0.01765984110534191,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10328171029686928,
      "epoch": 0.8857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.041226889938116074,
      "kl": 0.00717292504850775,
      "learning_rate": 6.428571428571429e-06,
      "loss": 0.0001,
      "num_tokens": 1104056.0,
      "reward": 1.09375,
      "reward_std": 0.03720119222998619,
      "rewards/oai_reward_function/mean": 0.546875,
      "rewards/oai_reward_function/std": 0.08974651247262955,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09696869738399982,
      "epoch": 0.9,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03586564213037491,
      "kl": 0.009956882800906897,
      "learning_rate": 5.7142857142857145e-06,
      "loss": 0.0001,
      "num_tokens": 1121872.0,
      "reward": 1.1218750476837158,
      "reward_std": 0.031160593032836914,
      "rewards/oai_reward_function/mean": 0.5609375014901161,
      "rewards/oai_reward_function/std": 0.11124978214502335,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10472088679671288,
      "epoch": 0.9142857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.045453496277332306,
      "kl": 0.008746590930968523,
      "learning_rate": 5e-06,
      "loss": 0.0001,
      "num_tokens": 1139616.0,
      "reward": 1.0046875476837158,
      "reward_std": 0.00930030457675457,
      "rewards/oai_reward_function/mean": 0.5023437500931323,
      "rewards/oai_reward_function/std": 0.009753772988915443,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09775208681821823,
      "epoch": 0.9285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07111279666423798,
      "kl": 0.007679712725803256,
      "learning_rate": 4.285714285714286e-06,
      "loss": 0.0001,
      "num_tokens": 1157472.0,
      "reward": 1.0890624523162842,
      "reward_std": 0.04253753647208214,
      "rewards/oai_reward_function/mean": 0.5445312485098839,
      "rewards/oai_reward_function/std": 0.08174862712621689,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10566045716404915,
      "epoch": 0.9428571428571428,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09820882230997086,
      "kl": 0.005833235685713589,
      "learning_rate": 3.5714285714285714e-06,
      "loss": 0.0001,
      "num_tokens": 1175344.0,
      "reward": 1.3125,
      "reward_std": 0.0763113722205162,
      "rewards/oai_reward_function/mean": 0.65625,
      "rewards/oai_reward_function/std": 0.199495330452919,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09616570547223091,
      "epoch": 0.9571428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.060254957526922226,
      "kl": 0.005365552264265716,
      "learning_rate": 2.8571428571428573e-06,
      "loss": 0.0001,
      "num_tokens": 1193248.0,
      "reward": 1.0703125,
      "reward_std": 0.026579536497592926,
      "rewards/oai_reward_function/mean": 0.53515625,
      "rewards/oai_reward_function/std": 0.06377232819795609,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09799160063266754,
      "epoch": 0.9714285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0033068626653403044,
      "kl": 0.010790573665872216,
      "learning_rate": 2.142857142857143e-06,
      "loss": 0.0001,
      "num_tokens": 1210992.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09545023553073406,
      "epoch": 0.9857142857142858,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05336514860391617,
      "kl": 0.005807638866826892,
      "learning_rate": 1.4285714285714286e-06,
      "loss": 0.0001,
      "num_tokens": 1228816.0,
      "reward": 1.0125000476837158,
      "reward_std": 0.013363069854676723,
      "rewards/oai_reward_function/mean": 0.5062500000931323,
      "rewards/oai_reward_function/std": 0.016800537705421448,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10368440486490726,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.050165776163339615,
      "kl": 0.008004623581655324,
      "learning_rate": 7.142857142857143e-07,
      "loss": 0.0001,
      "num_tokens": 1246584.0,
      "reward": 1.017187476158142,
      "reward_std": 0.017598580569028854,
      "rewards/oai_reward_function/mean": 0.5085937501862645,
      "rewards/oai_reward_function/std": 0.022548669949173927,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09804531745612621,
      "epoch": 1.0142857142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.05341744422912598,
      "kl": 0.01710776425898075,
      "learning_rate": 0.0,
      "loss": 0.0002,
      "num_tokens": 1264416.0,
      "reward": 1.0875000953674316,
      "reward_std": 0.03174196928739548,
      "rewards/oai_reward_function/mean": 0.5437500029802322,
      "rewards/oai_reward_function/std": 0.0375671461224556,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09854021109640598,
      "epoch": 1.0285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06506258249282837,
      "kl": 0.009508747374638915,
      "learning_rate": 4.4928571428571434e-05,
      "loss": 0.0001,
      "num_tokens": 1282296.0,
      "reward": 1.0406250953674316,
      "reward_std": 0.0222018975764513,
      "rewards/oai_reward_function/mean": 0.5203125011175871,
      "rewards/oai_reward_function/std": 0.035603947937488556,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08583058044314384,
      "epoch": 1.042857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07061073184013367,
      "kl": 0.005951485480181873,
      "learning_rate": 4.485714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 1300008.0,
      "reward": 1.0234375,
      "reward_std": 0.01804211549460888,
      "rewards/oai_reward_function/mean": 0.51171875,
      "rewards/oai_reward_function/std": 0.020064787939190865,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09507345780730247,
      "epoch": 1.0571428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.05930415913462639,
      "kl": 0.007875082548707724,
      "learning_rate": 4.478571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 1317832.0,
      "reward": 1.234375,
      "reward_std": 0.01088879257440567,
      "rewards/oai_reward_function/mean": 0.6171875,
      "rewards/oai_reward_function/std": 0.20452910661697388,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09059166349470615,
      "epoch": 1.0714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04293040931224823,
      "kl": 0.01093563821632415,
      "learning_rate": 4.471428571428571e-05,
      "loss": 0.0001,
      "num_tokens": 1335584.0,
      "reward": 1.0281250476837158,
      "reward_std": 0.008838837966322899,
      "rewards/oai_reward_function/mean": 0.514062499627471,
      "rewards/oai_reward_function/std": 0.026133574545383453,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08499786630272865,
      "epoch": 1.0857142857142856,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06890492141246796,
      "kl": 0.007947787176817656,
      "learning_rate": 4.464285714285715e-05,
      "loss": 0.0001,
      "num_tokens": 1353400.0,
      "reward": 1.0734374523162842,
      "reward_std": 0.03388907015323639,
      "rewards/oai_reward_function/mean": 0.5367187522351742,
      "rewards/oai_reward_function/std": 0.05607611685991287,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08190344646573067,
      "epoch": 1.1,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07835045456886292,
      "kl": 0.010214838432148099,
      "learning_rate": 4.4571428571428574e-05,
      "loss": 0.0001,
      "num_tokens": 1371176.0,
      "reward": 1.21875,
      "reward_std": 0.03153933212161064,
      "rewards/oai_reward_function/mean": 0.609375,
      "rewards/oai_reward_function/std": 0.12727762758731842,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09901309013366699,
      "epoch": 1.1142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07858891785144806,
      "kl": 0.006452999892644584,
      "learning_rate": 4.4500000000000004e-05,
      "loss": 0.0001,
      "num_tokens": 1388952.0,
      "reward": 1.053125023841858,
      "reward_std": 0.028757737949490547,
      "rewards/oai_reward_function/mean": 0.5265625007450581,
      "rewards/oai_reward_function/std": 0.02905604988336563,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09012427926063538,
      "epoch": 1.1285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08503545820713043,
      "kl": 0.01038876292295754,
      "learning_rate": 4.442857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 1406744.0,
      "reward": 1.084375023841858,
      "reward_std": 0.07080081105232239,
      "rewards/oai_reward_function/mean": 0.5421874970197678,
      "rewards/oai_reward_function/std": 0.09233474731445312,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07716062478721142,
      "epoch": 1.1428571428571428,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.0729447677731514,
      "kl": 0.012507579056546092,
      "learning_rate": 4.435714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 1424568.0,
      "reward": 1.2468750476837158,
      "reward_std": 0.03139737993478775,
      "rewards/oai_reward_function/mean": 0.6234374940395355,
      "rewards/oai_reward_function/std": 0.19123300909996033,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09090105071663857,
      "epoch": 1.157142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07926075905561447,
      "kl": 0.010987127898260951,
      "learning_rate": 4.428571428571428e-05,
      "loss": 0.0001,
      "num_tokens": 1442480.0,
      "reward": 1.0984375476837158,
      "reward_std": 0.0697232112288475,
      "rewards/oai_reward_function/mean": 0.5492187514901161,
      "rewards/oai_reward_function/std": 0.06006864085793495,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08928278088569641,
      "epoch": 1.1714285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08457206189632416,
      "kl": 0.004951049922965467,
      "learning_rate": 4.4214285714285714e-05,
      "loss": 0.0,
      "num_tokens": 1460344.0,
      "reward": 1.0875000953674316,
      "reward_std": 0.04518735408782959,
      "rewards/oai_reward_function/mean": 0.5437499992549419,
      "rewards/oai_reward_function/std": 0.07156093418598175,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08692280948162079,
      "epoch": 1.1857142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09246931225061417,
      "kl": 0.015749768121168017,
      "learning_rate": 4.4142857142857144e-05,
      "loss": 0.0002,
      "num_tokens": 1478248.0,
      "reward": 1.264062523841858,
      "reward_std": 0.03826536983251572,
      "rewards/oai_reward_function/mean": 0.6320312470197678,
      "rewards/oai_reward_function/std": 0.17668935656547546,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08908558450639248,
      "epoch": 1.2,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05387440696358681,
      "kl": 0.0058196637546643615,
      "learning_rate": 4.4071428571428575e-05,
      "loss": 0.0001,
      "num_tokens": 1496112.0,
      "reward": 1.0078125,
      "reward_std": 0.011451572179794312,
      "rewards/oai_reward_function/mean": 0.50390625,
      "rewards/oai_reward_function/std": 0.012872475199401379,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08258137106895447,
      "epoch": 1.2142857142857142,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05658518522977829,
      "kl": 0.004440092074219137,
      "learning_rate": 4.4000000000000006e-05,
      "loss": 0.0,
      "num_tokens": 1513752.0,
      "reward": 1.001562476158142,
      "reward_std": 0.004419416189193726,
      "rewards/oai_reward_function/mean": 0.5007812500116415,
      "rewards/oai_reward_function/std": 0.0044194175861775875,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07238267548382282,
      "epoch": 1.2285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07421422004699707,
      "kl": 0.006456690724007785,
      "learning_rate": 4.392857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 1531512.0,
      "reward": 1.048437476158142,
      "reward_std": 0.023024337366223335,
      "rewards/oai_reward_function/mean": 0.5242187492549419,
      "rewards/oai_reward_function/std": 0.030772563070058823,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08944158256053925,
      "epoch": 1.2428571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06827304512262344,
      "kl": 0.006732087349519134,
      "learning_rate": 4.385714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 1549400.0,
      "reward": 1.1703124046325684,
      "reward_std": 0.06780597567558289,
      "rewards/oai_reward_function/mean": 0.5851562544703484,
      "rewards/oai_reward_function/std": 0.15987133979797363,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08645510673522949,
      "epoch": 1.2571428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06966782361268997,
      "kl": 0.008698969963006675,
      "learning_rate": 4.3785714285714284e-05,
      "loss": 0.0001,
      "num_tokens": 1567168.0,
      "reward": 1.0187499523162842,
      "reward_std": 0.018725106492638588,
      "rewards/oai_reward_function/mean": 0.509375000372529,
      "rewards/oai_reward_function/std": 0.01878357119858265,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10576100833714008,
      "epoch": 1.2714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.04759611934423447,
      "kl": 0.009460748406127095,
      "learning_rate": 4.371428571428572e-05,
      "loss": 0.0001,
      "num_tokens": 1585032.0,
      "reward": 1.0812499523162842,
      "reward_std": 0.07165143638849258,
      "rewards/oai_reward_function/mean": 0.5406249985098839,
      "rewards/oai_reward_function/std": 0.0987318754196167,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07568562775850296,
      "epoch": 1.2857142857142856,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.052769020199775696,
      "kl": 0.005130159552209079,
      "learning_rate": 4.3642857142857146e-05,
      "loss": 0.0001,
      "num_tokens": 1602784.0,
      "reward": 1.0562500953674316,
      "reward_std": 0.03836483508348465,
      "rewards/oai_reward_function/mean": 0.5281250011175871,
      "rewards/oai_reward_function/std": 0.03952847048640251,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08074977435171604,
      "epoch": 1.3,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07572436332702637,
      "kl": 0.00757291610352695,
      "learning_rate": 4.3571428571428576e-05,
      "loss": 0.0001,
      "num_tokens": 1620496.0,
      "reward": 1.0546875,
      "reward_std": 0.032445792108774185,
      "rewards/oai_reward_function/mean": 0.52734375,
      "rewards/oai_reward_function/std": 0.03321446478366852,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07088322378695011,
      "epoch": 1.3142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07621358335018158,
      "kl": 0.005998459528200328,
      "learning_rate": 4.35e-05,
      "loss": 0.0001,
      "num_tokens": 1638464.0,
      "reward": 1.2015624046325684,
      "reward_std": 0.11482575535774231,
      "rewards/oai_reward_function/mean": 0.6007812544703484,
      "rewards/oai_reward_function/std": 0.26282399147748947,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06381132267415524,
      "epoch": 1.3285714285714285,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.030587607994675636,
      "kl": 0.007369687547907233,
      "learning_rate": 4.342857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 1656176.0,
      "reward": 1.0031249523162842,
      "reward_std": 0.008838832378387451,
      "rewards/oai_reward_function/mean": 0.5015625000232831,
      "rewards/oai_reward_function/std": 0.008838835172355175,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07285293377935886,
      "epoch": 1.342857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0500265508890152,
      "kl": 0.006194314104504883,
      "learning_rate": 4.3357142857142855e-05,
      "loss": 0.0001,
      "num_tokens": 1674088.0,
      "reward": 1.109375,
      "reward_std": 0.04590248316526413,
      "rewards/oai_reward_function/mean": 0.5546875037252903,
      "rewards/oai_reward_function/std": 0.07967613637447357,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.086557412520051,
      "epoch": 1.3571428571428572,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07414961606264114,
      "kl": 0.010996793280355632,
      "learning_rate": 4.328571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 1692048.0,
      "reward": 1.0671875476837158,
      "reward_std": 0.03708447515964508,
      "rewards/oai_reward_function/mean": 0.5335937514901161,
      "rewards/oai_reward_function/std": 0.04561823233962059,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08167718537151814,
      "epoch": 1.3714285714285714,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0010896283201873302,
      "kl": 0.004855156294070184,
      "learning_rate": 4.3214285714285716e-05,
      "loss": 0.0,
      "num_tokens": 1709760.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07798840664327145,
      "epoch": 1.3857142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09227096289396286,
      "kl": 0.014819784788414836,
      "learning_rate": 4.314285714285715e-05,
      "loss": 0.0001,
      "num_tokens": 1727480.0,
      "reward": 1.2296874523162842,
      "reward_std": 0.1029118224978447,
      "rewards/oai_reward_function/mean": 0.6148437485098839,
      "rewards/oai_reward_function/std": 0.12918156385421753,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07132465578615665,
      "epoch": 1.4,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07690515369176865,
      "kl": 0.0082227170933038,
      "learning_rate": 4.307142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 1745336.0,
      "reward": 1.037500023841858,
      "reward_std": 0.023145508021116257,
      "rewards/oai_reward_function/mean": 0.5187500007450581,
      "rewards/oai_reward_function/std": 0.030453559011220932,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08682013675570488,
      "epoch": 1.4142857142857144,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09899340569972992,
      "kl": 0.007965923519805074,
      "learning_rate": 4.3e-05,
      "loss": 0.0001,
      "num_tokens": 1763136.0,
      "reward": 1.1531250476837158,
      "reward_std": 0.13869836926460266,
      "rewards/oai_reward_function/mean": 0.5765625014901161,
      "rewards/oai_reward_function/std": 0.2821534648537636,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08919607102870941,
      "epoch": 1.4285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.080818772315979,
      "kl": 0.007981272647157311,
      "learning_rate": 4.292857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 1780920.0,
      "reward": 1.2093749046325684,
      "reward_std": 0.020411580801010132,
      "rewards/oai_reward_function/mean": 0.6046875044703484,
      "rewards/oai_reward_function/std": 0.18110741674900055,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07852962799370289,
      "epoch": 1.4428571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09296028316020966,
      "kl": 0.012157463701441884,
      "learning_rate": 4.2857142857142856e-05,
      "loss": 0.0001,
      "num_tokens": 1798872.0,
      "reward": 1.5265624523162842,
      "reward_std": 0.04206090793013573,
      "rewards/oai_reward_function/mean": 0.7632812559604645,
      "rewards/oai_reward_function/std": 0.22771519422531128,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.086120730265975,
      "epoch": 1.457142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.0995735228061676,
      "kl": 0.012111627496778965,
      "learning_rate": 4.278571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 1816744.0,
      "reward": 1.0968749523162842,
      "reward_std": 0.049927353858947754,
      "rewards/oai_reward_function/mean": 0.5484374985098839,
      "rewards/oai_reward_function/std": 0.049974795430898666,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07998536713421345,
      "epoch": 1.4714285714285715,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06524667888879776,
      "kl": 0.006055153091438115,
      "learning_rate": 4.271428571428572e-05,
      "loss": 0.0001,
      "num_tokens": 1834456.0,
      "reward": 1.0125000476837158,
      "reward_std": 0.02314549870789051,
      "rewards/oai_reward_function/mean": 0.5062500000931323,
      "rewards/oai_reward_function/std": 0.016800537705421448,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07001950591802597,
      "epoch": 1.4857142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11485958099365234,
      "kl": 0.00914135156199336,
      "learning_rate": 4.264285714285715e-05,
      "loss": 0.0001,
      "num_tokens": 1852408.0,
      "reward": 1.1859374046325684,
      "reward_std": 0.08434940874576569,
      "rewards/oai_reward_function/mean": 0.5929687544703484,
      "rewards/oai_reward_function/std": 0.09821683913469315,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07728070393204689,
      "epoch": 1.5,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07163766771554947,
      "kl": 0.006072127376683056,
      "learning_rate": 4.257142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 1870208.0,
      "reward": 1.0125000476837158,
      "reward_std": 0.02314549870789051,
      "rewards/oai_reward_function/mean": 0.5062500000931323,
      "rewards/oai_reward_function/std": 0.016800537705421448,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08027334697544575,
      "epoch": 1.5142857142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07313752919435501,
      "kl": 0.011275349417701364,
      "learning_rate": 4.25e-05,
      "loss": 0.0001,
      "num_tokens": 1887984.0,
      "reward": 1.0593750476837158,
      "reward_std": 0.022558562457561493,
      "rewards/oai_reward_function/mean": 0.529687499627471,
      "rewards/oai_reward_function/std": 0.03386256843805313,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08169634826481342,
      "epoch": 1.5285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09352786093950272,
      "kl": 0.014267339138314128,
      "learning_rate": 4.242857142857143e-05,
      "loss": 0.0001,
      "num_tokens": 1905776.0,
      "reward": 1.115625023841858,
      "reward_std": 0.055196452885866165,
      "rewards/oai_reward_function/mean": 0.5578125044703484,
      "rewards/oai_reward_function/std": 0.09427942335605621,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05949794687330723,
      "epoch": 1.5428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.08965466171503067,
      "kl": 0.014173903269693255,
      "learning_rate": 4.2357142857142864e-05,
      "loss": 0.0001,
      "num_tokens": 1923672.0,
      "reward": 1.357812523841858,
      "reward_std": 0.07401138544082642,
      "rewards/oai_reward_function/mean": 0.6789062470197678,
      "rewards/oai_reward_function/std": 0.1690949946641922,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08150264620780945,
      "epoch": 1.5571428571428572,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08684907853603363,
      "kl": 0.015842870343476534,
      "learning_rate": 4.228571428571429e-05,
      "loss": 0.0002,
      "num_tokens": 1941504.0,
      "reward": 1.334375023841858,
      "reward_std": 0.03491953760385513,
      "rewards/oai_reward_function/mean": 0.6671874970197678,
      "rewards/oai_reward_function/std": 0.18364998698234558,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06982677057385445,
      "epoch": 1.5714285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06661536544561386,
      "kl": 0.008391729556024075,
      "learning_rate": 4.221428571428572e-05,
      "loss": 0.0001,
      "num_tokens": 1959144.0,
      "reward": 1.03125,
      "reward_std": 0.019731827080249786,
      "rewards/oai_reward_function/mean": 0.515625,
      "rewards/oai_reward_function/std": 0.025988519191741943,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09021224454045296,
      "epoch": 1.5857142857142859,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05824963003396988,
      "kl": 0.008994318312034011,
      "learning_rate": 4.214285714285714e-05,
      "loss": 0.0001,
      "num_tokens": 1976936.0,
      "reward": 1.037500023841858,
      "reward_std": 0.013363069854676723,
      "rewards/oai_reward_function/mean": 0.5187500007450581,
      "rewards/oai_reward_function/std": 0.0353553369641304,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09183148294687271,
      "epoch": 1.6,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08340641111135483,
      "kl": 0.010920959059149027,
      "learning_rate": 4.2071428571428574e-05,
      "loss": 0.0001,
      "num_tokens": 1994664.0,
      "reward": 1.0390625,
      "reward_std": 0.027564914897084236,
      "rewards/oai_reward_function/mean": 0.51953125,
      "rewards/oai_reward_function/std": 0.0395205020904541,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08369805663824081,
      "epoch": 1.6142857142857143,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.09711972624063492,
      "kl": 0.009947408339940012,
      "learning_rate": 4.2e-05,
      "loss": 0.0001,
      "num_tokens": 2012432.0,
      "reward": 1.0703125,
      "reward_std": 0.02308514341711998,
      "rewards/oai_reward_function/mean": 0.53515625,
      "rewards/oai_reward_function/std": 0.04438621550798416,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.096822340041399,
      "epoch": 1.6285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07382018864154816,
      "kl": 0.017859197221696377,
      "learning_rate": 4.192857142857143e-05,
      "loss": 0.0002,
      "num_tokens": 2030136.0,
      "reward": 1.0359375476837158,
      "reward_std": 0.035533398389816284,
      "rewards/oai_reward_function/mean": 0.517968749627471,
      "rewards/oai_reward_function/std": 0.03252441808581352,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08726120926439762,
      "epoch": 1.6428571428571428,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0782691165804863,
      "kl": 0.012930417666211724,
      "learning_rate": 4.185714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 2047976.0,
      "reward": 1.1749999523162842,
      "reward_std": 0.08762745559215546,
      "rewards/oai_reward_function/mean": 0.5874999985098839,
      "rewards/oai_reward_function/std": 0.1177750751376152,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07392177730798721,
      "epoch": 1.657142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.07505157589912415,
      "kl": 0.012273511849343777,
      "learning_rate": 4.178571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 2065704.0,
      "reward": 1.1953125,
      "reward_std": 0.06742400676012039,
      "rewards/oai_reward_function/mean": 0.59765625,
      "rewards/oai_reward_function/std": 0.13019207119941711,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.065590625628829,
      "epoch": 1.6714285714285713,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08693981915712357,
      "kl": 0.02323699276894331,
      "learning_rate": 4.1714285714285714e-05,
      "loss": 0.0002,
      "num_tokens": 2083560.0,
      "reward": 1.1328125,
      "reward_std": 0.04593653976917267,
      "rewards/oai_reward_function/mean": 0.56640625,
      "rewards/oai_reward_function/std": 0.04902656376361847,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09886737167835236,
      "epoch": 1.6857142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05793704837560654,
      "kl": 0.015232619596645236,
      "learning_rate": 4.1642857142857144e-05,
      "loss": 0.0002,
      "num_tokens": 2101392.0,
      "reward": 1.03125,
      "reward_std": 0.011572758667171001,
      "rewards/oai_reward_function/mean": 0.515625,
      "rewards/oai_reward_function/std": 0.029614457860589027,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09572554007172585,
      "epoch": 1.7,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.03639426827430725,
      "kl": 0.017274728044867516,
      "learning_rate": 4.1571428571428575e-05,
      "loss": 0.0002,
      "num_tokens": 2119200.0,
      "reward": 1.2000000476837158,
      "reward_std": 0.018898215144872665,
      "rewards/oai_reward_function/mean": 0.5999999940395355,
      "rewards/oai_reward_function/std": 0.17689070105552673,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08603023178875446,
      "epoch": 1.7142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10388008505105972,
      "kl": 0.022097071167081594,
      "learning_rate": 4.15e-05,
      "loss": 0.0002,
      "num_tokens": 2137056.0,
      "reward": 1.0578125715255737,
      "reward_std": 0.04522190988063812,
      "rewards/oai_reward_function/mean": 0.528906250372529,
      "rewards/oai_reward_function/std": 0.03971134498715401,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0950616579502821,
      "epoch": 1.7285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09005559235811234,
      "kl": 0.015552334254607558,
      "learning_rate": 4.1428571428571437e-05,
      "loss": 0.0002,
      "num_tokens": 2154936.0,
      "reward": 1.268125057220459,
      "reward_std": 0.0341712087392807,
      "rewards/oai_reward_function/mean": 0.6340624988079071,
      "rewards/oai_reward_function/std": 0.2080029398202896,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07687668316066265,
      "epoch": 1.7428571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08279760181903839,
      "kl": 0.009949938859790564,
      "learning_rate": 4.135714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 2172616.0,
      "reward": 1.0625,
      "reward_std": 0.04204372316598892,
      "rewards/oai_reward_function/mean": 0.53125,
      "rewards/oai_reward_function/std": 0.04353345185518265,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08731912076473236,
      "epoch": 1.7571428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06586393713951111,
      "kl": 0.008837034576572478,
      "learning_rate": 4.128571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 2190368.0,
      "reward": 1.0187499523162842,
      "reward_std": 0.021777570247650146,
      "rewards/oai_reward_function/mean": 0.509375000372529,
      "rewards/oai_reward_function/std": 0.019827887415885925,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08952882327139378,
      "epoch": 1.7714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.09899036586284637,
      "kl": 0.013539569219574332,
      "learning_rate": 4.1214285714285715e-05,
      "loss": 0.0001,
      "num_tokens": 2208176.0,
      "reward": 1.1140625476837158,
      "reward_std": 0.06264616549015045,
      "rewards/oai_reward_function/mean": 0.5570312514901161,
      "rewards/oai_reward_function/std": 0.04455622285604477,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07565303146839142,
      "epoch": 1.7857142857142856,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08115291595458984,
      "kl": 0.009995393920689821,
      "learning_rate": 4.1142857142857146e-05,
      "loss": 0.0001,
      "num_tokens": 2225896.0,
      "reward": 1.1749999523162842,
      "reward_std": 0.06661029160022736,
      "rewards/oai_reward_function/mean": 0.5874999985098839,
      "rewards/oai_reward_function/std": 0.1399884670972824,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09428555145859718,
      "epoch": 1.8,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06345849484205246,
      "kl": 0.01248577213846147,
      "learning_rate": 4.107142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 2243656.0,
      "reward": 1.0390625,
      "reward_std": 0.02052600309252739,
      "rewards/oai_reward_function/mean": 0.51953125,
      "rewards/oai_reward_function/std": 0.0395205020904541,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07595096342265606,
      "epoch": 1.8142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09258188307285309,
      "kl": 0.00919699075166136,
      "learning_rate": 4.1e-05,
      "loss": 0.0001,
      "num_tokens": 2261448.0,
      "reward": 1.0343749523162842,
      "reward_std": 0.03808924928307533,
      "rewards/oai_reward_function/mean": 0.517187500372529,
      "rewards/oai_reward_function/std": 0.029400940984487534,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08019419759511948,
      "epoch": 1.8285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13585439324378967,
      "kl": 0.019885767716914415,
      "learning_rate": 4.092857142857143e-05,
      "loss": 0.0002,
      "num_tokens": 2279240.0,
      "reward": 1.3046875,
      "reward_std": 0.1113169863820076,
      "rewards/oai_reward_function/mean": 0.65234375,
      "rewards/oai_reward_function/std": 0.17964564263820648,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09248529188334942,
      "epoch": 1.842857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08260349929332733,
      "kl": 0.013745760545134544,
      "learning_rate": 4.085714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 2297056.0,
      "reward": 1.0734375715255737,
      "reward_std": 0.034589797258377075,
      "rewards/oai_reward_function/mean": 0.5367187522351742,
      "rewards/oai_reward_function/std": 0.03359169885516167,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09751161187887192,
      "epoch": 1.8571428571428572,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06302473694086075,
      "kl": 0.011824949877336621,
      "learning_rate": 4.0785714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 2314872.0,
      "reward": 1.0203125476837158,
      "reward_std": 0.013258256018161774,
      "rewards/oai_reward_function/mean": 0.510156249627471,
      "rewards/oai_reward_function/std": 0.021867798641324043,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09040896967053413,
      "epoch": 1.8714285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09831514209508896,
      "kl": 0.01766400644555688,
      "learning_rate": 4.0714285714285717e-05,
      "loss": 0.0002,
      "num_tokens": 2332784.0,
      "reward": 1.0187499523162842,
      "reward_std": 0.028380058705806732,
      "rewards/oai_reward_function/mean": 0.509375000372529,
      "rewards/oai_reward_function/std": 0.01878357119858265,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09301545284688473,
      "epoch": 1.8857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08136897534132004,
      "kl": 0.02844544965773821,
      "learning_rate": 4.064285714285714e-05,
      "loss": 0.0003,
      "num_tokens": 2350640.0,
      "reward": 1.060937523841858,
      "reward_std": 0.03093591332435608,
      "rewards/oai_reward_function/mean": 0.5304687507450581,
      "rewards/oai_reward_function/std": 0.054895199835300446,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08573882840573788,
      "epoch": 1.9,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0735137015581131,
      "kl": 0.023084456101059914,
      "learning_rate": 4.057142857142857e-05,
      "loss": 0.0002,
      "num_tokens": 2368456.0,
      "reward": 1.0734374523162842,
      "reward_std": 0.02894335612654686,
      "rewards/oai_reward_function/mean": 0.5367187522351742,
      "rewards/oai_reward_function/std": 0.05461905151605606,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10109273716807365,
      "epoch": 1.9142857142857141,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10477015376091003,
      "kl": 0.03489594021812081,
      "learning_rate": 4.05e-05,
      "loss": 0.0003,
      "num_tokens": 2386200.0,
      "reward": 1.0046875476837158,
      "reward_std": 0.11875393241643906,
      "rewards/oai_reward_function/mean": 0.5023437514901161,
      "rewards/oai_reward_function/std": 0.11234594881534576,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08120713755488396,
      "epoch": 1.9285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.10714246332645416,
      "kl": 0.024183190893381834,
      "learning_rate": 4.042857142857143e-05,
      "loss": 0.0002,
      "num_tokens": 2404056.0,
      "reward": 1.09375,
      "reward_std": 0.05726175755262375,
      "rewards/oai_reward_function/mean": 0.546875,
      "rewards/oai_reward_function/std": 0.05982164293527603,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.089906245470047,
      "epoch": 1.9428571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12431693077087402,
      "kl": 0.038056795950978994,
      "learning_rate": 4.035714285714286e-05,
      "loss": 0.0004,
      "num_tokens": 2421928.0,
      "reward": 1.3406250476837158,
      "reward_std": 0.05260005593299866,
      "rewards/oai_reward_function/mean": 0.6703125089406967,
      "rewards/oai_reward_function/std": 0.19317355751991272,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09174446761608124,
      "epoch": 1.9571428571428573,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07566652446985245,
      "kl": 0.023516141809523106,
      "learning_rate": 4.028571428571429e-05,
      "loss": 0.0002,
      "num_tokens": 2439832.0,
      "reward": 1.0640625953674316,
      "reward_std": 0.026437407359480858,
      "rewards/oai_reward_function/mean": 0.5320312529802322,
      "rewards/oai_reward_function/std": 0.04027845337986946,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0811004675924778,
      "epoch": 1.9714285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1272999793291092,
      "kl": 0.035705497954040766,
      "learning_rate": 4.021428571428572e-05,
      "loss": 0.0004,
      "num_tokens": 2457576.0,
      "reward": 1.0421874523162842,
      "reward_std": 0.04250866919755936,
      "rewards/oai_reward_function/mean": 0.521093750372529,
      "rewards/oai_reward_function/std": 0.03971134498715401,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08940452709794044,
      "epoch": 1.9857142857142858,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10179316252470016,
      "kl": 0.03542056027799845,
      "learning_rate": 4.014285714285714e-05,
      "loss": 0.0004,
      "num_tokens": 2475400.0,
      "reward": 1.0484375953674316,
      "reward_std": 0.03541836887598038,
      "rewards/oai_reward_function/mean": 0.5242187511175871,
      "rewards/oai_reward_function/std": 0.04375720024108887,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08982641063630581,
      "epoch": 2.0,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05322287976741791,
      "kl": 0.024960508104413748,
      "learning_rate": 4.007142857142857e-05,
      "loss": 0.0002,
      "num_tokens": 2493168.0,
      "reward": 1.0421874523162842,
      "reward_std": 0.024032622575759888,
      "rewards/oai_reward_function/mean": 0.521093750372529,
      "rewards/oai_reward_function/std": 0.04358407482504845,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07347713969647884,
      "epoch": 2.0142857142857142,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.054996706545352936,
      "kl": 0.029410825110971928,
      "learning_rate": 4e-05,
      "loss": 0.0003,
      "num_tokens": 2510968.0,
      "reward": 1.2937500476837158,
      "reward_std": 0.006681524682790041,
      "rewards/oai_reward_function/mean": 0.6468750089406967,
      "rewards/oai_reward_function/std": 0.2041652947664261,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11200576089322567,
      "epoch": 2.0285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07548272609710693,
      "kl": 0.03502320311963558,
      "learning_rate": 3.9928571428571434e-05,
      "loss": 0.0004,
      "num_tokens": 2528744.0,
      "reward": 1.095312476158142,
      "reward_std": 0.0437462255358696,
      "rewards/oai_reward_function/mean": 0.5476562529802322,
      "rewards/oai_reward_function/std": 0.05692360922694206,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09486313536763191,
      "epoch": 2.0428571428571427,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05399833247065544,
      "kl": 0.03851825185120106,
      "learning_rate": 3.985714285714286e-05,
      "loss": 0.0004,
      "num_tokens": 2546488.0,
      "reward": 1.0125000476837158,
      "reward_std": 0.01336306519806385,
      "rewards/oai_reward_function/mean": 0.5062500000931323,
      "rewards/oai_reward_function/std": 0.016800537705421448,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08246604166924953,
      "epoch": 2.057142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.042957283556461334,
      "kl": 0.03783240728080273,
      "learning_rate": 3.978571428571429e-05,
      "loss": 0.0004,
      "num_tokens": 2564176.0,
      "reward": 1.0234375,
      "reward_std": 0.01695253700017929,
      "rewards/oai_reward_function/mean": 0.51171875,
      "rewards/oai_reward_function/std": 0.026169713586568832,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10432570241391659,
      "epoch": 2.0714285714285716,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09850599616765976,
      "kl": 0.037014870904386044,
      "learning_rate": 3.971428571428571e-05,
      "loss": 0.0004,
      "num_tokens": 2581944.0,
      "reward": 1.0250000953674316,
      "reward_std": 0.15622428059577942,
      "rewards/oai_reward_function/mean": 0.5124999992549419,
      "rewards/oai_reward_function/std": 0.17416272684931755,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09751013852655888,
      "epoch": 2.085714285714286,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.060189735144376755,
      "kl": 0.050427704118192196,
      "learning_rate": 3.964285714285714e-05,
      "loss": 0.0005,
      "num_tokens": 2599616.0,
      "reward": 1.0265624523162842,
      "reward_std": 0.008010865189135075,
      "rewards/oai_reward_function/mean": 0.513281250372529,
      "rewards/oai_reward_function/std": 0.024580655619502068,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.1012180857360363,
      "epoch": 2.1,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09098206460475922,
      "kl": 0.05104807484894991,
      "learning_rate": 3.9571428571428574e-05,
      "loss": 0.0005,
      "num_tokens": 2617576.0,
      "reward": 1.2890625,
      "reward_std": 0.033694587647914886,
      "rewards/oai_reward_function/mean": 0.6445312350988388,
      "rewards/oai_reward_function/std": 0.1918431520462036,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08674592711031437,
      "epoch": 2.1142857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0632624626159668,
      "kl": 0.035351223312318325,
      "learning_rate": 3.9500000000000005e-05,
      "loss": 0.0004,
      "num_tokens": 2635248.0,
      "reward": 1.0265624523162842,
      "reward_std": 0.01813914254307747,
      "rewards/oai_reward_function/mean": 0.513281250372529,
      "rewards/oai_reward_function/std": 0.021982740610837936,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11086461879312992,
      "epoch": 2.1285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13065817952156067,
      "kl": 0.06040171813219786,
      "learning_rate": 3.942857142857143e-05,
      "loss": 0.0006,
      "num_tokens": 2653096.0,
      "reward": 1.037500023841858,
      "reward_std": 0.14793866872787476,
      "rewards/oai_reward_function/mean": 0.5187500044703484,
      "rewards/oai_reward_function/std": 0.12740343809127808,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.11983237601816654,
      "epoch": 2.142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11544425040483475,
      "kl": 0.06588536128401756,
      "learning_rate": 3.935714285714286e-05,
      "loss": 0.0007,
      "num_tokens": 2670944.0,
      "reward": 1.0812499523162842,
      "reward_std": 0.035140641033649445,
      "rewards/oai_reward_function/mean": 0.5406249985098839,
      "rewards/oai_reward_function/std": 0.023546453565359116,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08963452652096748,
      "epoch": 2.157142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.06575662642717361,
      "kl": 0.05113219376653433,
      "learning_rate": 3.928571428571429e-05,
      "loss": 0.0005,
      "num_tokens": 2688680.0,
      "reward": 1.154687523841858,
      "reward_std": 0.03592789173126221,
      "rewards/oai_reward_function/mean": 0.5773437470197678,
      "rewards/oai_reward_function/std": 0.13520585000514984,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10959535092115402,
      "epoch": 2.1714285714285713,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09100169688463211,
      "kl": 0.04979555029422045,
      "learning_rate": 3.9214285714285714e-05,
      "loss": 0.0005,
      "num_tokens": 2706528.0,
      "reward": 1.3046875,
      "reward_std": 0.032156482338905334,
      "rewards/oai_reward_function/mean": 0.6523437350988388,
      "rewards/oai_reward_function/std": 0.1942063421010971,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10072515532374382,
      "epoch": 2.185714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11344427615404129,
      "kl": 0.08104220405220985,
      "learning_rate": 3.9142857142857145e-05,
      "loss": 0.0008,
      "num_tokens": 2724424.0,
      "reward": 1.3984375,
      "reward_std": 0.06430189311504364,
      "rewards/oai_reward_function/mean": 0.69921875,
      "rewards/oai_reward_function/std": 0.19055142998695374,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.13380656391382217,
      "epoch": 2.2,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10259576886892319,
      "kl": 0.047852903604507446,
      "learning_rate": 3.9071428571428575e-05,
      "loss": 0.0005,
      "num_tokens": 2742272.0,
      "reward": 1.0578124523162842,
      "reward_std": 0.026579542085528374,
      "rewards/oai_reward_function/mean": 0.5289062485098839,
      "rewards/oai_reward_function/std": 0.05354730039834976,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.1082126721739769,
      "epoch": 2.2142857142857144,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10249610245227814,
      "kl": 0.07078076247125864,
      "learning_rate": 3.9000000000000006e-05,
      "loss": 0.0007,
      "num_tokens": 2760088.0,
      "reward": 1.0593750476837158,
      "reward_std": 0.036339618265628815,
      "rewards/oai_reward_function/mean": 0.5296875014901161,
      "rewards/oai_reward_function/std": 0.04327215999364853,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10915260016918182,
      "epoch": 2.2285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08330399543046951,
      "kl": 0.07353132590651512,
      "learning_rate": 3.892857142857143e-05,
      "loss": 0.0007,
      "num_tokens": 2777936.0,
      "reward": 1.25,
      "reward_std": 0.046066030859947205,
      "rewards/oai_reward_function/mean": 0.625,
      "rewards/oai_reward_function/std": 0.1287345290184021,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08624540269374847,
      "epoch": 2.242857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.0943220779299736,
      "kl": 0.06203949544578791,
      "learning_rate": 3.885714285714286e-05,
      "loss": 0.0006,
      "num_tokens": 2795664.0,
      "reward": 1.024999976158142,
      "reward_std": 0.023145508021116257,
      "rewards/oai_reward_function/mean": 0.5125000001862645,
      "rewards/oai_reward_function/std": 0.02199706807732582,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09054199792444706,
      "epoch": 2.257142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07400333881378174,
      "kl": 0.04232563078403473,
      "learning_rate": 3.8785714285714285e-05,
      "loss": 0.0004,
      "num_tokens": 2813352.0,
      "reward": 1.0499999523162842,
      "reward_std": 0.0258774571120739,
      "rewards/oai_reward_function/mean": 0.5250000022351742,
      "rewards/oai_reward_function/std": 0.03810004144906998,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.10258343070745468,
      "epoch": 2.2714285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09145762026309967,
      "kl": 0.07726636342704296,
      "learning_rate": 3.8714285714285715e-05,
      "loss": 0.0008,
      "num_tokens": 2831304.0,
      "reward": 1.0593750476837158,
      "reward_std": 0.03966484218835831,
      "rewards/oai_reward_function/mean": 0.5296875014901161,
      "rewards/oai_reward_function/std": 0.05057631433010101,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08886106871068478,
      "epoch": 2.2857142857142856,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.050053730607032776,
      "kl": 0.0593466404825449,
      "learning_rate": 3.8642857142857146e-05,
      "loss": 0.0006,
      "num_tokens": 2849216.0,
      "reward": 1.0031249523162842,
      "reward_std": 0.008838832378387451,
      "rewards/oai_reward_function/mean": 0.5015625000232831,
      "rewards/oai_reward_function/std": 0.008838835172355175,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08666450530290604,
      "epoch": 2.3,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09668877720832825,
      "kl": 0.037179723381996155,
      "learning_rate": 3.857142857142858e-05,
      "loss": 0.0004,
      "num_tokens": 2867016.0,
      "reward": 1.0671875476837158,
      "reward_std": 0.034592773765325546,
      "rewards/oai_reward_function/mean": 0.5335937514901161,
      "rewards/oai_reward_function/std": 0.038942355662584305,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08739319443702698,
      "epoch": 2.314285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05101403221487999,
      "kl": 0.016047978308051825,
      "learning_rate": 3.85e-05,
      "loss": 0.0002,
      "num_tokens": 2884784.0,
      "reward": 1.001562476158142,
      "reward_std": 0.004419416189193726,
      "rewards/oai_reward_function/mean": 0.5007812500116415,
      "rewards/oai_reward_function/std": 0.0044194175861775875,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07174593396484852,
      "epoch": 2.3285714285714287,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09460754692554474,
      "kl": 0.031096406280994415,
      "learning_rate": 3.842857142857143e-05,
      "loss": 0.0003,
      "num_tokens": 2902576.0,
      "reward": 1.0315624475479126,
      "reward_std": 0.03596320003271103,
      "rewards/oai_reward_function/mean": 0.5157812498509884,
      "rewards/oai_reward_function/std": 0.02667333371937275,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0965243298560381,
      "epoch": 2.342857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10237058997154236,
      "kl": 0.025380919221788645,
      "learning_rate": 3.8357142857142855e-05,
      "loss": 0.0003,
      "num_tokens": 2920416.0,
      "reward": 1.071874976158142,
      "reward_std": 0.045641690492630005,
      "rewards/oai_reward_function/mean": 0.5359374992549419,
      "rewards/oai_reward_function/std": 0.04396548494696617,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0735629927366972,
      "epoch": 2.357142857142857,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.001382152666337788,
      "kl": 0.011448808014392853,
      "learning_rate": 3.8285714285714286e-05,
      "loss": 0.0001,
      "num_tokens": 2938224.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07639794796705246,
      "epoch": 2.3714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05336588993668556,
      "kl": 0.0100309734698385,
      "learning_rate": 3.821428571428572e-05,
      "loss": 0.0001,
      "num_tokens": 2956000.0,
      "reward": 1.0187499523162842,
      "reward_std": 0.017677675932645798,
      "rewards/oai_reward_function/mean": 0.509375000372529,
      "rewards/oai_reward_function/std": 0.023546453565359116,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0797042902559042,
      "epoch": 2.3857142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09860816597938538,
      "kl": 0.02236688695847988,
      "learning_rate": 3.814285714285715e-05,
      "loss": 0.0002,
      "num_tokens": 2973728.0,
      "reward": 1.0421874523162842,
      "reward_std": 0.03380424156785011,
      "rewards/oai_reward_function/mean": 0.5210937485098839,
      "rewards/oai_reward_function/std": 0.03052588365972042,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.09247681871056557,
      "epoch": 2.4,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11807835847139359,
      "kl": 0.028510943986475468,
      "learning_rate": 3.807142857142857e-05,
      "loss": 0.0003,
      "num_tokens": 2991648.0,
      "reward": 1.0703125,
      "reward_std": 0.04926247149705887,
      "rewards/oai_reward_function/mean": 0.53515625,
      "rewards/oai_reward_function/std": 0.036400206387043,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0499194972217083,
      "epoch": 2.414285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11116263270378113,
      "kl": 0.04615373630076647,
      "learning_rate": 3.8e-05,
      "loss": 0.0005,
      "num_tokens": 3009472.0,
      "reward": 1.131250023841858,
      "reward_std": 0.0681503415107727,
      "rewards/oai_reward_function/mean": 0.5656249970197678,
      "rewards/oai_reward_function/std": 0.06772513687610626,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07691787928342819,
      "epoch": 2.4285714285714284,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.043317168951034546,
      "kl": 0.026559457648545504,
      "learning_rate": 3.792857142857143e-05,
      "loss": 0.0003,
      "num_tokens": 3027312.0,
      "reward": 1.0812499523162842,
      "reward_std": 0.013363059610128403,
      "rewards/oai_reward_function/mean": 0.5406249985098839,
      "rewards/oai_reward_function/std": 0.04867187887430191,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08763985149562359,
      "epoch": 2.442857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11738862097263336,
      "kl": 0.028244417626410723,
      "learning_rate": 3.785714285714286e-05,
      "loss": 0.0003,
      "num_tokens": 3045248.0,
      "reward": 1.2421875,
      "reward_std": 0.038010139018297195,
      "rewards/oai_reward_function/mean": 0.62109375,
      "rewards/oai_reward_function/std": 0.17051976919174194,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07618978433310986,
      "epoch": 2.4571428571428573,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06447312235832214,
      "kl": 0.009853521827608347,
      "learning_rate": 3.778571428571429e-05,
      "loss": 0.0001,
      "num_tokens": 3063104.0,
      "reward": 1.0125000476837158,
      "reward_std": 0.013363069854676723,
      "rewards/oai_reward_function/mean": 0.5062500000931323,
      "rewards/oai_reward_function/std": 0.016800537705421448,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05904076434671879,
      "epoch": 2.4714285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.08978112041950226,
      "kl": 0.03310586418956518,
      "learning_rate": 3.771428571428572e-05,
      "loss": 0.0003,
      "num_tokens": 3081032.0,
      "reward": 1.109375,
      "reward_std": 0.04799327254295349,
      "rewards/oai_reward_function/mean": 0.5546875,
      "rewards/oai_reward_function/std": 0.05903713405132294,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0563393235206604,
      "epoch": 2.4857142857142858,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11673219501972198,
      "kl": 0.01918662153184414,
      "learning_rate": 3.764285714285715e-05,
      "loss": 0.0002,
      "num_tokens": 3098880.0,
      "reward": 1.1156249046325684,
      "reward_std": 0.04642024636268616,
      "rewards/oai_reward_function/mean": 0.5578125007450581,
      "rewards/oai_reward_function/std": 0.04554221034049988,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.054440722800791264,
      "epoch": 2.5,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0016670229379087687,
      "kl": 0.011914134491235018,
      "learning_rate": 3.757142857142857e-05,
      "loss": 0.0001,
      "num_tokens": 3116528.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07713918946683407,
      "epoch": 2.5142857142857142,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09991607069969177,
      "kl": 0.014265456004068255,
      "learning_rate": 3.7500000000000003e-05,
      "loss": 0.0001,
      "num_tokens": 3134280.0,
      "reward": 1.0140624046325684,
      "reward_std": 0.026196977123618126,
      "rewards/oai_reward_function/mean": 0.5070312502793968,
      "rewards/oai_reward_function/std": 0.017079481855034828,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.060492053627967834,
      "epoch": 2.5285714285714285,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11897552013397217,
      "kl": 0.03578268736600876,
      "learning_rate": 3.742857142857143e-05,
      "loss": 0.0004,
      "num_tokens": 3152096.0,
      "reward": 1.09375,
      "reward_std": 0.05294632539153099,
      "rewards/oai_reward_function/mean": 0.546875,
      "rewards/oai_reward_function/std": 0.044336508959531784,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.062405264005064964,
      "epoch": 2.5428571428571427,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09012026339769363,
      "kl": 0.017184360651299357,
      "learning_rate": 3.735714285714286e-05,
      "loss": 0.0002,
      "num_tokens": 3169776.0,
      "reward": 1.2531249523162842,
      "reward_std": 0.029978279024362564,
      "rewards/oai_reward_function/mean": 0.6265625059604645,
      "rewards/oai_reward_function/std": 0.17049944400787354,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06565988063812256,
      "epoch": 2.557142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.126982182264328,
      "kl": 0.038017953746020794,
      "learning_rate": 3.728571428571428e-05,
      "loss": 0.0004,
      "num_tokens": 3187712.0,
      "reward": 1.3125,
      "reward_std": 0.041240036487579346,
      "rewards/oai_reward_function/mean": 0.65625,
      "rewards/oai_reward_function/std": 0.1866512894630432,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.040435372851789,
      "epoch": 2.571428571428571,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07750152051448822,
      "kl": 0.03242550138384104,
      "learning_rate": 3.721428571428572e-05,
      "loss": 0.0003,
      "num_tokens": 3205592.0,
      "reward": 1.256250023841858,
      "reward_std": 0.025646153837442398,
      "rewards/oai_reward_function/mean": 0.628125011920929,
      "rewards/oai_reward_function/std": 0.18651622533798218,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.047216037288308144,
      "epoch": 2.585714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.12098561972379684,
      "kl": 0.026824071537703276,
      "learning_rate": 3.7142857142857143e-05,
      "loss": 0.0003,
      "num_tokens": 3223480.0,
      "reward": 1.0093750953674316,
      "reward_std": 0.19897010922431946,
      "rewards/oai_reward_function/mean": 0.5046875029802322,
      "rewards/oai_reward_function/std": 0.21528521552681923,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08010220341384411,
      "epoch": 2.6,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11290978640317917,
      "kl": 0.022963001858443022,
      "learning_rate": 3.7071428571428574e-05,
      "loss": 0.0002,
      "num_tokens": 3241304.0,
      "reward": 1.1468751430511475,
      "reward_std": 0.08732541650533676,
      "rewards/oai_reward_function/mean": 0.5734374970197678,
      "rewards/oai_reward_function/std": 0.11707756668329239,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06848571076989174,
      "epoch": 2.6142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10999053716659546,
      "kl": 0.024655529763549566,
      "learning_rate": 3.7e-05,
      "loss": 0.0002,
      "num_tokens": 3259048.0,
      "reward": 1.165624976158142,
      "reward_std": 0.0838727056980133,
      "rewards/oai_reward_function/mean": 0.5828125029802322,
      "rewards/oai_reward_function/std": 0.14652389287948608,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.049521847628057,
      "epoch": 2.6285714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10953173786401749,
      "kl": 0.02982867369428277,
      "learning_rate": 3.692857142857143e-05,
      "loss": 0.0003,
      "num_tokens": 3276808.0,
      "reward": 1.171875,
      "reward_std": 0.061461035162210464,
      "rewards/oai_reward_function/mean": 0.5859375,
      "rewards/oai_reward_function/std": 0.1271488517522812,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07211006805300713,
      "epoch": 2.642857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1074090376496315,
      "kl": 0.036498697474598885,
      "learning_rate": 3.685714285714286e-05,
      "loss": 0.0004,
      "num_tokens": 3294808.0,
      "reward": 1.162500023841858,
      "reward_std": 0.13342483341693878,
      "rewards/oai_reward_function/mean": 0.5812499970197678,
      "rewards/oai_reward_function/std": 0.1636282056570053,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04203084297478199,
      "epoch": 2.657142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.07064300775527954,
      "kl": 0.02704466599971056,
      "learning_rate": 3.678571428571429e-05,
      "loss": 0.0003,
      "num_tokens": 3312640.0,
      "reward": 1.0750000476837158,
      "reward_std": 0.018898211419582367,
      "rewards/oai_reward_function/mean": 0.5375000014901161,
      "rewards/oai_reward_function/std": 0.06839166581630707,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.054328473284840584,
      "epoch": 2.6714285714285713,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06820113956928253,
      "kl": 0.022003832273185253,
      "learning_rate": 3.671428571428572e-05,
      "loss": 0.0002,
      "num_tokens": 3330408.0,
      "reward": 1.1531250476837158,
      "reward_std": 0.0646936446428299,
      "rewards/oai_reward_function/mean": 0.5765625014901161,
      "rewards/oai_reward_function/std": 0.14809781312942505,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.051017552614212036,
      "epoch": 2.685714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08293981850147247,
      "kl": 0.02239195117726922,
      "learning_rate": 3.6642857142857145e-05,
      "loss": 0.0002,
      "num_tokens": 3348064.0,
      "reward": 1.0109374523162842,
      "reward_std": 0.017358144745230675,
      "rewards/oai_reward_function/mean": 0.505468750372529,
      "rewards/oai_reward_function/std": 0.015206077136099339,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.049897488206624985,
      "epoch": 2.7,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09536179155111313,
      "kl": 0.05423136055469513,
      "learning_rate": 3.6571428571428576e-05,
      "loss": 0.0005,
      "num_tokens": 3365896.0,
      "reward": 1.1484375,
      "reward_std": 0.06608611345291138,
      "rewards/oai_reward_function/mean": 0.57421875,
      "rewards/oai_reward_function/std": 0.10268264263868332,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05249054729938507,
      "epoch": 2.7142857142857144,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.13304099440574646,
      "kl": 0.04763131029903889,
      "learning_rate": 3.65e-05,
      "loss": 0.0005,
      "num_tokens": 3383800.0,
      "reward": 1.0828125476837158,
      "reward_std": 0.04099529981613159,
      "rewards/oai_reward_function/mean": 0.5414062514901161,
      "rewards/oai_reward_function/std": 0.04943608492612839,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.062329867854714394,
      "epoch": 2.7285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11173869669437408,
      "kl": 0.03640593169257045,
      "learning_rate": 3.642857142857143e-05,
      "loss": 0.0004,
      "num_tokens": 3401648.0,
      "reward": 1.0562500953674316,
      "reward_std": 0.023689784109592438,
      "rewards/oai_reward_function/mean": 0.5281250011175871,
      "rewards/oai_reward_function/std": 0.0274963341653347,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.058776866644620895,
      "epoch": 2.742857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10382409393787384,
      "kl": 0.03305045561864972,
      "learning_rate": 3.6357142857142854e-05,
      "loss": 0.0003,
      "num_tokens": 3419408.0,
      "reward": 1.217187523841858,
      "reward_std": 0.02610759809613228,
      "rewards/oai_reward_function/mean": 0.6085937470197678,
      "rewards/oai_reward_function/std": 0.1844356507062912,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04789746552705765,
      "epoch": 2.757142857142857,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.05706682801246643,
      "kl": 0.03994939010590315,
      "learning_rate": 3.628571428571429e-05,
      "loss": 0.0004,
      "num_tokens": 3437112.0,
      "reward": 1.015625,
      "reward_std": 0.01860060542821884,
      "rewards/oai_reward_function/mean": 0.5078125,
      "rewards/oai_reward_function/std": 0.022394467145204544,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.062057855539023876,
      "epoch": 2.7714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15330444276332855,
      "kl": 0.084334472194314,
      "learning_rate": 3.6214285714285716e-05,
      "loss": 0.0008,
      "num_tokens": 3454904.0,
      "reward": 1.470312476158142,
      "reward_std": 0.04739333689212799,
      "rewards/oai_reward_function/mean": 0.735156238079071,
      "rewards/oai_reward_function/std": 0.19775548577308655,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04913834575563669,
      "epoch": 2.7857142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.11146184056997299,
      "kl": 0.06489744689315557,
      "learning_rate": 3.6142857142857146e-05,
      "loss": 0.0006,
      "num_tokens": 3472632.0,
      "reward": 1.0703125,
      "reward_std": 0.045694079250097275,
      "rewards/oai_reward_function/mean": 0.53515625,
      "rewards/oai_reward_function/std": 0.04438621550798416,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.057510885410010815,
      "epoch": 2.8,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1315256953239441,
      "kl": 0.06008041184395552,
      "learning_rate": 3.607142857142857e-05,
      "loss": 0.0006,
      "num_tokens": 3490592.0,
      "reward": 1.060937523841858,
      "reward_std": 0.02575094997882843,
      "rewards/oai_reward_function/mean": 0.5304687507450581,
      "rewards/oai_reward_function/std": 0.04522986710071564,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05490284040570259,
      "epoch": 2.814285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1558970808982849,
      "kl": 0.0605736318975687,
      "learning_rate": 3.6e-05,
      "loss": 0.0006,
      "num_tokens": 3508408.0,
      "reward": 1.1359374523162842,
      "reward_std": 0.16093073785305023,
      "rewards/oai_reward_function/mean": 0.5679687559604645,
      "rewards/oai_reward_function/std": 0.12798601388931274,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05418549384921789,
      "epoch": 2.8285714285714287,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.060409124940633774,
      "kl": 0.07027391903102398,
      "learning_rate": 3.5928571428571425e-05,
      "loss": 0.0007,
      "num_tokens": 3526168.0,
      "reward": 1.0281250476837158,
      "reward_std": 0.008838837966322899,
      "rewards/oai_reward_function/mean": 0.514062499627471,
      "rewards/oai_reward_function/std": 0.026133574545383453,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.03633992746472359,
      "epoch": 2.842857142857143,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.003255989169701934,
      "kl": 0.07370059937238693,
      "learning_rate": 3.585714285714286e-05,
      "loss": 0.0007,
      "num_tokens": 3543864.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/oai_reward_function/mean": 0.5,
      "rewards/oai_reward_function/std": 0.0,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.057329680770635605,
      "epoch": 2.857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11008545011281967,
      "kl": 0.06796468701213598,
      "learning_rate": 3.5785714285714286e-05,
      "loss": 0.0007,
      "num_tokens": 3561688.0,
      "reward": 1.25,
      "reward_std": 0.014625202864408493,
      "rewards/oai_reward_function/mean": 0.625,
      "rewards/oai_reward_function/std": 0.21655291318893433,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04236162081360817,
      "epoch": 2.8714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15067800879478455,
      "kl": 0.10442700423300266,
      "learning_rate": 3.571428571428572e-05,
      "loss": 0.001,
      "num_tokens": 3579560.0,
      "reward": 1.2906250953674316,
      "reward_std": 0.06347659230232239,
      "rewards/oai_reward_function/mean": 0.6453125029802322,
      "rewards/oai_reward_function/std": 0.18144108355045319,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04874769877642393,
      "epoch": 2.8857142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08031316101551056,
      "kl": 0.06067081820219755,
      "learning_rate": 3.564285714285715e-05,
      "loss": 0.0006,
      "num_tokens": 3597200.0,
      "reward": 1.037500023841858,
      "reward_std": 0.019918914884328842,
      "rewards/oai_reward_function/mean": 0.5187500007450581,
      "rewards/oai_reward_function/std": 0.023759547621011734,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06220845878124237,
      "epoch": 2.9,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.12046143412590027,
      "kl": 0.05884059518575668,
      "learning_rate": 3.557142857142857e-05,
      "loss": 0.0006,
      "num_tokens": 3615112.0,
      "reward": 1.076562523841858,
      "reward_std": 0.05444490164518356,
      "rewards/oai_reward_function/mean": 0.5382812507450581,
      "rewards/oai_reward_function/std": 0.04835369065403938,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05830034799873829,
      "epoch": 2.914285714285714,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.09531212598085403,
      "kl": 0.05973371770232916,
      "learning_rate": 3.55e-05,
      "loss": 0.0006,
      "num_tokens": 3632936.0,
      "reward": 1.1078124046325684,
      "reward_std": 0.037323713302612305,
      "rewards/oai_reward_function/mean": 0.5539062544703484,
      "rewards/oai_reward_function/std": 0.07622901350259781,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05407467018812895,
      "epoch": 2.928571428571429,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.07925046980381012,
      "kl": 0.07386120036244392,
      "learning_rate": 3.5428571428571426e-05,
      "loss": 0.0007,
      "num_tokens": 3650760.0,
      "reward": 1.0140624046325684,
      "reward_std": 0.02122672274708748,
      "rewards/oai_reward_function/mean": 0.5070312502793968,
      "rewards/oai_reward_function/std": 0.017079481855034828,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06182014662772417,
      "epoch": 2.942857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11716300994157791,
      "kl": 0.06741901952773333,
      "learning_rate": 3.5357142857142864e-05,
      "loss": 0.0007,
      "num_tokens": 3668512.0,
      "reward": 1.0906250476837158,
      "reward_std": 0.055445872247219086,
      "rewards/oai_reward_function/mean": 0.5453125014901161,
      "rewards/oai_reward_function/std": 0.06968752294778824,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.059788716956973076,
      "epoch": 2.9571428571428573,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.13249847292900085,
      "kl": 0.08083864115178585,
      "learning_rate": 3.528571428571429e-05,
      "loss": 0.0008,
      "num_tokens": 3686296.0,
      "reward": 1.2609375715255737,
      "reward_std": 0.032799478620290756,
      "rewards/oai_reward_function/mean": 0.6304687410593033,
      "rewards/oai_reward_function/std": 0.16235841810703278,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06789828836917877,
      "epoch": 2.9714285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1304040104150772,
      "kl": 0.07522418349981308,
      "learning_rate": 3.521428571428572e-05,
      "loss": 0.0008,
      "num_tokens": 3704008.0,
      "reward": 1.2593750953674316,
      "reward_std": 0.05023520812392235,
      "rewards/oai_reward_function/mean": 0.6296875029802322,
      "rewards/oai_reward_function/std": 0.1668539047241211,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04057574924081564,
      "epoch": 2.9857142857142858,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1246933788061142,
      "kl": 0.10525520890951157,
      "learning_rate": 3.514285714285714e-05,
      "loss": 0.0011,
      "num_tokens": 3721888.0,
      "reward": 1.2625000476837158,
      "reward_std": 0.03328196331858635,
      "rewards/oai_reward_function/mean": 0.6312500089406967,
      "rewards/oai_reward_function/std": 0.1866512894630432,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06538868602365255,
      "epoch": 3.0,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11987200379371643,
      "kl": 0.090326476842165,
      "learning_rate": 3.507142857142857e-05,
      "loss": 0.0009,
      "num_tokens": 3739752.0,
      "reward": 1.0437500476837158,
      "reward_std": 0.040318816900253296,
      "rewards/oai_reward_function/mean": 0.5218750014901161,
      "rewards/oai_reward_function/std": 0.03521248698234558,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06806700490415096,
      "epoch": 3.0142857142857142,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.08975692093372345,
      "kl": 0.08757120184600353,
      "learning_rate": 3.5e-05,
      "loss": 0.0009,
      "num_tokens": 3757520.0,
      "reward": 1.0203125476837158,
      "reward_std": 0.024814628064632416,
      "rewards/oai_reward_function/mean": 0.510156249627471,
      "rewards/oai_reward_function/std": 0.019938793033361435,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05651993863284588,
      "epoch": 3.0285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10675106197595596,
      "kl": 0.09534911066293716,
      "learning_rate": 3.4928571428571434e-05,
      "loss": 0.001,
      "num_tokens": 3775296.0,
      "reward": 1.0750000476837158,
      "reward_std": 0.06767623126506805,
      "rewards/oai_reward_function/mean": 0.5375000014901161,
      "rewards/oai_reward_function/std": 0.07378040999174118,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0639553228393197,
      "epoch": 3.0428571428571427,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09777996689081192,
      "kl": 0.07890664599835873,
      "learning_rate": 3.485714285714286e-05,
      "loss": 0.0008,
      "num_tokens": 3793032.0,
      "reward": 1.0281250476837158,
      "reward_std": 0.020751874893903732,
      "rewards/oai_reward_function/mean": 0.5140625005587935,
      "rewards/oai_reward_function/std": 0.021939707919955254,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06698552891612053,
      "epoch": 3.057142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1115046888589859,
      "kl": 0.07001018989831209,
      "learning_rate": 3.478571428571429e-05,
      "loss": 0.0007,
      "num_tokens": 3810744.0,
      "reward": 1.056249976158142,
      "reward_std": 0.030470959842205048,
      "rewards/oai_reward_function/mean": 0.5281250011175871,
      "rewards/oai_reward_function/std": 0.04741290956735611,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06257231812924147,
      "epoch": 3.0714285714285716,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.10517910867929459,
      "kl": 0.07934985496103764,
      "learning_rate": 3.471428571428571e-05,
      "loss": 0.0008,
      "num_tokens": 3828496.0,
      "reward": 1.235937476158142,
      "reward_std": 0.012387894093990326,
      "rewards/oai_reward_function/mean": 0.6179687529802322,
      "rewards/oai_reward_function/std": 0.20793089270591736,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.055911571718752384,
      "epoch": 3.085714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15163163840770721,
      "kl": 0.11663151904940605,
      "learning_rate": 3.4642857142857144e-05,
      "loss": 0.0012,
      "num_tokens": 3846408.0,
      "reward": 1.2062499523162842,
      "reward_std": 0.1645711362361908,
      "rewards/oai_reward_function/mean": 0.6031250059604645,
      "rewards/oai_reward_function/std": 0.10957211256027222,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05231211241334677,
      "epoch": 3.1,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.17118224501609802,
      "kl": 0.12115776538848877,
      "learning_rate": 3.4571428571428574e-05,
      "loss": 0.0012,
      "num_tokens": 3864168.0,
      "reward": 1.0859375,
      "reward_std": 0.1350831389427185,
      "rewards/oai_reward_function/mean": 0.5429687462747097,
      "rewards/oai_reward_function/std": 0.11469355970621109,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05661669000983238,
      "epoch": 3.1142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.13312071561813354,
      "kl": 0.08971596322953701,
      "learning_rate": 3.45e-05,
      "loss": 0.0009,
      "num_tokens": 3882016.0,
      "reward": 1.2296874523162842,
      "reward_std": 0.02697797492146492,
      "rewards/oai_reward_function/mean": 0.6148437485098839,
      "rewards/oai_reward_function/std": 0.1935303658246994,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0506694195792079,
      "epoch": 3.1285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.10720871388912201,
      "kl": 0.07849705778062344,
      "learning_rate": 3.442857142857143e-05,
      "loss": 0.0008,
      "num_tokens": 3899952.0,
      "reward": 1.1015625,
      "reward_std": 0.044115059077739716,
      "rewards/oai_reward_function/mean": 0.55078125,
      "rewards/oai_reward_function/std": 0.055534202605485916,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05330024380236864,
      "epoch": 3.142857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.12851352989673615,
      "kl": 0.0983127523213625,
      "learning_rate": 3.435714285714286e-05,
      "loss": 0.001,
      "num_tokens": 3917688.0,
      "reward": 1.365625023841858,
      "reward_std": 0.12765255570411682,
      "rewards/oai_reward_function/mean": 0.6828124970197678,
      "rewards/oai_reward_function/std": 0.20135001838207245,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05522188264876604,
      "epoch": 3.157142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.11083745956420898,
      "kl": 0.07289117947220802,
      "learning_rate": 3.428571428571429e-05,
      "loss": 0.0007,
      "num_tokens": 3935528.0,
      "reward": 1.046875,
      "reward_std": 0.0414334312081337,
      "rewards/oai_reward_function/mean": 0.5234375,
      "rewards/oai_reward_function/std": 0.039623990654945374,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05520590580999851,
      "epoch": 3.1714285714285713,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11108041554689407,
      "kl": 0.08299623243510723,
      "learning_rate": 3.4214285714285714e-05,
      "loss": 0.0008,
      "num_tokens": 3953320.0,
      "reward": 1.2531250715255737,
      "reward_std": 0.017311176285147667,
      "rewards/oai_reward_function/mean": 0.6265624910593033,
      "rewards/oai_reward_function/std": 0.18985748291015625,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05928301624953747,
      "epoch": 3.185714285714286,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.1239665076136589,
      "kl": 0.11972067691385746,
      "learning_rate": 3.4142857142857145e-05,
      "loss": 0.0012,
      "num_tokens": 3971032.0,
      "reward": 1.1640625,
      "reward_std": 0.04833199828863144,
      "rewards/oai_reward_function/mean": 0.58203125,
      "rewards/oai_reward_function/std": 0.12608151137828827,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.042910450138151646,
      "epoch": 3.2,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1018366813659668,
      "kl": 0.08956374414265156,
      "learning_rate": 3.407142857142857e-05,
      "loss": 0.0009,
      "num_tokens": 3988864.0,
      "reward": 1.2765624523162842,
      "reward_std": 0.06503090262413025,
      "rewards/oai_reward_function/mean": 0.6382812410593033,
      "rewards/oai_reward_function/std": 0.19134333729743958,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06987146660685539,
      "epoch": 3.2142857142857144,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.13029474020004272,
      "kl": 0.11290079541504383,
      "learning_rate": 3.4000000000000007e-05,
      "loss": 0.0011,
      "num_tokens": 4006800.0,
      "reward": 1.3203125,
      "reward_std": 0.04008040949702263,
      "rewards/oai_reward_function/mean": 0.66015625,
      "rewards/oai_reward_function/std": 0.20209181308746338,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07145651057362556,
      "epoch": 3.2285714285714286,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1022149994969368,
      "kl": 0.05857388116419315,
      "learning_rate": 3.392857142857143e-05,
      "loss": 0.0006,
      "num_tokens": 4024680.0,
      "reward": 1.0499999523162842,
      "reward_std": 0.03877411410212517,
      "rewards/oai_reward_function/mean": 0.5249999985098839,
      "rewards/oai_reward_function/std": 0.0416397750377655,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.048385005444288254,
      "epoch": 3.242857142857143,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.12312141805887222,
      "kl": 0.07377888821065426,
      "learning_rate": 3.385714285714286e-05,
      "loss": 0.0007,
      "num_tokens": 4042472.0,
      "reward": 1.4500000476837158,
      "reward_std": 0.02340090088546276,
      "rewards/oai_reward_function/mean": 0.7249999940395355,
      "rewards/oai_reward_function/std": 0.23026981949806213,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0545792318880558,
      "epoch": 3.257142857142857,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09947756677865982,
      "kl": 0.09583424963057041,
      "learning_rate": 3.3785714285714285e-05,
      "loss": 0.001,
      "num_tokens": 4060248.0,
      "reward": 1.0265624523162842,
      "reward_std": 0.10836321860551834,
      "rewards/oai_reward_function/mean": 0.5132812485098839,
      "rewards/oai_reward_function/std": 0.12380600348114967,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06396409310400486,
      "epoch": 3.2714285714285714,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.0686774030327797,
      "kl": 0.09425997547805309,
      "learning_rate": 3.3714285714285716e-05,
      "loss": 0.0009,
      "num_tokens": 4077880.0,
      "reward": 1.2703125476837158,
      "reward_std": 0.017598576843738556,
      "rewards/oai_reward_function/mean": 0.6351562440395355,
      "rewards/oai_reward_function/std": 0.2018921971321106,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08321201242506504,
      "epoch": 3.2857142857142856,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 0.06190980598330498,
      "kl": 0.03877187706530094,
      "learning_rate": 3.364285714285714e-05,
      "loss": 0.0004,
      "num_tokens": 4095672.0,
      "reward": 1.0046875476837158,
      "reward_std": 0.00646935636177659,
      "rewards/oai_reward_function/mean": 0.5023437500931323,
      "rewards/oai_reward_function/std": 0.007403614930808544,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.0575382262468338,
      "epoch": 3.3,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.11537446081638336,
      "kl": 0.08915554732084274,
      "learning_rate": 3.357142857142857e-05,
      "loss": 0.0009,
      "num_tokens": 4113480.0,
      "reward": 1.2359375953674316,
      "reward_std": 0.08257875591516495,
      "rewards/oai_reward_function/mean": 0.617968738079071,
      "rewards/oai_reward_function/std": 0.156090646982193,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05642772279679775,
      "epoch": 3.314285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.13337863981723785,
      "kl": 0.10237299278378487,
      "learning_rate": 3.35e-05,
      "loss": 0.001,
      "num_tokens": 4131224.0,
      "reward": 1.4734375476837158,
      "reward_std": 0.01958364248275757,
      "rewards/oai_reward_function/mean": 0.7367187440395355,
      "rewards/oai_reward_function/std": 0.2408807873725891,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06528778094798326,
      "epoch": 3.3285714285714287,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.13227899372577667,
      "kl": 0.11398253589868546,
      "learning_rate": 3.342857142857143e-05,
      "loss": 0.0011,
      "num_tokens": 4149184.0,
      "reward": 1.3203125,
      "reward_std": 0.03324369713664055,
      "rewards/oai_reward_function/mean": 0.6601562350988388,
      "rewards/oai_reward_function/std": 0.19673332571983337,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06553995609283447,
      "epoch": 3.342857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.20257574319839478,
      "kl": 0.12704718858003616,
      "learning_rate": 3.3357142857142856e-05,
      "loss": 0.0013,
      "num_tokens": 4167104.0,
      "reward": 1.4609375,
      "reward_std": 0.1858925223350525,
      "rewards/oai_reward_function/mean": 0.73046875,
      "rewards/oai_reward_function/std": 0.1834629327058792,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04393093287944794,
      "epoch": 3.357142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.12950977683067322,
      "kl": 0.09361258894205093,
      "learning_rate": 3.3285714285714286e-05,
      "loss": 0.0009,
      "num_tokens": 4184944.0,
      "reward": 1.1656250953674316,
      "reward_std": 0.150077685713768,
      "rewards/oai_reward_function/mean": 0.5828124955296516,
      "rewards/oai_reward_function/std": 0.13220180571079254,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06838994100689888,
      "epoch": 3.3714285714285714,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.14525781571865082,
      "kl": 0.09100262448191643,
      "learning_rate": 3.321428571428572e-05,
      "loss": 0.0009,
      "num_tokens": 4202672.0,
      "reward": 1.4187500476837158,
      "reward_std": 0.02699536457657814,
      "rewards/oai_reward_function/mean": 0.7093749940395355,
      "rewards/oai_reward_function/std": 0.21884500980377197,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05996893718838692,
      "epoch": 3.3857142857142857,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.12374205142259598,
      "kl": 0.13878228701651096,
      "learning_rate": 3.314285714285714e-05,
      "loss": 0.0014,
      "num_tokens": 4220472.0,
      "reward": 1.443750023841858,
      "reward_std": 0.060242824256420135,
      "rewards/oai_reward_function/mean": 0.721875011920929,
      "rewards/oai_reward_function/std": 0.18898604810237885,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04969180002808571,
      "epoch": 3.4,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.14094124734401703,
      "kl": 0.11433868668973446,
      "learning_rate": 3.307142857142858e-05,
      "loss": 0.0011,
      "num_tokens": 4238352.0,
      "reward": 1.5062499046325684,
      "reward_std": 0.04232252389192581,
      "rewards/oai_reward_function/mean": 0.7531249821186066,
      "rewards/oai_reward_function/std": 0.21019864082336426,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05842717830091715,
      "epoch": 3.414285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13528573513031006,
      "kl": 0.13230286352336407,
      "learning_rate": 3.3e-05,
      "loss": 0.0013,
      "num_tokens": 4256072.0,
      "reward": 1.5250000953674316,
      "reward_std": 0.0736992210149765,
      "rewards/oai_reward_function/mean": 0.762499988079071,
      "rewards/oai_reward_function/std": 0.19999998807907104,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.053723374381661415,
      "epoch": 3.4285714285714284,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.10746068507432938,
      "kl": 0.0968917403370142,
      "learning_rate": 3.292857142857143e-05,
      "loss": 0.001,
      "num_tokens": 4273848.0,
      "reward": 1.0499999523162842,
      "reward_std": 0.017677675932645798,
      "rewards/oai_reward_function/mean": 0.525000000372529,
      "rewards/oai_reward_function/std": 0.028398092836141586,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.08724895678460598,
      "epoch": 3.442857142857143,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1673547476530075,
      "kl": 0.16037143021821976,
      "learning_rate": 3.285714285714286e-05,
      "loss": 0.0016,
      "num_tokens": 4291872.0,
      "reward": 1.603124976158142,
      "reward_std": 0.11311184614896774,
      "rewards/oai_reward_function/mean": 0.801562488079071,
      "rewards/oai_reward_function/std": 0.22014817595481873,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06301301345229149,
      "epoch": 3.4571428571428573,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.127055823802948,
      "kl": 0.10341309197247028,
      "learning_rate": 3.278571428571429e-05,
      "loss": 0.001,
      "num_tokens": 4309640.0,
      "reward": 1.265625,
      "reward_std": 0.02265283279120922,
      "rewards/oai_reward_function/mean": 0.6328125,
      "rewards/oai_reward_function/std": 0.19933734834194183,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.04968675132840872,
      "epoch": 3.4714285714285715,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1374424248933792,
      "kl": 0.1188412457704544,
      "learning_rate": 3.271428571428571e-05,
      "loss": 0.0012,
      "num_tokens": 4327360.0,
      "reward": 1.303125023841858,
      "reward_std": 0.10906177759170532,
      "rewards/oai_reward_function/mean": 0.651562511920929,
      "rewards/oai_reward_function/std": 0.2058555632829666,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06477249693125486,
      "epoch": 3.4857142857142858,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1306895911693573,
      "kl": 0.11430021747946739,
      "learning_rate": 3.264285714285714e-05,
      "loss": 0.0011,
      "num_tokens": 4345192.0,
      "reward": 1.4375,
      "reward_std": 0.1379069834947586,
      "rewards/oai_reward_function/mean": 0.71875,
      "rewards/oai_reward_function/std": 0.18447834253311157,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06405621953308582,
      "epoch": 3.5,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.1333678811788559,
      "kl": 0.17231638357043266,
      "learning_rate": 3.257142857142857e-05,
      "loss": 0.0017,
      "num_tokens": 4363064.0,
      "reward": 1.5109375715255737,
      "reward_std": 0.0335906445980072,
      "rewards/oai_reward_function/mean": 0.7554687559604645,
      "rewards/oai_reward_function/std": 0.22403325140476227,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.067863704636693,
      "epoch": 3.5142857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.13658057153224945,
      "kl": 0.17316893115639687,
      "learning_rate": 3.2500000000000004e-05,
      "loss": 0.0017,
      "num_tokens": 4380968.0,
      "reward": 1.5546875,
      "reward_std": 0.05777457728981972,
      "rewards/oai_reward_function/mean": 0.77734375,
      "rewards/oai_reward_function/std": 0.2121661901473999,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.05905670113861561,
      "epoch": 3.5285714285714285,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 0.09611335396766663,
      "kl": 0.050939660519361496,
      "learning_rate": 3.242857142857143e-05,
      "loss": 0.0005,
      "num_tokens": 4398608.0,
      "reward": 1.0125000476837158,
      "reward_std": 0.02314549870789051,
      "rewards/oai_reward_function/mean": 0.5062500000931323,
      "rewards/oai_reward_function/std": 0.016800537705421448,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07766996510326862,
      "epoch": 3.5428571428571427,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 0.13582761585712433,
      "kl": 0.12659209407866,
      "learning_rate": 3.235714285714286e-05,
      "loss": 0.0013,
      "num_tokens": 4416392.0,
      "reward": 1.5640625953674316,
      "reward_std": 0.07988262921571732,
      "rewards/oai_reward_function/mean": 0.782031238079071,
      "rewards/oai_reward_function/std": 0.2193496972322464,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.07497746869921684,
      "epoch": 3.557142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.1760389357805252,
      "kl": 0.24591631814837456,
      "learning_rate": 3.228571428571428e-05,
      "loss": 0.0025,
      "num_tokens": 4434272.0,
      "reward": 1.798437476158142,
      "reward_std": 0.12221544235944748,
      "rewards/oai_reward_function/mean": 0.899218738079071,
      "rewards/oai_reward_function/std": 0.10424157232046127,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 512.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 512.0,
      "completions/min_terminated_length": 0.0,
      "entropy": 0.06046187411993742,
      "epoch": 3.571428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.15973122417926788,
      "kl": 0.22475793957710266,
      "learning_rate": 3.221428571428571e-05,
      "loss": 0.0022,
      "num_tokens": 4452104.0,
      "reward": 1.6359374523162842,
      "reward_std": 0.0966869369149208,
      "rewards/oai_reward_function/mean": 0.8179687559604645,
      "rewards/oai_reward_function/std": 0.19666926562786102,
      "step": 250
    }
  ],
  "logging_steps": 1,
  "max_steps": 700,
  "num_input_tokens_seen": 4452104,
  "num_train_epochs": 10,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}